From 251e1c289dd67d919cbee35dfcfd20ab2433dc8e Mon Sep 17 00:00:00 2001
From: Edresson Casanova <edresson1@gmail.com>
Date: Mon, 13 Jun 2022 13:47:31 +0000
Subject: [PATCH] Add support for inference using an specific reference file
 instead of the averaged embeddings

---
 TTS/utils/synthesizer.py                      | 34 +++++++++++++------
 ...est_vits_speaker_emb_with_emotion_train.py |  8 +++++
 2 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py
index e9a79220..716f9fb7 100644
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@@ -252,17 +252,24 @@ class Synthesizer(object):
             if speaker_name and isinstance(speaker_name, str):
                 if self.tts_config.use_d_vector_file:
                     # get the average speaker embedding from the saved d_vectors.
-                    speaker_embedding = self.tts_model.speaker_manager.get_mean_embedding(
-                        speaker_name, num_samples=None, randomize=False
-                    )
+                    if speaker_name in self.tts_model.speaker_manager.ids:
+                        speaker_embedding = self.tts_model.speaker_manager.get_mean_embedding(
+                            speaker_name, num_samples=None, randomize=False
+                        )
+                    else:
+                        speaker_embedding = self.tts_model.speaker_manager.embeddings[speaker_name]["embedding"]
+
                     speaker_embedding = np.array(speaker_embedding)[None, :]  # [1 x embedding_dim]
 
                     if style_speaker_name is not None:
-                        style_speaker_embedding = self.tts_model.speaker_manager.get_mean_embedding(
-                            style_speaker_name, num_samples=None, randomize=False
-                        )
-                        style_speaker_embedding = np.array(style_speaker_embedding)[None, :]  # [1 x embedding_dim]
+                        if style_speaker_name in self.tts_model.speaker_manager.ids:
+                            style_speaker_embedding = self.tts_model.speaker_manager.get_mean_embedding(
+                                style_speaker_name, num_samples=None, randomize=False
+                            )
+                        else: 
+                            style_speaker_embedding = self.tts_model.speaker_manager.embeddings[style_speaker_name]["embedding"]
 
+                        style_speaker_embedding = np.array(style_speaker_embedding)[None, :]  # [1 x embedding_dim]
                 else:
                     # get speaker idx from the speaker name
                     speaker_id = self.tts_model.speaker_manager.ids[speaker_name]
@@ -322,11 +329,16 @@ class Synthesizer(object):
                     getattr(self.tts_config, "model_args", None)
                     and getattr(self.tts_config.model_args, "use_external_emotions_embeddings", False)
                 ):
-                    # get the average speaker embedding from the saved embeddings.
-                    emotion_embedding = self.tts_model.emotion_manager.get_mean_embedding(
-                        emotion_name, num_samples=None, randomize=False
-                    )
+                    if emotion_name in self.tts_model.emotion_manager.ids:
+                        # get the average speaker embedding from the saved embeddings.
+                        emotion_embedding = self.tts_model.emotion_manager.get_mean_embedding(
+                            emotion_name, num_samples=None, randomize=False
+                        )
+                    else:
+                        emotion_embedding = self.tts_model.emotion_manager.embeddings[emotion_name]["embedding"]
+
                     emotion_embedding = np.array(emotion_embedding)[None, :]  # [1 x embedding_dim]
+
                 else:
                     # get speaker idx from the speaker name
                     emotion_id = self.tts_model.emotion_manager.ids[emotion_name]
diff --git a/tests/tts_tests/test_vits_speaker_emb_with_emotion_train.py b/tests/tts_tests/test_vits_speaker_emb_with_emotion_train.py
index bef67ee5..5f2bd4a9 100644
--- a/tests/tts_tests/test_vits_speaker_emb_with_emotion_train.py
+++ b/tests/tts_tests/test_vits_speaker_emb_with_emotion_train.py
@@ -27,6 +27,7 @@ config = VitsConfig(
     print_eval=True,
     test_sentences=[
         ["Be a voice, not an echo.", "ljspeech-1", None, None, "ljspeech-1"],
+        ["Be a voice, not an echo.", "LJ001-0001.wav", None, None, "LJ001-0002.wav"],
     ],
 )
 # set audio config
@@ -83,6 +84,13 @@ continue_emotion_path = os.path.join(continue_path, "emotions.json")
 inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --emotion_idx {emotion_id} --speakers_file_path {continue_speakers_path} --emotions_file_path {continue_emotion_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
 run_cli(inference_command)
 
+# inference with one reference embedding instead of average
+speaker_id = "LJ001-0001.wav"
+emotion_id = "LJ001-0002.wav"
+inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --emotion_idx {emotion_id} --speakers_file_path {continue_speakers_path} --emotions_file_path {continue_emotion_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+run_cli(inference_command)
+
+
 # restore the model and continue training for one more epoch
 command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
 run_cli(command_train)