Add Emotion Support for the VITS model

2022-03-15 01:16:48 +00:00 · 2022-03-15 01:16:48 +00:00 · bd99548016
parent ad7ce05ac9
commit bd99548016
7 changed files with 311 additions and 46 deletions
--- a/TTS/bin/synthesize.py
+++ b/TTS/bin/synthesize.py
@ -152,6 +152,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
    # args for multi-speaker synthesis
    parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
    parser.add_argument("--emotions_file_path", type=str, help="JSON file for emotion model.", default=None)
    parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None)
    parser.add_argument(
        "--speaker_idx",
@ -165,6 +166,12 @@ If you don't specify any models, then it uses LJSpeech based English model.
        help="Target language ID for a multi-lingual TTS model.",
        default=None,
    )
    parser.add_argument(
        "--emotion_idx",
        type=str,
        help="Target emotion ID.",
        default=None,
    )
    parser.add_argument(
        "--speaker_wav",
        nargs="+",
@ -254,6 +261,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
        model_path = args.model_path
        config_path = args.config_path
        speakers_file_path = args.speakers_file_path
        emotions_file_path = args.emotions_file_path
        language_ids_file_path = args.language_ids_file_path
    if args.vocoder_path is not None:
@ -269,6 +277,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
        model_path,
        config_path,
        speakers_file_path,
        emotions_file_path,
        language_ids_file_path,
        vocoder_path,
        vocoder_config_path,
@ -315,6 +324,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
        style_wav=args.capacitron_style_wav,
        style_text=args.capacitron_style_text,
        reference_speaker_name=args.reference_speaker_idx,
        emotion_name=args.emotion_idx,
    )
    # save the results
--- a/TTS/tts/models/base_tts.py
+++ b/TTS/tts/models/base_tts.py
@ -428,3 +428,25 @@ class BaseTTS(BaseTrainerModel):
            trainer.config.save_json(os.path.join(trainer.output_path, "config.json"))
            print(f" > `language_ids.json` is saved to {output_path}.")
            print(" > `language_ids_file` is updated in the config.json.")
        if hasattr(self, "emotion_manager") and self.emotion_manager is not None:
            output_path = os.path.join(trainer.output_path, "emotions.json")
            if hasattr(trainer.config, "model_args"):
                if trainer.config.model_args.use_emotion_embedding and not trainer.config.model_args.external_emotions_embs_file:
                    self.emotion_manager.save_ids_to_file(output_path)
                    trainer.config.model_args.emotions_ids_file = output_path
                else:
                    self.emotion_manager.save_embeddings_to_file(output_path)
                    trainer.config.model_args.external_emotions_embs_file = output_path
            else:
                if trainer.config.use_emotion_embedding and not trainer.config.external_emotions_embs_file:
                    self.emotion_manager.save_ids_to_file(output_path)
                    trainer.config.emotions_ids_file = output_path
                else:
                    self.emotion_manager.save_embeddings_to_file(output_path)
                    trainer.config.external_emotions_embs_file = output_path
            trainer.config.save_json(os.path.join(trainer.output_path, "config.json"))
            print(f" > `emotions.json` is saved to {output_path}.")
            print(" > `emotions_ids_file` or `external_emotions_embs_file` is updated in the config.json.")
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@ -25,6 +25,7 @@ from TTS.tts.models.base_tts import BaseTTS
 from TTS.tts.utils.helpers import generate_path, maximum_path, rand_segments, segment, sequence_mask
 from TTS.tts.utils.languages import LanguageManager
 from TTS.tts.utils.speakers import SpeakerManager
 from TTS.tts.utils.emotions import EmotionManager
 from TTS.tts.utils.synthesis import synthesis
 from TTS.tts.utils.text.characters import BaseCharacters, _characters, _pad, _phonemes, _punctuations
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
@ -300,7 +301,7 @@ class VitsDataset(TTSDataset):
            "waveform_lens": wav_lens,  # (B)
            "waveform_rel_lens": wav_rel_lens,
            "speaker_names": batch["speaker_name"],
-            "language_names": batch["language_name"],
+            "f": batch["language_name"],
            "audio_files": batch["wav_file"],
            "raw_text": batch["raw_text"],
        }
@ -529,6 +530,15 @@ class VitsArgs(Coqpit):
    speaker_embedding_channels: int = 256
    use_d_vector_file: bool = False
    d_vector_dim: int = 0
    # use emotion embeddings
    use_emotion_embedding: bool = False
    use_external_emotions_embeddings: bool = False
    emotions_ids_file: str = None
    external_emotions_embs_file: str = None
    emotion_embedding_dim: int = 0
    num_emotions: int = 0
    detach_dp_input: bool = True
    use_language_embedding: bool = False
    embedded_language_dim: int = 4
@ -584,13 +594,14 @@ class Vits(BaseTTS):
        tokenizer: "TTSTokenizer" = None,
        speaker_manager: SpeakerManager = None,
        language_manager: LanguageManager = None,
        emotion_manager: EmotionManager = None,
    ):
        super().__init__(config, ap, tokenizer, speaker_manager, language_manager)
        self.init_multispeaker(config)
        self.init_multilingual(config)
        self.init_upsampling()
        self.init_emotion(config, emotion_manager)
        self.length_scale = self.args.length_scale
        self.noise_scale = self.args.noise_scale
@ -619,7 +630,7 @@ class Vits(BaseTTS):
            kernel_size=self.args.kernel_size_posterior_encoder,
            dilation_rate=self.args.dilation_rate_posterior_encoder,
            num_layers=self.args.num_layers_posterior_encoder,
-            cond_channels=self.embedded_speaker_dim,
+            cond_channels=self.cond_embedding_dim,
        )
        self.flow = ResidualCouplingBlocks(
@ -628,7 +639,7 @@ class Vits(BaseTTS):
            kernel_size=self.args.kernel_size_flow,
            dilation_rate=self.args.dilation_rate_flow,
            num_layers=self.args.num_layers_flow,
-            cond_channels=self.embedded_speaker_dim,
+            cond_channels=self.cond_embedding_dim,
        )
        if self.args.use_sdp:
@ -638,7 +649,7 @@ class Vits(BaseTTS):
                3,
                self.args.dropout_p_duration_predictor,
                4,
-                cond_channels=self.embedded_speaker_dim if self.args.condition_dp_on_speaker else 0,
+                cond_channels=self.cond_embedding_dim if self.args.condition_dp_on_speaker else 0,
                language_emb_dim=self.embedded_language_dim,
            )
        else:
@ -647,7 +658,7 @@ class Vits(BaseTTS):
                256,
                3,
                self.args.dropout_p_duration_predictor,
-                cond_channels=self.embedded_speaker_dim,
+                cond_channels=self.cond_embedding_dim,
                language_emb_dim=self.embedded_language_dim,
            )
@ -661,7 +672,7 @@ class Vits(BaseTTS):
            self.args.upsample_initial_channel_decoder,
            self.args.upsample_rates_decoder,
            inference_padding=0,
-            cond_channels=self.embedded_speaker_dim,
+            cond_channels=self.cond_embedding_dim,
            conv_pre_weight_norm=False,
            conv_post_weight_norm=False,
            conv_post_bias=False,
@ -683,7 +694,7 @@ class Vits(BaseTTS):
            config (Coqpit): Model configuration.
            data (List, optional): Dataset items to infer number of speakers. Defaults to None.
        """
-        self.embedded_speaker_dim = 0
+        self.cond_embedding_dim = 0
        self.num_speakers = self.args.num_speakers
        self.audio_transform = None
@ -726,14 +737,14 @@ class Vits(BaseTTS):
        # pylint: disable=attribute-defined-outside-init
        if self.num_speakers > 0:
            print(" > initialization of speaker-embedding layers.")
-            self.embedded_speaker_dim = self.args.speaker_embedding_channels
+            self.cond_embedding_dim += self.args.speaker_embedding_channels
-            self.emb_g = nn.Embedding(self.num_speakers, self.embedded_speaker_dim)
+            self.emb_g = nn.Embedding(self.num_speakers, self.args.speaker_embedding_channels)
    def _init_d_vector(self):
        # pylint: disable=attribute-defined-outside-init
        if hasattr(self, "emb_g"):
            raise ValueError("[!] Speaker embedding layer already initialized before d_vector settings.")
-        self.embedded_speaker_dim = self.args.d_vector_dim
+        self.cond_embedding_dim += self.args.d_vector_dim
    def init_multilingual(self, config: Coqpit):
        """Initialize multilingual modules of a model.
@ -785,9 +796,36 @@ class Vits(BaseTTS):
                    raise RuntimeError(" [!] The weights of Text Encoder was not reinit check it !")
            print(" > Text Encoder was reinit.")
    def init_emotion(self, config: Coqpit, emotion_manager: EmotionManager):
        # pylint: disable=attribute-defined-outside-init
        """Initialize emotion modules of a model. A model can be trained either with a emotion embedding layer
        or with external `embeddings` computed from a emotion encoder model.
        You must provide a `emotion_manager` at initialization to set up the emotion modules.
        Args:
            config (Coqpit): Model configuration.
            emotion_manager (Coqpit): Emotion Manager.
        """
        self.emotion_manager = emotion_manager
        self.num_emotions = self.args.num_emotions
        if self.emotion_manager:
            self.num_emotions = self.emotion_manager.num_emotions
        if self.args.use_emotion_embedding:
            if self.num_emotions > 0:
                print(" > initialization of emotion-embedding layers.")
                self.emb_emotion = nn.Embedding(self.num_emotions, self.args.emotion_embedding_dim)
                self.cond_embedding_dim += self.args.emotion_embedding_dim
        if self.args.use_external_emotions_embeddings:
            self.cond_embedding_dim += self.args.emotion_embedding_dim
    def get_aux_input(self, aux_input: Dict):
-        sid, g, lid = self._set_cond_input(aux_input)
+        sid, g, lid, eid, eg = self._set_cond_input(aux_input)
-        return {"speaker_ids": sid, "style_wav": None, "d_vectors": g, "language_ids": lid}
+        return {"speaker_ids": sid, "style_wav": None, "d_vectors": g, "language_ids": lid,
                "emotion_embeddings": eg, "emotion_ids": eid}
    def _freeze_layers(self):
        if self.args.freeze_encoder:
@ -817,7 +855,7 @@ class Vits(BaseTTS):
    @staticmethod
    def _set_cond_input(aux_input: Dict):
        """Set the speaker conditioning input based on the multi-speaker mode."""
-        sid, g, lid = None, None, None
+        sid, g, lid, eid, eg = None, None, None, None, None
        if "speaker_ids" in aux_input and aux_input["speaker_ids"] is not None:
            sid = aux_input["speaker_ids"]
            if sid.ndim == 0:
@ -832,7 +870,17 @@ class Vits(BaseTTS):
            if lid.ndim == 0:
                lid = lid.unsqueeze_(0)
-        return sid, g, lid
+        if "emotion_ids" in aux_input and aux_input["emotion_ids"] is not None:
            eid = aux_input["emotion_ids"]
            if eid.ndim == 0:
                eid = eid.unsqueeze_(0)
        if "emotion_embeddings" in aux_input and aux_input["emotion_embeddings"] is not None:
            eg = F.normalize(aux_input["emotion_embeddings"]).unsqueeze(-1)
            if eg.ndim == 2:
                eg = eg.unsqueeze_(0)
        return sid, g, lid, eid, eg
    def _set_speaker_input(self, aux_input: Dict):
        d_vectors = aux_input.get("d_vectors", None)
@ -906,7 +954,7 @@ class Vits(BaseTTS):
        y: torch.tensor,
        y_lengths: torch.tensor,
        waveform: torch.tensor,
-        aux_input={"d_vectors": None, "speaker_ids": None, "language_ids": None},
+        aux_input={"d_vectors": None, "speaker_ids": None, "language_ids": None, "emotion_embeddings": None, "emotion_ids": None},
    ) -> Dict:
        """Forward pass of the model.
@ -946,11 +994,19 @@ class Vits(BaseTTS):
            - syn_spk_emb: :math:`[B, 1, speaker_encoder.proj_dim]`
        """
        outputs = {}
-        sid, g, lid = self._set_cond_input(aux_input)
+        sid, g, lid, eid, eg = self._set_cond_input(aux_input)
        # speaker embedding
        if self.args.use_speaker_embedding and sid is not None:
            g = self.emb_g(sid).unsqueeze(-1)  # [b, h, 1]
        # emotion embedding
        if self.args.use_emotion_embedding and eid is not None and eg is None:
            eg = self.emb_emotion(eid).unsqueeze(-1)  # [b, h, 1]
        # concat the emotion embedding and speaker embedding
        if eg is not None and (self.args.use_emotion_embedding or self.args.use_external_emotions_embeddings):
            g = torch.cat([g, eg], dim=1) # [b, h1+h1, 1]
        # language embedding
        lang_emb = None
        if self.args.use_language_embedding and lid is not None:
@ -1028,7 +1084,7 @@ class Vits(BaseTTS):
    @torch.no_grad()
    def inference(
-        self, x, aux_input={"x_lengths": None, "d_vectors": None, "speaker_ids": None, "language_ids": None}
+        self, x, aux_input={"x_lengths": None, "d_vectors": None, "speaker_ids": None, "language_ids": None, "emotion_embeddings": None, "emotion_ids": None}
    ):  # pylint: disable=dangerous-default-value
        """
        Note:
@ -1048,13 +1104,21 @@ class Vits(BaseTTS):
            - m_p: :math:`[B, C, T_dec]`
            - logs_p: :math:`[B, C, T_dec]`
        """
-        sid, g, lid = self._set_cond_input(aux_input)
+        sid, g, lid, eid, eg = self._set_cond_input(aux_input)
        x_lengths = self._set_x_lengths(x, aux_input)
        # speaker embedding
        if self.args.use_speaker_embedding and sid is not None:
            g = self.emb_g(sid).unsqueeze(-1)
        # emotion embedding
        if self.args.use_emotion_embedding and eid is not None and eg is None:
            eg = self.emb_emotion(eid).unsqueeze(-1)  # [b, h, 1]
        # concat the emotion embedding and speaker embedding
        if eg is not None and (self.args.use_emotion_embedding or self.args.use_external_emotions_embeddings):
            g = torch.cat([g, eg], dim=1) # [b, h1+h1, 1]
        # language embedding
        lang_emb = None
        if self.args.use_language_embedding and lid is not None:
@ -1187,6 +1251,8 @@ class Vits(BaseTTS):
            d_vectors = batch["d_vectors"]
            speaker_ids = batch["speaker_ids"]
            language_ids = batch["language_ids"]
            emotion_embeddings = batch["emotion_embeddings"]
            emotion_ids = batch["emotion_ids"]
            waveform = batch["waveform"]
            # generator pass
@ -1196,7 +1262,8 @@ class Vits(BaseTTS):
                spec,
                spec_lens,
                waveform,
-                aux_input={"d_vectors": d_vectors, "speaker_ids": speaker_ids, "language_ids": language_ids},
+                aux_input={"d_vectors": d_vectors, "speaker_ids": speaker_ids, "language_ids": language_ids,
                            "emotion_embeddings": emotion_embeddings, "emotion_ids": emotion_ids},
            )
            # cache tensors for the generator pass
@ -1322,7 +1389,7 @@ class Vits(BaseTTS):
            config = self.config
        # extract speaker and language info
-        text, speaker_name, style_wav, language_name = None, None, None, None
+        text, speaker_name, style_wav, language_name, emotion_name = None, None, None, None, None
        if isinstance(sentence_info, list):
            if len(sentence_info) == 1:
@ -1333,11 +1400,13 @@ class Vits(BaseTTS):
                text, speaker_name, style_wav = sentence_info
            elif len(sentence_info) == 4:
                text, speaker_name, style_wav, language_name = sentence_info
            elif len(sentence_info) == 5:
                text, speaker_name, style_wav, language_name, emotion_name = sentence_info
        else:
            text = sentence_info
        # get speaker  id/d_vector
-        speaker_id, d_vector, language_id = None, None, None
+        speaker_id, d_vector, language_id, emotion_id, emotion_embedding = None, None, None, None, None
        if hasattr(self, "speaker_manager"):
            if config.use_d_vector_file:
                if speaker_name is None:
@ -1354,6 +1423,19 @@ class Vits(BaseTTS):
        if hasattr(self, "language_manager") and config.use_language_embedding and language_name is not None:
            language_id = self.language_manager.ids[language_name]
        # get emotion id/embedding
        if hasattr(self, "emotion_manager"):
            if config.use_external_emotions_embeddings:
                if emotion_name is None:
                    emotion_embedding = self.emotion_manager.get_random_embeddings()
                else:
                    emotion_embedding = self.emotion_manager.get_mean_embedding(emotion_name, num_samples=None, randomize=False)
            elif config.use_emotion_embedding:
                if emotion_name is None:
                    emotion_id = self.emotion_manager.get_random_id()
                else:
                    emotion_id = self.emotion_manager.ids[emotion_name]
        return {
            "text": text,
            "speaker_id": speaker_id,
@ -1361,6 +1443,8 @@ class Vits(BaseTTS):
            "d_vector": d_vector,
            "language_id": language_id,
            "language_name": language_name,
            "emotion_embedding": emotion_embedding, 
            "emotion_ids": emotion_id
        }
    @torch.no_grad()
@ -1387,6 +1471,8 @@ class Vits(BaseTTS):
                d_vector=aux_inputs["d_vector"],
                style_wav=aux_inputs["style_wav"],
                language_id=aux_inputs["language_id"],
                emotion_embedding=aux_inputs["emotion_embedding"],
                emotion_id=aux_inputs["emotion_ids"],
                use_griffin_lim=True,
                do_trim_silence=False,
            ).values()
@ -1401,10 +1487,12 @@ class Vits(BaseTTS):
        logger.test_figures(steps, outputs["figures"])
    def format_batch(self, batch: Dict) -> Dict:
-        """Compute speaker, langugage IDs and d_vector for the batch if necessary."""
+        """Compute speaker, langugage IDs, d_vector and emotion embeddings for the batch if necessary."""
        speaker_ids = None
        language_ids = None
        d_vectors = None
        emotion_embeddings = None
        emotion_ids = None
        # get numerical speaker ids from speaker names
        if self.speaker_manager is not None and self.speaker_manager.ids and self.args.use_speaker_embedding:
@ -1421,15 +1509,33 @@ class Vits(BaseTTS):
            d_vectors = torch.FloatTensor(d_vectors)
        # get language ids from language names
-        if self.language_manager is not None and self.language_manager.ids and self.args.use_language_embedding:
+        if (
            self.language_manager is not None
            and self.language_manager.ids
            and self.args.use_language_embedding
        ):
            language_ids = [self.language_manager.ids[ln] for ln in batch["language_names"]]
        if language_ids is not None:
            language_ids = torch.LongTensor(language_ids)
        # get emotion embedding
        if self.emotion_manager is not None and self.emotion_manager.embeddings and self.args.use_external_emotions_embeddings:
            emotion_mapping = self.emotion_manager.embeddings
            emotion_embeddings = [emotion_mapping[w]["embedding"] for w in batch["audio_files"]]
            emotion_embeddings = torch.FloatTensor(emotion_embeddings)
        if self.emotion_manager is not None and self.emotion_manager.embeddings and self.args.use_emotion_embedding:
            emotion_mapping = self.emotion_manager.embeddings
            emotion_names = [emotion_mapping[w]["name"] for w in batch["audio_files"]]
            emotion_ids = [self.emotion_manager.ids[en] for en in emotion_names]
            emotion_ids = torch.LongTensor(emotion_ids)
        batch["language_ids"] = language_ids
        batch["d_vectors"] = d_vectors
        batch["speaker_ids"] = speaker_ids
        batch["emotion_embeddings"] = emotion_embeddings
        batch["emotion_ids"] = emotion_ids
        return batch
    def format_batch_on_device(self, batch):
@ -1643,12 +1749,13 @@ class Vits(BaseTTS):
        tokenizer, new_config = TTSTokenizer.init_from_config(config)
        speaker_manager = SpeakerManager.init_from_config(config, samples)
        language_manager = LanguageManager.init_from_config(config)
        emotion_manager = EmotionManager.init_from_config(config)
        if config.model_args.speaker_encoder_model_path:
            speaker_manager.init_encoder(
                config.model_args.speaker_encoder_model_path, config.model_args.speaker_encoder_config_path
            )
-        return Vits(new_config, ap, tokenizer, speaker_manager, language_manager)
+        return Vits(new_config, ap, tokenizer, speaker_manager, language_manager, emotion_manager=emotion_manager)
 ##################################
--- a/TTS/tts/utils/emotions.py
+++ b/TTS/tts/utils/emotions.py
@ -1,10 +1,8 @@
 import json
 import os
-from typing import Any, Dict, List, Tuple, Union
+from typing import Any, List
 import fsspec
 import numpy as np
 import torch
 from coqpit import Coqpit
 from TTS.config import get_from_config_or_model_args_with_default
@ -34,7 +32,7 @@ class EmotionManager(EmbeddingManager):
    computes the embeddings for a given clip or emotion.
    Args:
-        emo_embeddings_file_path (str, optional): Path to the metafile including x vectors. Defaults to "".
+        embeddings_file_path (str, optional): Path to the metafile including x vectors. Defaults to "".
        emotion_id_file_path (str, optional): Path to the metafile that maps emotion names to ids used by
        TTS models. Defaults to "".
        encoder_model_path (str, optional): Path to the emotion encoder model file. Defaults to "".
@ -50,14 +48,14 @@ class EmotionManager(EmbeddingManager):
    def __init__(
        self,
-        emo_embeddings_file_path: str = "",
+        embeddings_file_path: str = "",
        emotion_id_file_path: str = "",
        encoder_model_path: str = "",
        encoder_config_path: str = "",
        use_cuda: bool = False,
    ):
        super().__init__(
-            external_emotions_ids_file_path=emo_embeddings_file_path,
+            embedding_file_path=embeddings_file_path,
            id_file_path=emotion_id_file_path,
            encoder_model_path=encoder_model_path,
            encoder_config_path=encoder_config_path,
@ -98,11 +96,15 @@ class EmotionManager(EmbeddingManager):
                emotion_manager = EmotionManager(
                    emotion_id_file_path=get_from_config_or_model_args_with_default(config, "emotions_ids_file", None)
                )
-
+            elif get_from_config_or_model_args_with_default(config, "external_emotions_embs_file", None):
        if get_from_config_or_model_args_with_default(config, "use_external_emotion_embedding", False):
            if get_from_config_or_model_args_with_default(config, "external_emotions_ids_file", None):
                emotion_manager = EmotionManager(
-                    embeddings_file_path=get_from_config_or_model_args_with_default(config, "external_emotions_ids_file", None)
+                    embeddings_file_path=get_from_config_or_model_args_with_default(config, "external_emotions_embs_file", None)
                )
        if get_from_config_or_model_args_with_default(config, "use_external_emotions_embeddings", False):
            if get_from_config_or_model_args_with_default(config, "external_emotions_embs_file", None):
                emotion_manager = EmotionManager(
                    embeddings_file_path=get_from_config_or_model_args_with_default(config, "external_emotions_embs_file", None)
                )
        return emotion_manager
@ -157,26 +159,26 @@ def get_emotion_manager(c: Coqpit, restore_path: str = None, out_path: str = Non
        if c.use_external_emotions_embeddings:
            # restore emotion manager with the embedding file
            if not os.path.exists(emotions_ids_file):
-                print("WARNING: emotions.json was not found in restore_path, trying to use CONFIG.external_emotions_ids_file")
+                print("WARNING: emotions.json was not found in restore_path, trying to use CONFIG.external_emotions_embs_file")
-                if not os.path.exists(c.external_emotions_ids_file):
+                if not os.path.exists(c.external_emotions_embs_file):
                    raise RuntimeError(
-                        "You must copy the file emotions.json to restore_path, or set a valid file in CONFIG.external_emotions_ids_file"
+                        "You must copy the file emotions.json to restore_path, or set a valid file in CONFIG.external_emotions_embs_file"
                    )
-                emotion_manager.load_embeddings_from_file(c.external_emotions_ids_file)
+                emotion_manager.load_embeddings_from_file(c.external_emotions_embs_file)
            emotion_manager.load_embeddings_from_file(emotions_ids_file)
        elif not c.use_external_emotions_embeddings:  # restor emotion manager with emotion ID file.
            emotion_manager.load_ids_from_file(emotions_ids_file)
-    elif c.use_external_emotions_embeddings and c.external_emotions_ids_file:
+    elif c.use_external_emotions_embeddings and c.external_emotions_embs_file:
        # new emotion manager with external emotion embeddings.
-        emotion_manager.load_embeddings_from_file(c.external_emotions_ids_file)
+        emotion_manager.load_embeddings_from_file(c.external_emotions_embs_file)
-    elif c.use_external_emotions_embeddings and not c.external_emotions_ids_file:
+    elif c.use_external_emotions_embeddings and not c.external_emotions_embs_file:
        raise "use_external_emotions_embeddings is True, so you need pass a external emotion embedding file."
    elif c.use_emotion_embedding:
        if "emotions_ids_file" in c and c.emotions_ids_file:
            emotion_manager.load_ids_from_file(c.emotions_ids_file)
        else: # enable get ids from eternal embedding files
-            emotion_manager.load_embeddings_from_file(c.external_emotions_ids_file)
+            emotion_manager.load_embeddings_from_file(c.external_emotions_embs_file)
    if emotion_manager.num_emotions > 0:
        print(
@ -189,9 +191,8 @@ def get_emotion_manager(c: Coqpit, restore_path: str = None, out_path: str = Non
    if out_path:
        out_file_path = os.path.join(out_path, "emotions.json")
        print(f" > Saving `emotions.json` to {out_file_path}.")
-        if c.use_external_emotions_embeddings and c.external_emotions_ids_file:
+        if c.use_external_emotions_embeddings and c.external_emotions_embs_file:
            emotion_manager.save_embeddings_to_file(out_file_path)
        else:
            emotion_manager.save_ids_to_file(out_file_path)
    return emotion_manager
--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@ -29,6 +29,8 @@ def run_model_torch(
    style_text: str = None,
    d_vector: torch.Tensor = None,
    language_id: torch.Tensor = None,
    emotion_id: torch.Tensor = None,
    emotion_embedding: torch.Tensor = None,
 ) -> Dict:
    """Run a torch model for inference. It does not support batch inference.
@ -56,6 +58,8 @@ def run_model_torch(
            "style_mel": style_mel,
            "style_text": style_text,
            "language_ids": language_id,
            "emotion_ids": emotion_id,
            "emotion_embeddings": emotion_embedding,
        },
    )
    return outputs
@ -122,6 +126,8 @@ def synthesis(
    do_trim_silence=False,
    d_vector=None,
    language_id=None,
    emotion_id=None,
    emotion_embedding=None
 ):
    """Synthesize voice for the given text using Griffin-Lim vocoder or just compute output features to be passed to
    the vocoder model.
@ -190,6 +196,12 @@ def synthesis(
    if language_id is not None:
        language_id = id_to_torch(language_id, cuda=use_cuda)
    if emotion_id is not None:
        emotion_id = id_to_torch(emotion_id, cuda=use_cuda)
    if emotion_embedding is not None:
        emotion_embedding = embedding_to_torch(emotion_embedding, cuda=use_cuda)
    if not isinstance(style_mel, dict):
        # GST or Capacitron style mel
        style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda)
@ -212,6 +224,8 @@ def synthesis(
        style_text,
        d_vector=d_vector,
        language_id=language_id,
        emotion_id=emotion_id,
        emotion_embedding=emotion_embedding,
    )
    model_outputs = outputs["model_outputs"]
    model_outputs = model_outputs[0].data.cpu().numpy()
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@ -22,6 +22,7 @@ class Synthesizer(object):
        tts_checkpoint: str,
        tts_config_path: str,
        tts_speakers_file: str = "",
        tts_emotions_file: str = "",
        tts_languages_file: str = "",
        vocoder_checkpoint: str = "",
        vocoder_config: str = "",
@ -52,6 +53,7 @@ class Synthesizer(object):
        self.tts_checkpoint = tts_checkpoint
        self.tts_config_path = tts_config_path
        self.tts_speakers_file = tts_speakers_file
        self.tts_emotions_file = tts_emotions_file
        self.tts_languages_file = tts_languages_file
        self.vocoder_checkpoint = vocoder_checkpoint
        self.vocoder_config = vocoder_config
@ -183,6 +185,7 @@ class Synthesizer(object):
        style_text=None,
        reference_wav=None,
        reference_speaker_name=None,
        emotion_name=None,
    ) -> List[int]:
        """🐸 TTS magic. Run all the models and generate speech.
@ -240,7 +243,7 @@ class Synthesizer(object):
                    "Define path for speaker.json if it is a multi-speaker model or remove defined speaker idx. "
                )
-        # handle multi-lingaul
+        # handle multi-lingual
        language_id = None
        if self.tts_languages_file or (
            hasattr(self.tts_model, "language_manager") and self.tts_model.language_manager is not None
@ -260,6 +263,29 @@ class Synthesizer(object):
                    "Define path for language_ids.json if it is a multi-lingual model or remove defined language idx. "
                )
        # handle emotion
        emotion_embedding, emotion_id = None, None
        if self.tts_emotions_file or hasattr(self.tts_model.emotion_manager, "ids"):
            if emotion_name and isinstance(emotion_name, str):
                if getattr(self.tts_config, "use_external_emotions_embeddings", False) or getattr(self.tts_config.model_args, "use_external_emotions_embeddings", False):
                    # get the average speaker embedding from the saved embeddings.
                    emotion_embedding = self.tts_model.emotion_manager.get_mean_embedding(emotion_name, num_samples=None, randomize=False)
                    emotion_embedding = np.array(emotion_embedding)[None, :]  # [1 x embedding_dim]
                else:
                    # get speaker idx from the speaker name
                    speaker_id = self.tts_model.emotion_manager.ids[emotion_name]
            elif not emotion_name:
                raise ValueError(
                    " [!] Look like you use an emotion model. "
                    "You need to define either a `emotion_name`  to use an emotion model."
                )
        else:
            if emotion_name:
                raise ValueError(
                    f" [!] Missing emotion.json file path for selecting the emotion {emotion_name}."
                    "Define path for emotion.json if it is an emotion model or remove defined emotion idx. "
                )
        # compute a new d_vector from the given clip.
        if speaker_wav is not None:
            speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(speaker_wav)
@ -280,6 +306,8 @@ class Synthesizer(object):
                    use_griffin_lim=use_gl,
                    d_vector=speaker_embedding,
                    language_id=language_id,
                    emotion_embedding=emotion_embedding,
                    emotion_id=emotion_id,
                )
                waveform = outputs["wav"]
                mel_postnet_spec = outputs["outputs"]["model_outputs"][0].detach().cpu().numpy()
--- a/tests/tts_tests/test_vits_speaker_emb_with_emotion_train.py
+++ b/tests/tts_tests/test_vits_speaker_emb_with_emotion_train.py
@ -0,0 +1,83 @@
 import glob
 import os
 import shutil
 from trainer import get_last_checkpoint
 from tests import get_device_id, get_tests_output_path, run_cli
 from TTS.tts.configs.vits_config import VitsConfig
 config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
 output_path = os.path.join(get_tests_output_path(), "train_outputs")
 config = VitsConfig(
    batch_size=2,
    eval_batch_size=2,
    num_loader_workers=0,
    num_eval_loader_workers=0,
    text_cleaner="english_cleaners",
    use_phonemes=True,
    phoneme_language="en-us",
    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
    run_eval=True,
    test_delay_epochs=-1,
    epochs=1,
    print_step=1,
    print_eval=True,
    test_sentences=[
        ["Be a voice, not an echo.", "ljspeech-1", None, None, "ljspeech-1"],
    ],
 )
 # set audio config
 config.audio.do_trim_silence = True
 config.audio.trim_db = 60
 # active multispeaker d-vec mode
 config.model_args.use_speaker_embedding = True
 config.model_args.use_d_vector_file = False
 config.model_args.d_vector_file = None
 config.model_args.d_vector_dim = 256
 # emotion
 config.model_args.use_external_emotions_embeddings = True
 config.model_args.use_emotion_embedding = False
 config.model_args.emotion_embedding_dim = 256
 config.model_args.external_emotions_embs_file = "tests/data/ljspeech/speakers.json"
 config.save_json(config_path)
 # train the model for one epoch
 command_train = (
    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
    f"--coqpit.output_path {output_path} "
    "--coqpit.datasets.0.name ljspeech_test "
    "--coqpit.datasets.0.meta_file_train metadata.csv "
    "--coqpit.datasets.0.meta_file_val metadata.csv "
    "--coqpit.datasets.0.path tests/data/ljspeech "
    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
    "--coqpit.test_delay_epochs 0"
 )
 run_cli(command_train)
 # Find latest folder
 continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
 # Inference using TTS API
 continue_config_path = os.path.join(continue_path, "config.json")
 continue_restore_path, _ = get_last_checkpoint(continue_path)
 out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
 speaker_id = "ljspeech-1"
 emotion_id = "ljspeech-1"
 continue_speakers_path = os.path.join(continue_path, "speakers.json")
 continue_emotion_path = os.path.join(continue_path, "speakers.json")
 inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --emotion_idx {emotion_id} --speakers_file_path {continue_speakers_path} --emotions_file_path {continue_emotion_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
 run_cli(inference_command)
 # restore the model and continue training for one more epoch
 command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
 run_cli(command_train)
 shutil.rmtree(continue_path)