From 046b137946086fd729b9b1e3909ab6ea285cd582 Mon Sep 17 00:00:00 2001
From: Yehor Smoliakov <yehors@ukr.net>
Date: Fri, 16 Dec 2022 12:30:44 +0200
Subject: [PATCH 01/24] Add Ukrainian LADA (female) voice

---
 docs/source/tts_datasets.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/source/tts_datasets.md b/docs/source/tts_datasets.md
index 852ccd37..11da1b76 100644
--- a/docs/source/tts_datasets.md
+++ b/docs/source/tts_datasets.md
@@ -12,5 +12,6 @@ Some of the known public datasets that we successfully applied 🐸TTS:
 - [German - Thorsten OGVD](https://github.com/thorstenMueller/deep-learning-german-tts)
 - [Japanese - Kokoro](https://www.kaggle.com/kaiida/kokoro-speech-dataset-v11-small/version/1)
 - [Chinese](https://www.data-baker.com/data/index/source/)
+- [Ukrainian - LADA](https://github.com/egorsmkv/ukrainian-tts-datasets/tree/main/lada)
 
-Let us know if you use 🐸TTS on a different dataset.
\ No newline at end of file
+Let us know if you use 🐸TTS on a different dataset.

From cf765cb3f2c4b29f8c91eb5eda52ba2203a09eb3 Mon Sep 17 00:00:00 2001
From: Eren G??lge <egolge@coqui.ai>
Date: Mon, 26 Dec 2022 14:29:10 +0100
Subject: [PATCH 02/24] Add ca and fa models

---
 TTS/.models.json | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/TTS/.models.json b/TTS/.models.json
index 069de683..52cdf795 100644
--- a/TTS/.models.json
+++ b/TTS/.models.json
@@ -617,6 +617,30 @@
                     "license": "bsd-3-clause"
                 }
             }
+        },
+        "ca": {
+            "custom": {
+                "vits":{
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--ca--custom--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "description": " It is trained from zero with 101460 utterances consisting of 257 speakers, approx 138 hours of speech. We used three datasets;\nFestcat and Google Catalan TTS (both TTS datasets) and also a part of Common Voice 8. It is trained with TTS v0.8.0.\nhttps://github.com/coqui-ai/TTS/discussions/930#discussioncomment-4466345",
+                    "author": "@gullabi",
+                    "license": "CC-BY-4.0"
+                }
+            }
+        },
+        "fa":{
+            "custom":{
+                "glow-tts": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--fa--custom--glow-tts.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "description": "persian-tts-female-glow_tts model for text to speech purposes. Single-speaker female voice Trained on persian-tts-dataset-famale. \nThis model has no compatible vocoder thus the output quality is not very good. \nDataset: https://www.kaggle.com/datasets/magnoliasis/persian-tts-dataset-famale.",
+                    "author": "@karim23657",
+                    "license": "CC-BY-4.0"
+                }
+            }
         }
     },
     "vocoder_models": {

From 8c32a6998a7a6f2ad47f54ca1dc1ece29ae235d8 Mon Sep 17 00:00:00 2001
From: Eren G??lge <egolge@coqui.ai>
Date: Mon, 26 Dec 2022 14:29:25 +0100
Subject: [PATCH 03/24] Add pth files to manager

---
 TTS/utils/manage.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py
index 33011921..44348b29 100644
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@@ -298,7 +298,9 @@ class ModelManager(object):
         """
         output_stats_path = os.path.join(output_path, "scale_stats.npy")
         output_d_vector_file_path = os.path.join(output_path, "speakers.json")
+        output_d_vector_file_pth_path = os.path.join(output_path, "speakers.pth")
         output_speaker_ids_file_path = os.path.join(output_path, "speaker_ids.json")
+        output_speaker_ids_file_pth_path = os.path.join(output_path, "speaker_ids.pth")
         speaker_encoder_config_path = os.path.join(output_path, "config_se.json")
         speaker_encoder_model_path = self._find_speaker_encoder(output_path)
 
@@ -307,11 +309,15 @@ class ModelManager(object):
 
         # update the speakers.json file path in the model config.json to the current path
         self._update_path("d_vector_file", output_d_vector_file_path, config_path)
+        self._update_path("d_vector_file", output_d_vector_file_pth_path, config_path)
         self._update_path("model_args.d_vector_file", output_d_vector_file_path, config_path)
+        self._update_path("model_args.d_vector_file", output_d_vector_file_pth_path, config_path)
 
         # update the speaker_ids.json file path in the model config.json to the current path
         self._update_path("speakers_file", output_speaker_ids_file_path, config_path)
+        self._update_path("speakers_file", output_speaker_ids_file_pth_path, config_path)
         self._update_path("model_args.speakers_file", output_speaker_ids_file_path, config_path)
+        self._update_path("model_args.speakers_file", output_speaker_ids_file_pth_path, config_path)
 
         # update the speaker_encoder file path in the model config.json to the current path
         self._update_path("speaker_encoder_model_path", speaker_encoder_model_path, config_path)

From f814d523945fc43071d037a1fb9edcdad99949b2 Mon Sep 17 00:00:00 2001
From: Eren G??lge <egolge@coqui.ai>
Date: Mon, 26 Dec 2022 14:29:46 +0100
Subject: [PATCH 04/24] Bump up to v0.10.1

---
 TTS/VERSION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/VERSION b/TTS/VERSION
index 2774f858..71172b43 100644
--- a/TTS/VERSION
+++ b/TTS/VERSION
@@ -1 +1 @@
-0.10.0
\ No newline at end of file
+0.10.1
\ No newline at end of file

From a07397733ba45e28316e75617704ec47ec28cc46 Mon Sep 17 00:00:00 2001
From: Julian Weber <julian.weber@hotmail.fr>
Date: Mon, 2 Jan 2023 10:03:19 +0100
Subject: [PATCH 05/24] Multilingual tokenizer (#2229)

* Implement multilingual tokenizer

* Add multi_phonemizer receipe

* Fix lint

* Add TestMultiPhonemizer

* Fix lint

* make style
---
 TTS/config/shared_configs.py                  |   4 +
 TTS/tts/datasets/dataset.py                   |   6 +-
 TTS/tts/utils/synthesis.py                    |   8 +-
 TTS/tts/utils/text/phonemizers/base.py        |   2 +-
 .../text/phonemizers/ja_jp_phonemizer.py      |   2 +-
 .../text/phonemizers/ko_kr_phonemizer.py      |   2 +-
 .../text/phonemizers/multi_phonemizer.py      |  28 ++--
 TTS/tts/utils/text/tokenizer.py               |  39 +++---
 .../vits_tts/train_vits_tts_phonemes.py       | 126 ++++++++++++++++++
 tests/text_tests/test_phonemizer.py           |  44 ++++++
 10 files changed, 230 insertions(+), 31 deletions(-)
 create mode 100644 recipes/multilingual/vits_tts/train_vits_tts_phonemes.py

diff --git a/TTS/config/shared_configs.py b/TTS/config/shared_configs.py
index 77583332..7fae77d6 100644
--- a/TTS/config/shared_configs.py
+++ b/TTS/config/shared_configs.py
@@ -212,6 +212,9 @@ class BaseDatasetConfig(Coqpit):
         language (str):
             Language code of the dataset. If defined, it overrides `phoneme_language`. Defaults to `""`.
 
+        phonemizer (str):
+            Phonemizer used for that dataset's language. By default it uses `DEF_LANG_TO_PHONEMIZER`. Defaults to `""`.
+
         meta_file_val (str):
             Name of the dataset meta file that defines the instances used at validation.
 
@@ -226,6 +229,7 @@ class BaseDatasetConfig(Coqpit):
     meta_file_train: str = ""
     ignored_speakers: List[str] = None
     language: str = ""
+    phonemizer: str = ""
     meta_file_val: str = ""
     meta_file_attn_mask: str = ""
 
diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py
index 3bedebd4..9ec61e52 100644
--- a/TTS/tts/datasets/dataset.py
+++ b/TTS/tts/datasets/dataset.py
@@ -569,14 +569,14 @@ class PhonemeDataset(Dataset):
 
     def __getitem__(self, index):
         item = self.samples[index]
-        ids = self.compute_or_load(string2filename(item["audio_unique_name"]), item["text"])
+        ids = self.compute_or_load(string2filename(item["audio_unique_name"]), item["text"], item["language"])
         ph_hat = self.tokenizer.ids_to_text(ids)
         return {"text": item["text"], "ph_hat": ph_hat, "token_ids": ids, "token_ids_len": len(ids)}
 
     def __len__(self):
         return len(self.samples)
 
-    def compute_or_load(self, file_name, text):
+    def compute_or_load(self, file_name, text, language):
         """Compute phonemes for the given text.
 
         If the phonemes are already cached, load them from cache.
@@ -586,7 +586,7 @@ class PhonemeDataset(Dataset):
         try:
             ids = np.load(cache_path)
         except FileNotFoundError:
-            ids = self.tokenizer.text_to_ids(text)
+            ids = self.tokenizer.text_to_ids(text, language=language)
             np.save(cache_path, ids)
         return ids
 
diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py
index 2cdc7b84..039816db 100644
--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@@ -175,9 +175,15 @@ def synthesis(
         style_mel = compute_style_mel(style_wav, model.ap, cuda=use_cuda)
         style_mel = style_mel.transpose(1, 2)  # [1, time, depth]
 
+    language_name = None
+    if language_id is not None:
+        language = [k for k, v in model.language_manager.name_to_id.items() if v == language_id]
+        assert len(language) == 1, "language_id must be a valid language"
+        language_name = language[0]
+
     # convert text to sequence of token IDs
     text_inputs = np.asarray(
-        model.tokenizer.text_to_ids(text, language=language_id),
+        model.tokenizer.text_to_ids(text, language=language_name),
         dtype=np.int32,
     )
     # pass tensors to backend
diff --git a/TTS/tts/utils/text/phonemizers/base.py b/TTS/tts/utils/text/phonemizers/base.py
index 08fa8e13..3f8e8eaf 100644
--- a/TTS/tts/utils/text/phonemizers/base.py
+++ b/TTS/tts/utils/text/phonemizers/base.py
@@ -114,7 +114,7 @@ class BasePhonemizer(abc.ABC):
             return self._punctuator.restore(phonemized, punctuations)[0]
         return phonemized[0]
 
-    def phonemize(self, text: str, separator="|") -> str:
+    def phonemize(self, text: str, separator="|", language: str = None) -> str:  # pylint: disable=unused-argument
         """Returns the `text` phonemized for the given language
 
         Args:
diff --git a/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py b/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py
index 60b965f9..878e5e52 100644
--- a/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py
+++ b/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py
@@ -43,7 +43,7 @@ class JA_JP_Phonemizer(BasePhonemizer):
             return separator.join(ph)
         return ph
 
-    def phonemize(self, text: str, separator="|") -> str:
+    def phonemize(self, text: str, separator="|", language=None) -> str:
         """Custom phonemize for JP_JA
 
         Skip pre-post processing steps used by the other phonemizers.
diff --git a/TTS/tts/utils/text/phonemizers/ko_kr_phonemizer.py b/TTS/tts/utils/text/phonemizers/ko_kr_phonemizer.py
index c4aeb354..0bdba213 100644
--- a/TTS/tts/utils/text/phonemizers/ko_kr_phonemizer.py
+++ b/TTS/tts/utils/text/phonemizers/ko_kr_phonemizer.py
@@ -40,7 +40,7 @@ class KO_KR_Phonemizer(BasePhonemizer):
             return separator.join(ph)
         return ph
 
-    def phonemize(self, text: str, separator: str = "", character: str = "hangeul") -> str:
+    def phonemize(self, text: str, separator: str = "", character: str = "hangeul", language=None) -> str:
         return self._phonemize(text, separator, character)
 
     @staticmethod
diff --git a/TTS/tts/utils/text/phonemizers/multi_phonemizer.py b/TTS/tts/utils/text/phonemizers/multi_phonemizer.py
index e36b0a2a..62a9c393 100644
--- a/TTS/tts/utils/text/phonemizers/multi_phonemizer.py
+++ b/TTS/tts/utils/text/phonemizers/multi_phonemizer.py
@@ -14,30 +14,40 @@ class MultiPhonemizer:
     TODO: find a way to pass custom kwargs to the phonemizers
     """
 
-    lang_to_phonemizer_name = DEF_LANG_TO_PHONEMIZER
-    language = "multi-lingual"
+    lang_to_phonemizer = {}
 
-    def __init__(self, custom_lang_to_phonemizer: Dict = {}) -> None:  # pylint: disable=dangerous-default-value
-        self.lang_to_phonemizer_name.update(custom_lang_to_phonemizer)
+    def __init__(self, lang_to_phonemizer_name: Dict = {}) -> None:  # pylint: disable=dangerous-default-value
+        for k, v in lang_to_phonemizer_name.items():
+            if v == "" and k in DEF_LANG_TO_PHONEMIZER.keys():
+                lang_to_phonemizer_name[k] = DEF_LANG_TO_PHONEMIZER[k]
+            elif v == "":
+                raise ValueError(f"Phonemizer wasn't set for language {k} and doesn't have a default.")
+        self.lang_to_phonemizer_name = lang_to_phonemizer_name
         self.lang_to_phonemizer = self.init_phonemizers(self.lang_to_phonemizer_name)
 
     @staticmethod
     def init_phonemizers(lang_to_phonemizer_name: Dict) -> Dict:
         lang_to_phonemizer = {}
         for k, v in lang_to_phonemizer_name.items():
-            phonemizer = get_phonemizer_by_name(v, language=k)
-            lang_to_phonemizer[k] = phonemizer
+            lang_to_phonemizer[k] = get_phonemizer_by_name(v, language=k)
         return lang_to_phonemizer
 
     @staticmethod
     def name():
         return "multi-phonemizer"
 
-    def phonemize(self, text, language, separator="|"):
+    def phonemize(self, text, separator="|", language=""):
+        if language == "":
+            raise ValueError("Language must be set for multi-phonemizer to phonemize.")
         return self.lang_to_phonemizer[language].phonemize(text, separator)
 
     def supported_languages(self) -> List:
-        return list(self.lang_to_phonemizer_name.keys())
+        return list(self.lang_to_phonemizer.keys())
+
+    def print_logs(self, level: int = 0):
+        indent = "\t" * level
+        print(f"{indent}| > phoneme language: {self.supported_languages()}")
+        print(f"{indent}| > phoneme backend: {self.name()}")
 
 
 # if __name__ == "__main__":
@@ -48,7 +58,7 @@ class MultiPhonemizer:
 #         "zh-cn": "这是中国的例子",
 #     }
 #     phonemes = {}
-#     ph = MultiPhonemizer()
+#     ph = MultiPhonemizer({"tr": "espeak", "en-us": "", "de": "gruut", "zh-cn": ""})
 #     for lang, text in texts.items():
 #         phoneme = ph.phonemize(text, lang)
 #         phonemes[lang] = phoneme
diff --git a/TTS/tts/utils/text/tokenizer.py b/TTS/tts/utils/text/tokenizer.py
index 1569c634..04cbbd32 100644
--- a/TTS/tts/utils/text/tokenizer.py
+++ b/TTS/tts/utils/text/tokenizer.py
@@ -3,6 +3,7 @@ from typing import Callable, Dict, List, Union
 from TTS.tts.utils.text import cleaners
 from TTS.tts.utils.text.characters import Graphemes, IPAPhonemes
 from TTS.tts.utils.text.phonemizers import DEF_LANG_TO_PHONEMIZER, get_phonemizer_by_name
+from TTS.tts.utils.text.phonemizers.multi_phonemizer import MultiPhonemizer
 from TTS.utils.generic_utils import get_import_path, import_class
 
 
@@ -106,7 +107,7 @@ class TTSTokenizer:
         if self.text_cleaner is not None:
             text = self.text_cleaner(text)
         if self.use_phonemes:
-            text = self.phonemizer.phonemize(text, separator="")
+            text = self.phonemizer.phonemize(text, separator="", language=language)
         if self.add_blank:
             text = self.intersperse_blank_char(text, True)
         if self.use_eos_bos:
@@ -182,21 +183,29 @@ class TTSTokenizer:
         # init phonemizer
         phonemizer = None
         if config.use_phonemes:
-            phonemizer_kwargs = {"language": config.phoneme_language}
-
-            if "phonemizer" in config and config.phonemizer:
-                phonemizer = get_phonemizer_by_name(config.phonemizer, **phonemizer_kwargs)
+            if "phonemizer" in config and config.phonemizer == "multi_phonemizer":
+                lang_to_phonemizer_name = {}
+                for dataset in config.datasets:
+                    if dataset.language != "":
+                        lang_to_phonemizer_name[dataset.language] = dataset.phonemizer
+                    else:
+                        raise ValueError("Multi phonemizer requires language to be set for each dataset.")
+                phonemizer = MultiPhonemizer(lang_to_phonemizer_name)
             else:
-                try:
-                    phonemizer = get_phonemizer_by_name(
-                        DEF_LANG_TO_PHONEMIZER[config.phoneme_language], **phonemizer_kwargs
-                    )
-                    new_config.phonemizer = phonemizer.name()
-                except KeyError as e:
-                    raise ValueError(
-                        f"""No phonemizer found for language {config.phoneme_language}.
-                        You may need to install a third party library for this language."""
-                    ) from e
+                phonemizer_kwargs = {"language": config.phoneme_language}
+                if "phonemizer" in config and config.phonemizer:
+                    phonemizer = get_phonemizer_by_name(config.phonemizer, **phonemizer_kwargs)
+                else:
+                    try:
+                        phonemizer = get_phonemizer_by_name(
+                            DEF_LANG_TO_PHONEMIZER[config.phoneme_language], **phonemizer_kwargs
+                        )
+                        new_config.phonemizer = phonemizer.name()
+                    except KeyError as e:
+                        raise ValueError(
+                            f"""No phonemizer found for language {config.phoneme_language}.
+                            You may need to install a third party library for this language."""
+                        ) from e
 
         return (
             TTSTokenizer(
diff --git a/recipes/multilingual/vits_tts/train_vits_tts_phonemes.py b/recipes/multilingual/vits_tts/train_vits_tts_phonemes.py
new file mode 100644
index 00000000..24e9e51a
--- /dev/null
+++ b/recipes/multilingual/vits_tts/train_vits_tts_phonemes.py
@@ -0,0 +1,126 @@
+import os
+from glob import glob
+
+from trainer import Trainer, TrainerArgs
+
+from TTS.tts.configs.shared_configs import BaseDatasetConfig
+from TTS.tts.configs.vits_config import VitsConfig
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.models.vits import Vits, VitsArgs, VitsAudioConfig
+from TTS.tts.utils.languages import LanguageManager
+from TTS.tts.utils.speakers import SpeakerManager
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
+from TTS.utils.audio import AudioProcessor
+
+output_path = "/media/julian/Workdisk/train"
+
+mailabs_path = "/home/julian/workspace/mailabs/**"
+dataset_paths = glob(mailabs_path)
+dataset_config = [
+    BaseDatasetConfig(
+        formatter="mailabs",
+        meta_file_train=None,
+        path=path,
+        language=path.split("/")[-1],  # language code is the folder name
+    )
+    for path in dataset_paths
+]
+
+audio_config = VitsAudioConfig(
+    sample_rate=16000,
+    win_length=1024,
+    hop_length=256,
+    num_mels=80,
+    mel_fmin=0,
+    mel_fmax=None,
+)
+
+vitsArgs = VitsArgs(
+    use_language_embedding=True,
+    embedded_language_dim=4,
+    use_speaker_embedding=True,
+    use_sdp=False,
+)
+
+config = VitsConfig(
+    model_args=vitsArgs,
+    audio=audio_config,
+    run_name="vits_vctk",
+    use_speaker_embedding=True,
+    batch_size=32,
+    eval_batch_size=16,
+    batch_group_size=0,
+    num_loader_workers=12,
+    num_eval_loader_workers=12,
+    precompute_num_workers=12,
+    run_eval=True,
+    test_delay_epochs=-1,
+    epochs=1000,
+    text_cleaner="multilingual_cleaners",
+    use_phonemes=True,
+    phoneme_language=None,
+    phonemizer="multi_phonemizer",
+    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
+    compute_input_seq_cache=True,
+    print_step=25,
+    use_language_weighted_sampler=True,
+    print_eval=False,
+    mixed_precision=False,
+    min_audio_len=audio_config.sample_rate,
+    max_audio_len=audio_config.sample_rate * 10,
+    output_path=output_path,
+    datasets=dataset_config,
+    test_sentences=[
+        [
+            "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
+            "mary_ann",
+            None,
+            "en-us",
+        ],
+        [
+            "Il m'a fallu beaucoup de temps pour d\u00e9velopper une voix, et maintenant que je l'ai, je ne vais pas me taire.",
+            "ezwa",
+            None,
+            "fr-fr",
+        ],
+        ["Ich finde, dieses Startup ist wirklich unglaublich.", "eva_k", None, "de-de"],
+        ["Я думаю, что этот стартап действительно удивительный.", "nikolaev", None, "ru"],
+    ],
+)
+
+# force the convertion of the custom characters to a config attribute
+config.from_dict(config.to_dict())
+
+# init audio processor
+ap = AudioProcessor(**config.audio.to_dict())
+
+# load training samples
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)
+
+# init speaker manager for multi-speaker training
+# it maps speaker-id to speaker-name in the model and data-loader
+speaker_manager = SpeakerManager()
+speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name")
+config.model_args.num_speakers = speaker_manager.num_speakers
+
+language_manager = LanguageManager(config=config)
+config.model_args.num_languages = language_manager.num_languages
+
+# INITIALIZE THE TOKENIZER
+# Tokenizer is used to convert text to sequences of token IDs.
+# config is updated with the default characters if not defined in the config.
+tokenizer, config = TTSTokenizer.init_from_config(config)
+
+# init model
+model = Vits(config, ap, tokenizer, speaker_manager, language_manager)
+
+# init the trainer and 🚀
+trainer = Trainer(
+    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
+)
+trainer.fit()
diff --git a/tests/text_tests/test_phonemizer.py b/tests/text_tests/test_phonemizer.py
index 8261f2bb..794a8fd7 100644
--- a/tests/text_tests/test_phonemizer.py
+++ b/tests/text_tests/test_phonemizer.py
@@ -2,6 +2,7 @@ import unittest
 from distutils.version import LooseVersion
 
 from TTS.tts.utils.text.phonemizers import ESpeak, Gruut, JA_JP_Phonemizer, ZH_CN_Phonemizer
+from TTS.tts.utils.text.phonemizers.multi_phonemizer import MultiPhonemizer
 
 EXAMPLE_TEXTs = [
     "Recent research at Harvard has shown meditating",
@@ -226,3 +227,46 @@ class TestZH_CN_Phonemizer(unittest.TestCase):
 
     def test_is_available(self):
         self.assertTrue(self.phonemizer.is_available())
+
+
+class TestMultiPhonemizer(unittest.TestCase):
+    def setUp(self):
+        self.phonemizer = MultiPhonemizer({"tr": "espeak", "en-us": "", "de": "gruut", "zh-cn": ""})
+
+    def test_phonemize(self):
+
+        # Enlish espeak
+        text = "Be a voice, not an! echo?"
+        gt = "biː ɐ vˈɔɪs, nˈɑːt æn! ˈɛkoʊ?"
+        output = self.phonemizer.phonemize(text, separator="|", language="en-us")
+        output = output.replace("|", "")
+        self.assertEqual(output, gt)
+
+        # German gruut
+        text = "Hallo, das ist ein Deutches Beipiel!"
+        gt = "haloː, das ɪst aeːn dɔɔʏ̯tçəs bəʔiːpiːl!"
+        output = self.phonemizer.phonemize(text, separator="|", language="de")
+        output = output.replace("|", "")
+        self.assertEqual(output, gt)
+
+    def test_phonemizer_initialization(self):
+        # test with unsupported language
+        with self.assertRaises(ValueError):
+            MultiPhonemizer({"tr": "espeak", "xx": ""})
+
+        # test with unsupported phonemizer
+        with self.assertRaises(ValueError):
+            MultiPhonemizer({"tr": "espeak", "fr": "xx"})
+
+    def test_sub_phonemizers(self):
+        for lang in self.phonemizer.lang_to_phonemizer_name.keys():
+            self.assertEqual(lang, self.phonemizer.lang_to_phonemizer[lang].language)
+            self.assertEqual(
+                self.phonemizer.lang_to_phonemizer_name[lang], self.phonemizer.lang_to_phonemizer[lang].name()
+            )
+
+    def test_name(self):
+        self.assertEqual(self.phonemizer.name(), "multi-phonemizer")
+
+    def test_get_supported_languages(self):
+        self.assertIsInstance(self.phonemizer.supported_languages(), list)

From da93d768b83d7a3b805820625c388cacbea63968 Mon Sep 17 00:00:00 2001
From: Eren G??lge <egolge@coqui.ai>
Date: Mon, 2 Jan 2023 10:07:03 +0100
Subject: [PATCH 06/24] Update docs

---
 docs/source/formatting_your_dataset.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/formatting_your_dataset.md b/docs/source/formatting_your_dataset.md
index 400f407c..f3167a3c 100644
--- a/docs/source/formatting_your_dataset.md
+++ b/docs/source/formatting_your_dataset.md
@@ -113,7 +113,7 @@ def formatter(root_path, manifest_file, **kwargs):  # pylint: disable=unused-arg
             cols = line.split("|")
             wav_file = os.path.join(root_path, "wavs", cols[0])
             text = cols[1]
-            items.append({"text":text, "audio_file":wav_file, "speaker_name":speaker_name})
+            items.append({"text":text, "audio_file":wav_file, "speaker_name":speaker_name, "root_path": root_path})
     return items
 
 # load training samples

From 42afad5e79164d619dc992cc024ea70905872d06 Mon Sep 17 00:00:00 2001
From: Khalid Bashir <khalidbashir1212@gmail.com>
Date: Mon, 2 Jan 2023 18:20:02 +0500
Subject: [PATCH 07/24] Fixed bug related to yourtts speaker embeddings issue
 (#2234)

* Fixed bug related to yourtts speaker embeddings issue

* Reverted code for base_tts

* Bug fix on VITS d_vector_file type

* Ignore the test speakers on YourTTS recipe

* Add speaker encoder model and config on YourTTS recipe to easily do zero-shot inference

* Update YourTTS config file

* Update ModelManager._update_path to deal with list attributes

* Fix lint checks

* Remove unused code

* Fix unit tests

* Reset name_to_id to get the right speaker ids on load_embeddings_from_list_of_files

* Set weighted_sampler_multipliers as an empty dict to prevent users' mistakes

Co-authored-by: Edresson Casanova <edresson1@gmail.com>
---
 TTS/.models.json                              |  2 +-
 TTS/tts/configs/vits_config.py                |  2 +-
 TTS/tts/models/vits.py                        |  6 ++---
 TTS/tts/utils/managers.py                     |  3 +++
 TTS/tts/utils/speakers.py                     |  4 ---
 TTS/utils/manage.py                           | 12 +++++++--
 recipes/vctk/yourtts/train_yourtts.py         | 25 ++++++++++++++++---
 tests/tts_tests/test_vits.py                  |  6 ++---
 tests/tts_tests/test_vits_d-vectors_train.py  |  2 +-
 .../test_vits_multilingual_train-d_vectors.py |  4 +--
 10 files changed, 46 insertions(+), 20 deletions(-)

diff --git a/TTS/.models.json b/TTS/.models.json
index 52cdf795..0b502073 100644
--- a/TTS/.models.json
+++ b/TTS/.models.json
@@ -4,7 +4,7 @@
             "multi-dataset":{
                 "your_tts":{
                     "description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--multilingual--multi-dataset--your_tts.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--multilingual--multi-dataset--your_tts.zip",
                     "default_vocoder": null,
                     "commit": "e9a1953e",
                     "license": "CC BY-NC-ND 4.0",
diff --git a/TTS/tts/configs/vits_config.py b/TTS/tts/configs/vits_config.py
index 3469f701..4e574c5a 100644
--- a/TTS/tts/configs/vits_config.py
+++ b/TTS/tts/configs/vits_config.py
@@ -167,7 +167,7 @@ class VitsConfig(BaseTTSConfig):
 
     # use d-vectors
     use_d_vector_file: bool = False
-    d_vector_file: str = None
+    d_vector_file: List[str] = None
     d_vector_dim: int = None
 
     def __post_init__(self):
diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py
index 518809b3..1b367cd7 100644
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@@ -477,8 +477,8 @@ class VitsArgs(Coqpit):
         use_d_vector_file (bool):
             Enable/Disable the use of d-vectors for multi-speaker training. Defaults to False.
 
-        d_vector_file (str):
-            Path to the file including pre-computed speaker embeddings. Defaults to None.
+        d_vector_file (List[str]):
+            List of paths to the files including pre-computed speaker embeddings. Defaults to None.
 
         d_vector_dim (int):
             Number of d-vector channels. Defaults to 0.
@@ -573,7 +573,7 @@ class VitsArgs(Coqpit):
     use_speaker_embedding: bool = False
     num_speakers: int = 0
     speakers_file: str = None
-    d_vector_file: str = None
+    d_vector_file: List[str] = None
     speaker_embedding_channels: int = 256
     use_d_vector_file: bool = False
     d_vector_dim: int = 0
diff --git a/TTS/tts/utils/managers.py b/TTS/tts/utils/managers.py
index 46d999a2..0159a9d2 100644
--- a/TTS/tts/utils/managers.py
+++ b/TTS/tts/utils/managers.py
@@ -235,6 +235,9 @@ class EmbeddingManager(BaseIDManager):
             self.embeddings_by_names.update(embeddings_by_names)
             self.embeddings.update(embeddings)
 
+        # reset name_to_id to get the right speaker ids
+        self.name_to_id = {name: i for i, name in enumerate(self.name_to_id)}
+
     def get_embedding_by_clip(self, clip_idx: str) -> List:
         """Get embedding by clip ID.
 
diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py
index 21fefa0b..e4969526 100644
--- a/TTS/tts/utils/speakers.py
+++ b/TTS/tts/utils/speakers.py
@@ -109,10 +109,6 @@ class SpeakerManager(EmbeddingManager):
 
         if get_from_config_or_model_args_with_default(config, "use_d_vector_file", False):
             speaker_manager = SpeakerManager()
-            if get_from_config_or_model_args_with_default(config, "speakers_file", None):
-                speaker_manager = SpeakerManager(
-                    d_vectors_file_path=get_from_config_or_model_args_with_default(config, "speaker_file", None)
-                )
             if get_from_config_or_model_args_with_default(config, "d_vector_file", None):
                 speaker_manager = SpeakerManager(
                     d_vectors_file_path=get_from_config_or_model_args_with_default(config, "d_vector_file", None)
diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py
index 44348b29..ef4c11f5 100644
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@@ -339,10 +339,18 @@ class ModelManager(object):
                         sub_conf = sub_conf[fd]
                     else:
                         return
-                sub_conf[field_names[-1]] = new_path
+                if isinstance(sub_conf[field_names[-1]], list):
+                    sub_conf[field_names[-1]] = [new_path]
+                else:
+                    sub_conf[field_names[-1]] = new_path
             else:
                 # field name points to a top-level field
-                config[field_name] = new_path
+                if not field_name in config:
+                    return
+                if isinstance(config[field_name], list):
+                    config[field_name] = [new_path]
+                else:
+                    config[field_name] = new_path
             config.save_json(config_path)
 
     @staticmethod
diff --git a/recipes/vctk/yourtts/train_yourtts.py b/recipes/vctk/yourtts/train_yourtts.py
index aa584396..b783c5d6 100644
--- a/recipes/vctk/yourtts/train_yourtts.py
+++ b/recipes/vctk/yourtts/train_yourtts.py
@@ -57,7 +57,25 @@ if not os.path.exists(VCTK_DOWNLOAD_PATH):
 
 # init configs
 vctk_config = BaseDatasetConfig(
-    formatter="vctk", dataset_name="vctk", meta_file_train="", meta_file_val="", path=VCTK_DOWNLOAD_PATH, language="en"
+    formatter="vctk",
+    dataset_name="vctk",
+    meta_file_train="",
+    meta_file_val="",
+    path=VCTK_DOWNLOAD_PATH,
+    language="en",
+    ignored_speakers=[
+        "p261",
+        "p225",
+        "p294",
+        "p347",
+        "p238",
+        "p234",
+        "p248",
+        "p335",
+        "p245",
+        "p326",
+        "p302",
+    ],  # Ignore the test speakers to full replicate the paper experiment
 )
 
 # Add here all datasets configs, in our case we just want to train with the VCTK dataset then we need to add just VCTK. Note: If you want to added new datasets just added they here and it will automatically compute the speaker embeddings (d-vectors) for this new dataset :)
@@ -111,11 +129,11 @@ model_args = VitsArgs(
     use_d_vector_file=True,
     d_vector_dim=512,
     num_layers_text_encoder=10,
+    speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
+    speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
     resblock_type_decoder="2",  # On the paper, we accidentally trained the YourTTS using ResNet blocks type 2, if you like you can use the ResNet blocks type 1 like the VITS model
     # Usefull parameters to enable the Speaker Consistency Loss (SCL) discribed in the paper
     # use_speaker_encoder_as_loss=True,
-    # speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
-    # speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
     # Usefull parameters to the enable multilingual training
     # use_language_embedding=True,
     # embedded_language_dim=4,
@@ -207,6 +225,7 @@ config = VitsConfig(
     use_weighted_sampler=True,
     # Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has
     weighted_sampler_attrs={"speaker_name": 1.0},
+    weighted_sampler_multipliers={},
     # It defines the Speaker Consistency Loss (SCL) α to 9 like the paper
     speaker_encoder_loss_alpha=9.0,
 )
diff --git a/tests/tts_tests/test_vits.py b/tests/tts_tests/test_vits.py
index ccc3be1c..8e408519 100644
--- a/tests/tts_tests/test_vits.py
+++ b/tests/tts_tests/test_vits.py
@@ -210,7 +210,7 @@ class TestVits(unittest.TestCase):
             num_chars=32,
             use_d_vector_file=True,
             d_vector_dim=256,
-            d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"),
+            d_vector_file=[os.path.join(get_tests_data_path(), "dummy_speakers.json")],
         )
         config = VitsConfig(model_args=args)
         model = Vits.init_from_config(config, verbose=False).to(device)
@@ -355,7 +355,7 @@ class TestVits(unittest.TestCase):
             num_chars=32,
             use_d_vector_file=True,
             d_vector_dim=256,
-            d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"),
+            d_vector_file=[os.path.join(get_tests_data_path(), "dummy_speakers.json")],
         )
         config = VitsConfig(model_args=args)
         model = Vits.init_from_config(config, verbose=False).to(device)
@@ -587,7 +587,7 @@ class TestVits(unittest.TestCase):
                 num_chars=32,
                 use_d_vector_file=True,
                 d_vector_dim=256,
-                d_vector_file=os.path.join(get_tests_data_path(), "dummy_speakers.json"),
+                d_vector_file=[os.path.join(get_tests_data_path(), "dummy_speakers.json")],
             )
         )
         model = Vits.init_from_config(config, verbose=False).to(device)
diff --git a/tests/tts_tests/test_vits_d-vectors_train.py b/tests/tts_tests/test_vits_d-vectors_train.py
index 29c5b438..741bda91 100644
--- a/tests/tts_tests/test_vits_d-vectors_train.py
+++ b/tests/tts_tests/test_vits_d-vectors_train.py
@@ -33,7 +33,7 @@ config.audio.trim_db = 60
 
 # active multispeaker d-vec mode
 config.model_args.use_d_vector_file = True
-config.model_args.d_vector_file = "tests/data/ljspeech/speakers.json"
+config.model_args.d_vector_file = ["tests/data/ljspeech/speakers.json"]
 config.model_args.d_vector_dim = 256
 
 
diff --git a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py
index db66802b..fd58db53 100644
--- a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py
+++ b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py
@@ -63,8 +63,8 @@ config.use_speaker_embedding = False
 # active multispeaker d-vec mode
 config.model_args.use_d_vector_file = True
 config.use_d_vector_file = True
-config.model_args.d_vector_file = "tests/data/ljspeech/speakers.json"
-config.d_vector_file = "tests/data/ljspeech/speakers.json"
+config.model_args.d_vector_file = ["tests/data/ljspeech/speakers.json"]
+config.d_vector_file = ["tests/data/ljspeech/speakers.json"]
 config.model_args.d_vector_dim = 256
 config.d_vector_dim = 256
 

From 49dfaa52341fd7f27e239e50abb75ab67227064e Mon Sep 17 00:00:00 2001
From: Edresson Casanova <edresson1@gmail.com>
Date: Tue, 10 Jan 2023 21:01:46 -0300
Subject: [PATCH 08/24] Update the Trainer requirement version for a compatible
 one (#2276)

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 376da35f..8464d7cb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -24,7 +24,7 @@ pandas
 # deps for training
 matplotlib
 # coqui stack
-trainer
+trainer==0.0.20
 # config management
 coqpit>=0.0.16
 # chinese g2p deps

From 14d45b53470d862d4df1966d3984ef883077aa5c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <erogol@hotmail.com>
Date: Wed, 11 Jan 2023 01:06:02 +0100
Subject: [PATCH 09/24] Bump up to v0.10.2

---
 TTS/VERSION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/VERSION b/TTS/VERSION
index 71172b43..42624f31 100644
--- a/TTS/VERSION
+++ b/TTS/VERSION
@@ -1 +1 @@
-0.10.1
\ No newline at end of file
+0.10.2
\ No newline at end of file

From bc422f2f3c3d033c5e2b920375e0c0a6ff8d07db Mon Sep 17 00:00:00 2001
From: manmay nakhashi <manmay.nakhashi@gmail.com>
Date: Mon, 16 Jan 2023 03:09:22 +0530
Subject: [PATCH 10/24] Fastspeech2 (#2073)

* added EnergyDataset

* add energy to Dataset

* add comupte_energy

* added energy params

* added energy to forward_tts

* added plot_avg_energy for visualisation

* Update forward_tts.py

* create file

* added fastspeech2 recipe

* add fastspeech2 config

* removed energy from fast pitch

* add energy loss to forward tts

* Update fastspeech2_config.py

* change run_name

* Update numpy_transforms.py

* fix typo

* fix typo

* fix typo

* linting issues

* use_energy default value --> False

* Update numpy_transforms.py

* linting fixes

* fix typo

* liniting_fix

* liniting_fix

* fix

* fixes

* fixes

* lint fix

* lint fixws

* added training test

* wrong import

* wrong import

* trailing whitespace

* style fix

* changed class name because of error

* class name change

* class name change

* change class name

* fixed styles
---
 TTS/tts/configs/fast_pitch_config.py          |   7 +
 TTS/tts/configs/fastspeech2_config.py         | 198 ++++++++++++++++++
 TTS/tts/configs/shared_configs.py             |   4 +
 TTS/tts/datasets/dataset.py                   | 185 +++++++++++++++-
 TTS/tts/layers/losses.py                      |  11 +
 TTS/tts/models/forward_tts.py                 | 120 ++++++++++-
 TTS/tts/utils/visual.py                       |  33 +++
 TTS/utils/audio/numpy_transforms.py           |  23 +-
 .../ljspeech/fastspeech2/train_fastspeech2.py | 102 +++++++++
 .../test_fastspeech_2_speaker_emb_train.py    |  95 +++++++++
 tests/tts_tests/test_fastspeech_2_train.py    |  94 +++++++++
 11 files changed, 866 insertions(+), 6 deletions(-)
 create mode 100644 TTS/tts/configs/fastspeech2_config.py
 create mode 100644 recipes/ljspeech/fastspeech2/train_fastspeech2.py
 create mode 100644 tests/tts_tests/test_fastspeech_2_speaker_emb_train.py
 create mode 100644 tests/tts_tests/test_fastspeech_2_train.py

diff --git a/TTS/tts/configs/fast_pitch_config.py b/TTS/tts/configs/fast_pitch_config.py
index 024040f8..90b15021 100644
--- a/TTS/tts/configs/fast_pitch_config.py
+++ b/TTS/tts/configs/fast_pitch_config.py
@@ -100,6 +100,13 @@ class FastPitchConfig(BaseTTSConfig):
 
         max_seq_len (int):
             Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
+
+        # dataset configs
+        compute_f0(bool):
+            Compute pitch. defaults to True
+
+        f0_cache_path(str):
+            pith cache path. defaults to None
     """
 
     model: str = "fast_pitch"
diff --git a/TTS/tts/configs/fastspeech2_config.py b/TTS/tts/configs/fastspeech2_config.py
new file mode 100644
index 00000000..f7ff219a
--- /dev/null
+++ b/TTS/tts/configs/fastspeech2_config.py
@@ -0,0 +1,198 @@
+from dataclasses import dataclass, field
+from typing import List
+
+from TTS.tts.configs.shared_configs import BaseTTSConfig
+from TTS.tts.models.forward_tts import ForwardTTSArgs
+
+
+@dataclass
+class Fastspeech2Config(BaseTTSConfig):
+    """Configure `ForwardTTS` as FastPitch model.
+
+    Example:
+
+        >>> from TTS.tts.configs.fastspeech2_config import FastSpeech2Config
+        >>> config = FastSpeech2Config()
+
+    Args:
+        model (str):
+            Model name used for selecting the right model at initialization. Defaults to `fast_pitch`.
+
+        base_model (str):
+            Name of the base model being configured as this model so that 🐸 TTS knows it needs to initiate
+            the base model rather than searching for the `model` implementation. Defaults to `forward_tts`.
+
+        model_args (Coqpit):
+            Model class arguments. Check `FastPitchArgs` for more details. Defaults to `FastPitchArgs()`.
+
+        data_dep_init_steps (int):
+            Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
+            Activation Normalization that pre-computes normalization stats at the beginning and use the same values
+            for the rest. Defaults to 10.
+
+        speakers_file (str):
+            Path to the file containing the list of speakers. Needed at inference for loading matching speaker ids to
+            speaker names. Defaults to `None`.
+
+        use_speaker_embedding (bool):
+            enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
+            in the multi-speaker mode. Defaults to False.
+
+        use_d_vector_file (bool):
+            enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
+
+        d_vector_file (str):
+            Path to the file including pre-computed speaker embeddings. Defaults to None.
+
+        d_vector_dim (int):
+            Dimension of the external speaker embeddings. Defaults to 0.
+
+        optimizer (str):
+            Name of the model optimizer. Defaults to `Adam`.
+
+        optimizer_params (dict):
+            Arguments of the model optimizer. Defaults to `{"betas": [0.9, 0.998], "weight_decay": 1e-6}`.
+
+        lr_scheduler (str):
+            Name of the learning rate scheduler. Defaults to `Noam`.
+
+        lr_scheduler_params (dict):
+            Arguments of the learning rate scheduler. Defaults to `{"warmup_steps": 4000}`.
+
+        lr (float):
+            Initial learning rate. Defaults to `1e-3`.
+
+        grad_clip (float):
+            Gradient norm clipping value. Defaults to `5.0`.
+
+        spec_loss_type (str):
+            Type of the spectrogram loss. Check `ForwardTTSLoss` for possible values. Defaults to `mse`.
+
+        duration_loss_type (str):
+            Type of the duration loss. Check `ForwardTTSLoss` for possible values. Defaults to `mse`.
+
+        use_ssim_loss (bool):
+            Enable/disable the use of SSIM (Structural Similarity) loss. Defaults to True.
+
+        wd (float):
+            Weight decay coefficient. Defaults to `1e-7`.
+
+        ssim_loss_alpha (float):
+            Weight for the SSIM loss. If set 0, disables the SSIM loss. Defaults to 1.0.
+
+        dur_loss_alpha (float):
+            Weight for the duration predictor's loss. If set 0, disables the huber loss. Defaults to 1.0.
+
+        spec_loss_alpha (float):
+            Weight for the L1 spectrogram loss. If set 0, disables the L1 loss. Defaults to 1.0.
+
+        pitch_loss_alpha (float):
+            Weight for the pitch predictor's loss. If set 0, disables the pitch predictor. Defaults to 1.0.
+
+        energy_loss_alpha (float):
+            Weight for the energy predictor's loss. If set 0, disables the energy predictor. Defaults to 1.0.
+
+        binary_align_loss_alpha (float):
+            Weight for the binary loss. If set 0, disables the binary loss. Defaults to 1.0.
+
+        binary_loss_warmup_epochs (float):
+            Number of epochs to gradually increase the binary loss impact. Defaults to 150.
+
+        min_seq_len (int):
+            Minimum input sequence length to be used at training.
+
+        max_seq_len (int):
+            Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
+
+        # dataset configs
+        compute_f0(bool):
+            Compute pitch. defaults to True
+
+        f0_cache_path(str):
+            pith cache path. defaults to None
+
+        # dataset configs
+        compute_energy(bool):
+            Compute energy. defaults to True
+
+        energy_cache_path(str):
+            energy cache path. defaults to None
+    """
+
+    model: str = "fastspeech2"
+    base_model: str = "forward_tts"
+
+    # model specific params
+    model_args: ForwardTTSArgs = ForwardTTSArgs()
+
+    # multi-speaker settings
+    num_speakers: int = 0
+    speakers_file: str = None
+    use_speaker_embedding: bool = False
+    use_d_vector_file: bool = False
+    d_vector_file: str = False
+    d_vector_dim: int = 0
+
+    # optimizer parameters
+    optimizer: str = "Adam"
+    optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6})
+    lr_scheduler: str = "NoamLR"
+    lr_scheduler_params: dict = field(default_factory=lambda: {"warmup_steps": 4000})
+    lr: float = 1e-4
+    grad_clip: float = 5.0
+
+    # loss params
+    spec_loss_type: str = "mse"
+    duration_loss_type: str = "mse"
+    use_ssim_loss: bool = True
+    ssim_loss_alpha: float = 1.0
+    spec_loss_alpha: float = 1.0
+    aligner_loss_alpha: float = 1.0
+    pitch_loss_alpha: float = 0.1
+    energy_loss_alpha: float = 0.1
+    dur_loss_alpha: float = 0.1
+    binary_align_loss_alpha: float = 0.1
+    binary_loss_warmup_epochs: int = 150
+
+    # overrides
+    min_seq_len: int = 13
+    max_seq_len: int = 200
+    r: int = 1  # DO NOT CHANGE
+
+    # dataset configs
+    compute_f0: bool = True
+    f0_cache_path: str = None
+
+    # dataset configs
+    compute_energy: bool = True
+    energy_cache_path: str = None
+
+    # testing
+    test_sentences: List[str] = field(
+        default_factory=lambda: [
+            "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
+            "Be a voice, not an echo.",
+            "I'm sorry Dave. I'm afraid I can't do that.",
+            "This cake is great. It's so delicious and moist.",
+            "Prior to November 22, 1963.",
+        ]
+    )
+
+    def __post_init__(self):
+        # Pass multi-speaker parameters to the model args as `model.init_multispeaker()` looks for it there.
+        if self.num_speakers > 0:
+            self.model_args.num_speakers = self.num_speakers
+
+        # speaker embedding settings
+        if self.use_speaker_embedding:
+            self.model_args.use_speaker_embedding = True
+        if self.speakers_file:
+            self.model_args.speakers_file = self.speakers_file
+
+        # d-vector settings
+        if self.use_d_vector_file:
+            self.model_args.use_d_vector_file = True
+        if self.d_vector_dim is not None and self.d_vector_dim > 0:
+            self.model_args.d_vector_dim = self.d_vector_dim
+        if self.d_vector_file:
+            self.model_args.d_vector_file = self.d_vector_file
diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py
index 16b77c38..bf17322c 100644
--- a/TTS/tts/configs/shared_configs.py
+++ b/TTS/tts/configs/shared_configs.py
@@ -217,6 +217,9 @@ class BaseTTSConfig(BaseTrainingConfig):
         compute_f0 (int):
             (Not in use yet).
 
+        compute_energy (int):
+            (Not in use yet).
+
         compute_linear_spec (bool):
             If True data loader computes and returns linear spectrograms alongside the other data.
 
@@ -312,6 +315,7 @@ class BaseTTSConfig(BaseTrainingConfig):
     min_text_len: int = 1
     max_text_len: int = float("inf")
     compute_f0: bool = False
+    compute_energy: bool = False
     compute_linear_spec: bool = False
     precompute_num_workers: int = 0
     use_noise_augment: bool = False
diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py
index 9ec61e52..a8ff9772 100644
--- a/TTS/tts/datasets/dataset.py
+++ b/TTS/tts/datasets/dataset.py
@@ -11,6 +11,7 @@ from torch.utils.data import Dataset
 
 from TTS.tts.utils.data import prepare_data, prepare_stop_target, prepare_tensor
 from TTS.utils.audio import AudioProcessor
+from TTS.utils.audio.numpy_transforms import compute_energy as calculate_energy
 
 # to prevent too many open files error as suggested here
 # https://github.com/pytorch/pytorch/issues/11201#issuecomment-421146936
@@ -50,7 +51,9 @@ class TTSDataset(Dataset):
         samples: List[Dict] = None,
         tokenizer: "TTSTokenizer" = None,
         compute_f0: bool = False,
+        compute_energy: bool = False,
         f0_cache_path: str = None,
+        energy_cache_path: str = None,
         return_wav: bool = False,
         batch_group_size: int = 0,
         min_text_len: int = 0,
@@ -84,8 +87,12 @@ class TTSDataset(Dataset):
 
             compute_f0 (bool): compute f0 if True. Defaults to False.
 
+            compute_energy (bool): compute energy if True. Defaults to False.
+
             f0_cache_path (str): Path to store f0 cache. Defaults to None.
 
+            energy_cache_path (str): Path to store energy cache. Defaults to None.
+
             return_wav (bool): Return the waveform of the sample. Defaults to False.
 
             batch_group_size (int): Range of batch randomization after sorting
@@ -128,7 +135,9 @@ class TTSDataset(Dataset):
         self.compute_linear_spec = compute_linear_spec
         self.return_wav = return_wav
         self.compute_f0 = compute_f0
+        self.compute_energy = compute_energy
         self.f0_cache_path = f0_cache_path
+        self.energy_cache_path = energy_cache_path
         self.min_audio_len = min_audio_len
         self.max_audio_len = max_audio_len
         self.min_text_len = min_text_len
@@ -155,7 +164,10 @@ class TTSDataset(Dataset):
             self.f0_dataset = F0Dataset(
                 self.samples, self.ap, cache_path=f0_cache_path, precompute_num_workers=precompute_num_workers
             )
-
+        if compute_energy:
+            self.energy_dataset = EnergyDataset(
+                self.samples, self.ap, cache_path=energy_cache_path, precompute_num_workers=precompute_num_workers
+            )
         if self.verbose:
             self.print_logs()
 
@@ -211,6 +223,12 @@ class TTSDataset(Dataset):
         assert item["audio_unique_name"] == out_dict["audio_unique_name"]
         return out_dict
 
+    def get_energy(self, idx):
+        out_dict = self.energy_dataset[idx]
+        item = self.samples[idx]
+        assert item["audio_unique_name"] == out_dict["audio_unique_name"]
+        return out_dict
+
     @staticmethod
     def get_attn_mask(attn_file):
         return np.load(attn_file)
@@ -252,12 +270,16 @@ class TTSDataset(Dataset):
         f0 = None
         if self.compute_f0:
             f0 = self.get_f0(idx)["f0"]
+        energy = None
+        if self.compute_energy:
+            energy = self.get_energy(idx)["energy"]
 
         sample = {
             "raw_text": raw_text,
             "token_ids": token_ids,
             "wav": wav,
             "pitch": f0,
+            "energy": energy,
             "attn": attn,
             "item_idx": item["audio_file"],
             "speaker_name": item["speaker_name"],
@@ -490,7 +512,13 @@ class TTSDataset(Dataset):
                 pitch = torch.FloatTensor(pitch)[:, None, :].contiguous()  # B x 1 xT
             else:
                 pitch = None
-
+            # format energy
+            if self.compute_energy:
+                energy = prepare_data(batch["energy"])
+                assert mel.shape[1] == energy.shape[1], f"[!] {mel.shape} vs {energy.shape}"
+                energy = torch.FloatTensor(energy)[:, None, :].contiguous()  # B x 1 xT
+            else:
+                energy = None
             # format attention masks
             attns = None
             if batch["attn"][0] is not None:
@@ -519,6 +547,7 @@ class TTSDataset(Dataset):
                 "waveform": wav_padded,
                 "raw_text": batch["raw_text"],
                 "pitch": pitch,
+                "energy": energy,
                 "language_ids": language_ids,
                 "audio_unique_names": batch["audio_unique_name"],
             }
@@ -777,3 +806,155 @@ class F0Dataset:
         print("\n")
         print(f"{indent}> F0Dataset ")
         print(f"{indent}| > Number of instances : {len(self.samples)}")
+
+
+class EnergyDataset:
+    """Energy Dataset for computing Energy from wav files in CPU
+
+    Pre-compute Energy values for all the samples at initialization if `cache_path` is not None or already present. It
+    also computes the mean and std of Energy values if `normalize_Energy` is True.
+
+    Args:
+        samples (Union[List[List], List[Dict]]):
+            List of samples. Each sample is a list or a dict.
+
+        ap (AudioProcessor):
+            AudioProcessor to compute Energy from wav files.
+
+        cache_path (str):
+            Path to cache Energy values. If `cache_path` is already present or None, it skips the pre-computation.
+            Defaults to None.
+
+        precompute_num_workers (int):
+            Number of workers used for pre-computing the Energy values. Defaults to 0.
+
+        normalize_Energy (bool):
+            Whether to normalize Energy values by mean and std. Defaults to True.
+    """
+
+    def __init__(
+        self,
+        samples: Union[List[List], List[Dict]],
+        ap: "AudioProcessor",
+        verbose=False,
+        cache_path: str = None,
+        precompute_num_workers=0,
+        normalize_energy=True,
+    ):
+        self.samples = samples
+        self.ap = ap
+        self.verbose = verbose
+        self.cache_path = cache_path
+        self.normalize_energy = normalize_energy
+        self.pad_id = 0.0
+        self.mean = None
+        self.std = None
+        if cache_path is not None and not os.path.exists(cache_path):
+            os.makedirs(cache_path)
+            self.precompute(precompute_num_workers)
+        if normalize_energy:
+            self.load_stats(cache_path)
+
+    def __getitem__(self, idx):
+        item = self.samples[idx]
+        energy = self.compute_or_load(item["audio_file"])
+        if self.normalize_energy:
+            assert self.mean is not None and self.std is not None, " [!] Mean and STD is not available"
+            energy = self.normalize(energy)
+        return {"audio_file": item["audio_file"], "energy": energy}
+
+    def __len__(self):
+        return len(self.samples)
+
+    def precompute(self, num_workers=0):
+        print("[*] Pre-computing energys...")
+        with tqdm.tqdm(total=len(self)) as pbar:
+            batch_size = num_workers if num_workers > 0 else 1
+            # we do not normalize at preproessing
+            normalize_energy = self.normalize_energy
+            self.normalize_energy = False
+            dataloder = torch.utils.data.DataLoader(
+                batch_size=batch_size, dataset=self, shuffle=False, num_workers=num_workers, collate_fn=self.collate_fn
+            )
+            computed_data = []
+            for batch in dataloder:
+                energy = batch["energy"]
+                computed_data.append(e for e in energy)
+                pbar.update(batch_size)
+            self.normalize_energy = normalize_energy
+
+        if self.normalize_energy:
+            computed_data = [tensor for batch in computed_data for tensor in batch]  # flatten
+            energy_mean, energy_std = self.compute_pitch_stats(computed_data)
+            energy_stats = {"mean": energy_mean, "std": energy_std}
+            np.save(os.path.join(self.cache_path, "energy_stats"), energy_stats, allow_pickle=True)
+
+    def get_pad_id(self):
+        return self.pad_id
+
+    @staticmethod
+    def create_energy_file_path(wav_file, cache_path):
+        file_name = os.path.splitext(os.path.basename(wav_file))[0]
+        energy_file = os.path.join(cache_path, file_name + "_energy.npy")
+        return energy_file
+
+    @staticmethod
+    def _compute_and_save_energy(ap, wav_file, energy_file=None):
+        wav = ap.load_wav(wav_file)
+        energy = calculate_energy(wav)
+        if energy_file:
+            np.save(energy_file, energy)
+        return energy
+
+    @staticmethod
+    def compute_energy_stats(energy_vecs):
+        nonzeros = np.concatenate([v[np.where(v != 0.0)[0]] for v in energy_vecs])
+        mean, std = np.mean(nonzeros), np.std(nonzeros)
+        return mean, std
+
+    def load_stats(self, cache_path):
+        stats_path = os.path.join(cache_path, "energy_stats.npy")
+        stats = np.load(stats_path, allow_pickle=True).item()
+        self.mean = stats["mean"].astype(np.float32)
+        self.std = stats["std"].astype(np.float32)
+
+    def normalize(self, energy):
+        zero_idxs = np.where(energy == 0.0)[0]
+        energy = energy - self.mean
+        energy = energy / self.std
+        energy[zero_idxs] = 0.0
+        return energy
+
+    def denormalize(self, energy):
+        zero_idxs = np.where(energy == 0.0)[0]
+        energy *= self.std
+        energy += self.mean
+        energy[zero_idxs] = 0.0
+        return energy
+
+    def compute_or_load(self, wav_file):
+        """
+        compute energy and return a numpy array of energy values
+        """
+        energy_file = self.create_Energy_file_path(wav_file, self.cache_path)
+        if not os.path.exists(energy_file):
+            energy = self._compute_and_save_energy(self.ap, wav_file, energy_file)
+        else:
+            energy = np.load(energy_file)
+        return energy.astype(np.float32)
+
+    def collate_fn(self, batch):
+        audio_file = [item["audio_file"] for item in batch]
+        energys = [item["energy"] for item in batch]
+        energy_lens = [len(item["energy"]) for item in batch]
+        energy_lens_max = max(energy_lens)
+        energys_torch = torch.LongTensor(len(energys), energy_lens_max).fill_(self.get_pad_id())
+        for i, energy_len in enumerate(energy_lens):
+            energys_torch[i, :energy_len] = torch.LongTensor(energys[i])
+        return {"audio_file": audio_file, "energy": energys_torch, "energy_lens": energy_lens}
+
+    def print_logs(self, level: int = 0) -> None:
+        indent = "\t" * level
+        print("\n")
+        print(f"{indent}> energyDataset ")
+        print(f"{indent}| > Number of instances : {len(self.samples)}")
diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py
index 9933df6b..f39431fa 100644
--- a/TTS/tts/layers/losses.py
+++ b/TTS/tts/layers/losses.py
@@ -801,6 +801,10 @@ class ForwardTTSLoss(nn.Module):
             self.pitch_loss = MSELossMasked(False)
             self.pitch_loss_alpha = c.pitch_loss_alpha
 
+        if c.model_args.use_energy:
+            self.energy_loss = MSELossMasked(False)
+            self.energy_loss_alpha = c.energy_loss_alpha
+
         if c.use_ssim_loss:
             self.ssim = SSIMLoss() if c.use_ssim_loss else None
             self.ssim_loss_alpha = c.ssim_loss_alpha
@@ -826,6 +830,8 @@ class ForwardTTSLoss(nn.Module):
         dur_target,
         pitch_output,
         pitch_target,
+        energy_output,
+        energy_target,
         input_lens,
         alignment_logprob=None,
         alignment_hard=None,
@@ -855,6 +861,11 @@ class ForwardTTSLoss(nn.Module):
             loss = loss + self.pitch_loss_alpha * pitch_loss
             return_dict["loss_pitch"] = self.pitch_loss_alpha * pitch_loss
 
+        if hasattr(self, "energy_loss") and self.energy_loss_alpha > 0:
+            energy_loss = self.energy_loss(energy_output.transpose(1, 2), energy_target.transpose(1, 2), input_lens)
+            loss = loss + self.energy_loss_alpha * energy_loss
+            return_dict["loss_energy"] = self.energy_loss_alpha * energy_loss
+
         if hasattr(self, "aligner_loss") and self.aligner_loss_alpha > 0:
             aligner_loss = self.aligner_loss(alignment_logprob, input_lens, decoder_output_lens)
             loss = loss + self.aligner_loss_alpha * aligner_loss
diff --git a/TTS/tts/models/forward_tts.py b/TTS/tts/models/forward_tts.py
index c1132df2..6d1e90ca 100644
--- a/TTS/tts/models/forward_tts.py
+++ b/TTS/tts/models/forward_tts.py
@@ -15,7 +15,7 @@ from TTS.tts.models.base_tts import BaseTTS
 from TTS.tts.utils.helpers import average_over_durations, generate_path, maximum_path, sequence_mask
 from TTS.tts.utils.speakers import SpeakerManager
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
-from TTS.tts.utils.visual import plot_alignment, plot_avg_pitch, plot_spectrogram
+from TTS.tts.utils.visual import plot_alignment, plot_avg_energy, plot_avg_pitch, plot_spectrogram
 from TTS.utils.io import load_fsspec
 
 
@@ -42,6 +42,9 @@ class ForwardTTSArgs(Coqpit):
         use_pitch (bool):
             Use pitch predictor to learn the pitch. Defaults to True.
 
+        use_energy (bool):
+            Use energy predictor to learn the energy. Defaults to True.
+
         duration_predictor_hidden_channels (int):
             Number of hidden channels in the duration predictor. Defaults to 256.
 
@@ -63,6 +66,18 @@ class ForwardTTSArgs(Coqpit):
         pitch_embedding_kernel_size (int):
             Kernel size of the projection layer in the pitch predictor. Defaults to 3.
 
+        energy_predictor_hidden_channels (int):
+            Number of hidden channels in the energy predictor. Defaults to 256.
+
+        energy_predictor_dropout_p (float):
+            Dropout rate for the energy predictor. Defaults to 0.1.
+
+        energy_predictor_kernel_size (int):
+            Kernel size of conv layers in the energy predictor. Defaults to 3.
+
+        energy_embedding_kernel_size (int):
+            Kernel size of the projection layer in the energy predictor. Defaults to 3.
+
         positional_encoding (bool):
             Whether to use positional encoding. Defaults to True.
 
@@ -114,14 +129,25 @@ class ForwardTTSArgs(Coqpit):
     out_channels: int = 80
     hidden_channels: int = 384
     use_aligner: bool = True
+    # pitch params
     use_pitch: bool = True
     pitch_predictor_hidden_channels: int = 256
     pitch_predictor_kernel_size: int = 3
     pitch_predictor_dropout_p: float = 0.1
     pitch_embedding_kernel_size: int = 3
+
+    # energy params
+    use_energy: bool = False
+    energy_predictor_hidden_channels: int = 256
+    energy_predictor_kernel_size: int = 3
+    energy_predictor_dropout_p: float = 0.1
+    energy_embedding_kernel_size: int = 3
+
+    # duration params
     duration_predictor_hidden_channels: int = 256
     duration_predictor_kernel_size: int = 3
     duration_predictor_dropout_p: float = 0.1
+
     positional_encoding: bool = True
     poisitonal_encoding_use_scale: bool = True
     length_scale: int = 1
@@ -158,7 +184,7 @@ class ForwardTTS(BaseTTS):
         - FastPitch
         - SpeedySpeech
         - FastSpeech
-        - TODO: FastSpeech2 (requires average speech energy predictor)
+        - FastSpeech2 (requires average speech energy predictor)
 
     Args:
         config (Coqpit): Model coqpit class.
@@ -187,6 +213,7 @@ class ForwardTTS(BaseTTS):
         self.max_duration = self.args.max_duration
         self.use_aligner = self.args.use_aligner
         self.use_pitch = self.args.use_pitch
+        self.use_energy = self.args.use_energy
         self.binary_loss_weight = 0.0
 
         self.length_scale = (
@@ -234,6 +261,20 @@ class ForwardTTS(BaseTTS):
                 padding=int((self.args.pitch_embedding_kernel_size - 1) / 2),
             )
 
+        if self.args.use_energy:
+            self.energy_predictor = DurationPredictor(
+                self.args.hidden_channels + self.embedded_speaker_dim,
+                self.args.energy_predictor_hidden_channels,
+                self.args.energy_predictor_kernel_size,
+                self.args.energy_predictor_dropout_p,
+            )
+            self.energy_emb = nn.Conv1d(
+                1,
+                self.args.hidden_channels,
+                kernel_size=self.args.energy_embedding_kernel_size,
+                padding=int((self.args.energy_embedding_kernel_size - 1) / 2),
+            )
+
         if self.args.use_aligner:
             self.aligner = AlignmentNetwork(
                 in_query_channels=self.args.out_channels, in_key_channels=self.args.hidden_channels
@@ -440,6 +481,42 @@ class ForwardTTS(BaseTTS):
         o_pitch_emb = self.pitch_emb(o_pitch)
         return o_pitch_emb, o_pitch
 
+    def _forward_energy_predictor(
+        self,
+        o_en: torch.FloatTensor,
+        x_mask: torch.IntTensor,
+        energy: torch.FloatTensor = None,
+        dr: torch.IntTensor = None,
+    ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
+        """Energy predictor forward pass.
+
+        1. Predict energy from encoder outputs.
+        2. In training - Compute average pitch values for each input character from the ground truth pitch values.
+        3. Embed average energy values.
+
+        Args:
+            o_en (torch.FloatTensor): Encoder output.
+            x_mask (torch.IntTensor): Input sequence mask.
+            energy (torch.FloatTensor, optional): Ground truth energy values. Defaults to None.
+            dr (torch.IntTensor, optional): Ground truth durations. Defaults to None.
+
+        Returns:
+            Tuple[torch.FloatTensor, torch.FloatTensor]: Energy embedding, energy prediction.
+
+        Shapes:
+            - o_en: :math:`(B, C, T_{en})`
+            - x_mask: :math:`(B, 1, T_{en})`
+            - pitch: :math:`(B, 1, T_{de})`
+            - dr: :math:`(B, T_{en})`
+        """
+        o_energy = self.energy_predictor(o_en, x_mask)
+        if energy is not None:
+            avg_energy = average_over_durations(energy, dr)
+            o_energy_emb = self.energy_emb(avg_energy)
+            return o_energy_emb, o_energy, avg_energy
+        o_energy_emb = self.energy_emb(o_energy)
+        return o_energy_emb, o_energy
+
     def _forward_aligner(
         self, x: torch.FloatTensor, y: torch.FloatTensor, x_mask: torch.IntTensor, y_mask: torch.IntTensor
     ) -> Tuple[torch.IntTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
@@ -502,6 +579,7 @@ class ForwardTTS(BaseTTS):
         y: torch.FloatTensor = None,
         dr: torch.IntTensor = None,
         pitch: torch.FloatTensor = None,
+        energy: torch.FloatTensor = None,
         aux_input: Dict = {"d_vectors": None, "speaker_ids": None},  # pylint: disable=unused-argument
     ) -> Dict:
         """Model's forward pass.
@@ -513,6 +591,7 @@ class ForwardTTS(BaseTTS):
             y (torch.FloatTensor): Spectrogram frames. Only used when the alignment network is on. Defaults to None.
             dr (torch.IntTensor): Character durations over the spectrogram frames. Only used when the alignment network is off. Defaults to None.
             pitch (torch.FloatTensor): Pitch values for each spectrogram frame. Only used when the pitch predictor is on. Defaults to None.
+            energy (torch.FloatTensor): energy values for each spectrogram frame. Only used when the energy predictor is on. Defaults to None.
             aux_input (Dict): Auxiliary model inputs for multi-speaker training. Defaults to `{"d_vectors": 0, "speaker_ids": None}`.
 
         Shapes:
@@ -556,6 +635,12 @@ class ForwardTTS(BaseTTS):
         if self.args.use_pitch:
             o_pitch_emb, o_pitch, avg_pitch = self._forward_pitch_predictor(o_en, x_mask, pitch, dr)
             o_en = o_en + o_pitch_emb
+        # energy predictor pass
+        o_energy = None
+        avg_energy = None
+        if self.args.use_energy:
+            o_energy_emb, o_energy, avg_energy = self._forward_energy_predictor(o_en, x_mask, energy, dr)
+            o_en = o_en + o_energy_emb
         # decoder pass
         o_de, attn = self._forward_decoder(
             o_en, dr, x_mask, y_lengths, g=None
@@ -567,6 +652,8 @@ class ForwardTTS(BaseTTS):
             "attn_durations": o_attn,  # for visualization [B, T_en, T_de']
             "pitch_avg": o_pitch,
             "pitch_avg_gt": avg_pitch,
+            "energy_avg": o_energy,
+            "energy_avg_gt": avg_energy,
             "alignments": attn,  # [B, T_de, T_en]
             "alignment_soft": alignment_soft,
             "alignment_mas": alignment_mas,
@@ -604,12 +691,18 @@ class ForwardTTS(BaseTTS):
         if self.args.use_pitch:
             o_pitch_emb, o_pitch = self._forward_pitch_predictor(o_en, x_mask)
             o_en = o_en + o_pitch_emb
+        # energy predictor pass
+        o_energy = None
+        if self.args.use_energy:
+            o_energy_emb, o_energy = self._forward_energy_predictor(o_en, x_mask)
+            o_en = o_en + o_energy_emb
         # decoder pass
         o_de, attn = self._forward_decoder(o_en, o_dr, x_mask, y_lengths, g=None)
         outputs = {
             "model_outputs": o_de,
             "alignments": attn,
             "pitch": o_pitch,
+            "energy": o_energy,
             "durations_log": o_dr_log,
         }
         return outputs
@@ -620,6 +713,7 @@ class ForwardTTS(BaseTTS):
         mel_input = batch["mel_input"]
         mel_lengths = batch["mel_lengths"]
         pitch = batch["pitch"] if self.args.use_pitch else None
+        energy = batch["energy"] if self.args.use_energy else None
         d_vectors = batch["d_vectors"]
         speaker_ids = batch["speaker_ids"]
         durations = batch["durations"]
@@ -627,7 +721,14 @@ class ForwardTTS(BaseTTS):
 
         # forward pass
         outputs = self.forward(
-            text_input, text_lengths, mel_lengths, y=mel_input, dr=durations, pitch=pitch, aux_input=aux_input
+            text_input,
+            text_lengths,
+            mel_lengths,
+            y=mel_input,
+            dr=durations,
+            pitch=pitch,
+            energy=energy,
+            aux_input=aux_input,
         )
         # use aligner's output as the duration target
         if self.use_aligner:
@@ -643,6 +744,8 @@ class ForwardTTS(BaseTTS):
                 dur_target=durations,
                 pitch_output=outputs["pitch_avg"] if self.use_pitch else None,
                 pitch_target=outputs["pitch_avg_gt"] if self.use_pitch else None,
+                energy_output=outputs["energy_avg"] if self.use_energy else None,
+                energy_target=outputs["energy_avg_gt"] if self.use_energy else None,
                 input_lens=text_lengths,
                 alignment_logprob=outputs["alignment_logprob"] if self.use_aligner else None,
                 alignment_soft=outputs["alignment_soft"],
@@ -683,6 +786,17 @@ class ForwardTTS(BaseTTS):
             }
             figures.update(pitch_figures)
 
+        # plot energy figures
+        if self.args.use_energy:
+            energy_avg = abs(outputs["energy_avg_gt"][0, 0].data.cpu().numpy())
+            energy_avg_hat = abs(outputs["energy_avg"][0, 0].data.cpu().numpy())
+            chars = self.tokenizer.decode(batch["text_input"][0].data.cpu().numpy())
+            energy_figures = {
+                "energy_ground_truth": plot_avg_energy(energy_avg, chars, output_fig=False),
+                "energy_avg_predicted": plot_avg_energy(energy_avg_hat, chars, output_fig=False),
+            }
+            figures.update(energy_figures)
+
         # plot the attention mask computed from the predicted durations
         if "attn_durations" in outputs:
             alignments_hat = outputs["attn_durations"][0].data.cpu().numpy()
diff --git a/TTS/tts/utils/visual.py b/TTS/tts/utils/visual.py
index a823738d..fba7bc50 100644
--- a/TTS/tts/utils/visual.py
+++ b/TTS/tts/utils/visual.py
@@ -123,6 +123,39 @@ def plot_avg_pitch(pitch, chars, fig_size=(30, 10), output_fig=False):
     return fig
 
 
+def plot_avg_energy(energy, chars, fig_size=(30, 10), output_fig=False):
+    """Plot energy curves on top of the input characters.
+
+    Args:
+        energy (np.array): energy values.
+        chars (str): Characters to place to the x-axis.
+
+    Shapes:
+        energy: :math:`(T,)`
+    """
+    old_fig_size = plt.rcParams["figure.figsize"]
+    if fig_size is not None:
+        plt.rcParams["figure.figsize"] = fig_size
+
+    fig, ax = plt.subplots()
+
+    x = np.array(range(len(chars)))
+    my_xticks = chars
+    plt.xticks(x, my_xticks)
+
+    ax.set_xlabel("characters")
+    ax.set_ylabel("freq")
+
+    ax2 = ax.twinx()
+    ax2.plot(energy, linewidth=5.0, color="red")
+    ax2.set_ylabel("energy")
+
+    plt.rcParams["figure.figsize"] = old_fig_size
+    if not output_fig:
+        plt.close()
+    return fig
+
+
 def visualize(
     alignment,
     postnet_output,
diff --git a/TTS/utils/audio/numpy_transforms.py b/TTS/utils/audio/numpy_transforms.py
index 952b2243..60f8e0dd 100644
--- a/TTS/utils/audio/numpy_transforms.py
+++ b/TTS/utils/audio/numpy_transforms.py
@@ -4,7 +4,7 @@ import librosa
 import numpy as np
 import scipy
 import soundfile as sf
-from librosa import pyin
+from librosa import magphase, pyin
 
 # For using kwargs
 # pylint: disable=unused-argument
@@ -303,6 +303,27 @@ def compute_f0(
     return f0
 
 
+def compute_energy(y: np.ndarray, **kwargs) -> np.ndarray:
+    """Compute energy of a waveform using the same parameters used for computing melspectrogram.
+    Args:
+      x (np.ndarray): Waveform. Shape :math:`[T_wav,]`
+    Returns:
+      np.ndarray: energy. Shape :math:`[T_energy,]`. :math:`T_energy == T_wav / hop_length`
+    Examples:
+      >>> WAV_FILE = filename = librosa.util.example_audio_file()
+      >>> from TTS.config import BaseAudioConfig
+      >>> from TTS.utils.audio import AudioProcessor
+      >>> conf = BaseAudioConfig()
+      >>> ap = AudioProcessor(**conf)
+      >>> wav = ap.load_wav(WAV_FILE, sr=ap.sample_rate)[:5 * ap.sample_rate]
+      >>> energy = ap.compute_energy(wav)
+    """
+    x = stft(y=y, **kwargs)
+    mag, _ = magphase(x)
+    energy = np.sqrt(np.sum(mag**2, axis=0))
+    return energy
+
+
 ### Audio Processing ###
 def find_endpoint(
     *,
diff --git a/recipes/ljspeech/fastspeech2/train_fastspeech2.py b/recipes/ljspeech/fastspeech2/train_fastspeech2.py
new file mode 100644
index 00000000..93737dba
--- /dev/null
+++ b/recipes/ljspeech/fastspeech2/train_fastspeech2.py
@@ -0,0 +1,102 @@
+import os
+
+from trainer import Trainer, TrainerArgs
+
+from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig
+from TTS.tts.configs.fastspeech2_config import Fastspeech2Config
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.models.forward_tts import ForwardTTS
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
+from TTS.utils.audio import AudioProcessor
+from TTS.utils.manage import ModelManager
+
+output_path = os.path.dirname(os.path.abspath(__file__))
+
+# init configs
+dataset_config = BaseDatasetConfig(
+    formatter="ljspeech",
+    meta_file_train="metadata.csv",
+    # meta_file_attn_mask=os.path.join(output_path, "../LJSpeech-1.1/metadata_attn_mask.txt"),
+    path=os.path.join(output_path, "../LJSpeech-1.1/"),
+)
+
+audio_config = BaseAudioConfig(
+    sample_rate=22050,
+    do_trim_silence=True,
+    trim_db=60.0,
+    signal_norm=False,
+    mel_fmin=0.0,
+    mel_fmax=8000,
+    spec_gain=1.0,
+    log_func="np.log",
+    ref_level_db=20,
+    preemphasis=0.0,
+)
+
+config = Fastspeech2Config(
+    run_name="fastspeech2_ljspeech",
+    audio=audio_config,
+    batch_size=32,
+    eval_batch_size=16,
+    num_loader_workers=8,
+    num_eval_loader_workers=4,
+    compute_input_seq_cache=True,
+    compute_f0=True,
+    f0_cache_path=os.path.join(output_path, "f0_cache"),
+    compute_energy=True,
+    energy_cache_path=os.path.join(output_path, "energy_cache"),
+    run_eval=True,
+    test_delay_epochs=-1,
+    epochs=1000,
+    text_cleaner="english_cleaners",
+    use_phonemes=True,
+    phoneme_language="en-us",
+    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
+    precompute_num_workers=4,
+    print_step=50,
+    print_eval=False,
+    mixed_precision=False,
+    max_seq_len=500000,
+    output_path=output_path,
+    datasets=[dataset_config],
+)
+
+# compute alignments
+if not config.model_args.use_aligner:
+    manager = ModelManager()
+    model_path, config_path, _ = manager.download_model("tts_models/en/ljspeech/tacotron2-DCA")
+    # TODO: make compute_attention python callable
+    os.system(
+        f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/  --use_cuda true"
+    )
+
+# INITIALIZE THE AUDIO PROCESSOR
+# Audio processor is used for feature extraction and audio I/O.
+# It mainly serves to the dataloader and the training loggers.
+ap = AudioProcessor.init_from_config(config)
+
+# INITIALIZE THE TOKENIZER
+# Tokenizer is used to convert text to sequences of token IDs.
+# If characters are not defined in the config, default characters are passed to the config
+tokenizer, config = TTSTokenizer.init_from_config(config)
+
+# LOAD DATA SAMPLES
+# Each sample is a list of ```[text, audio_file_path, speaker_name]```
+# You can define your custom sample loader returning the list of samples.
+# Or define your custom formatter and pass it to the `load_tts_samples`.
+# Check `TTS.tts.datasets.load_tts_samples` for more details.
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)
+
+# init the model
+model = ForwardTTS(config, ap, tokenizer, speaker_manager=None)
+
+# init the trainer and 🚀
+trainer = Trainer(
+    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
+)
+trainer.fit()
diff --git a/tests/tts_tests/test_fastspeech_2_speaker_emb_train.py b/tests/tts_tests/test_fastspeech_2_speaker_emb_train.py
new file mode 100644
index 00000000..d12f8bed
--- /dev/null
+++ b/tests/tts_tests/test_fastspeech_2_speaker_emb_train.py
@@ -0,0 +1,95 @@
+import glob
+import json
+import os
+import shutil
+
+from trainer import get_last_checkpoint
+
+from tests import get_device_id, get_tests_output_path, run_cli
+from TTS.config.shared_configs import BaseAudioConfig
+from TTS.tts.configs.fastspeech2_config import Fastspeech2Config
+
+config_path = os.path.join(get_tests_output_path(), "fast_pitch_speaker_emb_config.json")
+output_path = os.path.join(get_tests_output_path(), "train_outputs")
+
+audio_config = BaseAudioConfig(
+    sample_rate=22050,
+    do_trim_silence=True,
+    trim_db=60.0,
+    signal_norm=False,
+    mel_fmin=0.0,
+    mel_fmax=8000,
+    spec_gain=1.0,
+    log_func="np.log",
+    ref_level_db=20,
+    preemphasis=0.0,
+)
+
+config = Fastspeech2Config(
+    audio=audio_config,
+    batch_size=8,
+    eval_batch_size=8,
+    num_loader_workers=0,
+    num_eval_loader_workers=0,
+    text_cleaner="english_cleaners",
+    use_phonemes=True,
+    phoneme_language="en-us",
+    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
+    f0_cache_path="tests/data/ljspeech/f0_cache/",
+    compute_f0=True,
+    compute_energy=True,
+    energy_cache_path="tests/data/ljspeech/f0_cache/",
+    run_eval=True,
+    test_delay_epochs=-1,
+    epochs=1,
+    print_step=1,
+    print_eval=True,
+    use_speaker_embedding=True,
+    test_sentences=[
+        "Be a voice, not an echo.",
+    ],
+)
+config.audio.do_trim_silence = True
+config.use_speaker_embedding = True
+config.model_args.use_speaker_embedding = True
+config.audio.trim_db = 60
+config.save_json(config_path)
+
+# train the model for one epoch
+command_train = (
+    f"CUDA_VISIBLE_DEVICES='{get_device_id()}'  python TTS/bin/train_tts.py --config_path {config_path}  "
+    f"--coqpit.output_path {output_path} "
+    "--coqpit.datasets.0.formatter ljspeech_test "
+    "--coqpit.datasets.0.meta_file_train metadata.csv "
+    "--coqpit.datasets.0.meta_file_val metadata.csv "
+    "--coqpit.datasets.0.path tests/data/ljspeech "
+    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
+    "--coqpit.test_delay_epochs 0"
+)
+run_cli(command_train)
+
+# Find latest folder
+continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+
+# Inference using TTS API
+continue_config_path = os.path.join(continue_path, "config.json")
+continue_restore_path, _ = get_last_checkpoint(continue_path)
+out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
+speaker_id = "ljspeech-1"
+continue_speakers_path = os.path.join(continue_path, "speakers.json")
+
+# Check integrity of the config
+with open(continue_config_path, "r", encoding="utf-8") as f:
+    config_loaded = json.load(f)
+assert config_loaded["characters"] is not None
+assert config_loaded["output_path"] in continue_path
+assert config_loaded["test_delay_epochs"] == 0
+
+# Load the model and run inference
+inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+run_cli(inference_command)
+
+# restore the model and continue training for one more epoch
+command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+run_cli(command_train)
+shutil.rmtree(continue_path)
diff --git a/tests/tts_tests/test_fastspeech_2_train.py b/tests/tts_tests/test_fastspeech_2_train.py
new file mode 100644
index 00000000..f54e6351
--- /dev/null
+++ b/tests/tts_tests/test_fastspeech_2_train.py
@@ -0,0 +1,94 @@
+import glob
+import json
+import os
+import shutil
+
+from trainer import get_last_checkpoint
+
+from tests import get_device_id, get_tests_output_path, run_cli
+from TTS.config.shared_configs import BaseAudioConfig
+from TTS.tts.configs.fastspeech2_config import Fastspeech2Config
+
+config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
+output_path = os.path.join(get_tests_output_path(), "train_outputs")
+
+audio_config = BaseAudioConfig(
+    sample_rate=22050,
+    do_trim_silence=True,
+    trim_db=60.0,
+    signal_norm=False,
+    mel_fmin=0.0,
+    mel_fmax=8000,
+    spec_gain=1.0,
+    log_func="np.log",
+    ref_level_db=20,
+    preemphasis=0.0,
+)
+
+config = Fastspeech2Config(
+    audio=audio_config,
+    batch_size=8,
+    eval_batch_size=8,
+    num_loader_workers=0,
+    num_eval_loader_workers=0,
+    text_cleaner="english_cleaners",
+    use_phonemes=True,
+    phoneme_language="en-us",
+    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
+    f0_cache_path="tests/data/ljspeech/f0_cache/",
+    compute_f0=True,
+    compute_energy=True,
+    energy_cache_path="tests/data/ljspeech/f0_cache/",
+    run_eval=True,
+    test_delay_epochs=-1,
+    epochs=1,
+    print_step=1,
+    print_eval=True,
+    test_sentences=[
+        "Be a voice, not an echo.",
+    ],
+    use_speaker_embedding=False,
+)
+config.audio.do_trim_silence = True
+config.use_speaker_embedding = False
+config.model_args.use_speaker_embedding = False
+config.audio.trim_db = 60
+config.save_json(config_path)
+
+# train the model for one epoch
+command_train = (
+    f"CUDA_VISIBLE_DEVICES='{get_device_id()}'  python TTS/bin/train_tts.py --config_path {config_path}  "
+    f"--coqpit.output_path {output_path} "
+    "--coqpit.datasets.0.formatter ljspeech "
+    "--coqpit.datasets.0.meta_file_train metadata.csv "
+    "--coqpit.datasets.0.meta_file_val metadata.csv "
+    "--coqpit.datasets.0.path tests/data/ljspeech "
+    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
+    "--coqpit.test_delay_epochs 0"
+)
+
+run_cli(command_train)
+
+# Find latest folder
+continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+
+# Inference using TTS API
+continue_config_path = os.path.join(continue_path, "config.json")
+continue_restore_path, _ = get_last_checkpoint(continue_path)
+out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
+
+# Check integrity of the config
+with open(continue_config_path, "r", encoding="utf-8") as f:
+    config_loaded = json.load(f)
+assert config_loaded["characters"] is not None
+assert config_loaded["output_path"] in continue_path
+assert config_loaded["test_delay_epochs"] == 0
+
+# Load the model and run inference
+inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+run_cli(inference_command)
+
+# restore the model and continue training for one more epoch
+command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+run_cli(command_train)
+shutil.rmtree(continue_path)

From 6e3f74fc2927892b5575d33e4df757e6d9f19fd3 Mon Sep 17 00:00:00 2001
From: Eren G??lge <egolge@coqui.ai>
Date: Sun, 15 Jan 2023 23:11:57 +0100
Subject: [PATCH 11/24] Fix #2191

---
 TTS/tts/models/base_tts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py
index 58d740d2..69980e72 100644
--- a/TTS/tts/models/base_tts.py
+++ b/TTS/tts/models/base_tts.py
@@ -345,7 +345,7 @@ class BaseTTS(BaseTrainerModel):
             loader = DataLoader(
                 dataset,
                 batch_size=config.eval_batch_size if is_eval else config.batch_size,
-                shuffle=config.shuffle if sampler is not None else False,  # if there is no other sampler
+                shuffle=config.shuffle if sampler is None else False,  # if there is no other sampler
                 collate_fn=dataset.collate_fn,
                 drop_last=config.drop_last,  # setting this False might cause issues in AMP training.
                 sampler=sampler,

From 497f22b20b40763098b580f8869164a4ea121f31 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <erogol@hotmail.com>
Date: Mon, 23 Jan 2023 11:49:51 +0100
Subject: [PATCH 12/24] Cache speaker encoder model (#2284)

---
 TTS/tts/utils/managers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/tts/utils/managers.py b/TTS/tts/utils/managers.py
index 0159a9d2..1f94c533 100644
--- a/TTS/tts/utils/managers.py
+++ b/TTS/tts/utils/managers.py
@@ -324,7 +324,7 @@ class EmbeddingManager(BaseIDManager):
         self.encoder_config = load_config(config_path)
         self.encoder = setup_encoder_model(self.encoder_config)
         self.encoder_criterion = self.encoder.load_checkpoint(
-            self.encoder_config, model_path, eval=True, use_cuda=use_cuda
+            self.encoder_config, model_path, eval=True, use_cuda=use_cuda, cache=True
         )
         self.encoder_ap = AudioProcessor(**self.encoder_config.audio)
 

From d83ee8fe45e3c0d776d4a865aca21d7c2ac324c4 Mon Sep 17 00:00:00 2001
From: Shivam Mehta <shivam.mehta007@gmail.com>
Date: Mon, 23 Jan 2023 11:53:04 +0100
Subject: [PATCH 13/24] Adding neural HMM TTS Model (#2272)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Adding neural HMM TTS

* Adding tests

* Adding neural hmm on readme

* renaming training recipe

* Removing overflow\s decoder parameters from the config

* Update the Trainer requirement version for a compatible one (#2276)

* Bump up to v0.10.2

* Adding neural HMM TTS

* Adding tests

* Adding neural hmm on readme

* renaming training recipe

* Removing overflow\s decoder parameters from the config

* fixing documentation

Co-authored-by: Edresson Casanova <edresson1@gmail.com>
Co-authored-by: Eren Gölge <erogol@hotmail.com>
---
 README.md                                     |   1 +
 TTS/tts/configs/neuralhmm_tts_config.py       | 170 ++++++++
 TTS/tts/layers/overflow/plotting_utils.py     |  15 +-
 TTS/tts/models/neuralhmm_tts.py               | 384 ++++++++++++++++++
 TTS/tts/models/overflow.py                    |   3 -
 .../neuralhmm_tts/train_neuralhmmtts.py       |  96 +++++
 tests/tts_tests/test_neuralhmm_tts_train.py   |  92 +++++
 7 files changed, 753 insertions(+), 8 deletions(-)
 create mode 100644 TTS/tts/configs/neuralhmm_tts_config.py
 create mode 100644 TTS/tts/models/neuralhmm_tts.py
 create mode 100644 recipes/ljspeech/neuralhmm_tts/train_neuralhmmtts.py
 create mode 100644 tests/tts_tests/test_neuralhmm_tts_train.py

diff --git a/README.md b/README.md
index 3c960af6..5a28bc8d 100644
--- a/README.md
+++ b/README.md
@@ -95,6 +95,7 @@ Underlined "TTS*" and "Judy*" are 🐸TTS models
 - SC-GlowTTS: [paper](https://arxiv.org/abs/2104.05557)
 - Capacitron: [paper](https://arxiv.org/abs/1906.03402)
 - OverFlow: [paper](https://arxiv.org/abs/2211.06892)
+- Neural HMM TTS: [paper](https://arxiv.org/abs/2108.13320)
 
 ### End-to-End Models
 - VITS: [paper](https://arxiv.org/pdf/2106.06103)
diff --git a/TTS/tts/configs/neuralhmm_tts_config.py b/TTS/tts/configs/neuralhmm_tts_config.py
new file mode 100644
index 00000000..50f72847
--- /dev/null
+++ b/TTS/tts/configs/neuralhmm_tts_config.py
@@ -0,0 +1,170 @@
+from dataclasses import dataclass, field
+from typing import List
+
+from TTS.tts.configs.shared_configs import BaseTTSConfig
+
+
+@dataclass
+class NeuralhmmTTSConfig(BaseTTSConfig):
+    """
+    Define parameters for Neural HMM TTS model.
+
+    Example:
+
+        >>> from TTS.tts.configs.overflow_config import OverflowConfig
+        >>> config = OverflowConfig()
+
+    Args:
+        model (str):
+            Model name used to select the right model class to initilize. Defaults to `Overflow`.
+        run_eval_steps (int):
+            Run evalulation epoch after N steps. If None, waits until training epoch is completed. Defaults to None.
+        save_step (int):
+            Save local checkpoint every save_step steps. Defaults to 500.
+        plot_step (int):
+            Plot training stats on the logger every plot_step steps. Defaults to 1.
+        model_param_stats (bool):
+            Log model parameters stats on the logger dashboard. Defaults to False.
+        force_generate_statistics (bool):
+            Force generate mel normalization statistics. Defaults to False.
+        mel_statistics_parameter_path (str):
+            Path to the mel normalization statistics.If the model doesn't finds a file there it will generate statistics.
+            Defaults to None.
+        num_chars (int):
+            Number of characters used by the model. It must be defined before initializing the model. Defaults to None.
+        state_per_phone (int):
+            Generates N states per phone. Similar, to `add_blank` parameter in GlowTTS but in Overflow it is upsampled by model's encoder. Defaults to 2.
+        encoder_in_out_features (int):
+            Channels of encoder input and character embedding tensors. Defaults to 512.
+        encoder_n_convolutions (int):
+            Number of convolution layers in the encoder. Defaults to 3.
+        out_channels (int):
+            Channels of the final model output. It must match the spectragram size. Defaults to 80.
+        ar_order (int):
+            Autoregressive order of the model. Defaults to 1. In ablations of Neural HMM it was found that more autoregression while giving more variation hurts naturalness of the synthesised audio.
+        sampling_temp (float):
+            Variation added to the sample from the latent space of neural HMM. Defaults to 0.334.
+        deterministic_transition (bool):
+            deterministic duration generation based on duration quantiles as defiend in "S. Ronanki, O. Watts, S. King, and G. E. Henter, “Medianbased generation of synthetic speech durations using a nonparametric approach,” in Proc. SLT, 2016.". Defaults to True.
+        duration_threshold (float):
+            Threshold for duration quantiles. Defaults to 0.55. Tune this to change the speaking rate of the synthesis, where lower values defines a slower speaking rate and higher values defines a faster speaking rate.
+        use_grad_checkpointing (bool):
+            Use gradient checkpointing to save memory. In a multi-GPU setting currently pytorch does not supports gradient checkpoint inside a loop so we will have to turn it off then.Adjust depending on whatever get more batch size either by using a single GPU or multi-GPU. Defaults to True.
+        max_sampling_time (int):
+            Maximum sampling time while synthesising latents from neural HMM. Defaults to 1000.
+        prenet_type (str):
+            `original` or `bn`. `original` sets the default Prenet and `bn` uses Batch Normalization version of the
+            Prenet. Defaults to `original`.
+        prenet_dim (int):
+            Dimension of the Prenet. Defaults to 256.
+        prenet_n_layers (int):
+            Number of layers in the Prenet. Defaults to 2.
+        prenet_dropout (float):
+            Dropout rate of the Prenet. Defaults to 0.5.
+        prenet_dropout_at_inference (bool):
+            Use dropout at inference time. Defaults to False.
+        memory_rnn_dim (int):
+            Dimension of the memory LSTM to process the prenet output. Defaults to 1024.
+        outputnet_size (list[int]):
+            Size of the output network inside the neural HMM. Defaults to [1024].
+        flat_start_params (dict):
+            Parameters for the flat start initialization of the neural HMM. Defaults to `{"mean": 0.0, "std": 1.0, "transition_p": 0.14}`.
+            It will be recomputed when you pass the dataset.
+        std_floor (float):
+            Floor value for the standard deviation of the neural HMM. Prevents model cheating by putting point mass and getting infinite likelihood at any datapoint. Defaults to 0.01.
+            It is called `variance flooring` in standard HMM literature.
+        optimizer (str):
+            Optimizer to use for training. Defaults to `adam`.
+        optimizer_params (dict):
+            Parameters for the optimizer. Defaults to `{"weight_decay": 1e-6}`.
+        grad_clip (float):
+            Gradient clipping threshold. Defaults to 40_000.
+        lr (float):
+            Learning rate. Defaults to 1e-3.
+        lr_scheduler (str):
+            Learning rate scheduler for the training. Use one from `torch.optim.Scheduler` schedulers or
+            `TTS.utils.training`. Defaults to `None`.
+        min_seq_len (int):
+            Minimum input sequence length to be used at training.
+        max_seq_len (int):
+            Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
+    """
+
+    model: str = "NeuralHMM_TTS"
+
+    # Training and Checkpoint configs
+    run_eval_steps: int = 100
+    save_step: int = 500
+    plot_step: int = 1
+    model_param_stats: bool = False
+
+    # data parameters
+    force_generate_statistics: bool = False
+    mel_statistics_parameter_path: str = None
+
+    # Encoder parameters
+    num_chars: int = None
+    state_per_phone: int = 2
+    encoder_in_out_features: int = 512
+    encoder_n_convolutions: int = 3
+
+    # HMM parameters
+    out_channels: int = 80
+    ar_order: int = 1
+    sampling_temp: float = 0
+    deterministic_transition: bool = True
+    duration_threshold: float = 0.43
+    use_grad_checkpointing: bool = True
+    max_sampling_time: int = 1000
+
+    ## Prenet parameters
+    prenet_type: str = "original"
+    prenet_dim: int = 256
+    prenet_n_layers: int = 2
+    prenet_dropout: float = 0.5
+    prenet_dropout_at_inference: bool = True
+    memory_rnn_dim: int = 1024
+
+    ## Outputnet parameters
+    outputnet_size: List[int] = field(default_factory=lambda: [1024])
+    flat_start_params: dict = field(default_factory=lambda: {"mean": 0.0, "std": 1.0, "transition_p": 0.14})
+    std_floor: float = 0.001
+
+    # optimizer parameters
+    optimizer: str = "Adam"
+    optimizer_params: dict = field(default_factory=lambda: {"weight_decay": 1e-6})
+    grad_clip: float = 40000.0
+    lr: float = 1e-3
+    lr_scheduler: str = None
+
+    # overrides
+    min_text_len: int = 10
+    max_text_len: int = 500
+    min_audio_len: int = 512
+
+    # testing
+    test_sentences: List[str] = field(
+        default_factory=lambda: [
+            "Be a voice, not an echo.",
+        ]
+    )
+
+    # Extra needed config
+    r: int = 1
+    use_d_vector_file: bool = False
+    use_speaker_embedding: bool = False
+
+    def check_values(self):
+        """Validate the hyperparameters.
+
+        Raises:
+            AssertionError: when the parameters network is not defined
+            AssertionError: transition probability is not between 0 and 1
+        """
+        assert self.ar_order > 0, "AR order must be greater than 0 it is an autoregressive model."
+        assert (
+            len(self.outputnet_size) >= 1
+        ), f"Parameter Network must have atleast one layer check the config file for parameter network. Provided: {self.parameternetwork}"
+        assert (
+            0 < self.flat_start_params["transition_p"] < 1
+        ), f"Transition probability must be between 0 and 1. Provided: {self.flat_start_params['transition_p']}"
diff --git a/TTS/tts/layers/overflow/plotting_utils.py b/TTS/tts/layers/overflow/plotting_utils.py
index a9f9c301..a63aeb37 100644
--- a/TTS/tts/layers/overflow/plotting_utils.py
+++ b/TTS/tts/layers/overflow/plotting_utils.py
@@ -30,7 +30,7 @@ def validate_numpy_array(value: Any):
     return value
 
 
-def get_spec_from_most_probable_state(log_alpha_scaled, means, decoder):
+def get_spec_from_most_probable_state(log_alpha_scaled, means, decoder=None):
     """Get the most probable state means from the log_alpha_scaled.
 
     Args:
@@ -38,16 +38,21 @@ def get_spec_from_most_probable_state(log_alpha_scaled, means, decoder):
             - Shape: :math:`(T, N)`
         means (torch.Tensor): Means of the states.
             - Shape: :math:`(N, T, D_out)`
-        decoder (torch.nn.Module): Decoder module to decode the latent to melspectrogram
+        decoder (torch.nn.Module): Decoder module to decode the latent to melspectrogram. Defaults to None.
     """
     max_state_numbers = torch.max(log_alpha_scaled, dim=1)[1]
     max_len = means.shape[0]
     n_mel_channels = means.shape[2]
     max_state_numbers = max_state_numbers.unsqueeze(1).unsqueeze(1).expand(max_len, 1, n_mel_channels)
     means = torch.gather(means, 1, max_state_numbers).squeeze(1).to(log_alpha_scaled.dtype)
-    mel = (
-        decoder(means.T.unsqueeze(0), torch.tensor([means.shape[0]], device=means.device), reverse=True)[0].squeeze(0).T
-    )
+    if decoder is not None:
+        mel = (
+            decoder(means.T.unsqueeze(0), torch.tensor([means.shape[0]], device=means.device), reverse=True)[0]
+            .squeeze(0)
+            .T
+        )
+    else:
+        mel = means
     return mel
 
 
diff --git a/TTS/tts/models/neuralhmm_tts.py b/TTS/tts/models/neuralhmm_tts.py
new file mode 100644
index 00000000..e4f88452
--- /dev/null
+++ b/TTS/tts/models/neuralhmm_tts.py
@@ -0,0 +1,384 @@
+import os
+from typing import Dict, List, Union
+
+import torch
+from coqpit import Coqpit
+from torch import nn
+from trainer.logging.tensorboard_logger import TensorboardLogger
+
+from TTS.tts.layers.overflow.common_layers import Encoder, OverflowUtils
+from TTS.tts.layers.overflow.neural_hmm import NeuralHMM
+from TTS.tts.layers.overflow.plotting_utils import (
+    get_spec_from_most_probable_state,
+    plot_transition_probabilities_to_numpy,
+)
+from TTS.tts.models.base_tts import BaseTTS
+from TTS.tts.utils.speakers import SpeakerManager
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
+from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
+from TTS.utils.generic_utils import format_aux_input
+from TTS.utils.io import load_fsspec
+
+
+class NeuralhmmTTS(BaseTTS):
+    """Neural HMM TTS model.
+
+    Paper::
+        https://arxiv.org/abs/2108.13320
+
+    Paper abstract::
+        Neural sequence-to-sequence TTS has achieved significantly better output quality
+    than statistical speech synthesis using HMMs.However, neural TTS is generally not probabilistic
+    and uses non-monotonic attention. Attention failures increase training time and can make
+    synthesis babble incoherently. This paper describes how the old and new paradigms can be
+    combined to obtain the advantages of both worlds, by replacing attention in neural TTS with
+    an autoregressive left-right no-skip hidden Markov model defined by a neural network.
+    Based on this proposal, we modify Tacotron 2 to obtain an HMM-based neural TTS model with
+    monotonic alignment, trained to maximise the full sequence likelihood without approximation.
+    We also describe how to combine ideas from classical and contemporary TTS for best results.
+    The resulting example system is smaller and simpler than Tacotron 2, and learns to speak with
+    fewer iterations and less data, whilst achieving comparable naturalness prior to the post-net.
+    Our approach also allows easy control over speaking rate. Audio examples and code
+    are available at https://shivammehta25.github.io/Neural-HMM/ .
+
+    Note:
+        - This is a parameter efficient version of OverFlow (15.3M vs 28.6M). Since it has half the
+        number of parameters as OverFlow the synthesis output quality is suboptimal (but comparable to Tacotron2
+        without Postnet), but it learns to speak with even lesser amount of data and is still significantly faster
+        than other attention-based methods.
+
+        - Neural HMMs uses flat start initialization i.e it computes the means and std and transition probabilities
+        of the dataset and uses them to initialize the model. This benefits the model and helps with faster learning
+        If you change the dataset or want to regenerate the parameters change the `force_generate_statistics` and
+        `mel_statistics_parameter_path` accordingly.
+
+        - To enable multi-GPU training, set the `use_grad_checkpointing=False` in config.
+        This will significantly increase the memory usage.  This is because to compute
+        the actual data likelihood (not an approximation using MAS/Viterbi) we must use
+        all the states at the previous time step during the forward pass to decide the
+        probability distribution at the current step i.e the difference between the forward
+        algorithm and viterbi approximation.
+
+    Check :class:`TTS.tts.configs.neuralhmm_tts_config.NeuralhmmTTSConfig` for class arguments.
+    """
+
+    def __init__(
+        self,
+        config: "NeuralhmmTTSConfig",
+        ap: "AudioProcessor" = None,
+        tokenizer: "TTSTokenizer" = None,
+        speaker_manager: SpeakerManager = None,
+    ):
+        super().__init__(config, ap, tokenizer, speaker_manager)
+
+        # pass all config fields to `self`
+        # for fewer code change
+        self.config = config
+        for key in config:
+            setattr(self, key, config[key])
+
+        self.encoder = Encoder(config.num_chars, config.state_per_phone, config.encoder_in_out_features)
+        self.neural_hmm = NeuralHMM(
+            frame_channels=self.out_channels,
+            ar_order=self.ar_order,
+            deterministic_transition=self.deterministic_transition,
+            encoder_dim=self.encoder_in_out_features,
+            prenet_type=self.prenet_type,
+            prenet_dim=self.prenet_dim,
+            prenet_n_layers=self.prenet_n_layers,
+            prenet_dropout=self.prenet_dropout,
+            prenet_dropout_at_inference=self.prenet_dropout_at_inference,
+            memory_rnn_dim=self.memory_rnn_dim,
+            outputnet_size=self.outputnet_size,
+            flat_start_params=self.flat_start_params,
+            std_floor=self.std_floor,
+            use_grad_checkpointing=self.use_grad_checkpointing,
+        )
+
+        self.register_buffer("mean", torch.tensor(0))
+        self.register_buffer("std", torch.tensor(1))
+
+    def update_mean_std(self, statistics_dict: Dict):
+        self.mean.data = torch.tensor(statistics_dict["mean"])
+        self.std.data = torch.tensor(statistics_dict["std"])
+
+    def preprocess_batch(self, text, text_len, mels, mel_len):
+        if self.mean.item() == 0 or self.std.item() == 1:
+            statistics_dict = torch.load(self.mel_statistics_parameter_path)
+            self.update_mean_std(statistics_dict)
+
+        mels = self.normalize(mels)
+        return text, text_len, mels, mel_len
+
+    def normalize(self, x):
+        return x.sub(self.mean).div(self.std)
+
+    def inverse_normalize(self, x):
+        return x.mul(self.std).add(self.mean)
+
+    def forward(self, text, text_len, mels, mel_len):
+        """
+        Forward pass for training and computing the log likelihood of a given batch.
+
+        Shapes:
+            Shapes:
+            text: :math:`[B, T_in]`
+            text_len: :math:`[B]`
+            mels: :math:`[B, T_out, C]`
+            mel_len: :math:`[B]`
+        """
+        text, text_len, mels, mel_len = self.preprocess_batch(text, text_len, mels, mel_len)
+        encoder_outputs, encoder_output_len = self.encoder(text, text_len)
+
+        log_probs, fwd_alignments, transition_vectors, means = self.neural_hmm(
+            encoder_outputs, encoder_output_len, mels.transpose(1, 2), mel_len
+        )
+
+        outputs = {
+            "log_probs": log_probs,
+            "alignments": fwd_alignments,
+            "transition_vectors": transition_vectors,
+            "means": means,
+        }
+
+        return outputs
+
+    @staticmethod
+    def _training_stats(batch):
+        stats = {}
+        stats["avg_text_length"] = batch["text_lengths"].float().mean()
+        stats["avg_spec_length"] = batch["mel_lengths"].float().mean()
+        stats["avg_text_batch_occupancy"] = (batch["text_lengths"].float() / batch["text_lengths"].float().max()).mean()
+        stats["avg_spec_batch_occupancy"] = (batch["mel_lengths"].float() / batch["mel_lengths"].float().max()).mean()
+        return stats
+
+    def train_step(self, batch: dict, criterion: nn.Module):
+        text_input = batch["text_input"]
+        text_lengths = batch["text_lengths"]
+        mel_input = batch["mel_input"]
+        mel_lengths = batch["mel_lengths"]
+
+        outputs = self.forward(
+            text=text_input,
+            text_len=text_lengths,
+            mels=mel_input,
+            mel_len=mel_lengths,
+        )
+        loss_dict = criterion(outputs["log_probs"] / (mel_lengths.sum() + text_lengths.sum()))
+
+        # for printing useful statistics on terminal
+        loss_dict.update(self._training_stats(batch))
+        return outputs, loss_dict
+
+    def eval_step(self, batch: Dict, criterion: nn.Module):
+        return self.train_step(batch, criterion)
+
+    def _format_aux_input(self, aux_input: Dict, default_input_dict):
+        """Set missing fields to their default value.
+
+        Args:
+            aux_inputs (Dict): Dictionary containing the auxiliary inputs.
+        """
+        default_input_dict.update(
+            {
+                "sampling_temp": self.sampling_temp,
+                "max_sampling_time": self.max_sampling_time,
+                "duration_threshold": self.duration_threshold,
+            }
+        )
+        if aux_input:
+            return format_aux_input(aux_input, default_input_dict)
+        return None
+
+    @torch.no_grad()
+    def inference(
+        self,
+        text: torch.Tensor,
+        aux_input={"x_lengths": None, "sampling_temp": None, "max_sampling_time": None, "duration_threshold": None},
+    ):  # pylint: disable=dangerous-default-value
+        """Sampling from the model
+
+        Args:
+            text (torch.Tensor): :math:`[B, T_in]`
+            aux_inputs (_type_, optional): _description_. Defaults to None.
+
+        Returns:
+            outputs: Dictionary containing the following
+                - mel (torch.Tensor): :math:`[B, T_out, C]`
+                - hmm_outputs_len (torch.Tensor): :math:`[B]`
+                - state_travelled (List[List[int]]): List of lists containing the state travelled for each sample in the batch.
+                - input_parameters (list[torch.FloatTensor]): Input parameters to the neural HMM.
+                - output_parameters (list[torch.FloatTensor]): Output parameters to the neural HMM.
+        """
+        default_input_dict = {
+            "x_lengths": torch.sum(text != 0, dim=1),
+        }
+        aux_input = self._format_aux_input(aux_input, default_input_dict)
+        encoder_outputs, encoder_output_len = self.encoder.inference(text, aux_input["x_lengths"])
+        outputs = self.neural_hmm.inference(
+            encoder_outputs,
+            encoder_output_len,
+            sampling_temp=aux_input["sampling_temp"],
+            max_sampling_time=aux_input["max_sampling_time"],
+            duration_threshold=aux_input["duration_threshold"],
+        )
+        mels, mel_outputs_len = outputs["hmm_outputs"], outputs["hmm_outputs_len"]
+
+        mels = self.inverse_normalize(mels)
+        outputs.update({"model_outputs": mels, "model_outputs_len": mel_outputs_len})
+        outputs["alignments"] = OverflowUtils.double_pad(outputs["alignments"])
+        return outputs
+
+    @staticmethod
+    def get_criterion():
+        return NLLLoss()
+
+    @staticmethod
+    def init_from_config(config: "NeuralhmmTTSConfig", samples: Union[List[List], List[Dict]] = None, verbose=True):
+        """Initiate model from config
+
+        Args:
+            config (VitsConfig): Model config.
+            samples (Union[List[List], List[Dict]]): Training samples to parse speaker ids for training.
+                Defaults to None.
+            verbose (bool): If True, print init messages. Defaults to True.
+        """
+        from TTS.utils.audio import AudioProcessor
+
+        ap = AudioProcessor.init_from_config(config, verbose)
+        tokenizer, new_config = TTSTokenizer.init_from_config(config)
+        speaker_manager = SpeakerManager.init_from_config(config, samples)
+        return NeuralhmmTTS(new_config, ap, tokenizer, speaker_manager)
+
+    def load_checkpoint(
+        self, config: Coqpit, checkpoint_path: str, eval: bool = False, strict: bool = True, cache=False
+    ):  # pylint: disable=unused-argument, redefined-builtin
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
+        self.load_state_dict(state["model"])
+        if eval:
+            self.eval()
+            assert not self.training
+
+    def on_init_start(self, trainer):
+        """If the current dataset does not have normalisation statistics and initialisation transition_probability it computes them otherwise loads."""
+        if not os.path.isfile(trainer.config.mel_statistics_parameter_path) or trainer.config.force_generate_statistics:
+            dataloader = trainer.get_train_dataloader(
+                training_assets=None, samples=trainer.train_samples, verbose=False
+            )
+            print(
+                f" | > Data parameters not found for: {trainer.config.mel_statistics_parameter_path}. Computing mel normalization parameters..."
+            )
+            data_mean, data_std, init_transition_prob = OverflowUtils.get_data_parameters_for_flat_start(
+                dataloader, trainer.config.out_channels, trainer.config.state_per_phone
+            )
+            print(
+                f" | > Saving data parameters to: {trainer.config.mel_statistics_parameter_path}: value: {data_mean, data_std, init_transition_prob}"
+            )
+            statistics = {
+                "mean": data_mean.item(),
+                "std": data_std.item(),
+                "init_transition_prob": init_transition_prob.item(),
+            }
+            torch.save(statistics, trainer.config.mel_statistics_parameter_path)
+
+        else:
+            print(
+                f" | > Data parameters found for: {trainer.config.mel_statistics_parameter_path}. Loading mel normalization parameters..."
+            )
+            statistics = torch.load(trainer.config.mel_statistics_parameter_path)
+            data_mean, data_std, init_transition_prob = (
+                statistics["mean"],
+                statistics["std"],
+                statistics["init_transition_prob"],
+            )
+            print(f" | > Data parameters loaded with value: {data_mean, data_std, init_transition_prob}")
+
+        trainer.config.flat_start_params["transition_p"] = (
+            init_transition_prob.item() if torch.is_tensor(init_transition_prob) else init_transition_prob
+        )
+        OverflowUtils.update_flat_start_transition(trainer.model, init_transition_prob)
+        trainer.model.update_mean_std(statistics)
+
+    @torch.inference_mode()
+    def _create_logs(self, batch, outputs, ap):  # pylint: disable=no-self-use, unused-argument
+        alignments, transition_vectors = outputs["alignments"], outputs["transition_vectors"]
+        means = torch.stack(outputs["means"], dim=1)
+
+        figures = {
+            "alignment": plot_alignment(alignments[0].exp(), title="Forward alignment", fig_size=(20, 20)),
+            "log_alignment": plot_alignment(
+                alignments[0].exp(), title="Forward log alignment", plot_log=True, fig_size=(20, 20)
+            ),
+            "transition_vectors": plot_alignment(transition_vectors[0], title="Transition vectors", fig_size=(20, 20)),
+            "mel_from_most_probable_state": plot_spectrogram(
+                get_spec_from_most_probable_state(alignments[0], means[0]), fig_size=(12, 3)
+            ),
+            "mel_target": plot_spectrogram(batch["mel_input"][0], fig_size=(12, 3)),
+        }
+
+        # sample one item from the batch -1 will give the smalles item
+        print(" | > Synthesising audio from the model...")
+        inference_output = self.inference(
+            batch["text_input"][-1].unsqueeze(0), aux_input={"x_lenghts": batch["text_lengths"][-1].unsqueeze(0)}
+        )
+        figures["synthesised"] = plot_spectrogram(inference_output["model_outputs"][0], fig_size=(12, 3))
+
+        states = [p[1] for p in inference_output["input_parameters"][0]]
+        transition_probability_synthesising = [p[2].cpu().numpy() for p in inference_output["output_parameters"][0]]
+
+        for i in range((len(transition_probability_synthesising) // 200) + 1):
+            start = i * 200
+            end = (i + 1) * 200
+            figures[f"synthesised_transition_probabilities/{i}"] = plot_transition_probabilities_to_numpy(
+                states[start:end], transition_probability_synthesising[start:end]
+            )
+
+        audio = ap.inv_melspectrogram(inference_output["model_outputs"][0].T.cpu().numpy())
+        return figures, {"audios": audio}
+
+    def train_log(
+        self, batch: dict, outputs: dict, logger: "Logger", assets: dict, steps: int
+    ):  # pylint: disable=unused-argument
+        """Log training progress."""
+        figures, audios = self._create_logs(batch, outputs, self.ap)
+        logger.train_figures(steps, figures)
+        logger.train_audios(steps, audios, self.ap.sample_rate)
+
+    def eval_log(
+        self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int
+    ):  # pylint: disable=unused-argument
+        """Compute and log evaluation metrics."""
+        # Plot model parameters histograms
+        if isinstance(logger, TensorboardLogger):
+            # I don't know if any other loggers supports this
+            for tag, value in self.named_parameters():
+                tag = tag.replace(".", "/")
+                logger.writer.add_histogram(tag, value.data.cpu().numpy(), steps)
+
+        figures, audios = self._create_logs(batch, outputs, self.ap)
+        logger.eval_figures(steps, figures)
+        logger.eval_audios(steps, audios, self.ap.sample_rate)
+
+    def test_log(
+        self, outputs: dict, logger: "Logger", assets: dict, steps: int  # pylint: disable=unused-argument
+    ) -> None:
+        logger.test_audios(steps, outputs[1], self.ap.sample_rate)
+        logger.test_figures(steps, outputs[0])
+
+
+class NLLLoss(nn.Module):
+    """Negative log likelihood loss."""
+
+    def forward(self, log_prob: torch.Tensor) -> dict:  # pylint: disable=no-self-use
+        """Compute the loss.
+
+        Args:
+            logits (Tensor): [B, T, D]
+
+        Returns:
+            Tensor: [1]
+
+        """
+        return_dict = {}
+        return_dict["loss"] = -log_prob.mean()
+        return return_dict
diff --git a/TTS/tts/models/overflow.py b/TTS/tts/models/overflow.py
index ee5ff411..c2b5b7c2 100644
--- a/TTS/tts/models/overflow.py
+++ b/TTS/tts/models/overflow.py
@@ -111,9 +111,6 @@ class Overflow(BaseTTS):
         self.register_buffer("mean", torch.tensor(0))
         self.register_buffer("std", torch.tensor(1))
 
-        # self.mean = nn.Parameter(torch.zeros(1), requires_grad=False)
-        # self.std = nn.Parameter(torch.ones(1), requires_grad=False)
-
     def update_mean_std(self, statistics_dict: Dict):
         self.mean.data = torch.tensor(statistics_dict["mean"])
         self.std.data = torch.tensor(statistics_dict["std"])
diff --git a/recipes/ljspeech/neuralhmm_tts/train_neuralhmmtts.py b/recipes/ljspeech/neuralhmm_tts/train_neuralhmmtts.py
new file mode 100644
index 00000000..28d37799
--- /dev/null
+++ b/recipes/ljspeech/neuralhmm_tts/train_neuralhmmtts.py
@@ -0,0 +1,96 @@
+import os
+
+from trainer import Trainer, TrainerArgs
+
+from TTS.config.shared_configs import BaseAudioConfig
+from TTS.tts.configs.neuralhmm_tts_config import NeuralhmmTTSConfig
+from TTS.tts.configs.shared_configs import BaseDatasetConfig
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.models.neuralhmm_tts import NeuralhmmTTS
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
+from TTS.utils.audio import AudioProcessor
+
+output_path = os.path.dirname(os.path.abspath(__file__))
+
+# init configs
+dataset_config = BaseDatasetConfig(
+    formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join("data", "LJSpeech-1.1/")
+)
+
+audio_config = BaseAudioConfig(
+    sample_rate=22050,
+    do_trim_silence=True,
+    trim_db=60.0,
+    signal_norm=False,
+    mel_fmin=0.0,
+    mel_fmax=8000,
+    spec_gain=1.0,
+    log_func="np.log",
+    ref_level_db=20,
+    preemphasis=0.0,
+)
+
+config = NeuralhmmTTSConfig(  # This is the config that is saved for the future use
+    run_name="neuralhmmtts_ljspeech",
+    audio=audio_config,
+    batch_size=32,
+    eval_batch_size=16,
+    num_loader_workers=4,
+    num_eval_loader_workers=4,
+    run_eval=True,
+    test_delay_epochs=-1,
+    epochs=1000,
+    text_cleaner="phoneme_cleaners",
+    use_phonemes=True,
+    phoneme_language="en-us",
+    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
+    precompute_num_workers=8,
+    mel_statistics_parameter_path=os.path.join(output_path, "lj_parameters.pt"),
+    force_generate_statistics=False,
+    print_step=1,
+    print_eval=True,
+    mixed_precision=True,
+    output_path=output_path,
+    datasets=[dataset_config],
+)
+
+# INITIALIZE THE AUDIO PROCESSOR
+# Audio processor is used for feature extraction and audio I/O.
+# It mainly serves to the dataloader and the training loggers.
+ap = AudioProcessor.init_from_config(config)
+
+# INITIALIZE THE TOKENIZER
+# Tokenizer is used to convert text to sequences of token IDs.
+# If characters are not defined in the config, default characters are passed to the config
+tokenizer, config = TTSTokenizer.init_from_config(config)
+
+# LOAD DATA SAMPLES
+# Each sample is a list of ```[text, audio_file_path, speaker_name]```
+# You can define your custom sample loader returning the list of samples.
+# Or define your custom formatter and pass it to the `load_tts_samples`.
+# Check `TTS.tts.datasets.load_tts_samples` for more details.
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)
+
+# INITIALIZE THE MODEL
+# Models take a config object and a speaker manager as input
+# Config defines the details of the model like the number of layers, the size of the embedding, etc.
+# Speaker manager is used by multi-speaker models.
+model = NeuralhmmTTS(config, ap, tokenizer)
+
+
+# init the trainer and 🚀
+trainer = Trainer(
+    TrainerArgs(),
+    config,
+    output_path,
+    model=model,
+    train_samples=train_samples,
+    eval_samples=eval_samples,
+    gpu=1,
+)
+trainer.fit()
diff --git a/tests/tts_tests/test_neuralhmm_tts_train.py b/tests/tts_tests/test_neuralhmm_tts_train.py
new file mode 100644
index 00000000..25d9aa81
--- /dev/null
+++ b/tests/tts_tests/test_neuralhmm_tts_train.py
@@ -0,0 +1,92 @@
+import glob
+import json
+import os
+import shutil
+
+import torch
+from trainer import get_last_checkpoint
+
+from tests import get_device_id, get_tests_output_path, run_cli
+from TTS.tts.configs.neuralhmm_tts_config import NeuralhmmTTSConfig
+
+config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
+output_path = os.path.join(get_tests_output_path(), "train_outputs")
+parameter_path = os.path.join(get_tests_output_path(), "lj_parameters.pt")
+
+torch.save({"mean": -5.5138, "std": 2.0636, "init_transition_prob": 0.3212}, parameter_path)
+
+config = NeuralhmmTTSConfig(
+    batch_size=3,
+    eval_batch_size=3,
+    num_loader_workers=0,
+    num_eval_loader_workers=0,
+    text_cleaner="phoneme_cleaners",
+    use_phonemes=True,
+    phoneme_language="en-us",
+    phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"),
+    run_eval=True,
+    test_delay_epochs=-1,
+    mel_statistics_parameter_path=parameter_path,
+    epochs=1,
+    print_step=1,
+    test_sentences=[
+        "Be a voice, not an echo.",
+    ],
+    print_eval=True,
+    max_sampling_time=50,
+)
+config.audio.do_trim_silence = True
+config.audio.trim_db = 60
+config.save_json(config_path)
+
+
+# train the model for one epoch when mel parameters exists
+command_train = (
+    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
+    f"--coqpit.output_path {output_path} "
+    "--coqpit.datasets.0.formatter ljspeech "
+    "--coqpit.datasets.0.meta_file_train metadata.csv "
+    "--coqpit.datasets.0.meta_file_val metadata.csv "
+    "--coqpit.datasets.0.path tests/data/ljspeech "
+    "--coqpit.test_delay_epochs 0 "
+)
+run_cli(command_train)
+
+
+# train the model for one epoch when mel parameters have to be computed from the dataset
+if os.path.exists(parameter_path):
+    os.remove(parameter_path)
+command_train = (
+    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
+    f"--coqpit.output_path {output_path} "
+    "--coqpit.datasets.0.formatter ljspeech "
+    "--coqpit.datasets.0.meta_file_train metadata.csv "
+    "--coqpit.datasets.0.meta_file_val metadata.csv "
+    "--coqpit.datasets.0.path tests/data/ljspeech "
+    "--coqpit.test_delay_epochs 0 "
+)
+run_cli(command_train)
+
+# Find latest folder
+continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+
+# Inference using TTS API
+continue_config_path = os.path.join(continue_path, "config.json")
+continue_restore_path, _ = get_last_checkpoint(continue_path)
+out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
+
+# Check integrity of the config
+with open(continue_config_path, "r", encoding="utf-8") as f:
+    config_loaded = json.load(f)
+assert config_loaded["characters"] is not None
+assert config_loaded["output_path"] in continue_path
+assert config_loaded["test_delay_epochs"] == 0
+
+# Load the model and run inference
+inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+run_cli(inference_command)
+
+# restore the model and continue training for one more epoch
+command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+run_cli(command_train)
+shutil.rmtree(continue_path)

From c59b3f75b8c5b5e2f5588c7cdc0bc8958928e6ac Mon Sep 17 00:00:00 2001
From: Gerard Sant Muniesa <90964413+GerrySant@users.noreply.github.com>
Date: Mon, 23 Jan 2023 11:56:30 +0100
Subject: [PATCH 14/24] Add Catalan text cleaners for Catalan support (#2295)

---
 TTS/tts/utils/text/cleaners.py | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/TTS/tts/utils/text/cleaners.py b/TTS/tts/utils/text/cleaners.py
index f02f8fb4..a9261ecb 100644
--- a/TTS/tts/utils/text/cleaners.py
+++ b/TTS/tts/utils/text/cleaners.py
@@ -44,8 +44,25 @@ def remove_aux_symbols(text):
 
 
 def replace_symbols(text, lang="en"):
+    """Replace symbols based on the lenguage tag.
+
+    Args:
+      text:
+       Input text.
+      lang:
+        Lenguage identifier. ex: "en", "fr", "pt", "ca".
+
+    Returns:
+      The modified text
+      example:
+        input args:
+            text: "si l'avi cau, diguem-ho"
+            lang: "ca"
+        Output:
+            text: "si lavi cau, diguemho"
+    """
     text = text.replace(";", ",")
-    text = text.replace("-", " ")
+    text = text.replace("-", " ") if lang != "ca" else text.replace("-", "")
     text = text.replace(":", ",")
     if lang == "en":
         text = text.replace("&", " and ")
@@ -53,6 +70,9 @@ def replace_symbols(text, lang="en"):
         text = text.replace("&", " et ")
     elif lang == "pt":
         text = text.replace("&", " e ")
+    elif lang == "ca":
+        text = text.replace("&", " i ")
+        text = text.replace("'", "")
     return text
 
 

From 13334d507ca81d4dd02444b94349019b72e67d30 Mon Sep 17 00:00:00 2001
From: Eren G??lge <egolge@coqui.ai>
Date: Mon, 23 Jan 2023 13:45:45 +0100
Subject: [PATCH 15/24] Load model from path

---
 TTS/api.py | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/TTS/api.py b/TTS/api.py
index 99c3e522..da571414 100644
--- a/TTS/api.py
+++ b/TTS/api.py
@@ -7,7 +7,7 @@ from TTS.utils.synthesizer import Synthesizer
 class TTS:
     """TODO: Add voice conversion and Capacitron support."""
 
-    def __init__(self, model_name: str = None, progress_bar: bool = True, gpu=False):
+    def __init__(self, model_name: str = None, model_path:str = None, config_path:str=None, progress_bar: bool = True, gpu=False):
         """🐸TTS python interface that allows to load and use the released models.
 
         Example with a multi-speaker model:
@@ -20,6 +20,10 @@ class TTS:
             >>> tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False)
             >>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
 
+        Example loading a model from a path:
+            >>> tts = TTS(model_path="/path/to/checkpoint_100000.pth", config_path="/path/to/config.json", progress_bar=False, gpu=False)
+            >>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
+
         Args:
             model_name (str, optional): Model name to load. You can list models by ```tts.models```. Defaults to None.
             progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
@@ -29,6 +33,8 @@ class TTS:
         self.synthesizer = None
         if model_name:
             self.load_model_by_name(model_name, gpu)
+        if model_path:
+            self.load_model_by_path(model_path, config_path, gpu)
 
     @property
     def models(self):
@@ -90,6 +96,19 @@ class TTS:
             use_cuda=gpu,
         )
 
+    def load_model_by_path(self, model_path: str, config_path: str, gpu: bool = False):
+        self.synthesizer = Synthesizer(
+            tts_checkpoint=model_path,
+            tts_config_path=config_path,
+            tts_speakers_file=None,
+            tts_languages_file=None,
+            vocoder_checkpoint=None,
+            vocoder_config=None,
+            encoder_checkpoint=None,
+            encoder_config=None,
+            use_cuda=gpu,
+        )
+
     def _check_arguments(self, speaker: str = None, language: str = None):
         if self.is_multi_speaker and speaker is None:
             raise ValueError("Model is multi-speaker but no speaker is provided.")

From cf076345e7ddb44584f15127f8d4c595a1428e04 Mon Sep 17 00:00:00 2001
From: Eren G??lge <egolge@coqui.ai>
Date: Mon, 23 Jan 2023 13:49:51 +0100
Subject: [PATCH 16/24] Make style

---
 TTS/api.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/TTS/api.py b/TTS/api.py
index da571414..ed7e6e6b 100644
--- a/TTS/api.py
+++ b/TTS/api.py
@@ -7,7 +7,14 @@ from TTS.utils.synthesizer import Synthesizer
 class TTS:
     """TODO: Add voice conversion and Capacitron support."""
 
-    def __init__(self, model_name: str = None, model_path:str = None, config_path:str=None, progress_bar: bool = True, gpu=False):
+    def __init__(
+        self,
+        model_name: str = None,
+        model_path: str = None,
+        config_path: str = None,
+        progress_bar: bool = True,
+        gpu=False,
+    ):
         """🐸TTS python interface that allows to load and use the released models.
 
         Example with a multi-speaker model:

From 994be163e1ce916b9594e5227fc6551e015b152f Mon Sep 17 00:00:00 2001
From: Martin Weinelt <mweinelt@users.noreply.github.com>
Date: Sun, 29 Jan 2023 22:47:00 +0000
Subject: [PATCH 17/24] Use packaging.version for version comparisons (#2310)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Use packaging.version for version comparisons

The distutils package is deprecated¹ and relies on PEP 386² version
comparisons, which have been superseded by PEP 440³ which is implemented
through the packaging module.

With more recent distutils versions, provided through setuptools
vendoring, we are seeing the following exception during version
comparisons:

> TypeError: '<' not supported between instances of 'str' and 'int'

This is fixed by this migration.

[1] https://docs.python.org/3/library/distutils.html
[2] https://peps.python.org/pep-0386/
[3] https://peps.python.org/pep-0440/

* Improve espeak version detection robustness

On many modern systems espeak is just a symlink to espeak-ng. In that
case looking for the 3rd word in the version output will break the
version comparison, when it finds `text-to-speech:`, instead of a proper
version.

This will not break during runtime, where espeak-ng would be
prioritized, but the phonemizer and tokenizer tests force the backend
to `espeak`, which exhibits this breakage.

This improves the version detection by simply looking for the version
after the "text-to-speech:" token.

* Replace distuils.copy_tree with shutil.copytree

The distutils module is deprecated and slated for removal in Python
3.12. Its usage should be replaced, in this case by a compatible method
from shutil.
---
 TTS/bin/resample.py                              |  4 ++--
 TTS/tts/layers/glow_tts/glow.py                  |  5 ++---
 TTS/tts/utils/text/phonemizers/espeak_wrapper.py | 14 +++++++++++---
 pyproject.toml                                   |  4 ++--
 requirements.txt                                 |  1 +
 setup.py                                         |  5 +++--
 tests/text_tests/test_phonemizer.py              | 11 ++++++-----
 7 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/TTS/bin/resample.py b/TTS/bin/resample.py
index ec96dcc0..eb4ee58e 100644
--- a/TTS/bin/resample.py
+++ b/TTS/bin/resample.py
@@ -2,8 +2,8 @@ import argparse
 import glob
 import os
 from argparse import RawTextHelpFormatter
-from distutils.dir_util import copy_tree
 from multiprocessing import Pool
+from shutil import copytree
 
 import librosa
 import soundfile as sf
@@ -19,7 +19,7 @@ def resample_file(func_args):
 def resample_files(input_dir, output_sr, output_dir=None, file_ext="wav", n_jobs=10):
     if output_dir:
         print("Recursively copying the input folder...")
-        copy_tree(input_dir, output_dir)
+        copytree(input_dir, output_dir)
         input_dir = output_dir
 
     print("Resampling the audio files...")
diff --git a/TTS/tts/layers/glow_tts/glow.py b/TTS/tts/layers/glow_tts/glow.py
index 3b745018..273c62a5 100644
--- a/TTS/tts/layers/glow_tts/glow.py
+++ b/TTS/tts/layers/glow_tts/glow.py
@@ -1,6 +1,5 @@
-from distutils.version import LooseVersion
-
 import torch
+from packaging.version import Version
 from torch import nn
 from torch.nn import functional as F
 
@@ -91,7 +90,7 @@ class InvConvNear(nn.Module):
         self.no_jacobian = no_jacobian
         self.weight_inv = None
 
-        if LooseVersion(torch.__version__) < LooseVersion("1.9"):
+        if Version(torch.__version__) < Version("1.9"):
             w_init = torch.qr(torch.FloatTensor(self.num_splits, self.num_splits).normal_())[0]
         else:
             w_init = torch.linalg.qr(torch.FloatTensor(self.num_splits, self.num_splits).normal_(), "complete")[0]
diff --git a/TTS/tts/utils/text/phonemizers/espeak_wrapper.py b/TTS/tts/utils/text/phonemizers/espeak_wrapper.py
index 5c0865bc..8982a893 100644
--- a/TTS/tts/utils/text/phonemizers/espeak_wrapper.py
+++ b/TTS/tts/utils/text/phonemizers/espeak_wrapper.py
@@ -1,9 +1,10 @@
 import logging
 import re
 import subprocess
-from distutils.version import LooseVersion
 from typing import Dict, List
 
+from packaging.version import Version
+
 from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
 from TTS.tts.utils.text.punctuation import Punctuation
 
@@ -14,9 +15,16 @@ def is_tool(name):
     return which(name) is not None
 
 
+# Use a regex pattern to match the espeak version, because it may be
+# symlinked to espeak-ng, which moves the version bits to another spot.
+espeak_version_pattern = re.compile(r"text-to-speech:\s(?P<version>\d+\.\d+(\.\d+)?)")
+
+
 def get_espeak_version():
     output = subprocess.getoutput("espeak --version")
-    return output.split()[2]
+    match = espeak_version_pattern.search(output)
+
+    return match.group("version")
 
 
 def get_espeakng_version():
@@ -168,7 +176,7 @@ class ESpeak(BasePhonemizer):
         else:
             # split with '_'
             if self.backend == "espeak":
-                if LooseVersion(self.backend_version) >= LooseVersion("1.48.15"):
+                if Version(self.backend_version) >= Version("1.48.15"):
                     args.append("--ipa=1")
                 else:
                     args.append("--ipa=3")
diff --git a/pyproject.toml b/pyproject.toml
index b790159d..8bc91b45 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,5 @@
 [build-system]
-requires = ["setuptools", "wheel", "cython==0.29.28", "numpy==1.21.6"]
+requires = ["setuptools", "wheel", "cython==0.29.28", "numpy==1.21.6", "packaging"]
 
 [flake8]
 max-line-length=120
@@ -30,4 +30,4 @@ exclude = '''
 [tool.isort]
 line_length = 120
 profile = "black"
-multi_line_output = 3
\ No newline at end of file
+multi_line_output = 3
diff --git a/requirements.txt b/requirements.txt
index 8464d7cb..7ee23dab 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,6 +14,7 @@ tqdm
 anyascii
 pyyaml
 fsspec>=2021.04.0
+packaging
 # deps for examples
 flask
 # deps for inference
diff --git a/setup.py b/setup.py
index f95d79f1..259c3cd1 100644
--- a/setup.py
+++ b/setup.py
@@ -23,7 +23,7 @@
 import os
 import subprocess
 import sys
-from distutils.version import LooseVersion
+from packaging.version import Version
 
 import numpy
 import setuptools.command.build_py
@@ -31,7 +31,8 @@ import setuptools.command.develop
 from Cython.Build import cythonize
 from setuptools import Extension, find_packages, setup
 
-if LooseVersion(sys.version) < LooseVersion("3.7") or LooseVersion(sys.version) >= LooseVersion("3.11"):
+python_version = sys.version.split()[0]
+if Version(python_version) < Version("3.7") or Version(python_version) >= Version("3.11"):
     raise RuntimeError("TTS requires python >= 3.7 and < 3.11 " "but your Python version is {}".format(sys.version))
 
 
diff --git a/tests/text_tests/test_phonemizer.py b/tests/text_tests/test_phonemizer.py
index 794a8fd7..4ca62384 100644
--- a/tests/text_tests/test_phonemizer.py
+++ b/tests/text_tests/test_phonemizer.py
@@ -1,5 +1,6 @@
 import unittest
-from distutils.version import LooseVersion
+
+from packaging.version import Version
 
 from TTS.tts.utils.text.phonemizers import ESpeak, Gruut, JA_JP_Phonemizer, ZH_CN_Phonemizer
 from TTS.tts.utils.text.phonemizers.multi_phonemizer import MultiPhonemizer
@@ -40,7 +41,7 @@ class TestEspeakPhonemizer(unittest.TestCase):
     def setUp(self):
         self.phonemizer = ESpeak(language="en-us", backend="espeak")
 
-        if LooseVersion(self.phonemizer.backend_version) >= LooseVersion("1.48.15"):
+        if Version(self.phonemizer.backend_version) >= Version("1.48.15"):
             target_phonemes = EXPECTED_ESPEAK_v1_48_15_PHONEMES
         else:
             target_phonemes = EXPECTED_ESPEAK_PHONEMES
@@ -52,7 +53,7 @@ class TestEspeakPhonemizer(unittest.TestCase):
         # multiple punctuations
         text = "Be a voice, not an! echo?"
         gt = "biː ɐ vˈɔɪs, nˈɑːt ɐn! ˈɛkoʊ?"
-        if LooseVersion(self.phonemizer.backend_version) >= LooseVersion("1.48.15"):
+        if Version(self.phonemizer.backend_version) >= Version("1.48.15"):
             gt = "biː ɐ vˈɔɪs, nˈɑːt æn! ˈɛkoʊ?"
         output = self.phonemizer.phonemize(text, separator="|")
         output = output.replace("|", "")
@@ -61,7 +62,7 @@ class TestEspeakPhonemizer(unittest.TestCase):
         # not ending with punctuation
         text = "Be a voice, not an! echo"
         gt = "biː ɐ vˈɔɪs, nˈɑːt ɐn! ˈɛkoʊ"
-        if LooseVersion(self.phonemizer.backend_version) >= LooseVersion("1.48.15"):
+        if Version(self.phonemizer.backend_version) >= Version("1.48.15"):
             gt = "biː ɐ vˈɔɪs, nˈɑːt æn! ˈɛkoʊ"
         output = self.phonemizer.phonemize(text, separator="")
         self.assertEqual(output, gt)
@@ -69,7 +70,7 @@ class TestEspeakPhonemizer(unittest.TestCase):
         # extra space after the sentence
         text = "Be a voice, not an! echo.  "
         gt = "biː ɐ vˈɔɪs, nˈɑːt ɐn! ˈɛkoʊ."
-        if LooseVersion(self.phonemizer.backend_version) >= LooseVersion("1.48.15"):
+        if Version(self.phonemizer.backend_version) >= Version("1.48.15"):
             gt = "biː ɐ vˈɔɪs, nˈɑːt æn! ˈɛkoʊ."
         output = self.phonemizer.phonemize(text, separator="")
         self.assertEqual(output, gt)

From 335b8ed44e7b252f7c17069bb621a1dab592008a Mon Sep 17 00:00:00 2001
From: Eren G??lge <egolge@coqui.ai>
Date: Mon, 30 Jan 2023 12:59:29 +0100
Subject: [PATCH 18/24] Add vocoder path

---
 TTS/api.py | 36 ++++++++++++++++++++++++++++++++----
 1 file changed, 32 insertions(+), 4 deletions(-)

diff --git a/TTS/api.py b/TTS/api.py
index ed7e6e6b..22b81ba4 100644
--- a/TTS/api.py
+++ b/TTS/api.py
@@ -12,6 +12,8 @@ class TTS:
         model_name: str = None,
         model_path: str = None,
         config_path: str = None,
+        vocoder_path: str = None,
+        vocoder_config_path: str = None,
         progress_bar: bool = True,
         gpu=False,
     ):
@@ -33,6 +35,10 @@ class TTS:
 
         Args:
             model_name (str, optional): Model name to load. You can list models by ```tts.models```. Defaults to None.
+            model_path (str, optional): Path to the model checkpoint. Defaults to None.
+            config_path (str, optional): Path to the model config. Defaults to None.
+            vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
+            vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
             progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
             gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
         """
@@ -41,7 +47,9 @@ class TTS:
         if model_name:
             self.load_model_by_name(model_name, gpu)
         if model_path:
-            self.load_model_by_path(model_path, config_path, gpu)
+            self.load_model_by_path(
+                model_path, config_path, vocoder_path=vocoder_path, vocoder_config=vocoder_config_path, gpu=gpu
+            )
 
     @property
     def models(self):
@@ -89,6 +97,14 @@ class TTS:
 
     def load_model_by_name(self, model_name: str, gpu: bool = False):
         model_path, config_path, vocoder_path, vocoder_config_path = self.download_model_by_name(model_name)
+        """ Load one of 🐸TTS models by name.
+
+        Args:
+            model_name (str): Model name to load. You can list models by ```tts.models```.
+            gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
+
+        TODO: Add tests
+        """
         # init synthesizer
         # None values are fetch from the model
         self.synthesizer = Synthesizer(
@@ -103,14 +119,26 @@ class TTS:
             use_cuda=gpu,
         )
 
-    def load_model_by_path(self, model_path: str, config_path: str, gpu: bool = False):
+    def load_model_by_path(
+        self, model_path: str, config_path: str, vocoder_path: str = None, vocoder_config: str = None, gpu: bool = False
+    ):
+        """Load a model from a path.
+
+        Args:
+            model_path (str): Path to the model checkpoint.
+            config_path (str): Path to the model config.
+            vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
+            vocoder_config (str, optional): Path to the vocoder config. Defaults to None.
+            gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
+        """
+
         self.synthesizer = Synthesizer(
             tts_checkpoint=model_path,
             tts_config_path=config_path,
             tts_speakers_file=None,
             tts_languages_file=None,
-            vocoder_checkpoint=None,
-            vocoder_config=None,
+            vocoder_checkpoint=vocoder_path,
+            vocoder_config=vocoder_config,
             encoder_checkpoint=None,
             encoder_config=None,
             use_cuda=gpu,

From 7fddabc8ac9c84a9d05ba7928f454e45558c9422 Mon Sep 17 00:00:00 2001
From: Eren G??lge <egolge@coqui.ai>
Date: Mon, 30 Jan 2023 13:35:48 +0100
Subject: [PATCH 19/24] Implement cloning in API

---
 TTS/api.py                               | 33 +++++++++++++++++++-----
 TTS/utils/synthesizer.py                 |  2 +-
 tests/inference_tests/test_python_api.py |  9 ++++++-
 3 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/TTS/api.py b/TTS/api.py
index 22b81ba4..6fa8c606 100644
--- a/TTS/api.py
+++ b/TTS/api.py
@@ -33,6 +33,12 @@ class TTS:
             >>> tts = TTS(model_path="/path/to/checkpoint_100000.pth", config_path="/path/to/config.json", progress_bar=False, gpu=False)
             >>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
 
+        Example voice cloning with YourTTS in English, French and Portuguese:
+            >>> tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)
+            >>> tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="thisisit.wav")
+            >>> tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr", file_path="thisisit.wav")
+            >>> tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt", file_path="thisisit.wav")
+
         Args:
             model_name (str, optional): Model name to load. You can list models by ```tts.models```. Defaults to None.
             model_path (str, optional): Path to the model checkpoint. Defaults to None.
@@ -144,8 +150,8 @@ class TTS:
             use_cuda=gpu,
         )
 
-    def _check_arguments(self, speaker: str = None, language: str = None):
-        if self.is_multi_speaker and speaker is None:
+    def _check_arguments(self, speaker: str = None, language: str = None, speaker_wav: str = None):
+        if self.is_multi_speaker and (speaker is None and speaker_wav is None):
             raise ValueError("Model is multi-speaker but no speaker is provided.")
         if self.is_multi_lingual and language is None:
             raise ValueError("Model is multi-lingual but no language is provided.")
@@ -154,7 +160,7 @@ class TTS:
         if not self.is_multi_lingual and language is not None:
             raise ValueError("Model is not multi-lingual but language is provided.")
 
-    def tts(self, text: str, speaker: str = None, language: str = None):
+    def tts(self, text: str, speaker: str = None, language: str = None, speaker_wav: str = None):
         """Convert text to speech.
 
         Args:
@@ -166,14 +172,17 @@ class TTS:
             language (str, optional):
                 Language code for multi-lingual models. You can check whether loaded model is multi-lingual
                 `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
+            speaker_wav (str, optional):
+                Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
+                Defaults to None.
         """
-        self._check_arguments(speaker=speaker, language=language)
+        self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav)
 
         wav = self.synthesizer.tts(
             text=text,
             speaker_name=speaker,
             language_name=language,
-            speaker_wav=None,
+            speaker_wav=speaker_wav,
             reference_wav=None,
             style_wav=None,
             style_text=None,
@@ -181,7 +190,14 @@ class TTS:
         )
         return wav
 
-    def tts_to_file(self, text: str, speaker: str = None, language: str = None, file_path: str = "output.wav"):
+    def tts_to_file(
+        self,
+        text: str,
+        speaker: str = None,
+        language: str = None,
+        speaker_wav: str = None,
+        file_path: str = "output.wav",
+    ):
         """Convert text to speech.
 
         Args:
@@ -193,8 +209,11 @@ class TTS:
             language (str, optional):
                 Language code for multi-lingual models. You can check whether loaded model is multi-lingual
                 `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
+            speaker_wav (str, optional):
+                Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
+                Defaults to None.
             file_path (str, optional):
                 Output file path. Defaults to "output.wav".
         """
-        wav = self.tts(text=text, speaker=speaker, language=language)
+        wav = self.tts(text=text, speaker=speaker, language=language, speaker_wav=speaker_wav)
         self.synthesizer.save_wav(wav=wav, path=file_path)
diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py
index 4a0ab038..498dc7ba 100644
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@@ -187,7 +187,7 @@ class Synthesizer(object):
             text (str): input text.
             speaker_name (str, optional): spekaer id for multi-speaker models. Defaults to "".
             language_name (str, optional): language id for multi-language models. Defaults to "".
-            speaker_wav (Union[str, List[str]], optional): path to the speaker wav. Defaults to None.
+            speaker_wav (Union[str, List[str]], optional): path to the speaker wav for voice cloning. Defaults to None.
             style_wav ([type], optional): style waveform for GST. Defaults to None.
             style_text ([type], optional): transcription of style_wav for Capacitron. Defaults to None.
             reference_wav ([type], optional): reference waveform for voice conversion. Defaults to None.
diff --git a/tests/inference_tests/test_python_api.py b/tests/inference_tests/test_python_api.py
index fdd7e1cb..b306b5ea 100644
--- a/tests/inference_tests/test_python_api.py
+++ b/tests/inference_tests/test_python_api.py
@@ -1,10 +1,12 @@
 import os
 import unittest
 
-from tests import get_tests_output_path
+from tests import get_tests_data_path, get_tests_output_path
+
 from TTS.api import TTS
 
 OUTPUT_PATH = os.path.join(get_tests_output_path(), "test_python_api.wav")
+cloning_test_wav_path = os.path.join(get_tests_data_path(), "ljspeech/wavs/LJ001-0028.wav")
 
 
 class TTSTest(unittest.TestCase):
@@ -34,3 +36,8 @@ class TTSTest(unittest.TestCase):
         self.assertTrue(tts.is_multi_lingual)
         self.assertGreater(len(tts.speakers), 1)
         self.assertGreater(len(tts.languages), 1)
+
+    def test_voice_cloning():
+        tts = TTS()
+        tts.load_model_by_name("tts_models/multilingual/multi-dataset/your_tts")
+        tts.tts_to_file("Hello world!", speaker_wav=cloning_test_wav_path, language="en", file_path=OUTPUT_PATH)

From 6ee94f8badb32bfbc0ed61c000fd976899ecb5d0 Mon Sep 17 00:00:00 2001
From: Eren G??lge <egolge@coqui.ai>
Date: Mon, 30 Jan 2023 14:02:25 +0100
Subject: [PATCH 20/24] Fixup

---
 TTS/api.py                               | 4 +++-
 tests/inference_tests/test_python_api.py | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/TTS/api.py b/TTS/api.py
index 6fa8c606..850f0681 100644
--- a/TTS/api.py
+++ b/TTS/api.py
@@ -102,7 +102,6 @@ class TTS:
         return model_path, config_path, vocoder_path, vocoder_config_path
 
     def load_model_by_name(self, model_name: str, gpu: bool = False):
-        model_path, config_path, vocoder_path, vocoder_config_path = self.download_model_by_name(model_name)
         """ Load one of 🐸TTS models by name.
 
         Args:
@@ -111,6 +110,9 @@ class TTS:
 
         TODO: Add tests
         """
+
+        model_path, config_path, vocoder_path, vocoder_config_path = self.download_model_by_name(model_name)
+
         # init synthesizer
         # None values are fetch from the model
         self.synthesizer = Synthesizer(
diff --git a/tests/inference_tests/test_python_api.py b/tests/inference_tests/test_python_api.py
index b306b5ea..6114c803 100644
--- a/tests/inference_tests/test_python_api.py
+++ b/tests/inference_tests/test_python_api.py
@@ -37,7 +37,7 @@ class TTSTest(unittest.TestCase):
         self.assertGreater(len(tts.speakers), 1)
         self.assertGreater(len(tts.languages), 1)
 
-    def test_voice_cloning():
+    def test_voice_cloning(self):
         tts = TTS()
         tts.load_model_by_name("tts_models/multilingual/multi-dataset/your_tts")
         tts.tts_to_file("Hello world!", speaker_wav=cloning_test_wav_path, language="en", file_path=OUTPUT_PATH)

From 1f4d8bf0f108479120db2f7f2b850850122e4021 Mon Sep 17 00:00:00 2001
From: marius851000 <marius@mariusdavid.fr>
Date: Mon, 6 Feb 2023 10:54:34 +0100
Subject: [PATCH 21/24] Fix tts-server for multi-lingual models (#2257)

---
 TTS/server/server.py            | 13 +++++++++++--
 TTS/server/templates/index.html | 19 +++++++++++++++----
 TTS/utils/synthesizer.py        |  2 +-
 3 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/TTS/server/server.py b/TTS/server/server.py
index 345e4d50..c276a142 100644
--- a/TTS/server/server.py
+++ b/TTS/server/server.py
@@ -115,8 +115,13 @@ synthesizer = Synthesizer(
 use_multi_speaker = hasattr(synthesizer.tts_model, "num_speakers") and (
     synthesizer.tts_model.num_speakers > 1 or synthesizer.tts_speakers_file is not None
 )
-
 speaker_manager = getattr(synthesizer.tts_model, "speaker_manager", None)
+
+use_multi_language = hasattr(synthesizer.tts_model, "num_languages") and (
+    synthesizer.tts_model.num_languages > 1 or synthesizer.tts_languages_file is not None
+)
+language_manager = getattr(synthesizer.tts_model, "language_manager", None)
+
 # TODO: set this from SpeakerManager
 use_gst = synthesizer.tts_config.get("use_gst", False)
 app = Flask(__name__)
@@ -147,7 +152,9 @@ def index():
         "index.html",
         show_details=args.show_details,
         use_multi_speaker=use_multi_speaker,
+        use_multi_language=use_multi_language,
         speaker_ids=speaker_manager.name_to_id if speaker_manager is not None else None,
+        language_ids=language_manager.name_to_id if language_manager is not None else None,
         use_gst=use_gst,
     )
 
@@ -177,11 +184,13 @@ def tts():
     with lock:
         text = request.args.get("text")
         speaker_idx = request.args.get("speaker_id", "")
+        language_idx = request.args.get("language_id", "")
         style_wav = request.args.get("style_wav", "")
         style_wav = style_wav_uri_to_dict(style_wav)
         print(" > Model input: {}".format(text))
         print(" > Speaker Idx: {}".format(speaker_idx))
-        wavs = synthesizer.tts(text, speaker_name=speaker_idx, style_wav=style_wav)
+        print(" > Language Idx: {}".format(language_idx))
+        wavs = synthesizer.tts(text, speaker_name=speaker_idx, language_name=language_idx, style_wav=style_wav)
         out = io.BytesIO()
         synthesizer.save_wav(wavs, out)
     return send_file(out, mimetype="audio/wav")
diff --git a/TTS/server/templates/index.html b/TTS/server/templates/index.html
index b0eab291..6354d391 100644
--- a/TTS/server/templates/index.html
+++ b/TTS/server/templates/index.html
@@ -65,7 +65,7 @@
                 </ul>
 
                 {%if use_gst%}
-                <input value='{"0": 0.1}' id="style_wav" placeholder="style wav (dict or path ot wav).." size=45
+                <input value='{"0": 0.1}' id="style_wav" placeholder="style wav (dict or path to wav).." size=45
                     type="text" name="style_wav">
                 {%endif%}
 
@@ -81,6 +81,16 @@
                 </select><br /><br />
                 {%endif%}
 
+                {%if use_multi_language%}
+                Choose a language:
+                <select id="language_id" name=language_id method="GET" action="/">
+                    {% for language_id in language_ids %}
+                    <option value="{{language_id}}" SELECTED>{{language_id}}</option>"
+                    {% endfor %}
+                </select><br /><br />
+                {%endif%}
+
+
                 {%if show_details%}
                 <button id="details-button" onclick="location.href = 'details'" name="model-details">Model
                     Details</button><br /><br />
@@ -106,11 +116,12 @@
             const text = q('#text').value
             const speaker_id = getTextValue('#speaker_id')
             const style_wav = getTextValue('#style_wav')
+            const language_id = getTextValue('#language_id')
             if (text) {
                 q('#message').textContent = 'Synthesizing...'
                 q('#speak-button').disabled = true
                 q('#audio').hidden = true
-                synthesize(text, speaker_id, style_wav)
+                synthesize(text, speaker_id, style_wav, language_id)
             }
             e.preventDefault()
             return false
@@ -121,8 +132,8 @@
                 do_tts(e)
             }
         })
-        function synthesize(text, speaker_id = "", style_wav = "") {
-            fetch(`/api/tts?text=${encodeURIComponent(text)}&speaker_id=${encodeURIComponent(speaker_id)}&style_wav=${encodeURIComponent(style_wav)}`, { cache: 'no-cache' })
+        function synthesize(text, speaker_id = "", style_wav = "", language_id = "") {
+            fetch(`/api/tts?text=${encodeURIComponent(text)}&speaker_id=${encodeURIComponent(speaker_id)}&style_wav=${encodeURIComponent(style_wav)}&language_id=${encodeURIComponent(language_id)}`, { cache: 'no-cache' })
                 .then(function (res) {
                     if (!res.ok) throw Error(res.statusText)
                     return res.blob()
diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py
index 4a0ab038..2cef8d70 100644
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@@ -242,7 +242,7 @@ class Synthesizer(object):
                     "Define path for speaker.json if it is a multi-speaker model or remove defined speaker idx. "
                 )
 
-        # handle multi-lingaul
+        # handle multi-lingual
         language_id = None
         if self.tts_languages_file or (
             hasattr(self.tts_model, "language_manager") and self.tts_model.language_manager is not None

From baed2a2c2b4bb006da571ea0b0e6e6ad48d7d8d7 Mon Sep 17 00:00:00 2001
From: Eren G??lge <egolge@coqui.ai>
Date: Mon, 6 Feb 2023 11:15:43 +0100
Subject: [PATCH 22/24] Update README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 5a28bc8d..bbcd170e 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,6 @@
 ----
 
 ### 📣 Clone your voice with a single click on [🐸Coqui.ai](https://app.coqui.ai/auth/signin)
-### 📣 🐸Coqui Studio is launching soon!! Join our [waiting list](https://coqui.ai/)!!
 
 ----
 
@@ -92,6 +91,7 @@ Underlined "TTS*" and "Judy*" are 🐸TTS models
 - Align-TTS: [paper](https://arxiv.org/abs/2003.01950)
 - FastPitch: [paper](https://arxiv.org/pdf/2006.06873.pdf)
 - FastSpeech: [paper](https://arxiv.org/abs/1905.09263)
+- FastSpeech2: [paper](https://arxiv.org/abs/2006.04558)
 - SC-GlowTTS: [paper](https://arxiv.org/abs/2104.05557)
 - Capacitron: [paper](https://arxiv.org/abs/1906.03402)
 - OverFlow: [paper](https://arxiv.org/abs/2211.06892)

From c496b1a986c9a507fbe90b01f02f870baa70cfa5 Mon Sep 17 00:00:00 2001
From: Eren G??lge <egolge@coqui.ai>
Date: Mon, 6 Feb 2023 11:17:28 +0100
Subject: [PATCH 23/24] Linter fix

---
 tests/inference_tests/test_python_api.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/inference_tests/test_python_api.py b/tests/inference_tests/test_python_api.py
index 6114c803..71690440 100644
--- a/tests/inference_tests/test_python_api.py
+++ b/tests/inference_tests/test_python_api.py
@@ -37,6 +37,7 @@ class TTSTest(unittest.TestCase):
         self.assertGreater(len(tts.speakers), 1)
         self.assertGreater(len(tts.languages), 1)
 
+    @staticmethod
     def test_voice_cloning(self):
         tts = TTS()
         tts.load_model_by_name("tts_models/multilingual/multi-dataset/your_tts")

From 4e75b6262c212b59d6402b6f068b257d6f7f1cec Mon Sep 17 00:00:00 2001
From: Eren G??lge <egolge@coqui.ai>
Date: Mon, 6 Feb 2023 11:20:32 +0100
Subject: [PATCH 24/24] Update docs

---
 README.md                | 6 ++++++
 docs/source/inference.md | 9 +++++++++
 2 files changed, 15 insertions(+)

diff --git a/README.md b/README.md
index bbcd170e..7490af2a 100644
--- a/README.md
+++ b/README.md
@@ -191,6 +191,12 @@ tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.langu
 tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False)
 # Run TTS
 tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path=OUTPUT_PATH)
+
+# Example voice cloning with YourTTS in English, French and Portuguese:
+tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)
+tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav")
+tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr", file_path="output.wav")
+tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt", file_path="output.wav")
 ```
 
 ### Command line `tts`
diff --git a/docs/source/inference.md b/docs/source/inference.md
index d7d63a69..60f787d3 100644
--- a/docs/source/inference.md
+++ b/docs/source/inference.md
@@ -126,4 +126,13 @@ Here is an example for a single speaker model.
 tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False)
 # Run TTS
 tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path=OUTPUT_PATH)
+```
+
+Example voice cloning with YourTTS in English, French and Portuguese:
+
+```python
+tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)
+tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav")
+tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr", file_path="output.wav")
+tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt", file_path="output.wav")
 ```
\ No newline at end of file