Merge pull request #717 from coqui-ai/dev

v0.2.0
2021-08-11 10:08:04 +02:00 · 2021-08-11 10:08:04 +02:00 · 01a2b0b5c0
parent 0fc9f38745 863525deb1
commit 01a2b0b5c0
94 changed files with 3610 additions and 657 deletions
--- a/.github/PR_TEMPLATE.md
+++ b/.github/PR_TEMPLATE.md
@ -1,10 +1,15 @@
---
+# Pull request guidelines
 name: 'Contribution Guideline '
 about: Refer to Contirbution Guideline
 title: ''
 labels: ''
 assignees: ''
---
+Welcome to the 🐸TTS project! We are excited to see your interest, and appreciate your support!
-👐 Please check our [CONTRIBUTION GUIDELINE](https://github.com/coqui-ai/TTS#contribution-guidelines).
+This repository is governed by the Contributor Covenant Code of Conduct. For more details, see the [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md) file.
 In order to make a good pull request, please see our [CONTRIBUTING.md](CONTRIBUTING.md) file.
 Before accepting your pull request, you will be asked to sign a [Contributor License Agreement](https://cla-assistant.io/coqui-ai/TTS).
 This [Contributor License Agreement](https://cla-assistant.io/coqui-ai/TTS):
 - Protects you, Coqui, and the users of the code.
 - Does not change your rights to use your contributions for any purpose.
 - Does not change the license of the 🐸TTS project. It just makes the terms of your contribution clearer and lets us know you are OK to contribute.
--- a/.gitignore
+++ b/.gitignore
@ -136,6 +136,7 @@ TTS/tts/layers/glow_tts/monotonic_align/core.c
 temp_build/*
 recipes/WIP/*
 recipes/ljspeech/LJSpeech-1.1/*
 recipes/ljspeech/tacotron2-DDC/LJSpeech-1.1/*
 events.out*
 old_configs/*
 model_importers/*
@ -152,4 +153,6 @@ output.wav
 tts_output.wav
 deps.json
 speakers.json
-internal/*
+internal/*
 *_pitch.npy
 *_phoneme.npy
--- a/2
+++ b/2
@ -4,7 +4,7 @@
 help:
 	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
-target_dirs := tests TTS notebooks
+target_dirs := tests TTS notebooks recipes
 test_all:	## run tests and don't stop on an error.
 	nosetests --with-cov -cov  --cover-erase --cover-package TTS tests --nologcapture --with-id
--- a/README.md
+++ b/README.md
@ -73,10 +73,13 @@ Underlined "TTS*" and "Judy*" are 🐸TTS models
 - Speedy-Speech: [paper](https://arxiv.org/abs/2008.03802)
 - Align-TTS: [paper](https://arxiv.org/abs/2003.01950)
 ### End-to-End Models
 - VITS: [paper](https://arxiv.org/pdf/2106.06103)
 ### Attention Methods
 - Guided Attention: [paper](https://arxiv.org/abs/1710.08969)
 - Forward Backward Decoding: [paper](https://arxiv.org/abs/1907.09006)
- Graves Attention: [paper](https://arxiv.org/abs/1907.09006)
+- Graves Attention: [paper](https://arxiv.org/abs/1910.10288)
 - Double Decoder Consistency: [blog](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/)
 - Dynamic Convolutional Attention: [paper](https://arxiv.org/pdf/1910.10288.pdf)
--- a/TTS/.models.json
+++ b/TTS/.models.json
@ -1,7 +1,7 @@
 {
-    "tts_models":{
+    "tts_models": {
-        "en":{
+        "en": {
-            "ek1":{
+            "ek1": {
                "tacotron2": {
                    "description": "EK1 en-rp tacotron2 by NMStoker",
                    "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.1.0/tts_models--en--ek1--tacotron2.zip",
@ -9,7 +9,7 @@
                    "commit": "c802255"
                }
            },
-            "ljspeech":{
+            "ljspeech": {
                "tacotron2-DDC": {
                    "description": "Tacotron2 with Double Decoder Consistency.",
                    "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.12/tts_models--en--ljspeech--tacotron2-DDC.zip",
@ -17,9 +17,18 @@
                    "commit": "bae2ad0f",
                    "author": "Eren Gölge @erogol",
                    "license": "",
-                    "contact":"egolge@coqui.com"
+                    "contact": "egolge@coqui.com"
                },
-                "glow-tts":{
+                "tacotron2-DDC_ph": {
                    "description": "Tacotron2 with Double Decoder Consistency with phonemes.",
                    "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.2.0/tts_models--en--ljspeech--tacotronDDC_ph.zip",
                    "default_vocoder": "vocoder_models/en/ljspeech/univnet",
                    "commit": "3900448",
                    "author": "Eren Gölge @erogol",
                    "license": "",
                    "contact": "egolge@coqui.com"
                },
                "glow-tts": {
                    "description": "",
                    "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.9/tts_models--en--ljspeech--glow-tts.zip",
                    "stats_file": null,
@ -27,7 +36,7 @@
                    "commit": "",
                    "author": "Eren Gölge @erogol",
                    "license": "MPL",
-                    "contact":"egolge@coqui.com"
+                    "contact": "egolge@coqui.com"
                },
                "tacotron2-DCA": {
                    "description": "",
@ -36,19 +45,28 @@
                    "commit": "",
                    "author": "Eren Gölge @erogol",
                    "license": "MPL",
-                    "contact":"egolge@coqui.com"
+                    "contact": "egolge@coqui.com"
                },
-                "speedy-speech-wn":{
+                "speedy-speech-wn": {
                    "description": "Speedy Speech model with wavenet decoder.",
                    "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.1.0/tts_models--en--ljspeech--speedy-speech-wn.zip",
                    "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
                    "commit": "77b6145",
                    "author": "Eren Gölge @erogol",
                    "license": "MPL",
-                    "contact":"egolge@coqui.com"
+                    "contact": "egolge@coqui.com"
                },
                "vits": {
                    "description": "VITS is an End2End TTS model trained on LJSpeech dataset with phonemes.",
                    "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.2.0/tts_models--en--ljspeech--vits.zip",
                    "default_vocoder": null,
                    "commit": "3900448",
                    "author": "Eren Gölge @erogol",
                    "license": "",
                    "contact": "egolge@coqui.com"
                }
            },
-            "vctk":{
+            "vctk": {
                "sc-glow-tts": {
                    "description": "Multi-Speaker Transformers based SC-Glow model from https://arxiv.org/abs/2104.05557.",
                    "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.1.0/tts_models--en--vctk--sc-glow-tts.zip",
@ -56,12 +74,19 @@
                    "commit": "b531fa69",
                    "author": "Edresson Casanova",
                    "license": "",
-                    "contact":""
+                    "contact": ""
-
+                },
-
+                "vits": {
                    "description": "VITS End2End TTS model trained on VCTK dataset with 109 different speakers with EN accent.",
                    "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.2.0/tts_models--en--vctk--vits.zip",
                    "default_vocoder": null,
                    "commit": "3900448",
                    "author": "Eren @erogol",
                    "license": "",
                    "contact": "egolge@coqui.ai"
                }
            },
-            "sam":{
+            "sam": {
                "tacotron-DDC": {
                    "description": "Tacotron2 with Double Decoder Consistency trained with Aceenture's Sam dataset.",
                    "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.13/tts_models--en--sam--tacotron_DDC.zip",
@ -69,46 +94,47 @@
                    "commit": "bae2ad0f",
                    "author": "Eren Gölge @erogol",
                    "license": "",
-                    "contact":"egolge@coqui.com"
+                    "contact": "egolge@coqui.com"
                }
            }
        },
-        "es":{
+        "es": {
-            "mai":{
+            "mai": {
-                "tacotron2-DDC":{
+                "tacotron2-DDC": {
                    "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.9/tts_models--es--mai--tacotron2-DDC.zip",
                    "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
                    "commit": "",
                    "author": "Eren Gölge @erogol",
                    "license": "MPL",
-                    "contact":"egolge@coqui.com"
+                    "contact": "egolge@coqui.com"
                }
            }
        },
-        "fr":{
+        "fr": {
-            "mai":{
+            "mai": {
-                "tacotron2-DDC":{
+                "tacotron2-DDC": {
                    "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.9/tts_models--fr--mai--tacotron2-DDC.zip",
                    "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
                    "commit": "",
                    "author": "Eren Gölge @erogol",
                    "license": "MPL",
-                    "contact":"egolge@coqui.com"
+                    "contact": "egolge@coqui.com"
                }
            }
        },
-        "zh-CN":{
+        "zh-CN": {
-            "baker":{
+            "baker": {
-                "tacotron2-DDC-GST":{
+                "tacotron2-DDC-GST": {
                    "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.10/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip",
                    "commit": "unknown",
-                    "author": "@kirianguiller"
+                    "author": "@kirianguiller",
                    "default_vocoder": null
                }
            }
        },
-        "nl":{
+        "nl": {
-            "mai":{
+            "mai": {
-                "tacotron2-DDC":{
+                "tacotron2-DDC": {
                    "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.10/tts_models--nl--mai--tacotron2-DDC.zip",
                    "author": "@r-dh",
                    "default_vocoder": "vocoder_models/nl/mai/parallel-wavegan",
@ -117,20 +143,9 @@
                }
            }
        },
-        "ru":{
+        "de": {
-            "ruslan":{
+            "thorsten": {
-                "tacotron2-DDC":{
+                "tacotron2-DCA": {
                    "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.10/tts_models--ru--ruslan--tacotron2-DDC.zip",
                    "author": "@erogol",
                    "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
                    "license":"",
                    "contact": "egolge@coqui.com"
                }
            }
        },
        "de":{
            "thorsten":{
                "tacotron2-DCA":{
                    "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.11/tts_models--de--thorsten--tacotron2-DCA.zip",
                    "default_vocoder": "vocoder_models/de/thorsten/fullband-melgan",
                    "author": "@thorstenMueller",
@ -138,9 +153,9 @@
                }
            }
        },
-        "ja":{
+        "ja": {
-            "kokoro":{
+            "kokoro": {
-                "tacotron2-DDC":{
+                "tacotron2-DDC": {
                    "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.15/tts_models--jp--kokoro--tacotron2-DDC.zip",
                    "default_vocoder": "vocoder_models/universal/libri-tts/wavegrad",
                    "description": "Tacotron2 with Double Decoder Consistency trained with Kokoro Speech Dataset.",
@ -150,54 +165,62 @@
            }
        }
    },
-    "vocoder_models":{
+    "vocoder_models": {
-        "universal":{
+        "universal": {
-            "libri-tts":{
+            "libri-tts": {
-                "wavegrad":{
+                "wavegrad": {
                    "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.9/vocoder_models--universal--libri-tts--wavegrad.zip",
                    "commit": "ea976b0",
                    "author": "Eren Gölge @erogol",
                    "license": "MPL",
-                    "contact":"egolge@coqui.com"
+                    "contact": "egolge@coqui.com"
                },
-                "fullband-melgan":{
+                "fullband-melgan": {
                    "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.9/vocoder_models--universal--libri-tts--fullband-melgan.zip",
                    "commit": "4132240",
                    "author": "Eren Gölge @erogol",
                    "license": "MPL",
-                    "contact":"egolge@coqui.com"
+                    "contact": "egolge@coqui.com"
                }
            }
        },
        "en": {
-            "ek1":{
+            "ek1": {
                "wavegrad": {
                    "description": "EK1 en-rp wavegrad by NMStoker",
                    "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.10/vocoder_models--en--ek1--wavegrad.zip",
                    "commit": "c802255"
                }
            },
-            "ljspeech":{
+            "ljspeech": {
-                "multiband-melgan":{
+                "multiband-melgan": {
                    "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.9/vocoder_models--en--ljspeech--mulitband-melgan.zip",
                    "commit": "ea976b0",
                    "author": "Eren Gölge @erogol",
                    "license": "MPL",
-                    "contact":"egolge@coqui.com"
+                    "contact": "egolge@coqui.com"
                },
-                "hifigan_v2":{
+                "hifigan_v2": {
                    "description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
                    "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.12/vocoder_model--en--ljspeech-hifigan_v2.zip",
                    "commit": "bae2ad0f",
                    "author": "@erogol",
                    "license": "",
                    "contact": "egolge@coqui.ai"
                },
                "univnet": {
                    "description": "UnivNet model trained on LJSpeech to complement the TacotronDDC_ph model.",
                    "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.2.0/vocoder_models--en--ljspeech--univnet.zip",
                    "commit": "3900448",
                    "author": "Eren @erogol",
                    "license": "",
                    "contact": "egolge@coqui.ai"
                }
            },
-            "vctk":{
+            "vctk": {
-                "hifigan_v2":{
+                "hifigan_v2": {
                    "description": "Finetuned and intended to be used with tts_models/en/vctk/sc-glow-tts",
-                    "github_rls_url":"https://github.com/coqui-ai/TTS/releases/download/v0.0.12/vocoder_model--en--vctk--hifigan_v2.zip",
+                    "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.12/vocoder_model--en--vctk--hifigan_v2.zip",
                    "commit": "2f07160",
                    "author": "Edresson Casanova",
                    "license": "",
@ -205,9 +228,9 @@
                }
            },
            "sam": {
-                "hifigan_v2":{
+                "hifigan_v2": {
                    "description": "Finetuned and intended to be used with tts_models/en/sam/tacotron_DDC",
-                    "github_rls_url":"https://github.com/coqui-ai/TTS/releases/download/v0.0.13/vocoder_models--en--sam--hifigan_v2.zip",
+                    "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.13/vocoder_models--en--sam--hifigan_v2.zip",
                    "commit": "2f07160",
                    "author": "Eren Gölge @erogol",
                    "license": "",
@ -215,28 +238,38 @@
                }
            }
        },
-        "nl":{
+        "nl": {
-            "mai":{
+            "mai": {
-                "parallel-wavegan":{
+                "parallel-wavegan": {
                    "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.10/vocoder_models--nl--mai--parallel-wavegan.zip",
                    "author": "@r-dh",
                    "commit": "unknown"
                }
            }
        },
-        "de":{
+        "de": {
-            "thorsten":{
+            "thorsten": {
-                "wavegrad":{
+                "wavegrad": {
                    "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.11/vocoder_models--de--thorsten--wavegrad.zip",
                    "author": "@thorstenMueller",
                    "commit": "unknown"
                },
-                "fullband-melgan":{
+                "fullband-melgan": {
                    "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.1.3/vocoder_models--de--thorsten--fullband-melgan.zip",
                    "author": "@thorstenMueller",
                    "commit": "unknown"
                }
            }
        },
        "ja": {
            "kokoro": {
                "hifigan_v1": {
                    "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.2.0/vocoder_models--ja--kokoro--hifigan_v1.zip",
                    "description": "HifiGAN model trained for kokoro dataset by @kaiidams",
                    "author": "@kaiidams",
                    "commit": "3900448"
                }
            }
        }
    }
-}
+}
--- a/TTS/bin/convert_melgan_torch_to_tf.py
+++ b/TTS/bin/convert_melgan_torch_to_tf.py
@ -6,7 +6,7 @@ import numpy as np
 import tensorflow as tf
 import torch
-from TTS.utils.io import load_config
+from TTS.utils.io import load_config, load_fsspec
 from TTS.vocoder.tf.utils.convert_torch_to_tf_utils import (
    compare_torch_tf,
    convert_tf_name,
@ -33,7 +33,7 @@ num_speakers = 0
 # init torch model
 model = setup_generator(c)
-checkpoint = torch.load(args.torch_model_path, map_location=torch.device("cpu"))
+checkpoint = load_fsspec(args.torch_model_path, map_location=torch.device("cpu"))
 state_dict = checkpoint["model"]
 model.load_state_dict(state_dict)
 model.remove_weight_norm()
--- a/TTS/bin/convert_tacotron2_torch_to_tf.py
+++ b/TTS/bin/convert_tacotron2_torch_to_tf.py
@ -13,7 +13,7 @@ from TTS.tts.tf.models.tacotron2 import Tacotron2
 from TTS.tts.tf.utils.convert_torch_to_tf_utils import compare_torch_tf, convert_tf_name, transfer_weights_torch_to_tf
 from TTS.tts.tf.utils.generic_utils import save_checkpoint
 from TTS.tts.utils.text.symbols import phonemes, symbols
-from TTS.utils.io import load_config
+from TTS.utils.io import load_config, load_fsspec
 sys.path.append("/home/erogol/Projects")
 os.environ["CUDA_VISIBLE_DEVICES"] = ""
@ -32,7 +32,7 @@ num_speakers = 0
 # init torch model
 model = setup_model(c)
-checkpoint = torch.load(args.torch_model_path, map_location=torch.device("cpu"))
+checkpoint = load_fsspec(args.torch_model_path, map_location=torch.device("cpu"))
 state_dict = checkpoint["model"]
 model.load_state_dict(state_dict)
--- a/TTS/bin/extract_tts_spectrograms.py
+++ b/TTS/bin/extract_tts_spectrograms.py
@ -16,6 +16,7 @@ from TTS.tts.models import setup_model
 from TTS.tts.utils.speakers import get_speaker_manager
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.generic_utils import count_parameters
 from TTS.utils.io import load_fsspec
 use_cuda = torch.cuda.is_available()
@ -239,7 +240,7 @@ def main(args):  # pylint: disable=redefined-outer-name
    model = setup_model(c)
    # restore model
-    checkpoint = torch.load(args.checkpoint_path, map_location="cpu")
+    checkpoint = load_fsspec(args.checkpoint_path, map_location="cpu")
    model.load_state_dict(checkpoint["model"])
    if use_cuda:
--- a/TTS/bin/synthesize.py
+++ b/TTS/bin/synthesize.py
@ -208,7 +208,7 @@ def main():
    if args.vocoder_name is not None and not args.vocoder_path:
        vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
-    # CASE3: set custome model paths
+    # CASE3: set custom model paths
    if args.model_path is not None:
        model_path = args.model_path
        config_path = args.config_path
--- a/TTS/bin/train_encoder.py
+++ b/TTS/bin/train_encoder.py
@ -17,6 +17,7 @@ from TTS.trainer import init_training
 from TTS.tts.datasets import load_meta_data
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.generic_utils import count_parameters, remove_experiment_folder, set_init_dict
 from TTS.utils.io import load_fsspec
 from TTS.utils.radam import RAdam
 from TTS.utils.training import NoamLR, check_update
@ -115,12 +116,12 @@ def train(model, optimizer, scheduler, criterion, data_loader, global_step):
                "step_time": step_time,
                "avg_loader_time": avg_loader_time,
            }
-            tb_logger.tb_train_epoch_stats(global_step, train_stats)
+            dashboard_logger.train_epoch_stats(global_step, train_stats)
            figures = {
                # FIXME: not constant
                "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), 10),
            }
-            tb_logger.tb_train_figures(global_step, figures)
+            dashboard_logger.train_figures(global_step, figures)
        if global_step % c.print_step == 0:
            print(
@ -169,7 +170,7 @@ def main(args):  # pylint: disable=redefined-outer-name
        raise Exception("The %s  not is a loss supported" % c.loss)
    if args.restore_path:
-        checkpoint = torch.load(args.restore_path)
+        checkpoint = load_fsspec(args.restore_path)
        try:
            model.load_state_dict(checkpoint["model"])
@ -207,7 +208,7 @@ def main(args):  # pylint: disable=redefined-outer-name
 if __name__ == "__main__":
-    args, c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = init_training(sys.argv)
+    args, c, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = init_training(sys.argv)
    try:
        main(args)
--- a/TTS/bin/train_tts.py
+++ b/TTS/bin/train_tts.py
@ -5,8 +5,8 @@ from TTS.trainer import Trainer, init_training
 def main():
    """Run 🐸TTS trainer from terminal. This is also necessary to run DDP training by ```distribute.py```"""
-    args, config, output_path, _, c_logger, tb_logger = init_training(sys.argv)
+    args, config, output_path, _, c_logger, dashboard_logger = init_training(sys.argv)
-    trainer = Trainer(args, config, output_path, c_logger, tb_logger, cudnn_benchmark=False)
+    trainer = Trainer(args, config, output_path, c_logger, dashboard_logger, cudnn_benchmark=False)
    trainer.fit()
--- a/TTS/bin/train_vocoder.py
+++ b/TTS/bin/train_vocoder.py
@ -8,8 +8,8 @@ from TTS.utils.generic_utils import remove_experiment_folder
 def main():
    try:
-        args, config, output_path, _, c_logger, tb_logger = init_training(sys.argv)
+        args, config, output_path, _, c_logger, dashboard_logger = init_training(sys.argv)
-        trainer = Trainer(args, config, output_path, c_logger, tb_logger)
+        trainer = Trainer(args, config, output_path, c_logger, dashboard_logger)
        trainer.fit()
    except KeyboardInterrupt:
        remove_experiment_folder(output_path)
--- a/TTS/config/init.py
+++ b/TTS/config/init.py
@ -3,6 +3,7 @@ import os
 import re
 from typing import Dict
 import fsspec
 import yaml
 from coqpit import Coqpit
@ -13,7 +14,7 @@ from TTS.utils.generic_utils import find_module
 def read_json_with_comments(json_path):
    """for backward compat."""
    # fallback to json
-    with open(json_path, "r", encoding="utf-8") as f:
+    with fsspec.open(json_path, "r", encoding="utf-8") as f:
        input_str = f.read()
    # handle comments
    input_str = re.sub(r"\\\n", "", input_str)
@ -76,13 +77,12 @@ def load_config(config_path: str) -> None:
    config_dict = {}
    ext = os.path.splitext(config_path)[1]
    if ext in (".yml", ".yaml"):
-        with open(config_path, "r", encoding="utf-8") as f:
+        with fsspec.open(config_path, "r", encoding="utf-8") as f:
            data = yaml.safe_load(f)
    elif ext == ".json":
        try:
-            with open(config_path, "r", encoding="utf-8") as f:
+            with fsspec.open(config_path, "r", encoding="utf-8") as f:
-                input_str = f.read()
+                data = json.load(f)
                data = json.loads(input_str)
        except json.decoder.JSONDecodeError:
            # backwards compat.
            data = read_json_with_comments(config_path)
--- a/TTS/config/shared_configs.py
+++ b/TTS/config/shared_configs.py
@ -36,6 +36,10 @@ class BaseAudioConfig(Coqpit):
            Enable / Disable sound normalization to reconcile the volume differences among samples. Defaults to False.
        do_trim_silence (bool):
            Enable / Disable trimming silences at the beginning and the end of the audio clip. Defaults to ```True```.
        do_amp_to_db_linear (bool, optional):
            enable/disable amplitude to dB conversion of linear spectrograms. Defaults to True.
        do_amp_to_db_mel (bool, optional):
            enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True.
        trim_db (int):
            Silence threshold used for silence trimming. Defaults to 45.
        power (float):
@ -79,7 +83,7 @@ class BaseAudioConfig(Coqpit):
    preemphasis: float = 0.0
    ref_level_db: int = 20
    do_sound_norm: bool = False
-    log_func = "np.log10"
+    log_func: str = "np.log10"
    # silence trimming
    do_trim_silence: bool = True
    trim_db: int = 45
@ -91,6 +95,8 @@ class BaseAudioConfig(Coqpit):
    mel_fmin: float = 0.0
    mel_fmax: float = None
    spec_gain: int = 20
    do_amp_to_db_linear: bool = True
    do_amp_to_db_mel: bool = True
    # normalization params
    signal_norm: bool = True
    min_level_db: int = -100
@ -182,51 +188,87 @@ class BaseTrainingConfig(Coqpit):
    Args:
        model (str):
            Name of the model that is used in the training.
        run_name (str):
            Name of the experiment. This prefixes the output folder name.
        run_description (str):
            Short description of the experiment.
        epochs (int):
            Number training epochs. Defaults to 10000.
        batch_size (int):
            Training batch size.
        eval_batch_size (int):
            Validation batch size.
        mixed_precision (bool):
            Enable / Disable mixed precision training. It reduces the VRAM use and allows larger batch sizes, however
            it may also cause numerical unstability in some cases.
        scheduler_after_epoch (bool):
            If true, run the scheduler step after each epoch else run it after each model step.
        run_eval (bool):
            Enable / Disable evaluation (validation) run. Defaults to True.
        test_delay_epochs (int):
            Number of epochs before starting to use evaluation runs. Initially, models do not generate meaningful
            results, hence waiting for a couple of epochs might save some time.
        print_eval (bool):
            Enable / Disable console logging for evalutaion steps. If disabled then it only shows the final values at
            the end of the evaluation. Default to ```False```.
        print_step (int):
            Number of steps required to print the next training log.
-        tb_plot_step (int):
+
        log_dashboard (str): "tensorboard" or "wandb"
            Set the experiment tracking tool
        plot_step (int):
            Number of steps required to log training on Tensorboard.
-        tb_model_param_stats (bool):
+
        model_param_stats (bool):
            Enable / Disable logging internal model stats for model diagnostic. It might be useful for model debugging.
            Defaults to ```False```.
        project_name (str):
            Name of the project. Defaults to config.model
        wandb_entity (str):
            Name of W&B entity/team. Enables collaboration across a team or org.
        log_model_step (int):
            Number of steps required to log a checkpoint as W&B artifact
        save_step (int):ipt
            Number of steps required to save the next checkpoint.
        checkpoint (bool):
            Enable / Disable checkpointing.
        keep_all_best (bool):
            Enable / Disable keeping all the saved best models instead of overwriting the previous one. Defaults
            to ```False```.
        keep_after (int):
            Number of steps to wait before saving all the best models. In use if ```keep_all_best == True```. Defaults
            to 10000.
        num_loader_workers (int):
            Number of workers for training time dataloader.
        num_eval_loader_workers (int):
            Number of workers for evaluation time dataloader.
        output_path (str):
-            Path for training output folder. The nonexist part of the given path is created automatically.
+            Path for training output folder, either a local file path or other
-            All training outputs are saved there.
+            URLs supported by both fsspec and tensorboardX, e.g. GCS (gs://) or
            S3 (s3://) paths. The nonexist part of the given path is created
            automatically. All training artefacts are saved there.
    """
    model: str = None
@ -237,14 +279,19 @@ class BaseTrainingConfig(Coqpit):
    batch_size: int = None
    eval_batch_size: int = None
    mixed_precision: bool = False
    scheduler_after_epoch: bool = False
    # eval params
    run_eval: bool = True
    test_delay_epochs: int = 0
    print_eval: bool = False
    # logging
    dashboard_logger: str = "tensorboard"
    print_step: int = 25
-    tb_plot_step: int = 100
+    plot_step: int = 100
-    tb_model_param_stats: bool = False
+    model_param_stats: bool = False
    project_name: str = None
    log_model_step: int = None
    wandb_entity: str = None
    # checkpointing
    save_step: int = 10000
    checkpoint: bool = True
--- a/TTS/server/server.py
+++ b/TTS/server/server.py
@ -103,8 +103,8 @@ synthesizer = Synthesizer(
    model_path, config_path, speakers_file_path, vocoder_path, vocoder_config_path, use_cuda=args.use_cuda
 )
-use_multi_speaker = synthesizer.tts_model.speaker_manager is not None and synthesizer.tts_model.num_speakers > 1
+use_multi_speaker = hasattr(synthesizer.tts_model, "speaker_manager") and synthesizer.tts_model.num_speakers > 1
-speaker_manager = synthesizer.tts_model.speaker_manager if hasattr(synthesizer.tts_model, "speaker_manager") else None
+speaker_manager = getattr(synthesizer.tts_model, "speaker_manager", None)
 # TODO: set this from SpeakerManager
 use_gst = synthesizer.tts_config.get("use_gst", False)
 app = Flask(__name__)
--- a/TTS/speaker_encoder/models/lstm.py
+++ b/TTS/speaker_encoder/models/lstm.py
@ -2,6 +2,8 @@ import numpy as np
 import torch
 from torch import nn
 from TTS.utils.io import load_fsspec
 class LSTMWithProjection(nn.Module):
    def __init__(self, input_size, hidden_size, proj_size):
@ -120,7 +122,7 @@ class LSTMSpeakerEncoder(nn.Module):
    # pylint: disable=unused-argument, redefined-builtin
    def load_checkpoint(self, config: dict, checkpoint_path: str, eval: bool = False, use_cuda: bool = False):
-        state = torch.load(checkpoint_path, map_location=torch.device("cpu"))
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
        self.load_state_dict(state["model"])
        if use_cuda:
            self.cuda()
--- a/TTS/speaker_encoder/models/resnet.py
+++ b/TTS/speaker_encoder/models/resnet.py
@ -2,6 +2,8 @@ import numpy as np
 import torch
 import torch.nn as nn
 from TTS.utils.io import load_fsspec
 class SELayer(nn.Module):
    def __init__(self, channel, reduction=8):
@ -201,7 +203,7 @@ class ResNetSpeakerEncoder(nn.Module):
        return embeddings
    def load_checkpoint(self, config: dict, checkpoint_path: str, eval: bool = False, use_cuda: bool = False):
-        state = torch.load(checkpoint_path, map_location=torch.device("cpu"))
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
        self.load_state_dict(state["model"])
        if use_cuda:
            self.cuda()
--- a/TTS/speaker_encoder/utils/generic_utils.py
+++ b/TTS/speaker_encoder/utils/generic_utils.py
@ -6,11 +6,11 @@ import re
 from multiprocessing import Manager
 import numpy as np
 import torch
 from scipy import signal
 from TTS.speaker_encoder.models.lstm import LSTMSpeakerEncoder
 from TTS.speaker_encoder.models.resnet import ResNetSpeakerEncoder
 from TTS.utils.io import save_fsspec
 class Storage(object):
@ -198,7 +198,7 @@ def save_checkpoint(model, optimizer, criterion, model_loss, out_path, current_s
        "loss": model_loss,
        "date": datetime.date.today().strftime("%B %d, %Y"),
    }
-    torch.save(state, checkpoint_path)
+    save_fsspec(state, checkpoint_path)
 def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path, current_step):
@ -216,5 +216,5 @@ def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path
        bestmodel_path = "best_model.pth.tar"
        bestmodel_path = os.path.join(out_path, bestmodel_path)
        print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path))
-        torch.save(state, bestmodel_path)
+        save_fsspec(state, bestmodel_path)
    return best_loss
--- a/TTS/speaker_encoder/utils/io.py
+++ b/TTS/speaker_encoder/utils/io.py
@ -1,7 +1,7 @@
 import datetime
 import os
-import torch
+from TTS.utils.io import save_fsspec
 def save_checkpoint(model, optimizer, model_loss, out_path, current_step):
@ -17,7 +17,7 @@ def save_checkpoint(model, optimizer, model_loss, out_path, current_step):
        "loss": model_loss,
        "date": datetime.date.today().strftime("%B %d, %Y"),
    }
-    torch.save(state, checkpoint_path)
+    save_fsspec(state, checkpoint_path)
 def save_best_model(model, optimizer, model_loss, best_loss, out_path, current_step):
@ -34,5 +34,5 @@ def save_best_model(model, optimizer, model_loss, best_loss, out_path, current_s
        bestmodel_path = "best_model.pth.tar"
        bestmodel_path = os.path.join(out_path, bestmodel_path)
        print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path))
-        torch.save(state, bestmodel_path)
+        save_fsspec(state, bestmodel_path)
    return best_loss
--- a/TTS/trainer.py
+++ b/TTS/trainer.py
@ -1,8 +1,8 @@
 # -*- coding: utf-8 -*-
 import glob
 import importlib
 import logging
 import multiprocessing
 import os
 import platform
 import re
@ -12,7 +12,9 @@ import traceback
 from argparse import Namespace
 from dataclasses import dataclass, field
 from typing import Dict, List, Tuple, Union
 from urllib.parse import urlparse
 import fsspec
 import torch
 from coqpit import Coqpit
 from torch import nn
@ -29,18 +31,20 @@ from TTS.utils.distribute import init_distributed
 from TTS.utils.generic_utils import (
    KeepAverage,
    count_parameters,
-    create_experiment_folder,
+    get_experiment_folder_path,
    get_git_branch,
    remove_experiment_folder,
    set_init_dict,
    to_cuda,
 )
-from TTS.utils.io import copy_model_files, save_best_model, save_checkpoint
+from TTS.utils.io import copy_model_files, load_fsspec, save_best_model, save_checkpoint
-from TTS.utils.logging import ConsoleLogger, TensorboardLogger
+from TTS.utils.logging import ConsoleLogger, TensorboardLogger, WandbLogger, init_logger
 from TTS.utils.trainer_utils import get_optimizer, get_scheduler, is_apex_available, setup_torch_training_env
 from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
 from TTS.vocoder.models import setup_model as setup_vocoder_model
 multiprocessing.set_start_method("fork")
 if platform.system() != "Windows":
    # https://github.com/pytorch/pytorch/issues/973
    import resource
@ -48,6 +52,7 @@ if platform.system() != "Windows":
    rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
    resource.setrlimit(resource.RLIMIT_NOFILE, (4096, rlimit[1]))
 if is_apex_available():
    from apex import amp
@ -87,7 +92,7 @@ class Trainer:
        config: Coqpit,
        output_path: str,
        c_logger: ConsoleLogger = None,
-        tb_logger: TensorboardLogger = None,
+        dashboard_logger: Union[TensorboardLogger, WandbLogger] = None,
        model: nn.Module = None,
        cudnn_benchmark: bool = False,
    ) -> None:
@ -112,7 +117,7 @@ class Trainer:
            c_logger (ConsoleLogger, optional): Console logger for printing training status. If not provided, the default
                console logger is used. Defaults to None.
-            tb_logger (TensorboardLogger, optional): Tensorboard logger. If not provided, the default logger is used.
+            dashboard_logger Union[TensorboardLogger, WandbLogger]: Dashboard logger. If not provided, the tensorboard logger is used.
                Defaults to None.
            model (nn.Module, optional): Initialized and ready-to-train model. If it is not defined, `Trainer`
@ -134,8 +139,8 @@ class Trainer:
            Running trainer on a config.
            >>> config = WavegradConfig(data_path="/home/erogol/nvme/gdrive/Datasets/LJSpeech-1.1/wavs/", output_path=output_path,)
-            >>> args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config)
+            >>> args, config, output_path, _, c_logger, dashboard_logger = init_training(TrainingArgs(), config)
-            >>> trainer = Trainer(args, config, output_path, c_logger, tb_logger)
+            >>> trainer = Trainer(args, config, output_path, c_logger, dashboard_logger)
            >>> trainer.fit()
        TODO:
@ -148,22 +153,24 @@ class Trainer:
        # set and initialize Pytorch runtime
        self.use_cuda, self.num_gpus = setup_torch_training_env(True, cudnn_benchmark)
        if config is None:
            # parse config from console arguments
-            config, output_path, _, c_logger, tb_logger = process_args(args)
+            config, output_path, _, c_logger, dashboard_logger = process_args(args)
        self.output_path = output_path
        self.args = args
        self.config = config
-
+        self.config.output_log_path = output_path
        # init loggers
        self.c_logger = ConsoleLogger() if c_logger is None else c_logger
-        if tb_logger is None:
+        self.dashboard_logger = dashboard_logger
-            self.tb_logger = TensorboardLogger(output_path, model_name=config.model)
+
-            self.tb_logger.tb_add_text("model-config", f"<pre>{config.to_json()}</pre>", 0)
+        if self.dashboard_logger is None:
-        else:
+            self.dashboard_logger = init_logger(config)
-            self.tb_logger = tb_logger
+
        if not self.config.log_model_step:
            self.config.log_model_step = self.config.save_step
        log_file = os.path.join(self.output_path, f"trainer_{args.rank}_log.txt")
        self._setup_logger_config(log_file)
@ -173,7 +180,6 @@ class Trainer:
        self.best_loss = float("inf")
        self.train_loader = None
        self.eval_loader = None
        self.output_audio_path = os.path.join(output_path, "test_audios")
        self.keep_avg_train = None
        self.keep_avg_eval = None
@ -184,7 +190,7 @@ class Trainer:
        # init audio processor
        self.ap = AudioProcessor(**self.config.audio.to_dict())
-        # load dataset samples
+        # load data samples
        # TODO: refactor this
        if "datasets" in self.config:
            # load data for `tts` models
@ -205,6 +211,10 @@ class Trainer:
        else:
            self.model = self.get_model(self.config)
        # init multispeaker settings of the model
        if hasattr(self.model, "init_multispeaker"):
            self.model.init_multispeaker(self.config, self.data_train + self.data_eval)
        # setup criterion
        self.criterion = self.get_criterion(self.model)
@ -274,9 +284,9 @@ class Trainer:
        """
        # TODO: better model setup
        try:
            model = setup_tts_model(config)
        except ModuleNotFoundError:
            model = setup_vocoder_model(config)
        except ModuleNotFoundError:
            model = setup_tts_model(config)
        return model
    def restore_model(
@ -309,7 +319,7 @@ class Trainer:
            return obj
        print(" > Restoring from %s ..." % os.path.basename(restore_path))
-        checkpoint = torch.load(restore_path)
+        checkpoint = load_fsspec(restore_path)
        try:
            print(" > Restoring Model...")
            model.load_state_dict(checkpoint["model"])
@ -417,7 +427,7 @@ class Trainer:
        scheduler: Union[torch.optim.lr_scheduler._LRScheduler, List],  # pylint: disable=protected-access
        config: Coqpit,
        optimizer_idx: int = None,
-    ) -> Tuple[Dict, Dict, int, torch.Tensor]:
+    ) -> Tuple[Dict, Dict, int]:
        """Perform a forward - backward pass and run the optimizer.
        Args:
@ -426,7 +436,7 @@ class Trainer:
            optimizer (Union[nn.optim.Optimizer, List]): Model's optimizer. If it is a list then, `optimizer_idx` must be defined to indicate the optimizer in use.
            scaler (AMPScaler): AMP scaler.
            criterion (nn.Module): Model's criterion.
-            scheduler (Union[torch.optim.lr_scheduler._LRScheduler, List]): LR scheduler used by the optimizer.
+            scheduler (torch.optim.lr_scheduler._LRScheduler): LR scheduler used by the optimizer.
            config (Coqpit): Model config.
            optimizer_idx (int, optional): Target optimizer being used. Defaults to None.
@ -436,6 +446,7 @@ class Trainer:
        Returns:
            Tuple[Dict, Dict, int, torch.Tensor]: model outputs, losses, step time and gradient norm.
        """
        step_start_time = time.time()
        # zero-out optimizer
        optimizer.zero_grad()
@ -448,11 +459,11 @@ class Trainer:
        # skip the rest
        if outputs is None:
            step_time = time.time() - step_start_time
-            return None, {}, step_time, 0
+            return None, {}, step_time
        # check nan loss
        if torch.isnan(loss_dict["loss"]).any():
-            raise RuntimeError(f"Detected NaN loss at step {self.total_steps_done}.")
+            raise RuntimeError(f" > Detected NaN loss - {loss_dict}.")
        # set gradient clipping threshold
        if "grad_clip" in config and config.grad_clip is not None:
@ -463,7 +474,6 @@ class Trainer:
        else:
            grad_clip = 0.0  # meaning no gradient clipping
        # TODO: compute grad norm
        if grad_clip <= 0:
            grad_norm = 0
@ -474,15 +484,17 @@ class Trainer:
                with amp.scale_loss(loss_dict["loss"], optimizer) as scaled_loss:
                    scaled_loss.backward()
                grad_norm = torch.nn.utils.clip_grad_norm_(
-                    amp.master_params(optimizer),
+                    amp.master_params(optimizer), grad_clip, error_if_nonfinite=False
                    grad_clip,
                )
            else:
                # model optimizer step in mixed precision mode
                scaler.scale(loss_dict["loss"]).backward()
                scaler.unscale_(optimizer)
                if grad_clip > 0:
-                    grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
+                    scaler.unscale_(optimizer)
                    grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip, error_if_nonfinite=False)
                    # pytorch skips the step when the norm is 0. So ignore the norm value when it is NaN
                    if torch.isnan(grad_norm) or torch.isinf(grad_norm):
                        grad_norm = 0
                scale_prev = scaler.get_scale()
                scaler.step(optimizer)
                scaler.update()
@ -491,13 +503,13 @@ class Trainer:
            # main model optimizer step
            loss_dict["loss"].backward()
            if grad_clip > 0:
-                grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
+                grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip, error_if_nonfinite=False)
            optimizer.step()
        step_time = time.time() - step_start_time
        # setup lr
-        if scheduler is not None and update_lr_scheduler:
+        if scheduler is not None and update_lr_scheduler and not self.config.scheduler_after_epoch:
            scheduler.step()
        # detach losses
@ -505,7 +517,9 @@ class Trainer:
        if optimizer_idx is not None:
            loss_dict[f"loss_{optimizer_idx}"] = loss_dict.pop("loss")
            loss_dict[f"grad_norm_{optimizer_idx}"] = grad_norm
-        return outputs, loss_dict, step_time, grad_norm
+        else:
            loss_dict["grad_norm"] = grad_norm
        return outputs, loss_dict, step_time
    @staticmethod
    def _detach_loss_dict(loss_dict: Dict) -> Dict:
@ -544,11 +558,10 @@ class Trainer:
        # conteainers to hold model outputs and losses for each optimizer.
        outputs_per_optimizer = None
        log_dict = {}
        loss_dict = {}
        if not isinstance(self.optimizer, list):
            # training with a single optimizer
-            outputs, loss_dict_new, step_time, grad_norm = self._optimize(
+            outputs, loss_dict_new, step_time = self._optimize(
                batch, self.model, self.optimizer, self.scaler, self.criterion, self.scheduler, self.config
            )
            loss_dict.update(loss_dict_new)
@ -560,25 +573,36 @@ class Trainer:
                criterion = self.criterion
                scaler = self.scaler[idx] if self.use_amp_scaler else None
                scheduler = self.scheduler[idx]
-                outputs, loss_dict_new, step_time, grad_norm = self._optimize(
+                outputs, loss_dict_new, step_time = self._optimize(
                    batch, self.model, optimizer, scaler, criterion, scheduler, self.config, idx
                )
                # skip the rest if the model returns None
                total_step_time += step_time
                outputs_per_optimizer[idx] = outputs
                # merge loss_dicts from each optimizer
                # rename duplicates with the optimizer idx
                # if None, model skipped this optimizer
                if loss_dict_new is not None:
-                    loss_dict.update(loss_dict_new)
+                    for k, v in loss_dict_new.items():
                        if k in loss_dict:
                            loss_dict[f"{k}-{idx}"] = v
                        else:
                            loss_dict[k] = v
                step_time = total_step_time
            outputs = outputs_per_optimizer
-        # update avg stats
+        # update avg runtime stats
        keep_avg_update = dict()
        for key, value in log_dict.items():
            keep_avg_update["avg_" + key] = value
        keep_avg_update["avg_loader_time"] = loader_time
        keep_avg_update["avg_step_time"] = step_time
        self.keep_avg_train.update_values(keep_avg_update)
        # update avg loss stats
        update_eval_values = dict()
        for key, value in loss_dict.items():
            update_eval_values["avg_" + key] = value
        self.keep_avg_train.update_values(update_eval_values)
        # print training progress
        if self.total_steps_done % self.config.print_step == 0:
            # log learning rates
@ -590,33 +614,27 @@ class Trainer:
            else:
                current_lr = self.optimizer.param_groups[0]["lr"]
                lrs = {"current_lr": current_lr}
-            log_dict.update(lrs)
+
            if grad_norm > 0:
                log_dict.update({"grad_norm": grad_norm})
            # log run-time stats
-            log_dict.update(
+            loss_dict.update(
                {
                    "step_time": round(step_time, 4),
                    "loader_time": round(loader_time, 4),
                }
            )
            self.c_logger.print_train_step(
-                batch_n_steps, step, self.total_steps_done, log_dict, loss_dict, self.keep_avg_train.avg_values
+                batch_n_steps, step, self.total_steps_done, loss_dict, self.keep_avg_train.avg_values
            )
        if self.args.rank == 0:
            # Plot Training Iter Stats
            # reduce TB load and don't log every step
-            if self.total_steps_done % self.config.tb_plot_step == 0:
+            if self.total_steps_done % self.config.plot_step == 0:
-                iter_stats = log_dict
+                self.dashboard_logger.train_step_stats(self.total_steps_done, loss_dict)
                iter_stats.update(loss_dict)
                self.tb_logger.tb_train_step_stats(self.total_steps_done, iter_stats)
            if self.total_steps_done % self.config.save_step == 0 and self.total_steps_done != 0:
                if self.config.checkpoint:
                    # checkpoint the model
-                    model_loss = (
+                    target_avg_loss = self._pick_target_avg_loss(self.keep_avg_train)
                        loss_dict[self.config.target_loss] if "target_loss" in self.config else loss_dict["loss"]
                    )
                    save_checkpoint(
                        self.config,
                        self.model,
@ -625,8 +643,14 @@ class Trainer:
                        self.total_steps_done,
                        self.epochs_done,
                        self.output_path,
-                        model_loss=model_loss,
+                        model_loss=target_avg_loss,
                    )
                    if self.total_steps_done % self.config.log_model_step == 0:
                        # log checkpoint as artifact
                        aliases = [f"epoch-{self.epochs_done}", f"step-{self.total_steps_done}"]
                        self.dashboard_logger.log_artifact(self.output_path, "checkpoint", "model", aliases)
                # training visualizations
                figures, audios = None, None
                if hasattr(self.model, "module") and hasattr(self.model.module, "train_log"):
@ -634,11 +658,13 @@ class Trainer:
                elif hasattr(self.model, "train_log"):
                    figures, audios = self.model.train_log(self.ap, batch, outputs)
                if figures is not None:
-                    self.tb_logger.tb_train_figures(self.total_steps_done, figures)
+                    self.dashboard_logger.train_figures(self.total_steps_done, figures)
                if audios is not None:
-                    self.tb_logger.tb_train_audios(self.total_steps_done, audios, self.ap.sample_rate)
+                    self.dashboard_logger.train_audios(self.total_steps_done, audios, self.ap.sample_rate)
        self.total_steps_done += 1
        self.callbacks.on_train_step_end()
        self.dashboard_logger.flush()
        return outputs, loss_dict
    def train_epoch(self) -> None:
@ -663,9 +689,17 @@ class Trainer:
        if self.args.rank == 0:
            epoch_stats = {"epoch_time": epoch_time}
            epoch_stats.update(self.keep_avg_train.avg_values)
-            self.tb_logger.tb_train_epoch_stats(self.total_steps_done, epoch_stats)
+            self.dashboard_logger.train_epoch_stats(self.total_steps_done, epoch_stats)
-            if self.config.tb_model_param_stats:
+            if self.config.model_param_stats:
-                self.tb_logger.tb_model_weights(self.model, self.total_steps_done)
+                self.logger.model_weights(self.model, self.total_steps_done)
        # scheduler step after the epoch
        if self.scheduler is not None and self.config.scheduler_after_epoch:
            if isinstance(self.scheduler, list):
                for scheduler in self.scheduler:
                    if scheduler is not None:
                        scheduler.step()
            else:
                self.scheduler.step()
    @staticmethod
    def _model_eval_step(
@ -701,19 +735,22 @@ class Trainer:
            Tuple[Dict, Dict]: Model outputs and losses.
        """
        with torch.no_grad():
-            outputs_per_optimizer = None
+            outputs = []
            loss_dict = {}
            if not isinstance(self.optimizer, list):
                outputs, loss_dict = self._model_eval_step(batch, self.model, self.criterion)
            else:
-                outputs_per_optimizer = [None] * len(self.optimizer)
+                outputs = [None] * len(self.optimizer)
                for idx, _ in enumerate(self.optimizer):
                    criterion = self.criterion
-                    outputs, loss_dict_new = self._model_eval_step(batch, self.model, criterion, idx)
+                    outputs_, loss_dict_new = self._model_eval_step(batch, self.model, criterion, idx)
-                    outputs_per_optimizer[idx] = outputs
+                    outputs[idx] = outputs_
                    if loss_dict_new is not None:
                        loss_dict_new[f"loss_{idx}"] = loss_dict_new.pop("loss")
                        loss_dict.update(loss_dict_new)
-                outputs = outputs_per_optimizer
+
                    loss_dict = self._detach_loss_dict(loss_dict)
            # update avg stats
            update_eval_values = dict()
@ -755,28 +792,35 @@ class Trainer:
            elif hasattr(self.model, "eval_log"):
                figures, audios = self.model.eval_log(self.ap, batch, outputs)
            if figures is not None:
-                self.tb_logger.tb_eval_figures(self.total_steps_done, figures)
+                self.dashboard_logger.eval_figures(self.total_steps_done, figures)
            if audios is not None:
-                self.tb_logger.tb_eval_audios(self.total_steps_done, audios, self.ap.sample_rate)
+                self.dashboard_logger.eval_audios(self.total_steps_done, audios, self.ap.sample_rate)
-            self.tb_logger.tb_eval_stats(self.total_steps_done, self.keep_avg_eval.avg_values)
+            self.dashboard_logger.eval_stats(self.total_steps_done, self.keep_avg_eval.avg_values)
    def test_run(self) -> None:
        """Run test and log the results. Test run must be defined by the model.
        Model must return figures and audios to be logged by the Tensorboard."""
        if hasattr(self.model, "test_run"):
            if self.eval_loader is None:
                self.eval_loader = self.get_eval_dataloader(
                    self.ap,
                    self.data_eval,
                    verbose=True,
                )
            if hasattr(self.eval_loader.dataset, "load_test_samples"):
                samples = self.eval_loader.dataset.load_test_samples(1)
                figures, audios = self.model.test_run(self.ap, samples, None)
            else:
                figures, audios = self.model.test_run(self.ap)
-            self.tb_logger.tb_test_audios(self.total_steps_done, audios, self.config.audio["sample_rate"])
+            self.dashboard_logger.test_audios(self.total_steps_done, audios, self.config.audio["sample_rate"])
-            self.tb_logger.tb_test_figures(self.total_steps_done, figures)
+            self.dashboard_logger.test_figures(self.total_steps_done, figures)
    def _fit(self) -> None:
        """🏃 train -> evaluate -> test for the number of epochs."""
        if self.restore_step != 0 or self.args.best_path:
            print(" > Restoring best loss from " f"{os.path.basename(self.args.best_path)} ...")
-            self.best_loss = torch.load(self.args.best_path, map_location="cpu")["model_loss"]
+            self.best_loss = load_fsspec(self.args.restore_path, map_location="cpu")["model_loss"]
            print(f" > Starting with loaded last best loss {self.best_loss}.")
        self.total_steps_done = self.restore_step
@ -802,10 +846,13 @@ class Trainer:
        """Where the ✨️magic✨️ happens..."""
        try:
            self._fit()
            self.dashboard_logger.finish()
        except KeyboardInterrupt:
            self.callbacks.on_keyboard_interrupt()
            # if the output folder is empty remove the run.
            remove_experiment_folder(self.output_path)
            # finish the wandb run and sync data
            self.dashboard_logger.finish()
            # stop without error signal
            try:
                sys.exit(0)
@ -816,10 +863,33 @@ class Trainer:
            traceback.print_exc()
            sys.exit(1)
    def _pick_target_avg_loss(self, keep_avg_target: KeepAverage) -> Dict:
        """Pick the target loss to compare models"""
        target_avg_loss = None
        # return if target loss defined in the model config
        if "target_loss" in self.config and self.config.target_loss:
            return keep_avg_target[f"avg_{self.config.target_loss}"]
        # take the average of loss_{optimizer_idx} as the target loss when there are multiple optimizers
        if isinstance(self.optimizer, list):
            target_avg_loss = 0
            for idx in range(len(self.optimizer)):
                target_avg_loss += keep_avg_target[f"avg_loss_{idx}"]
            target_avg_loss /= len(self.optimizer)
        else:
            target_avg_loss = keep_avg_target["avg_loss"]
        return target_avg_loss
    def save_best_model(self) -> None:
        """Save the best model. It only saves if the current target loss is smaller then the previous."""
        # set the target loss to choose the best model
        target_loss_dict = self._pick_target_avg_loss(self.keep_avg_eval if self.keep_avg_eval else self.keep_avg_train)
        # save the model and update the best_loss
        self.best_loss = save_best_model(
-            self.keep_avg_eval["avg_loss"] if self.keep_avg_eval else self.keep_avg_train["avg_loss"],
+            target_loss_dict,
            self.best_loss,
            self.config,
            self.model,
@ -834,9 +904,16 @@ class Trainer:
    @staticmethod
    def _setup_logger_config(log_file: str) -> None:
-        logging.basicConfig(
+        handlers = [logging.StreamHandler()]
-            level=logging.INFO, format="", handlers=[logging.FileHandler(log_file), logging.StreamHandler()]
+
-        )
+        # Only add a log file if the output location is local due to poor
        # support for writing logs to file-like objects.
        parsed_url = urlparse(log_file)
        if not parsed_url.scheme or parsed_url.scheme == "file":
            schemeless_path = os.path.join(parsed_url.netloc, parsed_url.path)
            handlers.append(logging.FileHandler(schemeless_path))
        logging.basicConfig(level=logging.INFO, format="", handlers=handlers)
    @staticmethod
    def _is_apex_available() -> bool:
@ -920,28 +997,33 @@ class Trainer:
        return criterion
-def init_arguments():
+def getarguments():
    train_config = TrainingArgs()
    parser = train_config.init_argparse(arg_prefix="")
    return parser
-def get_last_checkpoint(path):
+def get_last_checkpoint(path: str) -> Tuple[str, str]:
    """Get latest checkpoint or/and best model in path.
    It is based on globbing for `*.pth.tar` and the RegEx
    `(checkpoint|best_model)_([0-9]+)`.
    Args:
-        path (list): Path to files to be compared.
+        path: Path to files to be compared.
    Raises:
        ValueError: If no checkpoint or best_model files are found.
    Returns:
-        last_checkpoint (str): Last checkpoint filename.
+        Path to the last checkpoint
        Path to best checkpoint
    """
-    file_names = glob.glob(os.path.join(path, "*.pth.tar"))
+    fs = fsspec.get_mapper(path).fs
    file_names = fs.glob(os.path.join(path, "*.pth.tar"))
    scheme = urlparse(path).scheme
    if scheme:  # scheme is not preserved in fs.glob, add it back
        file_names = [scheme + "://" + file_name for file_name in file_names]
    last_models = {}
    last_model_nums = {}
    for key in ["checkpoint", "best_model"]:
@ -963,7 +1045,7 @@ def get_last_checkpoint(path):
        key_file_names = [fn for fn in file_names if key in fn]
        if last_model is None and len(key_file_names) > 0:
            last_model = max(key_file_names, key=os.path.getctime)
-            last_model_num = torch.load(last_model)["step"]
+            last_model_num = load_fsspec(last_model)["step"]
        if last_model is not None:
            last_models[key] = last_model
@ -997,8 +1079,8 @@ def process_args(args, config=None):
        audio_path (str): Path to save generated test audios.
        c_logger (TTS.utils.console_logger.ConsoleLogger): Class that does
            logging to the console.
-        tb_logger (TTS.utils.tensorboard.TensorboardLogger): Class that does
+
-            the TensorBoard logging.
+        dashboard_logger (WandbLogger or TensorboardLogger): Class that does the dashboard Logging
    TODO:
        - Interactive config definition.
@ -1012,6 +1094,7 @@ def process_args(args, config=None):
        args.restore_path, best_model = get_last_checkpoint(args.continue_path)
        if not args.best_path:
            args.best_path = best_model
    # init config if not already defined
    if config is None:
        if args.config_path:
@ -1030,12 +1113,12 @@ def process_args(args, config=None):
        print("   >  Mixed precision mode is ON")
    experiment_path = args.continue_path
    if not experiment_path:
-        experiment_path = create_experiment_folder(config.output_path, config.run_name)
+        experiment_path = get_experiment_folder_path(config.output_path, config.run_name)
    audio_path = os.path.join(experiment_path, "test_audios")
    config.output_log_path = experiment_path
    # setup rank 0 process in distributed training
-    tb_logger = None
+    dashboard_logger = None
    if args.rank == 0:
        os.makedirs(audio_path, exist_ok=True)
        new_fields = {}
        if args.restore_path:
            new_fields["restore_path"] = args.restore_path
@ -1043,17 +1126,20 @@ def process_args(args, config=None):
        # if model characters are not set in the config file
        # save the default set to the config file for future
        # compatibility.
-        if config.has("characters_config"):
+        if config.has("characters") and config.characters is None:
            used_characters = parse_symbols()
            new_fields["characters"] = used_characters
        copy_model_files(config, experiment_path, new_fields)
-        os.chmod(audio_path, 0o775)
+
-        os.chmod(experiment_path, 0o775)
+    dashboard_logger = init_logger(config)
        tb_logger = TensorboardLogger(experiment_path, model_name=config.model)
        # write model desc to tensorboard
        tb_logger.tb_add_text("model-config", f"<pre>{config.to_json()}</pre>", 0)
    c_logger = ConsoleLogger()
-    return config, experiment_path, audio_path, c_logger, tb_logger
+    return config, experiment_path, audio_path, c_logger, dashboard_logger
 def init_arguments():
    train_config = TrainingArgs()
    parser = train_config.init_argparse(arg_prefix="")
    return parser
 def init_training(argv: Union[List, Coqpit], config: Coqpit = None):
@ -1063,5 +1149,5 @@ def init_training(argv: Union[List, Coqpit], config: Coqpit = None):
    else:
        parser = init_arguments()
    args = parser.parse_known_args()
-    config, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = process_args(args, config)
+    config, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = process_args(args, config)
-    return args[0], config, OUT_PATH, AUDIO_PATH, c_logger, tb_logger
+    return args[0], config, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger
--- a/TTS/tts/configs/shared_configs.py
+++ b/TTS/tts/configs/shared_configs.py
@ -13,12 +13,16 @@ class GSTConfig(Coqpit):
    Args:
        gst_style_input_wav (str):
            Path to the wav file used to define the style of the output speech at inference. Defaults to None.
        gst_style_input_weights (dict):
            Defines the weights for each style token used at inference. Defaults to None.
        gst_embedding_dim (int):
            Defines the size of the GST embedding vector dimensions. Defaults to 256.
        gst_num_heads (int):
            Number of attention heads used by the multi-head attention. Defaults to 4.
        gst_num_style_tokens (int):
            Number of style token vectors. Defaults to 10.
    """
@ -51,17 +55,23 @@ class CharactersConfig(Coqpit):
    Args:
        pad (str):
            characters in place of empty padding. Defaults to None.
        eos (str):
            characters showing the end of a sentence. Defaults to None.
        bos (str):
            characters showing the beginning of a sentence. Defaults to None.
        characters (str):
            character set used by the model. Characters not in this list are ignored when converting input text to
            a list of sequence IDs. Defaults to None.
        punctuations (str):
            characters considered as punctuation as parsing the input sentence. Defaults to None.
        phonemes (str):
            characters considered as parsing phonemes. Defaults to None.
        unique (bool):
            remove any duplicate characters in the character lists. It is a bandaid for compatibility with the old
            models trained with character lists with duplicates.
@ -95,54 +105,78 @@ class BaseTTSConfig(BaseTrainingConfig):
    Args:
        audio (BaseAudioConfig):
            Audio processor config object instance.
        use_phonemes (bool):
            enable / disable phoneme use.
        use_espeak_phonemes (bool):
            enable / disable eSpeak-compatible phonemes (only if use_phonemes = `True`).
        compute_input_seq_cache (bool):
            enable / disable precomputation of the phoneme sequences. At the expense of some delay at the beginning of
            the training, It allows faster data loader time and precise limitation with `max_seq_len` and
            `min_seq_len`.
        text_cleaner (str):
            Name of the text cleaner used for cleaning and formatting transcripts.
        enable_eos_bos_chars (bool):
            enable / disable the use of eos and bos characters.
        test_senteces_file (str):
            Path to a txt file that has sentences used at test time. The file must have a sentence per line.
        phoneme_cache_path (str):
            Path to the output folder caching the computed phonemes for each sample.
        characters (CharactersConfig):
            Instance of a CharactersConfig class.
        batch_group_size (int):
            Size of the batch groups used for bucketing. By default, the dataloader orders samples by the sequence
            length for a more efficient and stable training. If `batch_group_size > 1` then it performs bucketing to
            prevent using the same batches for each epoch.
        loss_masking (bool):
            enable / disable masking loss values against padded segments of samples in a batch.
        min_seq_len (int):
            Minimum input sequence length to be used at training.
        max_seq_len (int):
            Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
        compute_f0 (int):
            (Not in use yet).
        compute_linear_spec (bool):
            If True data loader computes and returns linear spectrograms alongside the other data.
        use_noise_augment (bool):
            Augment the input audio with random noise.
        add_blank (bool):
            Add blank characters between each other two characters. It improves performance for some models at expense
            of slower run-time due to the longer input sequence.
        datasets (List[BaseDatasetConfig]):
            List of datasets used for training. If multiple datasets are provided, they are merged and used together
            for training.
        optimizer (str):
            Optimizer used for the training. Set one from `torch.optim.Optimizer` or `TTS.utils.training`.
            Defaults to ``.
        optimizer_params (dict):
            Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}`
        lr_scheduler (str):
            Learning rate scheduler for the training. Use one from `torch.optim.Scheduler` schedulers or
            `TTS.utils.training`. Defaults to ``.
        lr_scheduler_params (dict):
            Parameters for the generator learning rate scheduler. Defaults to `{"warmup": 4000}`.
        test_sentences (List[str]):
            List of sentences to be used at testing. Defaults to '[]'
    """
@ -166,6 +200,7 @@ class BaseTTSConfig(BaseTrainingConfig):
    min_seq_len: int = 1
    max_seq_len: int = float("inf")
    compute_f0: bool = False
    compute_linear_spec: bool = False
    use_noise_augment: bool = False
    add_blank: bool = False
    # dataset
--- a/TTS/tts/configs/vits_config.py
+++ b/TTS/tts/configs/vits_config.py
@ -0,0 +1,136 @@
 from dataclasses import dataclass, field
 from typing import List
 from TTS.tts.configs.shared_configs import BaseTTSConfig
 from TTS.tts.models.vits import VitsArgs
@dataclass
 class VitsConfig(BaseTTSConfig):
    """Defines parameters for VITS End2End TTS model.
    Args:
        model (str):
            Model name. Do not change unless you know what you are doing.
        model_args (VitsArgs):
            Model architecture arguments. Defaults to `VitsArgs()`.
        grad_clip (List):
            Gradient clipping thresholds for each optimizer. Defaults to `[5.0, 5.0]`.
        lr_gen (float):
            Initial learning rate for the generator. Defaults to 0.0002.
        lr_disc (float):
            Initial learning rate for the discriminator. Defaults to 0.0002.
        lr_scheduler_gen (str):
            Name of the learning rate scheduler for the generator. One of the `torch.optim.lr_scheduler.*`. Defaults to
            `ExponentialLR`.
        lr_scheduler_gen_params (dict):
            Parameters for the learning rate scheduler of the generator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`.
        lr_scheduler_disc (str):
            Name of the learning rate scheduler for the discriminator. One of the `torch.optim.lr_scheduler.*`. Defaults to
            `ExponentialLR`.
        lr_scheduler_disc_params (dict):
            Parameters for the learning rate scheduler of the discriminator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`.
        scheduler_after_epoch (bool):
            If true, step the schedulers after each epoch else after each step. Defaults to `False`.
        optimizer (str):
            Name of the optimizer to use with both the generator and the discriminator networks. One of the
            `torch.optim.*`. Defaults to `AdamW`.
        kl_loss_alpha (float):
            Loss weight for KL loss. Defaults to 1.0.
        disc_loss_alpha (float):
            Loss weight for the discriminator loss. Defaults to 1.0.
        gen_loss_alpha (float):
            Loss weight for the generator loss. Defaults to 1.0.
        feat_loss_alpha (float):
            Loss weight for the feature matching loss. Defaults to 1.0.
        mel_loss_alpha (float):
            Loss weight for the mel loss. Defaults to 45.0.
        return_wav (bool):
            If true, data loader returns the waveform as well as the other outputs. Do not change. Defaults to `True`.
        compute_linear_spec (bool):
            If true, the linear spectrogram is computed and returned alongside the mel output. Do not change. Defaults to `True`.
        min_seq_len (int):
            Minimum text length to be considered for training. Defaults to `13`.
        max_seq_len (int):
            Maximum text length to be considered for training. Defaults to `500`.
        r (int):
            Number of spectrogram frames to be generated at a time. Do not change. Defaults to `1`.
        add_blank (bool):
            If true, a blank token is added in between every character. Defaults to `True`.
        test_sentences (List[str]):
            List of sentences to be used for testing.
    Note:
        Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters.
    Example:
        >>> from TTS.tts.configs import VitsConfig
        >>> config = VitsConfig()
    """
    model: str = "vits"
    # model specific params
    model_args: VitsArgs = field(default_factory=VitsArgs)
    # optimizer
    grad_clip: List[float] = field(default_factory=lambda: [5, 5])
    lr_gen: float = 0.0002
    lr_disc: float = 0.0002
    lr_scheduler_gen: str = "ExponentialLR"
    lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.999875, "last_epoch": -1})
    lr_scheduler_disc: str = "ExponentialLR"
    lr_scheduler_disc_params: dict = field(default_factory=lambda: {"gamma": 0.999875, "last_epoch": -1})
    scheduler_after_epoch: bool = True
    optimizer: str = "AdamW"
    optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "eps": 1e-9, "weight_decay": 0.01})
    # loss params
    kl_loss_alpha: float = 1.0
    disc_loss_alpha: float = 1.0
    gen_loss_alpha: float = 1.0
    feat_loss_alpha: float = 1.0
    mel_loss_alpha: float = 45.0
    # data loader params
    return_wav: bool = True
    compute_linear_spec: bool = True
    # overrides
    min_seq_len: int = 13
    max_seq_len: int = 500
    r: int = 1  # DO NOT CHANGE
    add_blank: bool = True
    # testing
    test_sentences: List[str] = field(
        default_factory=lambda: [
            "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
            "Be a voice, not an echo.",
            "I'm sorry Dave. I'm afraid I can't do that.",
            "This cake is great. It's so delicious and moist.",
            "Prior to November 22, 1963.",
        ]
    )
--- a/TTS/tts/datasets/TTSDataset.py
+++ b/TTS/tts/datasets/TTSDataset.py
@ -23,7 +23,9 @@ class TTSDataset(Dataset):
        ap: AudioProcessor,
        meta_data: List[List],
        characters: Dict = None,
        custom_symbols: List = None,
        add_blank: bool = False,
        return_wav: bool = False,
        batch_group_size: int = 0,
        min_seq_len: int = 0,
        max_seq_len: int = float("inf"),
@ -54,9 +56,14 @@ class TTSDataset(Dataset):
            characters (dict): `dict` of custom text characters used for converting texts to sequences.
            custom_symbols (list): List of custom symbols used for converting texts to sequences. Models using its own
                set of symbols need to pass it here. Defaults to `None`.
            add_blank (bool): Add a special `blank` character after every other character. It helps some
                models achieve better results. Defaults to false.
            return_wav (bool): Return the waveform of the sample. Defaults to False.
            batch_group_size (int): Range of batch randomization after sorting
                sequences by length. It shuffles each batch with bucketing to gather similar lenght sequences in a
                batch. Set 0 to disable. Defaults to 0.
@ -95,10 +102,12 @@ class TTSDataset(Dataset):
        self.sample_rate = ap.sample_rate
        self.cleaners = text_cleaner
        self.compute_linear_spec = compute_linear_spec
        self.return_wav = return_wav
        self.min_seq_len = min_seq_len
        self.max_seq_len = max_seq_len
        self.ap = ap
        self.characters = characters
        self.custom_symbols = custom_symbols
        self.add_blank = add_blank
        self.use_phonemes = use_phonemes
        self.phoneme_cache_path = phoneme_cache_path
@ -109,6 +118,7 @@ class TTSDataset(Dataset):
        self.use_noise_augment = use_noise_augment
        self.verbose = verbose
        self.input_seq_computed = False
        self.rescue_item_idx = 1
        if use_phonemes and not os.path.isdir(phoneme_cache_path):
            os.makedirs(phoneme_cache_path, exist_ok=True)
        if self.verbose:
@ -128,13 +138,21 @@ class TTSDataset(Dataset):
        return data
    @staticmethod
-    def _generate_and_cache_phoneme_sequence(text, cache_path, cleaners, language, characters, add_blank):
+    def _generate_and_cache_phoneme_sequence(
        text, cache_path, cleaners, language, custom_symbols, characters, add_blank
    ):
        """generate a phoneme sequence from text.
        since the usage is for subsequent caching, we never add bos and
        eos chars here. Instead we add those dynamically later; based on the
        config option."""
        phonemes = phoneme_to_sequence(
-            text, [cleaners], language=language, enable_eos_bos=False, tp=characters, add_blank=add_blank
+            text,
            [cleaners],
            language=language,
            enable_eos_bos=False,
            custom_symbols=custom_symbols,
            tp=characters,
            add_blank=add_blank,
        )
        phonemes = np.asarray(phonemes, dtype=np.int32)
        np.save(cache_path, phonemes)
@ -142,7 +160,7 @@ class TTSDataset(Dataset):
    @staticmethod
    def _load_or_generate_phoneme_sequence(
-        wav_file, text, phoneme_cache_path, enable_eos_bos, cleaners, language, characters, add_blank
+        wav_file, text, phoneme_cache_path, enable_eos_bos, cleaners, language, custom_symbols, characters, add_blank
    ):
        file_name = os.path.splitext(os.path.basename(wav_file))[0]
@ -153,12 +171,12 @@ class TTSDataset(Dataset):
            phonemes = np.load(cache_path)
        except FileNotFoundError:
            phonemes = TTSDataset._generate_and_cache_phoneme_sequence(
-                text, cache_path, cleaners, language, characters, add_blank
+                text, cache_path, cleaners, language, custom_symbols, characters, add_blank
            )
        except (ValueError, IOError):
            print(" [!] failed loading phonemes for {}. " "Recomputing.".format(wav_file))
            phonemes = TTSDataset._generate_and_cache_phoneme_sequence(
-                text, cache_path, cleaners, language, characters, add_blank
+                text, cache_path, cleaners, language, custom_symbols, characters, add_blank
            )
        if enable_eos_bos:
            phonemes = pad_with_eos_bos(phonemes, tp=characters)
@ -173,6 +191,7 @@ class TTSDataset(Dataset):
        else:
            text, wav_file, speaker_name = item
            attn = None
        raw_text = text
        wav = np.asarray(self.load_wav(wav_file), dtype=np.float32)
@ -189,13 +208,19 @@ class TTSDataset(Dataset):
                    self.enable_eos_bos,
                    self.cleaners,
                    self.phoneme_language,
                    self.custom_symbols,
                    self.characters,
                    self.add_blank,
                )
            else:
                text = np.asarray(
-                    text_to_sequence(text, [self.cleaners], tp=self.characters, add_blank=self.add_blank),
+                    text_to_sequence(
                        text,
                        [self.cleaners],
                        custom_symbols=self.custom_symbols,
                        tp=self.characters,
                        add_blank=self.add_blank,
                    ),
                    dtype=np.int32,
                )
@ -209,9 +234,10 @@ class TTSDataset(Dataset):
            # return a different sample if the phonemized
            # text is longer than the threshold
            # TODO: find a better fix
-            return self.load_data(100)
+            return self.load_data(self.rescue_item_idx)
        sample = {
            "raw_text": raw_text,
            "text": text,
            "wav": wav,
            "attn": attn,
@ -238,7 +264,13 @@ class TTSDataset(Dataset):
            for idx, item in enumerate(tqdm.tqdm(self.items)):
                text, *_ = item
                sequence = np.asarray(
-                    text_to_sequence(text, [self.cleaners], tp=self.characters, add_blank=self.add_blank),
+                    text_to_sequence(
                        text,
                        [self.cleaners],
                        custom_symbols=self.custom_symbols,
                        tp=self.characters,
                        add_blank=self.add_blank,
                    ),
                    dtype=np.int32,
                )
                self.items[idx][0] = sequence
@ -249,6 +281,7 @@ class TTSDataset(Dataset):
                self.enable_eos_bos,
                self.cleaners,
                self.phoneme_language,
                self.custom_symbols,
                self.characters,
                self.add_blank,
            ]
@ -329,6 +362,7 @@ class TTSDataset(Dataset):
            wav = [batch[idx]["wav"] for idx in ids_sorted_decreasing]
            item_idxs = [batch[idx]["item_idx"] for idx in ids_sorted_decreasing]
            text = [batch[idx]["text"] for idx in ids_sorted_decreasing]
            raw_text = [batch[idx]["raw_text"] for idx in ids_sorted_decreasing]
            speaker_names = [batch[idx]["speaker_name"] for idx in ids_sorted_decreasing]
            # get pre-computed d-vectors
@ -347,6 +381,14 @@ class TTSDataset(Dataset):
            mel_lengths = [m.shape[1] for m in mel]
            # lengths adjusted by the reduction factor
            mel_lengths_adjusted = [
                m.shape[1] + (self.outputs_per_step - (m.shape[1] % self.outputs_per_step))
                if m.shape[1] % self.outputs_per_step
                else m.shape[1]
                for m in mel
            ]
            # compute 'stop token' targets
            stop_targets = [np.array([0.0] * (mel_len - 1) + [1.0]) for mel_len in mel_lengths]
@ -385,6 +427,20 @@ class TTSDataset(Dataset):
            else:
                linear = None
            # format waveforms
            wav_padded = None
            if self.return_wav:
                wav_lengths = [w.shape[0] for w in wav]
                max_wav_len = max(mel_lengths_adjusted) * self.ap.hop_length
                wav_lengths = torch.LongTensor(wav_lengths)
                wav_padded = torch.zeros(len(batch), 1, max_wav_len)
                for i, w in enumerate(wav):
                    mel_length = mel_lengths_adjusted[i]
                    w = np.pad(w, (0, self.ap.hop_length * self.outputs_per_step), mode="edge")
                    w = w[: mel_length * self.ap.hop_length]
                    wav_padded[i, :, : w.shape[0]] = torch.from_numpy(w)
                wav_padded.transpose_(1, 2)
            # collate attention alignments
            if batch[0]["attn"] is not None:
                attns = [batch[idx]["attn"].T for idx in ids_sorted_decreasing]
@ -397,6 +453,7 @@ class TTSDataset(Dataset):
                attns = torch.FloatTensor(attns).unsqueeze(1)
            else:
                attns = None
            # TODO: return dictionary
            return (
                text,
                text_lenghts,
@ -409,6 +466,8 @@ class TTSDataset(Dataset):
                d_vectors,
                speaker_ids,
                attns,
                wav_padded,
                raw_text,
            )
        raise TypeError(
--- a/TTS/tts/layers/generic/normalization.py
+++ b/TTS/tts/layers/generic/normalization.py
@ -28,6 +28,31 @@ class LayerNorm(nn.Module):
        return x
 class LayerNorm2(nn.Module):
    """Layer norm for the 2nd dimension of the input using torch primitive.
    Args:
        channels (int): number of channels (2nd dimension) of the input.
        eps (float): to prevent 0 division
    Shapes:
        - input: (B, C, T)
        - output: (B, C, T)
    """
    def __init__(self, channels, eps=1e-5):
        super().__init__()
        self.channels = channels
        self.eps = eps
        self.gamma = nn.Parameter(torch.ones(channels))
        self.beta = nn.Parameter(torch.zeros(channels))
    def forward(self, x):
        x = x.transpose(1, -1)
        x = torch.nn.functional.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
        return x.transpose(1, -1)
 class TemporalBatchNorm1d(nn.BatchNorm1d):
    """Normalize each channel separately over time and batch."""
--- a/TTS/tts/layers/glow_tts/duration_predictor.py
+++ b/TTS/tts/layers/glow_tts/duration_predictor.py
@ -18,7 +18,7 @@ class DurationPredictor(nn.Module):
        dropout_p (float): Dropout rate used after each conv layer.
    """
-    def __init__(self, in_channels, hidden_channels, kernel_size, dropout_p):
+    def __init__(self, in_channels, hidden_channels, kernel_size, dropout_p, cond_channels=None):
        super().__init__()
        # class arguments
        self.in_channels = in_channels
@ -33,13 +33,18 @@ class DurationPredictor(nn.Module):
        self.norm_2 = LayerNorm(hidden_channels)
        # output layer
        self.proj = nn.Conv1d(hidden_channels, 1, 1)
        if cond_channels is not None and cond_channels != 0:
            self.cond = nn.Conv1d(cond_channels, in_channels, 1)
-    def forward(self, x, x_mask):
+    def forward(self, x, x_mask, g=None):
        """
        Shapes:
            - x: :math:`[B, C, T]`
            - x_mask: :math:`[B, 1, T]`
            - g: :math:`[B, C, 1]`
        """
        if g is not None:
            x = x + self.cond(g)
        x = self.conv_1(x * x_mask)
        x = torch.relu(x)
        x = self.norm_1(x)
--- a/TTS/tts/layers/glow_tts/glow.py
+++ b/TTS/tts/layers/glow_tts/glow.py
@ -16,7 +16,7 @@ class ResidualConv1dLayerNormBlock(nn.Module):
    ::
        x |-> conv1d -> layer_norm -> relu -> dropout -> + -> o
-          |---------------> conv1d_1x1 -----------------------|
+          |---------------> conv1d_1x1 ------------------|
    Args:
        in_channels (int): number of input tensor channels.
--- a/TTS/tts/layers/glow_tts/transformer.py
+++ b/TTS/tts/layers/glow_tts/transformer.py
@ -4,7 +4,7 @@ import torch
 from torch import nn
 from torch.nn import functional as F
-from TTS.tts.layers.glow_tts.glow import LayerNorm
+from TTS.tts.layers.generic.normalization import LayerNorm, LayerNorm2
 class RelativePositionMultiHeadAttention(nn.Module):
@ -271,7 +271,7 @@ class FeedForwardNetwork(nn.Module):
        dropout_p (float, optional): dropout rate. Defaults to 0.
    """
-    def __init__(self, in_channels, out_channels, hidden_channels, kernel_size, dropout_p=0.0):
+    def __init__(self, in_channels, out_channels, hidden_channels, kernel_size, dropout_p=0.0, causal=False):
        super().__init__()
        self.in_channels = in_channels
@ -280,17 +280,46 @@ class FeedForwardNetwork(nn.Module):
        self.kernel_size = kernel_size
        self.dropout_p = dropout_p
-        self.conv_1 = nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size // 2)
+        if causal:
-        self.conv_2 = nn.Conv1d(hidden_channels, out_channels, kernel_size, padding=kernel_size // 2)
+            self.padding = self._causal_padding
        else:
            self.padding = self._same_padding
        self.conv_1 = nn.Conv1d(in_channels, hidden_channels, kernel_size)
        self.conv_2 = nn.Conv1d(hidden_channels, out_channels, kernel_size)
        self.dropout = nn.Dropout(dropout_p)
    def forward(self, x, x_mask):
-        x = self.conv_1(x * x_mask)
+        x = self.conv_1(self.padding(x * x_mask))
        x = torch.relu(x)
        x = self.dropout(x)
-        x = self.conv_2(x * x_mask)
+        x = self.conv_2(self.padding(x * x_mask))
        return x * x_mask
    def _causal_padding(self, x):
        if self.kernel_size == 1:
            return x
        pad_l = self.kernel_size - 1
        pad_r = 0
        padding = [[0, 0], [0, 0], [pad_l, pad_r]]
        x = F.pad(x, self._pad_shape(padding))
        return x
    def _same_padding(self, x):
        if self.kernel_size == 1:
            return x
        pad_l = (self.kernel_size - 1) // 2
        pad_r = self.kernel_size // 2
        padding = [[0, 0], [0, 0], [pad_l, pad_r]]
        x = F.pad(x, self._pad_shape(padding))
        return x
    @staticmethod
    def _pad_shape(padding):
        l = padding[::-1]
        pad_shape = [item for sublist in l for item in sublist]
        return pad_shape
 class RelativePositionTransformer(nn.Module):
    """Transformer with Relative Potional Encoding.
@ -310,20 +339,23 @@ class RelativePositionTransformer(nn.Module):
            If default, relative encoding is disabled and it is a regular transformer.
            Defaults to None.
        input_length (int, optional): input lenght to limit position encoding. Defaults to None.
        layer_norm_type (str, optional): type "1" uses torch tensor operations and type "2" uses torch layer_norm
            primitive. Use type "2", type "1: is for backward compat. Defaults to "1".
    """
    def __init__(
        self,
-        in_channels,
+        in_channels: int,
-        out_channels,
+        out_channels: int,
-        hidden_channels,
+        hidden_channels: int,
-        hidden_channels_ffn,
+        hidden_channels_ffn: int,
-        num_heads,
+        num_heads: int,
-        num_layers,
+        num_layers: int,
        kernel_size=1,
        dropout_p=0.0,
-        rel_attn_window_size=None,
+        rel_attn_window_size: int = None,
-        input_length=None,
+        input_length: int = None,
        layer_norm_type: str = "1",
    ):
        super().__init__()
        self.hidden_channels = hidden_channels
@ -351,7 +383,12 @@ class RelativePositionTransformer(nn.Module):
                    input_length=input_length,
                )
            )
-            self.norm_layers_1.append(LayerNorm(hidden_channels))
+            if layer_norm_type == "1":
                self.norm_layers_1.append(LayerNorm(hidden_channels))
            elif layer_norm_type == "2":
                self.norm_layers_1.append(LayerNorm2(hidden_channels))
            else:
                raise ValueError(" [!] Unknown layer norm type")
            if hidden_channels != out_channels and (idx + 1) == self.num_layers:
                self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
@ -366,7 +403,12 @@ class RelativePositionTransformer(nn.Module):
                )
            )
-            self.norm_layers_2.append(LayerNorm(hidden_channels if (idx + 1) != self.num_layers else out_channels))
+            if layer_norm_type == "1":
                self.norm_layers_2.append(LayerNorm(hidden_channels if (idx + 1) != self.num_layers else out_channels))
            elif layer_norm_type == "2":
                self.norm_layers_2.append(LayerNorm2(hidden_channels if (idx + 1) != self.num_layers else out_channels))
            else:
                raise ValueError(" [!] Unknown layer norm type")
    def forward(self, x, x_mask):
        """
--- a/TTS/tts/layers/losses.py
+++ b/TTS/tts/layers/losses.py
@ -2,11 +2,13 @@ import math
 import numpy as np
 import torch
 from coqpit import Coqpit
 from torch import nn
 from torch.nn import functional
 from TTS.tts.utils.data import sequence_mask
 from TTS.tts.utils.ssim import ssim
 from TTS.utils.audio import TorchSTFT
 # pylint: disable=abstract-method
@ -514,3 +516,142 @@ class AlignTTSLoss(nn.Module):
            + self.mdn_alpha * mdn_loss
        )
        return {"loss": loss, "loss_l1": spec_loss, "loss_ssim": ssim_loss, "loss_dur": dur_loss, "mdn_loss": mdn_loss}
 class VitsGeneratorLoss(nn.Module):
    def __init__(self, c: Coqpit):
        super().__init__()
        self.kl_loss_alpha = c.kl_loss_alpha
        self.gen_loss_alpha = c.gen_loss_alpha
        self.feat_loss_alpha = c.feat_loss_alpha
        self.mel_loss_alpha = c.mel_loss_alpha
        self.stft = TorchSTFT(
            c.audio.fft_size,
            c.audio.hop_length,
            c.audio.win_length,
            sample_rate=c.audio.sample_rate,
            mel_fmin=c.audio.mel_fmin,
            mel_fmax=c.audio.mel_fmax,
            n_mels=c.audio.num_mels,
            use_mel=True,
            do_amp_to_db=True,
        )
    @staticmethod
    def feature_loss(feats_real, feats_generated):
        loss = 0
        for dr, dg in zip(feats_real, feats_generated):
            for rl, gl in zip(dr, dg):
                rl = rl.float().detach()
                gl = gl.float()
                loss += torch.mean(torch.abs(rl - gl))
        return loss * 2
    @staticmethod
    def generator_loss(scores_fake):
        loss = 0
        gen_losses = []
        for dg in scores_fake:
            dg = dg.float()
            l = torch.mean((1 - dg) ** 2)
            gen_losses.append(l)
            loss += l
        return loss, gen_losses
    @staticmethod
    def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
        """
        z_p, logs_q: [b, h, t_t]
        m_p, logs_p: [b, h, t_t]
        """
        z_p = z_p.float()
        logs_q = logs_q.float()
        m_p = m_p.float()
        logs_p = logs_p.float()
        z_mask = z_mask.float()
        kl = logs_p - logs_q - 0.5
        kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
        kl = torch.sum(kl * z_mask)
        l = kl / torch.sum(z_mask)
        return l
    def forward(
        self,
        waveform,
        waveform_hat,
        z_p,
        logs_q,
        m_p,
        logs_p,
        z_len,
        scores_disc_fake,
        feats_disc_fake,
        feats_disc_real,
    ):
        """
        Shapes:
            - wavefrom: :math:`[B, 1, T]`
            - waveform_hat: :math:`[B, 1, T]`
            - z_p: :math:`[B, C, T]`
            - logs_q: :math:`[B, C, T]`
            - m_p: :math:`[B, C, T]`
            - logs_p: :math:`[B, C, T]`
            - z_len: :math:`[B]`
            - scores_disc_fake[i]: :math:`[B, C]`
            - feats_disc_fake[i][j]: :math:`[B, C, T', P]`
            - feats_disc_real[i][j]: :math:`[B, C, T', P]`
        """
        loss = 0.0
        return_dict = {}
        z_mask = sequence_mask(z_len).float()
        # compute mel spectrograms from the waveforms
        mel = self.stft(waveform)
        mel_hat = self.stft(waveform_hat)
        # compute losses
        loss_feat = self.feature_loss(feats_disc_fake, feats_disc_real) * self.feat_loss_alpha
        loss_gen = self.generator_loss(scores_disc_fake)[0] * self.gen_loss_alpha
        loss_kl = self.kl_loss(z_p, logs_q, m_p, logs_p, z_mask.unsqueeze(1)) * self.kl_loss_alpha
        loss_mel = torch.nn.functional.l1_loss(mel, mel_hat) * self.mel_loss_alpha
        loss = loss_kl + loss_feat + loss_mel + loss_gen
        # pass losses to the dict
        return_dict["loss_gen"] = loss_gen
        return_dict["loss_kl"] = loss_kl
        return_dict["loss_feat"] = loss_feat
        return_dict["loss_mel"] = loss_mel
        return_dict["loss"] = loss
        return return_dict
 class VitsDiscriminatorLoss(nn.Module):
    def __init__(self, c: Coqpit):
        super().__init__()
        self.disc_loss_alpha = c.disc_loss_alpha
    @staticmethod
    def discriminator_loss(scores_real, scores_fake):
        loss = 0
        real_losses = []
        fake_losses = []
        for dr, dg in zip(scores_real, scores_fake):
            dr = dr.float()
            dg = dg.float()
            real_loss = torch.mean((1 - dr) ** 2)
            fake_loss = torch.mean(dg ** 2)
            loss += real_loss + fake_loss
            real_losses.append(real_loss.item())
            fake_losses.append(fake_loss.item())
        return loss, real_losses, fake_losses
    def forward(self, scores_disc_real, scores_disc_fake):
        loss = 0.0
        return_dict = {}
        loss_disc, _, _ = self.discriminator_loss(scores_disc_real, scores_disc_fake)
        return_dict["loss_disc"] = loss_disc * self.disc_loss_alpha
        loss = loss + loss_disc
        return_dict["loss_disc"] = loss_disc
        return_dict["loss"] = loss
        return return_dict
--- a/TTS/tts/layers/tacotron/gst_layers.py
+++ b/TTS/tts/layers/tacotron/gst_layers.py
@ -8,10 +8,10 @@ class GST(nn.Module):
    See https://arxiv.org/pdf/1803.09017"""
-    def __init__(self, num_mel, num_heads, num_style_tokens, gst_embedding_dim, d_vector_dim=None):
+    def __init__(self, num_mel, num_heads, num_style_tokens, gst_embedding_dim, embedded_speaker_dim=None):
        super().__init__()
        self.encoder = ReferenceEncoder(num_mel, gst_embedding_dim)
-        self.style_token_layer = StyleTokenLayer(num_heads, num_style_tokens, gst_embedding_dim, d_vector_dim)
+        self.style_token_layer = StyleTokenLayer(num_heads, num_style_tokens, gst_embedding_dim, embedded_speaker_dim)
    def forward(self, inputs, speaker_embedding=None):
        enc_out = self.encoder(inputs)
@ -83,19 +83,19 @@ class ReferenceEncoder(nn.Module):
 class StyleTokenLayer(nn.Module):
    """NN Module attending to style tokens based on prosody encodings."""
-    def __init__(self, num_heads, num_style_tokens, embedding_dim, d_vector_dim=None):
+    def __init__(self, num_heads, num_style_tokens, gst_embedding_dim, d_vector_dim=None):
        super().__init__()
-        self.query_dim = embedding_dim // 2
+        self.query_dim = gst_embedding_dim // 2
        if d_vector_dim:
            self.query_dim += d_vector_dim
-        self.key_dim = embedding_dim // num_heads
+        self.key_dim = gst_embedding_dim // num_heads
        self.style_tokens = nn.Parameter(torch.FloatTensor(num_style_tokens, self.key_dim))
        nn.init.normal_(self.style_tokens, mean=0, std=0.5)
        self.attention = MultiHeadAttention(
-            query_dim=self.query_dim, key_dim=self.key_dim, num_units=embedding_dim, num_heads=num_heads
+            query_dim=self.query_dim, key_dim=self.key_dim, num_units=gst_embedding_dim, num_heads=num_heads
        )
    def forward(self, inputs):
--- a/TTS/tts/layers/vits/discriminator.py
+++ b/TTS/tts/layers/vits/discriminator.py
@ -0,0 +1,77 @@
 import torch
 from torch import nn
 from torch.nn.modules.conv import Conv1d
 from TTS.vocoder.models.hifigan_discriminator import MultiPeriodDiscriminator
 class DiscriminatorS(torch.nn.Module):
    """HiFiGAN Scale Discriminator. Channel sizes are different from the original HiFiGAN.
    Args:
        use_spectral_norm (bool): if `True` swith to spectral norm instead of weight norm.
    """
    def __init__(self, use_spectral_norm=False):
        super().__init__()
        norm_f = nn.utils.spectral_norm if use_spectral_norm else nn.utils.weight_norm
        self.convs = nn.ModuleList(
            [
                norm_f(Conv1d(1, 16, 15, 1, padding=7)),
                norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
                norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
                norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
                norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
                norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
            ]
        )
        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
    def forward(self, x):
        """
        Args:
            x (Tensor): input waveform.
        Returns:
            Tensor: discriminator scores.
            List[Tensor]: list of features from the convolutiona layers.
        """
        feat = []
        for l in self.convs:
            x = l(x)
            x = torch.nn.functional.leaky_relu(x, 0.1)
            feat.append(x)
        x = self.conv_post(x)
        feat.append(x)
        x = torch.flatten(x, 1, -1)
        return x, feat
 class VitsDiscriminator(nn.Module):
    """VITS discriminator wrapping one Scale Discriminator and a stack of Period Discriminator.
    ::
        waveform -> ScaleDiscriminator() -> scores_sd, feats_sd --> append() -> scores, feats
               |--> MultiPeriodDiscriminator() -> scores_mpd, feats_mpd ^
    Args:
        use_spectral_norm (bool): if `True` swith to spectral norm instead of weight norm.
    """
    def __init__(self, use_spectral_norm=False):
        super().__init__()
        self.sd = DiscriminatorS(use_spectral_norm=use_spectral_norm)
        self.mpd = MultiPeriodDiscriminator(use_spectral_norm=use_spectral_norm)
    def forward(self, x):
        """
        Args:
            x (Tensor): input waveform.
        Returns:
            List[Tensor]: discriminator scores.
            List[List[Tensor]]: list of list of features from each layers of each discriminator.
        """
        scores, feats = self.mpd(x)
        score_sd, feats_sd = self.sd(x)
        return scores + [score_sd], feats + [feats_sd]
--- a/TTS/tts/layers/vits/networks.py
+++ b/TTS/tts/layers/vits/networks.py
@ -0,0 +1,271 @@
 import math
 import torch
 from torch import nn
 from TTS.tts.layers.glow_tts.glow import WN
 from TTS.tts.layers.glow_tts.transformer import RelativePositionTransformer
 from TTS.tts.utils.data import sequence_mask
 LRELU_SLOPE = 0.1
 def convert_pad_shape(pad_shape):
    l = pad_shape[::-1]
    pad_shape = [item for sublist in l for item in sublist]
    return pad_shape
 def init_weights(m, mean=0.0, std=0.01):
    classname = m.__class__.__name__
    if classname.find("Conv") != -1:
        m.weight.data.normal_(mean, std)
 def get_padding(kernel_size, dilation=1):
    return int((kernel_size * dilation - dilation) / 2)
 class TextEncoder(nn.Module):
    def __init__(
        self,
        n_vocab: int,
        out_channels: int,
        hidden_channels: int,
        hidden_channels_ffn: int,
        num_heads: int,
        num_layers: int,
        kernel_size: int,
        dropout_p: float,
    ):
        """Text Encoder for VITS model.
        Args:
            n_vocab (int): Number of characters for the embedding layer.
            out_channels (int): Number of channels for the output.
            hidden_channels (int): Number of channels for the hidden layers.
            hidden_channels_ffn (int): Number of channels for the convolutional layers.
            num_heads (int): Number of attention heads for the Transformer layers.
            num_layers (int): Number of Transformer layers.
            kernel_size (int): Kernel size for the FFN layers in Transformer network.
            dropout_p (float): Dropout rate for the Transformer layers.
        """
        super().__init__()
        self.out_channels = out_channels
        self.hidden_channels = hidden_channels
        self.emb = nn.Embedding(n_vocab, hidden_channels)
        nn.init.normal_(self.emb.weight, 0.0, hidden_channels ** -0.5)
        self.encoder = RelativePositionTransformer(
            in_channels=hidden_channels,
            out_channels=hidden_channels,
            hidden_channels=hidden_channels,
            hidden_channels_ffn=hidden_channels_ffn,
            num_heads=num_heads,
            num_layers=num_layers,
            kernel_size=kernel_size,
            dropout_p=dropout_p,
            layer_norm_type="2",
            rel_attn_window_size=4,
        )
        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
    def forward(self, x, x_lengths):
        """
        Shapes:
            - x: :math:`[B, T]`
            - x_length: :math:`[B]`
        """
        x = self.emb(x) * math.sqrt(self.hidden_channels)  # [b, t, h]
        x = torch.transpose(x, 1, -1)  # [b, h, t]
        x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
        x = self.encoder(x * x_mask, x_mask)
        stats = self.proj(x) * x_mask
        m, logs = torch.split(stats, self.out_channels, dim=1)
        return x, m, logs, x_mask
 class ResidualCouplingBlock(nn.Module):
    def __init__(
        self,
        channels,
        hidden_channels,
        kernel_size,
        dilation_rate,
        num_layers,
        dropout_p=0,
        cond_channels=0,
        mean_only=False,
    ):
        assert channels % 2 == 0, "channels should be divisible by 2"
        super().__init__()
        self.half_channels = channels // 2
        self.mean_only = mean_only
        # input layer
        self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
        # coupling layers
        self.enc = WN(
            hidden_channels,
            hidden_channels,
            kernel_size,
            dilation_rate,
            num_layers,
            dropout_p=dropout_p,
            c_in_channels=cond_channels,
        )
        # output layer
        # Initializing last layer to 0 makes the affine coupling layers
        # do nothing at first.  This helps with training stability
        self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
        self.post.weight.data.zero_()
        self.post.bias.data.zero_()
    def forward(self, x, x_mask, g=None, reverse=False):
        """
        Shapes:
            - x: :math:`[B, C, T]`
            - x_mask: :math:`[B, 1, T]`
            - g: :math:`[B, C, 1]`
        """
        x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
        h = self.pre(x0) * x_mask
        h = self.enc(h, x_mask, g=g)
        stats = self.post(h) * x_mask
        if not self.mean_only:
            m, log_scale = torch.split(stats, [self.half_channels] * 2, 1)
        else:
            m = stats
            log_scale = torch.zeros_like(m)
        if not reverse:
            x1 = m + x1 * torch.exp(log_scale) * x_mask
            x = torch.cat([x0, x1], 1)
            logdet = torch.sum(log_scale, [1, 2])
            return x, logdet
        else:
            x1 = (x1 - m) * torch.exp(-log_scale) * x_mask
            x = torch.cat([x0, x1], 1)
            return x
 class ResidualCouplingBlocks(nn.Module):
    def __init__(
        self,
        channels: int,
        hidden_channels: int,
        kernel_size: int,
        dilation_rate: int,
        num_layers: int,
        num_flows=4,
        cond_channels=0,
    ):
        """Redisual Coupling blocks for VITS flow layers.
        Args:
            channels (int): Number of input and output tensor channels.
            hidden_channels (int): Number of hidden network channels.
            kernel_size (int): Kernel size of the WaveNet layers.
            dilation_rate (int): Dilation rate of the WaveNet layers.
            num_layers (int): Number of the WaveNet layers.
            num_flows (int, optional): Number of Residual Coupling blocks. Defaults to 4.
            cond_channels (int, optional): Number of channels of the conditioning tensor. Defaults to 0.
        """
        super().__init__()
        self.channels = channels
        self.hidden_channels = hidden_channels
        self.kernel_size = kernel_size
        self.dilation_rate = dilation_rate
        self.num_layers = num_layers
        self.num_flows = num_flows
        self.cond_channels = cond_channels
        self.flows = nn.ModuleList()
        for _ in range(num_flows):
            self.flows.append(
                ResidualCouplingBlock(
                    channels,
                    hidden_channels,
                    kernel_size,
                    dilation_rate,
                    num_layers,
                    cond_channels=cond_channels,
                    mean_only=True,
                )
            )
    def forward(self, x, x_mask, g=None, reverse=False):
        """
        Shapes:
            - x: :math:`[B, C, T]`
            - x_mask: :math:`[B, 1, T]`
            - g: :math:`[B, C, 1]`
        """
        if not reverse:
            for flow in self.flows:
                x, _ = flow(x, x_mask, g=g, reverse=reverse)
                x = torch.flip(x, [1])
        else:
            for flow in reversed(self.flows):
                x = torch.flip(x, [1])
                x = flow(x, x_mask, g=g, reverse=reverse)
        return x
 class PosteriorEncoder(nn.Module):
    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        hidden_channels: int,
        kernel_size: int,
        dilation_rate: int,
        num_layers: int,
        cond_channels=0,
    ):
        """Posterior Encoder of VITS model.
        ::
            x -> conv1x1() -> WaveNet() (non-causal) -> conv1x1() -> split() -> [m, s] -> sample(m, s) -> z
        Args:
            in_channels (int): Number of input tensor channels.
            out_channels (int): Number of output tensor channels.
            hidden_channels (int): Number of hidden channels.
            kernel_size (int): Kernel size of the WaveNet convolution layers.
            dilation_rate (int): Dilation rate of the WaveNet layers.
            num_layers (int): Number of the WaveNet layers.
            cond_channels (int, optional): Number of conditioning tensor channels. Defaults to 0.
        """
        super().__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.hidden_channels = hidden_channels
        self.kernel_size = kernel_size
        self.dilation_rate = dilation_rate
        self.num_layers = num_layers
        self.cond_channels = cond_channels
        self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
        self.enc = WN(
            hidden_channels, hidden_channels, kernel_size, dilation_rate, num_layers, c_in_channels=cond_channels
        )
        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
    def forward(self, x, x_lengths, g=None):
        """
        Shapes:
            - x: :math:`[B, C, T]`
            - x_lengths: :math:`[B, 1]`
            - g: :math:`[B, C, 1]`
        """
        x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
        x = self.pre(x) * x_mask
        x = self.enc(x, x_mask, g=g)
        stats = self.proj(x) * x_mask
        mean, log_scale = torch.split(stats, self.out_channels, dim=1)
        z = (mean + torch.randn_like(mean) * torch.exp(log_scale)) * x_mask
        return z, mean, log_scale, x_mask
--- a/TTS/tts/layers/vits/stochastic_duration_predictor.py
+++ b/TTS/tts/layers/vits/stochastic_duration_predictor.py
@ -0,0 +1,276 @@
 import math
 import torch
 from torch import nn
 from torch.nn import functional as F
 from TTS.tts.layers.generic.normalization import LayerNorm2
 from TTS.tts.layers.vits.transforms import piecewise_rational_quadratic_transform
 class DilatedDepthSeparableConv(nn.Module):
    def __init__(self, channels, kernel_size, num_layers, dropout_p=0.0) -> torch.tensor:
        """Dilated Depth-wise Separable Convolution module.
        ::
            x |-> DDSConv(x) -> LayerNorm(x) -> GeLU(x) -> Conv1x1(x) -> LayerNorm(x) -> GeLU(x) -> + -> o
              |-------------------------------------------------------------------------------------^
        Args:
            channels ([type]): [description]
            kernel_size ([type]): [description]
            num_layers ([type]): [description]
            dropout_p (float, optional): [description]. Defaults to 0.0.
        Returns:
            torch.tensor: Network output masked by the input sequence mask.
        """
        super().__init__()
        self.num_layers = num_layers
        self.convs_sep = nn.ModuleList()
        self.convs_1x1 = nn.ModuleList()
        self.norms_1 = nn.ModuleList()
        self.norms_2 = nn.ModuleList()
        for i in range(num_layers):
            dilation = kernel_size ** i
            padding = (kernel_size * dilation - dilation) // 2
            self.convs_sep.append(
                nn.Conv1d(channels, channels, kernel_size, groups=channels, dilation=dilation, padding=padding)
            )
            self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
            self.norms_1.append(LayerNorm2(channels))
            self.norms_2.append(LayerNorm2(channels))
        self.dropout = nn.Dropout(dropout_p)
    def forward(self, x, x_mask, g=None):
        """
        Shapes:
            - x: :math:`[B, C, T]`
            - x_mask: :math:`[B, 1, T]`
        """
        if g is not None:
            x = x + g
        for i in range(self.num_layers):
            y = self.convs_sep[i](x * x_mask)
            y = self.norms_1[i](y)
            y = F.gelu(y)
            y = self.convs_1x1[i](y)
            y = self.norms_2[i](y)
            y = F.gelu(y)
            y = self.dropout(y)
            x = x + y
        return x * x_mask
 class ElementwiseAffine(nn.Module):
    """Element-wise affine transform like no-population stats BatchNorm alternative.
    Args:
        channels (int): Number of input tensor channels.
    """
    def __init__(self, channels):
        super().__init__()
        self.translation = nn.Parameter(torch.zeros(channels, 1))
        self.log_scale = nn.Parameter(torch.zeros(channels, 1))
    def forward(self, x, x_mask, reverse=False, **kwargs):  # pylint: disable=unused-argument
        if not reverse:
            y = (x * torch.exp(self.log_scale) + self.translation) * x_mask
            logdet = torch.sum(self.log_scale * x_mask, [1, 2])
            return y, logdet
        x = (x - self.translation) * torch.exp(-self.log_scale) * x_mask
        return x
 class ConvFlow(nn.Module):
    """Dilated depth separable convolutional based spline flow.
    Args:
        in_channels (int): Number of input tensor channels.
        hidden_channels (int): Number of in network channels.
        kernel_size (int): Convolutional kernel size.
        num_layers (int): Number of convolutional layers.
        num_bins (int, optional): Number of spline bins. Defaults to 10.
        tail_bound (float, optional): Tail bound for PRQT. Defaults to 5.0.
    """
    def __init__(
        self,
        in_channels: int,
        hidden_channels: int,
        kernel_size: int,
        num_layers: int,
        num_bins=10,
        tail_bound=5.0,
    ):
        super().__init__()
        self.num_bins = num_bins
        self.tail_bound = tail_bound
        self.hidden_channels = hidden_channels
        self.half_channels = in_channels // 2
        self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
        self.convs = DilatedDepthSeparableConv(hidden_channels, kernel_size, num_layers, dropout_p=0.0)
        self.proj = nn.Conv1d(hidden_channels, self.half_channels * (num_bins * 3 - 1), 1)
        self.proj.weight.data.zero_()
        self.proj.bias.data.zero_()
    def forward(self, x, x_mask, g=None, reverse=False):
        x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
        h = self.pre(x0)
        h = self.convs(h, x_mask, g=g)
        h = self.proj(h) * x_mask
        b, c, t = x0.shape
        h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2)  # [b, cx?, t] -> [b, c, t, ?]
        unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.hidden_channels)
        unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt(self.hidden_channels)
        unnormalized_derivatives = h[..., 2 * self.num_bins :]
        x1, logabsdet = piecewise_rational_quadratic_transform(
            x1,
            unnormalized_widths,
            unnormalized_heights,
            unnormalized_derivatives,
            inverse=reverse,
            tails="linear",
            tail_bound=self.tail_bound,
        )
        x = torch.cat([x0, x1], 1) * x_mask
        logdet = torch.sum(logabsdet * x_mask, [1, 2])
        if not reverse:
            return x, logdet
        return x
 class StochasticDurationPredictor(nn.Module):
    """Stochastic duration predictor with Spline Flows.
    It applies Variational Dequantization and Variationsl Data Augmentation.
    Paper:
        SDP: https://arxiv.org/pdf/2106.06103.pdf
        Spline Flow: https://arxiv.org/abs/1906.04032
    ::
        ## Inference
        x -> TextCondEncoder() -> Flow() -> dr_hat
        noise ----------------------^
        ## Training
                                                                              |---------------------|
        x -> TextCondEncoder() -> + -> PosteriorEncoder() -> split() -> z_u, z_v -> (d - z_u) -> concat() -> Flow() -> noise
        d -> DurCondEncoder()  -> ^                                                    |
        |------------------------------------------------------------------------------|
    Args:
        in_channels (int): Number of input tensor channels.
        hidden_channels (int): Number of hidden channels.
        kernel_size (int): Kernel size of convolutional layers.
        dropout_p (float): Dropout rate.
        num_flows (int, optional): Number of flow blocks. Defaults to 4.
        cond_channels (int, optional): Number of channels of conditioning tensor. Defaults to 0.
    """
    def __init__(
        self, in_channels: int, hidden_channels: int, kernel_size: int, dropout_p: float, num_flows=4, cond_channels=0
    ):
        super().__init__()
        # condition encoder text
        self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
        self.convs = DilatedDepthSeparableConv(hidden_channels, kernel_size, num_layers=3, dropout_p=dropout_p)
        self.proj = nn.Conv1d(hidden_channels, hidden_channels, 1)
        # posterior encoder
        self.flows = nn.ModuleList()
        self.flows.append(ElementwiseAffine(2))
        self.flows += [ConvFlow(2, hidden_channels, kernel_size, num_layers=3) for _ in range(num_flows)]
        # condition encoder duration
        self.post_pre = nn.Conv1d(1, hidden_channels, 1)
        self.post_convs = DilatedDepthSeparableConv(hidden_channels, kernel_size, num_layers=3, dropout_p=dropout_p)
        self.post_proj = nn.Conv1d(hidden_channels, hidden_channels, 1)
        # flow layers
        self.post_flows = nn.ModuleList()
        self.post_flows.append(ElementwiseAffine(2))
        self.post_flows += [ConvFlow(2, hidden_channels, kernel_size, num_layers=3) for _ in range(num_flows)]
        if cond_channels != 0 and cond_channels is not None:
            self.cond = nn.Conv1d(cond_channels, hidden_channels, 1)
    def forward(self, x, x_mask, dr=None, g=None, reverse=False, noise_scale=1.0):
        """
        Shapes:
            - x: :math:`[B, C, T]`
            - x_mask: :math:`[B, 1, T]`
            - dr: :math:`[B, 1, T]`
            - g: :math:`[B, C]`
        """
        # condition encoder text
        x = self.pre(x)
        if g is not None:
            x = x + self.cond(g)
        x = self.convs(x, x_mask)
        x = self.proj(x) * x_mask
        if not reverse:
            flows = self.flows
            assert dr is not None
            # condition encoder duration
            h = self.post_pre(dr)
            h = self.post_convs(h, x_mask)
            h = self.post_proj(h) * x_mask
            noise = torch.rand(dr.size(0), 2, dr.size(2)).to(device=x.device, dtype=x.dtype) * x_mask
            z_q = noise
            # posterior encoder
            logdet_tot_q = 0.0
            for idx, flow in enumerate(self.post_flows):
                z_q, logdet_q = flow(z_q, x_mask, g=(x + h))
                logdet_tot_q = logdet_tot_q + logdet_q
                if idx > 0:
                    z_q = torch.flip(z_q, [1])
            z_u, z_v = torch.split(z_q, [1, 1], 1)
            u = torch.sigmoid(z_u) * x_mask
            z0 = (dr - u) * x_mask
            # posterior encoder - neg log likelihood
            logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2])
            nll_posterior_encoder = (
                torch.sum(-0.5 * (math.log(2 * math.pi) + (noise ** 2)) * x_mask, [1, 2]) - logdet_tot_q
            )
            z0 = torch.log(torch.clamp_min(z0, 1e-5)) * x_mask
            logdet_tot = torch.sum(-z0, [1, 2])
            z = torch.cat([z0, z_v], 1)
            # flow layers
            for idx, flow in enumerate(flows):
                z, logdet = flow(z, x_mask, g=x, reverse=reverse)
                logdet_tot = logdet_tot + logdet
                if idx > 0:
                    z = torch.flip(z, [1])
            # flow layers - neg log likelihood
            nll_flow_layers = torch.sum(0.5 * (math.log(2 * math.pi) + (z ** 2)) * x_mask, [1, 2]) - logdet_tot
            return nll_flow_layers + nll_posterior_encoder
        flows = list(reversed(self.flows))
        flows = flows[:-2] + [flows[-1]]  # remove a useless vflow
        z = torch.rand(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale
        for flow in flows:
            z = torch.flip(z, [1])
            z = flow(z, x_mask, g=x, reverse=reverse)
        z0, _ = torch.split(z, [1, 1], 1)
        logw = z0
        return logw
--- a/TTS/tts/layers/vits/transforms.py
+++ b/TTS/tts/layers/vits/transforms.py
@ -0,0 +1,203 @@
 # adopted from https://github.com/bayesiains/nflows
 import numpy as np
 import torch
 from torch.nn import functional as F
 DEFAULT_MIN_BIN_WIDTH = 1e-3
 DEFAULT_MIN_BIN_HEIGHT = 1e-3
 DEFAULT_MIN_DERIVATIVE = 1e-3
 def piecewise_rational_quadratic_transform(
    inputs,
    unnormalized_widths,
    unnormalized_heights,
    unnormalized_derivatives,
    inverse=False,
    tails=None,
    tail_bound=1.0,
    min_bin_width=DEFAULT_MIN_BIN_WIDTH,
    min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
    min_derivative=DEFAULT_MIN_DERIVATIVE,
 ):
    if tails is None:
        spline_fn = rational_quadratic_spline
        spline_kwargs = {}
    else:
        spline_fn = unconstrained_rational_quadratic_spline
        spline_kwargs = {"tails": tails, "tail_bound": tail_bound}
    outputs, logabsdet = spline_fn(
        inputs=inputs,
        unnormalized_widths=unnormalized_widths,
        unnormalized_heights=unnormalized_heights,
        unnormalized_derivatives=unnormalized_derivatives,
        inverse=inverse,
        min_bin_width=min_bin_width,
        min_bin_height=min_bin_height,
        min_derivative=min_derivative,
        **spline_kwargs,
    )
    return outputs, logabsdet
 def searchsorted(bin_locations, inputs, eps=1e-6):
    bin_locations[..., -1] += eps
    return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
 def unconstrained_rational_quadratic_spline(
    inputs,
    unnormalized_widths,
    unnormalized_heights,
    unnormalized_derivatives,
    inverse=False,
    tails="linear",
    tail_bound=1.0,
    min_bin_width=DEFAULT_MIN_BIN_WIDTH,
    min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
    min_derivative=DEFAULT_MIN_DERIVATIVE,
 ):
    inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
    outside_interval_mask = ~inside_interval_mask
    outputs = torch.zeros_like(inputs)
    logabsdet = torch.zeros_like(inputs)
    if tails == "linear":
        unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
        constant = np.log(np.exp(1 - min_derivative) - 1)
        unnormalized_derivatives[..., 0] = constant
        unnormalized_derivatives[..., -1] = constant
        outputs[outside_interval_mask] = inputs[outside_interval_mask]
        logabsdet[outside_interval_mask] = 0
    else:
        raise RuntimeError("{} tails are not implemented.".format(tails))
    outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline(
        inputs=inputs[inside_interval_mask],
        unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
        unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
        unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
        inverse=inverse,
        left=-tail_bound,
        right=tail_bound,
        bottom=-tail_bound,
        top=tail_bound,
        min_bin_width=min_bin_width,
        min_bin_height=min_bin_height,
        min_derivative=min_derivative,
    )
    return outputs, logabsdet
 def rational_quadratic_spline(
    inputs,
    unnormalized_widths,
    unnormalized_heights,
    unnormalized_derivatives,
    inverse=False,
    left=0.0,
    right=1.0,
    bottom=0.0,
    top=1.0,
    min_bin_width=DEFAULT_MIN_BIN_WIDTH,
    min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
    min_derivative=DEFAULT_MIN_DERIVATIVE,
 ):
    if torch.min(inputs) < left or torch.max(inputs) > right:
        raise ValueError("Input to a transform is not within its domain")
    num_bins = unnormalized_widths.shape[-1]
    if min_bin_width * num_bins > 1.0:
        raise ValueError("Minimal bin width too large for the number of bins")
    if min_bin_height * num_bins > 1.0:
        raise ValueError("Minimal bin height too large for the number of bins")
    widths = F.softmax(unnormalized_widths, dim=-1)
    widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
    cumwidths = torch.cumsum(widths, dim=-1)
    cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
    cumwidths = (right - left) * cumwidths + left
    cumwidths[..., 0] = left
    cumwidths[..., -1] = right
    widths = cumwidths[..., 1:] - cumwidths[..., :-1]
    derivatives = min_derivative + F.softplus(unnormalized_derivatives)
    heights = F.softmax(unnormalized_heights, dim=-1)
    heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
    cumheights = torch.cumsum(heights, dim=-1)
    cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
    cumheights = (top - bottom) * cumheights + bottom
    cumheights[..., 0] = bottom
    cumheights[..., -1] = top
    heights = cumheights[..., 1:] - cumheights[..., :-1]
    if inverse:
        bin_idx = searchsorted(cumheights, inputs)[..., None]
    else:
        bin_idx = searchsorted(cumwidths, inputs)[..., None]
    input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
    input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
    input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
    delta = heights / widths
    input_delta = delta.gather(-1, bin_idx)[..., 0]
    input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
    input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
    input_heights = heights.gather(-1, bin_idx)[..., 0]
    if inverse:
        a = (inputs - input_cumheights) * (
            input_derivatives + input_derivatives_plus_one - 2 * input_delta
        ) + input_heights * (input_delta - input_derivatives)
        b = input_heights * input_derivatives - (inputs - input_cumheights) * (
            input_derivatives + input_derivatives_plus_one - 2 * input_delta
        )
        c = -input_delta * (inputs - input_cumheights)
        discriminant = b.pow(2) - 4 * a * c
        assert (discriminant >= 0).all()
        root = (2 * c) / (-b - torch.sqrt(discriminant))
        outputs = root * input_bin_widths + input_cumwidths
        theta_one_minus_theta = root * (1 - root)
        denominator = input_delta + (
            (input_derivatives + input_derivatives_plus_one - 2 * input_delta) * theta_one_minus_theta
        )
        derivative_numerator = input_delta.pow(2) * (
            input_derivatives_plus_one * root.pow(2)
            + 2 * input_delta * theta_one_minus_theta
            + input_derivatives * (1 - root).pow(2)
        )
        logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
        return outputs, -logabsdet
    else:
        theta = (inputs - input_cumwidths) / input_bin_widths
        theta_one_minus_theta = theta * (1 - theta)
        numerator = input_heights * (input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta)
        denominator = input_delta + (
            (input_derivatives + input_derivatives_plus_one - 2 * input_delta) * theta_one_minus_theta
        )
        outputs = input_cumheights + numerator / denominator
        derivative_numerator = input_delta.pow(2) * (
            input_derivatives_plus_one * theta.pow(2)
            + 2 * input_delta * theta_one_minus_theta
            + input_derivatives * (1 - theta).pow(2)
        )
        logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
        return outputs, logabsdet
--- a/TTS/tts/models/init.py
+++ b/TTS/tts/models/init.py
@ -4,20 +4,23 @@ from TTS.utils.generic_utils import find_module
 def setup_model(config):
    print(" > Using model: {}".format(config.model))
    MyModel = find_module("TTS.tts.models", config.model.lower())
    # define set of characters used by the model
    if config.characters is not None:
        # set characters from config
-        symbols, phonemes = make_symbols(**config.characters.to_dict())  # pylint: disable=redefined-outer-name
+        if hasattr(MyModel, "make_symbols"):
            symbols = MyModel.make_symbols(config)
        else:
            symbols, phonemes = make_symbols(**config.characters)
    else:
        from TTS.tts.utils.text.symbols import phonemes, symbols  # pylint: disable=import-outside-toplevel
        if config.use_phonemes:
            symbols = phonemes
        # use default characters and assign them to config
        config.characters = parse_symbols()
    num_chars = len(phonemes) if config.use_phonemes else len(symbols)
    # consider special `blank` character if `add_blank` is set True
-    num_chars = num_chars + getattr(config, "add_blank", False)
+    num_chars = len(symbols) + getattr(config, "add_blank", False)
    config.num_chars = num_chars
    # compatibility fix
    if "model_params" in config:
--- a/TTS/tts/models/align_tts.py
+++ b/TTS/tts/models/align_tts.py
@ -16,6 +16,7 @@ from TTS.tts.utils.data import sequence_mask
 from TTS.tts.utils.measures import alignment_diagonal_score
 from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.io import load_fsspec
@dataclass
@ -389,7 +390,7 @@ class AlignTTS(BaseTTS):
    def load_checkpoint(
        self, config, checkpoint_path, eval=False
    ):  # pylint: disable=unused-argument, redefined-builtin
-        state = torch.load(checkpoint_path, map_location=torch.device("cpu"))
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
        self.load_state_dict(state["model"])
        if eval:
            self.eval()
--- a/TTS/tts/models/base_tacotron.py
+++ b/TTS/tts/models/base_tacotron.py
@ -13,6 +13,7 @@ from TTS.tts.utils.data import sequence_mask
 from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager
 from TTS.tts.utils.text import make_symbols
 from TTS.utils.generic_utils import format_aux_input
 from TTS.utils.io import load_fsspec
 from TTS.utils.training import gradual_training_scheduler
@ -75,9 +76,6 @@ class BaseTacotron(BaseTTS):
        self.decoder_backward = None
        self.coarse_decoder = None
        # init multi-speaker layers
        self.init_multispeaker(config)
    @staticmethod
    def _format_aux_input(aux_input: Dict) -> Dict:
        return format_aux_input({"d_vectors": None, "speaker_ids": None}, aux_input)
@ -113,7 +111,7 @@ class BaseTacotron(BaseTTS):
    def load_checkpoint(
        self, config, checkpoint_path, eval=False
    ):  # pylint: disable=unused-argument, redefined-builtin
-        state = torch.load(checkpoint_path, map_location=torch.device("cpu"))
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
        self.load_state_dict(state["model"])
        if "r" in state:
            self.decoder.set_r(state["r"])
@ -236,6 +234,7 @@ class BaseTacotron(BaseTTS):
    def compute_gst(self, inputs, style_input, speaker_embedding=None):
        """Compute global style token"""
        if isinstance(style_input, dict):
            # multiply each style token with a weight
            query = torch.zeros(1, 1, self.gst.gst_embedding_dim // 2).type_as(inputs)
            if speaker_embedding is not None:
                query = torch.cat([query, speaker_embedding.reshape(1, 1, -1)], dim=-1)
@ -247,8 +246,10 @@ class BaseTacotron(BaseTTS):
                gst_outputs_att = self.gst_layer.style_token_layer.attention(query, key)
                gst_outputs = gst_outputs + gst_outputs_att * v_amplifier
        elif style_input is None:
            # ignore style token and return zero tensor
            gst_outputs = torch.zeros(1, 1, self.gst.gst_embedding_dim).type_as(inputs)
        else:
            # compute style tokens
            gst_outputs = self.gst_layer(style_input, speaker_embedding)  # pylint: disable=not-callable
        inputs = self._concat_speaker_embedding(inputs, gst_outputs)
        return inputs
--- a/TTS/tts/models/base_tts.py
+++ b/TTS/tts/models/base_tts.py
@ -1,6 +1,6 @@
 import os
 from typing import Dict, List, Tuple
 import numpy as np
 import torch
 from coqpit import Coqpit
 from torch import nn
@ -48,10 +48,17 @@ class BaseTTS(BaseModel):
        return get_speaker_manager(config, restore_path, data, out_path)
    def init_multispeaker(self, config: Coqpit, data: List = None):
-        """Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer
+        """Initialize a speaker embedding layer if needen and define expected embedding channel size for defining
-        or with external `d_vectors` computed from a speaker encoder model.
+        `in_channels` size of the connected layers.
-        If you need a different behaviour, override this function for your model.
+        This implementation yields 3 possible outcomes:
        1. If `config.use_speaker_embedding` and `config.use_d_vector_file are False, do nothing.
        2. If `config.use_d_vector_file` is True, set expected embedding channel size to `config.d_vector_dim` or 512.
        3. If `config.use_speaker_embedding`, initialize a speaker embedding layer with channel size of
        `config.d_vector_dim` or 512.
        You can override this function for new models.0
        Args:
            config (Coqpit): Model configuration.
@ -59,12 +66,24 @@ class BaseTTS(BaseModel):
        """
        # init speaker manager
        self.speaker_manager = get_speaker_manager(config, data=data)
-        self.num_speakers = self.speaker_manager.num_speakers
+
-        # init speaker embedding layer
+        # set number of speakers - if num_speakers is set in config, use it, otherwise use speaker_manager
-        if config.use_speaker_embedding and not config.use_d_vector_file:
+        if data is not None or self.speaker_manager.speaker_ids:
            self.num_speakers = self.speaker_manager.num_speakers
        else:
            self.num_speakers = (
                config.num_speakers
                if "num_speakers" in config and config.num_speakers != 0
                else self.speaker_manager.num_speakers
            )
        # set ultimate speaker embedding size
        if config.use_speaker_embedding or config.use_d_vector_file:
            self.embedded_speaker_dim = (
                config.d_vector_dim if "d_vector_dim" in config and config.d_vector_dim is not None else 512
            )
        # init speaker embedding layer
        if config.use_speaker_embedding and not config.use_d_vector_file:
            self.speaker_embedding = nn.Embedding(self.num_speakers, self.embedded_speaker_dim)
            self.speaker_embedding.weight.data.normal_(0, 0.3)
@ -87,7 +106,7 @@ class BaseTTS(BaseModel):
        text_input = batch[0]
        text_lengths = batch[1]
        speaker_names = batch[2]
-        linear_input = batch[3] if self.config.model.lower() in ["tacotron"] else None
+        linear_input = batch[3]
        mel_input = batch[4]
        mel_lengths = batch[5]
        stop_targets = batch[6]
@ -95,6 +114,7 @@ class BaseTTS(BaseModel):
        d_vectors = batch[8]
        speaker_ids = batch[9]
        attn_mask = batch[10]
        waveform = batch[11]
        max_text_length = torch.max(text_lengths.float())
        max_spec_length = torch.max(mel_lengths.float())
@ -140,6 +160,7 @@ class BaseTTS(BaseModel):
            "max_text_length": float(max_text_length),
            "max_spec_length": float(max_spec_length),
            "item_idx": item_idx,
            "waveform": waveform,
        }
    def get_data_loader(
@ -160,15 +181,22 @@ class BaseTTS(BaseModel):
                speaker_id_mapping = None
                d_vector_mapping = None
            # setup custom symbols if needed
            custom_symbols = None
            if hasattr(self, "make_symbols"):
                custom_symbols = self.make_symbols(self.config)
            # init dataloader
            dataset = TTSDataset(
                outputs_per_step=config.r if "r" in config else 1,
                text_cleaner=config.text_cleaner,
-                compute_linear_spec=config.model.lower() == "tacotron",
+                compute_linear_spec=config.model.lower() == "tacotron" or config.compute_linear_spec,
                meta_data=data_items,
                ap=ap,
                characters=config.characters,
                custom_symbols=custom_symbols,
                add_blank=config["add_blank"],
                return_wav=config.return_wav if "return_wav" in config else False,
                batch_group_size=0 if is_eval else config.batch_group_size * config.batch_size,
                min_seq_len=config.min_seq_len,
                max_seq_len=config.max_seq_len,
@ -185,8 +213,21 @@ class BaseTTS(BaseModel):
            )
            if config.use_phonemes and config.compute_input_seq_cache:
-                # precompute phonemes to have a better estimate of sequence lengths.
+                if hasattr(self, "eval_data_items") and is_eval:
-                dataset.compute_input_seq(config.num_loader_workers)
+                    dataset.items = self.eval_data_items
                elif hasattr(self, "train_data_items") and not is_eval:
                    dataset.items = self.train_data_items
                else:
                    # precompute phonemes to have a better estimate of sequence lengths.
                    dataset.compute_input_seq(config.num_loader_workers)
                    # TODO: find a more efficient solution
                    # cheap hack - store items in the model state to avoid recomputing when reinit the dataset
                    if is_eval:
                        self.eval_data_items = dataset.items
                    else:
                        self.train_data_items = dataset.items
            dataset.sort_items()
            sampler = DistributedSampler(dataset) if num_gpus > 1 else None
@ -216,7 +257,7 @@ class BaseTTS(BaseModel):
        test_sentences = self.config.test_sentences
        aux_inputs = self.get_aux_input()
        for idx, sen in enumerate(test_sentences):
-            wav, alignment, model_outputs, _ = synthesis(
+            outputs_dict = synthesis(
                self,
                sen,
                self.config,
@ -228,9 +269,12 @@ class BaseTTS(BaseModel):
                enable_eos_bos_chars=self.config.enable_eos_bos_chars,
                use_griffin_lim=True,
                do_trim_silence=False,
-            ).values()
+            )
-
+            test_audios["{}-audio".format(idx)] = outputs_dict["wav"]
-            test_audios["{}-audio".format(idx)] = wav
+            test_figures["{}-prediction".format(idx)] = plot_spectrogram(
-            test_figures["{}-prediction".format(idx)] = plot_spectrogram(model_outputs, ap, output_fig=False)
+                outputs_dict["outputs"]["model_outputs"], ap, output_fig=False
-            test_figures["{}-alignment".format(idx)] = plot_alignment(alignment, output_fig=False)
+            )
            test_figures["{}-alignment".format(idx)] = plot_alignment(
                outputs_dict["outputs"]["alignments"], output_fig=False
            )
        return test_figures, test_audios
--- a/TTS/tts/models/glow_tts.py
+++ b/TTS/tts/models/glow_tts.py
@ -12,14 +12,19 @@ from TTS.tts.models.base_tts import BaseTTS
 from TTS.tts.utils.data import sequence_mask
 from TTS.tts.utils.measures import alignment_diagonal_score
 from TTS.tts.utils.speakers import get_speaker_manager
 from TTS.tts.utils.synthesis import synthesis
 from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.io import load_fsspec
 class GlowTTS(BaseTTS):
-    """Glow TTS models from https://arxiv.org/abs/2005.11129
+    """GlowTTS model.
-    Paper abstract:
+    Paper::
        https://arxiv.org/abs/2005.11129
    Paper abstract::
        Recently, text-to-speech (TTS) models such as FastSpeech and ParaNet have been proposed to generate
        mel-spectrograms from text in parallel. Despite the advantage, the parallel TTS models cannot be trained
        without guidance from autoregressive TTS models as their external aligners. In this work, we propose Glow-TTS,
@ -144,7 +149,6 @@ class GlowTTS(BaseTTS):
                g = F.normalize(g).unsqueeze(-1)
            else:
                g = F.normalize(self.emb_g(g)).unsqueeze(-1)  # [b, h, 1]
        # embedding pass
        o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, x_lengths, g=g)
        # drop redisual frames wrt num_squeeze and set y_lengths.
@ -361,12 +365,49 @@ class GlowTTS(BaseTTS):
        train_audio = ap.inv_melspectrogram(pred_spec.T)
        return figures, {"audio": train_audio}
    @torch.no_grad()
    def eval_step(self, batch: dict, criterion: nn.Module):
        return self.train_step(batch, criterion)
    def eval_log(self, ap: AudioProcessor, batch: dict, outputs: dict):
        return self.train_log(ap, batch, outputs)
    @torch.no_grad()
    def test_run(self, ap):
        """Generic test run for `tts` models used by `Trainer`.
        You can override this for a different behaviour.
        Returns:
            Tuple[Dict, Dict]: Test figures and audios to be projected to Tensorboard.
        """
        print(" | > Synthesizing test sentences.")
        test_audios = {}
        test_figures = {}
        test_sentences = self.config.test_sentences
        aux_inputs = self.get_aux_input()
        for idx, sen in enumerate(test_sentences):
            outputs = synthesis(
                self,
                sen,
                self.config,
                "cuda" in str(next(self.parameters()).device),
                ap,
                speaker_id=aux_inputs["speaker_id"],
                d_vector=aux_inputs["d_vector"],
                style_wav=aux_inputs["style_wav"],
                enable_eos_bos_chars=self.config.enable_eos_bos_chars,
                use_griffin_lim=True,
                do_trim_silence=False,
            )
            test_audios["{}-audio".format(idx)] = outputs["wav"]
            test_figures["{}-prediction".format(idx)] = plot_spectrogram(
                outputs["outputs"]["model_outputs"], ap, output_fig=False
            )
            test_figures["{}-alignment".format(idx)] = plot_alignment(outputs["alignments"], output_fig=False)
        return test_figures, test_audios
    def preprocess(self, y, y_lengths, y_max_length, attn=None):
        if y_max_length is not None:
            y_max_length = (y_max_length // self.num_squeeze) * self.num_squeeze
@ -382,7 +423,7 @@ class GlowTTS(BaseTTS):
    def load_checkpoint(
        self, config, checkpoint_path, eval=False
    ):  # pylint: disable=unused-argument, redefined-builtin
-        state = torch.load(checkpoint_path, map_location=torch.device("cpu"))
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
        self.load_state_dict(state["model"])
        if eval:
            self.eval()
--- a/TTS/tts/models/speedy_speech.py
+++ b/TTS/tts/models/speedy_speech.py
@ -14,6 +14,7 @@ from TTS.tts.utils.data import sequence_mask
 from TTS.tts.utils.measures import alignment_diagonal_score
 from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.io import load_fsspec
@dataclass
@ -105,7 +106,7 @@ class SpeedySpeech(BaseTTS):
            if isinstance(config.model_args.length_scale, int)
            else config.model_args.length_scale
        )
-        self.emb = nn.Embedding(config.model_args.num_chars, config.model_args.hidden_channels)
+        self.emb = nn.Embedding(self.num_chars, config.model_args.hidden_channels)
        self.encoder = Encoder(
            config.model_args.hidden_channels,
            config.model_args.hidden_channels,
@ -227,6 +228,7 @@ class SpeedySpeech(BaseTTS):
        outputs = {"model_outputs": o_de.transpose(1, 2), "durations_log": o_dr_log.squeeze(1), "alignments": attn}
        return outputs
    @torch.no_grad()
    def inference(self, x, aux_input={"d_vectors": None, "speaker_ids": None}):  # pylint: disable=unused-argument
        """
        Shapes:
@ -306,7 +308,7 @@ class SpeedySpeech(BaseTTS):
    def load_checkpoint(
        self, config, checkpoint_path, eval=False
    ):  # pylint: disable=unused-argument, redefined-builtin
-        state = torch.load(checkpoint_path, map_location=torch.device("cpu"))
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
        self.load_state_dict(state["model"])
        if eval:
            self.eval()
--- a/TTS/tts/models/tacotron.py
+++ b/TTS/tts/models/tacotron.py
@ -23,19 +23,19 @@ class Tacotron(BaseTacotron):
    def __init__(self, config: Coqpit):
        super().__init__(config)
-        self.num_chars, self.config = self.get_characters(config)
+        chars, self.config = self.get_characters(config)
        config.num_chars = self.num_chars = len(chars)
        # pass all config fields to `self`
        # for fewer code change
        for key in config:
            setattr(self, key, config[key])
-        # speaker embedding layer
+        # set speaker embedding channel size for determining `in_channels` for the connected layers.
-        if self.num_speakers > 1:
+        # `init_multispeaker` needs to be called once more in training to initialize the speaker embedding layer based
        # on the number of speakers infered from the dataset.
        if self.use_speaker_embedding or self.use_d_vector_file:
            self.init_multispeaker(config)
        # speaker and gst embeddings is concat in decoder input
        if self.num_speakers > 1:
            self.decoder_in_features += self.embedded_speaker_dim  # add speaker embedding dim
        if self.use_gst:
@ -75,13 +75,11 @@ class Tacotron(BaseTacotron):
        if self.gst and self.use_gst:
            self.gst_layer = GST(
                num_mel=self.decoder_output_dim,
                d_vector_dim=self.d_vector_dim
                if self.config.gst.gst_use_speaker_embedding and self.use_speaker_embedding
                else None,
                num_heads=self.gst.gst_num_heads,
                num_style_tokens=self.gst.gst_num_style_tokens,
                gst_embedding_dim=self.gst.gst_embedding_dim,
            )
        # backward pass decoder
        if self.bidirectional_decoder:
            self._init_backward_decoder()
@ -106,7 +104,9 @@ class Tacotron(BaseTacotron):
                self.max_decoder_steps,
            )
-    def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, aux_input=None):
+    def forward(  # pylint: disable=dangerous-default-value
        self, text, text_lengths, mel_specs=None, mel_lengths=None, aux_input={"speaker_ids": None, "d_vectors": None}
    ):
        """
        Shapes:
            text: [B, T_in]
@ -115,6 +115,7 @@ class Tacotron(BaseTacotron):
            mel_lengths: [B]
            aux_input: 'speaker_ids': [B, 1] and  'd_vectors':[B, C]
        """
        aux_input = self._format_aux_input(aux_input)
        outputs = {"alignments_backward": None, "decoder_outputs_backward": None}
        inputs = self.embedding(text)
        input_mask, output_mask = self.compute_masks(text_lengths, mel_lengths)
@ -125,12 +126,10 @@ class Tacotron(BaseTacotron):
        # global style token
        if self.gst and self.use_gst:
            # B x gst_dim
-            encoder_outputs = self.compute_gst(
+            encoder_outputs = self.compute_gst(encoder_outputs, mel_specs)
                encoder_outputs, mel_specs, aux_input["d_vectors"] if "d_vectors" in aux_input else None
            )
        # speaker embedding
-        if self.num_speakers > 1:
+        if self.use_speaker_embedding or self.use_d_vector_file:
-            if not self.use_d_vectors:
+            if not self.use_d_vector_file:
                # B x 1 x speaker_embed_dim
                embedded_speakers = self.speaker_embedding(aux_input["speaker_ids"])[:, None]
            else:
@ -182,7 +181,7 @@ class Tacotron(BaseTacotron):
            # B x gst_dim
            encoder_outputs = self.compute_gst(encoder_outputs, aux_input["style_mel"], aux_input["d_vectors"])
        if self.num_speakers > 1:
-            if not self.use_d_vectors:
+            if not self.use_d_vector_file:
                # B x 1 x speaker_embed_dim
                embedded_speakers = self.speaker_embedding(aux_input["speaker_ids"])
                # reshape embedded_speakers
--- a/TTS/tts/models/tacotron2.py
+++ b/TTS/tts/models/tacotron2.py
@ -23,7 +23,7 @@ class Tacotron2(BaseTacotron):
        super().__init__(config)
        chars, self.config = self.get_characters(config)
-        self.num_chars = len(chars)
+        config.num_chars = len(chars)
        self.decoder_output_dim = config.out_channels
        # pass all config fields to `self`
@ -31,12 +31,11 @@ class Tacotron2(BaseTacotron):
        for key in config:
            setattr(self, key, config[key])
-        # speaker embedding layer
+        # set speaker embedding channel size for determining `in_channels` for the connected layers.
-        if self.num_speakers > 1:
+        # `init_multispeaker` needs to be called once more in training to initialize the speaker embedding layer based
        # on the number of speakers infered from the dataset.
        if self.use_speaker_embedding or self.use_d_vector_file:
            self.init_multispeaker(config)
        # speaker and gst embeddings is concat in decoder input
        if self.num_speakers > 1:
            self.decoder_in_features += self.embedded_speaker_dim  # add speaker embedding dim
        if self.use_gst:
@ -47,6 +46,7 @@ class Tacotron2(BaseTacotron):
        # base model layers
        self.encoder = Encoder(self.encoder_in_features)
        self.decoder = Decoder(
            self.decoder_in_features,
            self.decoder_output_dim,
@ -73,9 +73,6 @@ class Tacotron2(BaseTacotron):
        if self.gst and self.use_gst:
            self.gst_layer = GST(
                num_mel=self.decoder_output_dim,
                d_vector_dim=self.d_vector_dim
                if self.config.gst.gst_use_speaker_embedding and self.use_speaker_embedding
                else None,
                num_heads=self.gst.gst_num_heads,
                num_style_tokens=self.gst.gst_num_style_tokens,
                gst_embedding_dim=self.gst.gst_embedding_dim,
@ -110,7 +107,9 @@ class Tacotron2(BaseTacotron):
        mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2)
        return mel_outputs, mel_outputs_postnet, alignments
-    def forward(self, text, text_lengths, mel_specs=None, mel_lengths=None, aux_input=None):
+    def forward(  # pylint: disable=dangerous-default-value
        self, text, text_lengths, mel_specs=None, mel_lengths=None, aux_input={"speaker_ids": None, "d_vectors": None}
    ):
        """
        Shapes:
            text: [B, T_in]
@ -130,11 +129,10 @@ class Tacotron2(BaseTacotron):
        encoder_outputs = self.encoder(embedded_inputs, text_lengths)
        if self.gst and self.use_gst:
            # B x gst_dim
-            encoder_outputs = self.compute_gst(
+            encoder_outputs = self.compute_gst(encoder_outputs, mel_specs)
-                encoder_outputs, mel_specs, aux_input["d_vectors"] if "d_vectors" in aux_input else None
+
-            )
+        if self.use_speaker_embedding or self.use_d_vector_file:
-        if self.num_speakers > 1:
+            if not self.use_d_vector_file:
            if not self.use_d_vectors:
                # B x 1 x speaker_embed_dim
                embedded_speakers = self.speaker_embedding(aux_input["speaker_ids"])[:, None]
            else:
@ -186,8 +184,9 @@ class Tacotron2(BaseTacotron):
        if self.gst and self.use_gst:
            # B x gst_dim
            encoder_outputs = self.compute_gst(encoder_outputs, aux_input["style_mel"], aux_input["d_vectors"])
        if self.num_speakers > 1:
-            if not self.use_d_vectors:
+            if not self.use_d_vector_file:
                embedded_speakers = self.speaker_embedding(aux_input["speaker_ids"])[None]
                # reshape embedded_speakers
                if embedded_speakers.ndim == 1:
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@ -0,0 +1,767 @@
 from dataclasses import dataclass, field
 from typing import Dict, List, Tuple
 import torch
 from coqpit import Coqpit
 from torch import nn
 from torch.cuda.amp.autocast_mode import autocast
 from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor
 from TTS.tts.layers.glow_tts.monotonic_align import generate_path, maximum_path
 from TTS.tts.layers.vits.discriminator import VitsDiscriminator
 from TTS.tts.layers.vits.networks import PosteriorEncoder, ResidualCouplingBlocks, TextEncoder
 from TTS.tts.layers.vits.stochastic_duration_predictor import StochasticDurationPredictor
 # from TTS.tts.layers.vits.sdp import StochasticDurationPredictor
 from TTS.tts.models.base_tts import BaseTTS
 from TTS.tts.utils.data import sequence_mask
 from TTS.tts.utils.speakers import get_speaker_manager
 from TTS.tts.utils.synthesis import synthesis
 from TTS.tts.utils.visual import plot_alignment
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.trainer_utils import get_optimizer, get_scheduler
 from TTS.vocoder.models.hifigan_generator import HifiganGenerator
 from TTS.vocoder.utils.generic_utils import plot_results
 def segment(x: torch.tensor, segment_indices: torch.tensor, segment_size=4):
    """Segment each sample in a batch based on the provided segment indices"""
    segments = torch.zeros_like(x[:, :, :segment_size])
    for i in range(x.size(0)):
        index_start = segment_indices[i]
        index_end = index_start + segment_size
        segments[i] = x[i, :, index_start:index_end]
    return segments
 def rand_segment(x: torch.tensor, x_lengths: torch.tensor = None, segment_size=4):
    """Create random segments based on the input lengths."""
    B, _, T = x.size()
    if x_lengths is None:
        x_lengths = T
    max_idxs = x_lengths - segment_size + 1
    assert all(max_idxs > 0), " [!] At least one sample is shorter than the segment size."
    ids_str = (torch.rand([B]).type_as(x) * max_idxs).long()
    ret = segment(x, ids_str, segment_size)
    return ret, ids_str
@dataclass
 class VitsArgs(Coqpit):
    """VITS model arguments.
    Args:
        num_chars (int):
            Number of characters in the vocabulary. Defaults to 100.
        out_channels (int):
            Number of output channels. Defaults to 513.
        spec_segment_size (int):
            Decoder input segment size. Defaults to 32 `(32 * hoplength = waveform length)`.
        hidden_channels (int):
            Number of hidden channels of the model. Defaults to 192.
        hidden_channels_ffn_text_encoder (int):
            Number of hidden channels of the feed-forward layers of the text encoder transformer. Defaults to 256.
        num_heads_text_encoder (int):
            Number of attention heads of the text encoder transformer. Defaults to 2.
        num_layers_text_encoder (int):
            Number of transformer layers in the text encoder. Defaults to 6.
        kernel_size_text_encoder (int):
            Kernel size of the text encoder transformer FFN layers. Defaults to 3.
        dropout_p_text_encoder (float):
            Dropout rate of the text encoder. Defaults to 0.1.
        dropout_p_duration_predictor (float):
            Dropout rate of the duration predictor. Defaults to 0.1.
        kernel_size_posterior_encoder (int):
            Kernel size of the posterior encoder's WaveNet layers. Defaults to 5.
        dilatation_posterior_encoder (int):
            Dilation rate of the posterior encoder's WaveNet layers. Defaults to 1.
        num_layers_posterior_encoder (int):
            Number of posterior encoder's WaveNet layers. Defaults to 16.
        kernel_size_flow (int):
            Kernel size of the Residual Coupling layers of the flow network. Defaults to 5.
        dilatation_flow (int):
            Dilation rate of the Residual Coupling WaveNet layers of the flow network. Defaults to 1.
        num_layers_flow (int):
            Number of Residual Coupling WaveNet layers of the flow network. Defaults to 6.
        resblock_type_decoder (str):
            Type of the residual block in the decoder network. Defaults to "1".
        resblock_kernel_sizes_decoder (List[int]):
            Kernel sizes of the residual blocks in the decoder network. Defaults to `[3, 7, 11]`.
        resblock_dilation_sizes_decoder (List[List[int]]):
            Dilation sizes of the residual blocks in the decoder network. Defaults to `[[1, 3, 5], [1, 3, 5], [1, 3, 5]]`.
        upsample_rates_decoder (List[int]):
            Upsampling rates for each concecutive upsampling layer in the decoder network. The multiply of these
            values must be equal to the kop length used for computing spectrograms. Defaults to `[8, 8, 2, 2]`.
        upsample_initial_channel_decoder (int):
            Number of hidden channels of the first upsampling convolution layer of the decoder network. Defaults to 512.
        upsample_kernel_sizes_decoder (List[int]):
            Kernel sizes for each upsampling layer of the decoder network. Defaults to `[16, 16, 4, 4]`.
        use_sdp (int):
            Use Stochastic Duration Predictor. Defaults to True.
        noise_scale (float):
            Noise scale used for the sample noise tensor in training. Defaults to 1.0.
        inference_noise_scale (float):
            Noise scale used for the sample noise tensor in inference. Defaults to 0.667.
        length_scale (int):
            Scale factor for the predicted duration values. Smaller values result faster speech. Defaults to 1.
        noise_scale_dp (float):
            Noise scale used by the Stochastic Duration Predictor sample noise in training. Defaults to 1.0.
        inference_noise_scale_dp (float):
            Noise scale for the Stochastic Duration Predictor in inference. Defaults to 0.8.
        max_inference_len (int):
            Maximum inference length to limit the memory use. Defaults to None.
        init_discriminator (bool):
            Initialize the disciminator network if set True. Set False for inference. Defaults to True.
        use_spectral_norm_disriminator (bool):
            Use spectral normalization over weight norm in the discriminator. Defaults to False.
        use_speaker_embedding (bool):
            Enable/Disable speaker embedding for multi-speaker models. Defaults to False.
        num_speakers (int):
            Number of speakers for the speaker embedding layer. Defaults to 0.
        speakers_file (str):
            Path to the speaker mapping file for the Speaker Manager. Defaults to None.
        speaker_embedding_channels (int):
            Number of speaker embedding channels. Defaults to 256.
        use_d_vector_file (bool):
            Enable/Disable the use of d-vectors for multi-speaker training. Defaults to False.
        d_vector_dim (int):
            Number of d-vector channels. Defaults to 0.
        detach_dp_input (bool):
            Detach duration predictor's input from the network for stopping the gradients. Defaults to True.
    """
    num_chars: int = 100
    out_channels: int = 513
    spec_segment_size: int = 32
    hidden_channels: int = 192
    hidden_channels_ffn_text_encoder: int = 768
    num_heads_text_encoder: int = 2
    num_layers_text_encoder: int = 6
    kernel_size_text_encoder: int = 3
    dropout_p_text_encoder: int = 0.1
    dropout_p_duration_predictor: int = 0.1
    kernel_size_posterior_encoder: int = 5
    dilation_rate_posterior_encoder: int = 1
    num_layers_posterior_encoder: int = 16
    kernel_size_flow: int = 5
    dilation_rate_flow: int = 1
    num_layers_flow: int = 4
    resblock_type_decoder: int = "1"
    resblock_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [3, 7, 11])
    resblock_dilation_sizes_decoder: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]])
    upsample_rates_decoder: List[int] = field(default_factory=lambda: [8, 8, 2, 2])
    upsample_initial_channel_decoder: int = 512
    upsample_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [16, 16, 4, 4])
    use_sdp: int = True
    noise_scale: float = 1.0
    inference_noise_scale: float = 0.667
    length_scale: int = 1
    noise_scale_dp: float = 1.0
    inference_noise_scale_dp: float = 0.8
    max_inference_len: int = None
    init_discriminator: bool = True
    use_spectral_norm_disriminator: bool = False
    use_speaker_embedding: bool = False
    num_speakers: int = 0
    speakers_file: str = None
    speaker_embedding_channels: int = 256
    use_d_vector_file: bool = False
    d_vector_dim: int = 0
    detach_dp_input: bool = True
 class Vits(BaseTTS):
    """VITS TTS model
    Paper::
        https://arxiv.org/pdf/2106.06103.pdf
    Paper Abstract::
        Several recent end-to-end text-to-speech (TTS) models enabling single-stage training and parallel
        sampling have been proposed, but their sample quality does not match that of two-stage TTS systems.
        In this work, we present a parallel endto-end TTS method that generates more natural sounding audio than
        current two-stage models. Our method adopts variational inference augmented with normalizing flows and
        an adversarial training process, which improves the expressive power of generative modeling. We also propose a
        stochastic duration predictor to synthesize speech with diverse rhythms from input text. With the
        uncertainty modeling over latent variables and the stochastic duration predictor, our method expresses the
        natural one-to-many relationship in which a text input can be spoken in multiple ways
        with different pitches and rhythms. A subjective human evaluation (mean opinion score, or MOS)
        on the LJ Speech, a single speaker dataset, shows that our method outperforms the best publicly
        available TTS systems and achieves a MOS comparable to ground truth.
    Check :class:`TTS.tts.configs.vits_config.VitsConfig` for class arguments.
    Examples:
        >>> from TTS.tts.configs import VitsConfig
        >>> from TTS.tts.models.vits import Vits
        >>> config = VitsConfig()
        >>> model = Vits(config)
    """
    # pylint: disable=dangerous-default-value
    def __init__(self, config: Coqpit):
        super().__init__()
        self.END2END = True
        if config.__class__.__name__ == "VitsConfig":
            # loading from VitsConfig
            if "num_chars" not in config:
                _, self.config, num_chars = self.get_characters(config)
                config.model_args.num_chars = num_chars
            else:
                self.config = config
                config.model_args.num_chars = config.num_chars
            args = self.config.model_args
        elif isinstance(config, VitsArgs):
            # loading from VitsArgs
            self.config = config
            args = config
        else:
            raise ValueError("config must be either a VitsConfig or VitsArgs")
        self.args = args
        self.init_multispeaker(config)
        self.length_scale = args.length_scale
        self.noise_scale = args.noise_scale
        self.inference_noise_scale = args.inference_noise_scale
        self.inference_noise_scale_dp = args.inference_noise_scale_dp
        self.noise_scale_dp = args.noise_scale_dp
        self.max_inference_len = args.max_inference_len
        self.spec_segment_size = args.spec_segment_size
        self.text_encoder = TextEncoder(
            args.num_chars,
            args.hidden_channels,
            args.hidden_channels,
            args.hidden_channels_ffn_text_encoder,
            args.num_heads_text_encoder,
            args.num_layers_text_encoder,
            args.kernel_size_text_encoder,
            args.dropout_p_text_encoder,
        )
        self.posterior_encoder = PosteriorEncoder(
            args.out_channels,
            args.hidden_channels,
            args.hidden_channels,
            kernel_size=args.kernel_size_posterior_encoder,
            dilation_rate=args.dilation_rate_posterior_encoder,
            num_layers=args.num_layers_posterior_encoder,
            cond_channels=self.embedded_speaker_dim,
        )
        self.flow = ResidualCouplingBlocks(
            args.hidden_channels,
            args.hidden_channels,
            kernel_size=args.kernel_size_flow,
            dilation_rate=args.dilation_rate_flow,
            num_layers=args.num_layers_flow,
            cond_channels=self.embedded_speaker_dim,
        )
        if args.use_sdp:
            self.duration_predictor = StochasticDurationPredictor(
                args.hidden_channels,
                192,
                3,
                args.dropout_p_duration_predictor,
                4,
                cond_channels=self.embedded_speaker_dim,
            )
        else:
            self.duration_predictor = DurationPredictor(
                args.hidden_channels, 256, 3, args.dropout_p_duration_predictor, cond_channels=self.embedded_speaker_dim
            )
        self.waveform_decoder = HifiganGenerator(
            args.hidden_channels,
            1,
            args.resblock_type_decoder,
            args.resblock_dilation_sizes_decoder,
            args.resblock_kernel_sizes_decoder,
            args.upsample_kernel_sizes_decoder,
            args.upsample_initial_channel_decoder,
            args.upsample_rates_decoder,
            inference_padding=0,
            cond_channels=self.embedded_speaker_dim,
            conv_pre_weight_norm=False,
            conv_post_weight_norm=False,
            conv_post_bias=False,
        )
        if args.init_discriminator:
            self.disc = VitsDiscriminator(use_spectral_norm=args.use_spectral_norm_disriminator)
    def init_multispeaker(self, config: Coqpit, data: List = None):
        """Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer
        or with external `d_vectors` computed from a speaker encoder model.
        If you need a different behaviour, override this function for your model.
        Args:
            config (Coqpit): Model configuration.
            data (List, optional): Dataset items to infer number of speakers. Defaults to None.
        """
        if hasattr(config, "model_args"):
            config = config.model_args
        self.embedded_speaker_dim = 0
        # init speaker manager
        self.speaker_manager = get_speaker_manager(config, data=data)
        if config.num_speakers > 0 and self.speaker_manager.num_speakers == 0:
            self.speaker_manager.num_speakers = config.num_speakers
        self.num_speakers = self.speaker_manager.num_speakers
        # init speaker embedding layer
        if config.use_speaker_embedding and not config.use_d_vector_file:
            self.embedded_speaker_dim = config.speaker_embedding_channels
            self.emb_g = nn.Embedding(config.num_speakers, config.speaker_embedding_channels)
        # init d-vector usage
        if config.use_d_vector_file:
            self.embedded_speaker_dim = config.d_vector_dim
    @staticmethod
    def _set_cond_input(aux_input: Dict):
        """Set the speaker conditioning input based on the multi-speaker mode."""
        sid, g = None, None
        if "speaker_ids" in aux_input and aux_input["speaker_ids"] is not None:
            sid = aux_input["speaker_ids"]
            if sid.ndim == 0:
                sid = sid.unsqueeze_(0)
        if "d_vectors" in aux_input and aux_input["d_vectors"] is not None:
            g = aux_input["d_vectors"]
        return sid, g
    def forward(
        self,
        x: torch.tensor,
        x_lengths: torch.tensor,
        y: torch.tensor,
        y_lengths: torch.tensor,
        aux_input={"d_vectors": None, "speaker_ids": None},
    ) -> Dict:
        """Forward pass of the model.
        Args:
            x (torch.tensor): Batch of input character sequence IDs.
            x_lengths (torch.tensor): Batch of input character sequence lengths.
            y (torch.tensor): Batch of input spectrograms.
            y_lengths (torch.tensor): Batch of input spectrogram lengths.
            aux_input (dict, optional): Auxiliary inputs for multi-speaker training. Defaults to {"d_vectors": None, "speaker_ids": None}.
        Returns:
            Dict: model outputs keyed by the output name.
        Shapes:
            - x: :math:`[B, T_seq]`
            - x_lengths: :math:`[B]`
            - y: :math:`[B, C, T_spec]`
            - y_lengths: :math:`[B]`
            - d_vectors: :math:`[B, C, 1]`
            - speaker_ids: :math:`[B]`
        """
        outputs = {}
        sid, g = self._set_cond_input(aux_input)
        x, m_p, logs_p, x_mask = self.text_encoder(x, x_lengths)
        # speaker embedding
        if self.num_speakers > 1 and sid is not None:
            g = self.emb_g(sid).unsqueeze(-1)  # [b, h, 1]
        # posterior encoder
        z, m_q, logs_q, y_mask = self.posterior_encoder(y, y_lengths, g=g)
        # flow layers
        z_p = self.flow(z, y_mask, g=g)
        # find the alignment path
        attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2)
        with torch.no_grad():
            o_scale = torch.exp(-2 * logs_p)
            # logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - logs_p, [1]).unsqueeze(-1)  # [b, t, 1]
            logp2 = torch.einsum("klm, kln -> kmn", [o_scale, -0.5 * (z_p ** 2)])
            logp3 = torch.einsum("klm, kln -> kmn", [m_p * o_scale, z_p])
            # logp4 = torch.sum(-0.5 * (m_p ** 2) * o_scale, [1]).unsqueeze(-1)  # [b, t, 1]
            logp = logp2 + logp3
            attn = maximum_path(logp, attn_mask.squeeze(1)).unsqueeze(1).detach()
        # duration predictor
        attn_durations = attn.sum(3)
        if self.args.use_sdp:
            nll_duration = self.duration_predictor(
                x.detach() if self.args.detach_dp_input else x,
                x_mask,
                attn_durations,
                g=g.detach() if self.args.detach_dp_input and g is not None else g,
            )
            nll_duration = torch.sum(nll_duration.float() / torch.sum(x_mask))
            outputs["nll_duration"] = nll_duration
        else:
            attn_log_durations = torch.log(attn_durations + 1e-6) * x_mask
            log_durations = self.duration_predictor(
                x.detach() if self.args.detach_dp_input else x,
                x_mask,
                g=g.detach() if self.args.detach_dp_input and g is not None else g,
            )
            loss_duration = torch.sum((log_durations - attn_log_durations) ** 2, [1, 2]) / torch.sum(x_mask)
            outputs["loss_duration"] = loss_duration
        # expand prior
        m_p = torch.einsum("klmn, kjm -> kjn", [attn, m_p])
        logs_p = torch.einsum("klmn, kjm -> kjn", [attn, logs_p])
        # select a random feature segment for the waveform decoder
        z_slice, slice_ids = rand_segment(z, y_lengths, self.spec_segment_size)
        o = self.waveform_decoder(z_slice, g=g)
        outputs.update(
            {
                "model_outputs": o,
                "alignments": attn.squeeze(1),
                "slice_ids": slice_ids,
                "z": z,
                "z_p": z_p,
                "m_p": m_p,
                "logs_p": logs_p,
                "m_q": m_q,
                "logs_q": logs_q,
            }
        )
        return outputs
    def inference(self, x, aux_input={"d_vectors": None, "speaker_ids": None}):
        """
        Shapes:
            - x: :math:`[B, T_seq]`
            - d_vectors: :math:`[B, C, 1]`
            - speaker_ids: :math:`[B]`
        """
        sid, g = self._set_cond_input(aux_input)
        x_lengths = torch.tensor(x.shape[1:2]).to(x.device)
        x, m_p, logs_p, x_mask = self.text_encoder(x, x_lengths)
        if self.num_speakers > 0 and sid:
            g = self.emb_g(sid).unsqueeze(-1)
        if self.args.use_sdp:
            logw = self.duration_predictor(x, x_mask, g=g, reverse=True, noise_scale=self.inference_noise_scale_dp)
        else:
            logw = self.duration_predictor(x, x_mask, g=g)
        w = torch.exp(logw) * x_mask * self.length_scale
        w_ceil = torch.ceil(w)
        y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
        y_mask = sequence_mask(y_lengths, None).to(x_mask.dtype)
        attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
        attn = generate_path(w_ceil.squeeze(1), attn_mask.squeeze(1).transpose(1, 2))
        m_p = torch.matmul(attn.transpose(1, 2), m_p.transpose(1, 2)).transpose(1, 2)
        logs_p = torch.matmul(attn.transpose(1, 2), logs_p.transpose(1, 2)).transpose(1, 2)
        z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * self.inference_noise_scale
        z = self.flow(z_p, y_mask, g=g, reverse=True)
        o = self.waveform_decoder((z * y_mask)[:, :, : self.max_inference_len], g=g)
        outputs = {"model_outputs": o, "alignments": attn.squeeze(1), "z": z, "z_p": z_p, "m_p": m_p, "logs_p": logs_p}
        return outputs
    def voice_conversion(self, y, y_lengths, sid_src, sid_tgt):
        """TODO: create an end-point for voice conversion"""
        assert self.num_speakers > 0, "num_speakers have to be larger than 0."
        g_src = self.emb_g(sid_src).unsqueeze(-1)
        g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
        z, _, _, y_mask = self.enc_q(y, y_lengths, g=g_src)
        z_p = self.flow(z, y_mask, g=g_src)
        z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
        o_hat = self.waveform_decoder(z_hat * y_mask, g=g_tgt)
        return o_hat, y_mask, (z, z_p, z_hat)
    def train_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int) -> Tuple[Dict, Dict]:
        """Perform a single training step. Run the model forward pass and compute losses.
        Args:
            batch (Dict): Input tensors.
            criterion (nn.Module): Loss layer designed for the model.
            optimizer_idx (int): Index of optimizer to use. 0 for the generator and 1 for the discriminator networks.
        Returns:
            Tuple[Dict, Dict]: Model ouputs and computed losses.
        """
        # pylint: disable=attribute-defined-outside-init
        if optimizer_idx not in [0, 1]:
            raise ValueError(" [!] Unexpected `optimizer_idx`.")
        if optimizer_idx == 0:
            text_input = batch["text_input"]
            text_lengths = batch["text_lengths"]
            mel_lengths = batch["mel_lengths"]
            linear_input = batch["linear_input"]
            d_vectors = batch["d_vectors"]
            speaker_ids = batch["speaker_ids"]
            waveform = batch["waveform"]
            # generator pass
            outputs = self.forward(
                text_input,
                text_lengths,
                linear_input.transpose(1, 2),
                mel_lengths,
                aux_input={"d_vectors": d_vectors, "speaker_ids": speaker_ids},
            )
            # cache tensors for the discriminator
            self.y_disc_cache = None
            self.wav_seg_disc_cache = None
            self.y_disc_cache = outputs["model_outputs"]
            wav_seg = segment(
                waveform.transpose(1, 2),
                outputs["slice_ids"] * self.config.audio.hop_length,
                self.args.spec_segment_size * self.config.audio.hop_length,
            )
            self.wav_seg_disc_cache = wav_seg
            outputs["waveform_seg"] = wav_seg
            # compute discriminator scores and features
            outputs["scores_disc_fake"], outputs["feats_disc_fake"] = self.disc(outputs["model_outputs"])
            _, outputs["feats_disc_real"] = self.disc(wav_seg)
            # compute losses
            with autocast(enabled=False):  # use float32 for the criterion
                loss_dict = criterion[optimizer_idx](
                    waveform_hat=outputs["model_outputs"].float(),
                    waveform=wav_seg.float(),
                    z_p=outputs["z_p"].float(),
                    logs_q=outputs["logs_q"].float(),
                    m_p=outputs["m_p"].float(),
                    logs_p=outputs["logs_p"].float(),
                    z_len=mel_lengths,
                    scores_disc_fake=outputs["scores_disc_fake"],
                    feats_disc_fake=outputs["feats_disc_fake"],
                    feats_disc_real=outputs["feats_disc_real"],
                )
            # handle the duration loss
            if self.args.use_sdp:
                loss_dict["nll_duration"] = outputs["nll_duration"]
                loss_dict["loss"] += outputs["nll_duration"]
            else:
                loss_dict["loss_duration"] = outputs["loss_duration"]
                loss_dict["loss"] += outputs["nll_duration"]
        elif optimizer_idx == 1:
            # discriminator pass
            outputs = {}
            # compute scores and features
            outputs["scores_disc_fake"], outputs["feats_disc_fake"] = self.disc(self.y_disc_cache.detach())
            outputs["scores_disc_real"], outputs["feats_disc_real"] = self.disc(self.wav_seg_disc_cache)
            # compute loss
            with autocast(enabled=False):  # use float32 for the criterion
                loss_dict = criterion[optimizer_idx](
                    outputs["scores_disc_real"],
                    outputs["scores_disc_fake"],
                )
        return outputs, loss_dict
    def train_log(
        self, ap: AudioProcessor, batch: Dict, outputs: List, name_prefix="train"
    ):  # pylint: disable=no-self-use
        """Create visualizations and waveform examples.
        For example, here you can plot spectrograms and generate sample sample waveforms from these spectrograms to
        be projected onto Tensorboard.
        Args:
            ap (AudioProcessor): audio processor used at training.
            batch (Dict): Model inputs used at the previous training step.
            outputs (Dict): Model outputs generated at the previoud training step.
        Returns:
            Tuple[Dict, np.ndarray]: training plots and output waveform.
        """
        y_hat = outputs[0]["model_outputs"]
        y = outputs[0]["waveform_seg"]
        figures = plot_results(y_hat, y, ap, name_prefix)
        sample_voice = y_hat[0].squeeze(0).detach().cpu().numpy()
        audios = {f"{name_prefix}/audio": sample_voice}
        alignments = outputs[0]["alignments"]
        align_img = alignments[0].data.cpu().numpy().T
        figures.update(
            {
                "alignment": plot_alignment(align_img, output_fig=False),
            }
        )
        return figures, audios
    @torch.no_grad()
    def eval_step(self, batch: dict, criterion: nn.Module, optimizer_idx: int):
        return self.train_step(batch, criterion, optimizer_idx)
    def eval_log(self, ap: AudioProcessor, batch: dict, outputs: dict):
        return self.train_log(ap, batch, outputs, "eval")
    @torch.no_grad()
    def test_run(self, ap) -> Tuple[Dict, Dict]:
        """Generic test run for `tts` models used by `Trainer`.
        You can override this for a different behaviour.
        Returns:
            Tuple[Dict, Dict]: Test figures and audios to be projected to Tensorboard.
        """
        print(" | > Synthesizing test sentences.")
        test_audios = {}
        test_figures = {}
        test_sentences = self.config.test_sentences
        aux_inputs = self.get_aux_input()
        for idx, sen in enumerate(test_sentences):
            wav, alignment, _, _ = synthesis(
                self,
                sen,
                self.config,
                "cuda" in str(next(self.parameters()).device),
                ap,
                speaker_id=aux_inputs["speaker_id"],
                d_vector=aux_inputs["d_vector"],
                style_wav=aux_inputs["style_wav"],
                enable_eos_bos_chars=self.config.enable_eos_bos_chars,
                use_griffin_lim=True,
                do_trim_silence=False,
            ).values()
            test_audios["{}-audio".format(idx)] = wav
            test_figures["{}-alignment".format(idx)] = plot_alignment(alignment.T, output_fig=False)
        return test_figures, test_audios
    def get_optimizer(self) -> List:
        """Initiate and return the GAN optimizers based on the config parameters.
        It returnes 2 optimizers in a list. First one is for the generator and the second one is for the discriminator.
        Returns:
            List: optimizers.
        """
        self.disc.requires_grad_(False)
        gen_parameters = filter(lambda p: p.requires_grad, self.parameters())
        self.disc.requires_grad_(True)
        optimizer1 = get_optimizer(
            self.config.optimizer, self.config.optimizer_params, self.config.lr_gen, parameters=gen_parameters
        )
        optimizer2 = get_optimizer(self.config.optimizer, self.config.optimizer_params, self.config.lr_disc, self.disc)
        return [optimizer1, optimizer2]
    def get_lr(self) -> List:
        """Set the initial learning rates for each optimizer.
        Returns:
            List: learning rates for each optimizer.
        """
        return [self.config.lr_gen, self.config.lr_disc]
    def get_scheduler(self, optimizer) -> List:
        """Set the schedulers for each optimizer.
        Args:
            optimizer (List[`torch.optim.Optimizer`]): List of optimizers.
        Returns:
            List: Schedulers, one for each optimizer.
        """
        scheduler1 = get_scheduler(self.config.lr_scheduler_gen, self.config.lr_scheduler_gen_params, optimizer[0])
        scheduler2 = get_scheduler(self.config.lr_scheduler_disc, self.config.lr_scheduler_disc_params, optimizer[1])
        return [scheduler1, scheduler2]
    def get_criterion(self):
        """Get criterions for each optimizer. The index in the output list matches the optimizer idx used in
        `train_step()`"""
        from TTS.tts.layers.losses import (  # pylint: disable=import-outside-toplevel
            VitsDiscriminatorLoss,
            VitsGeneratorLoss,
        )
        return [VitsGeneratorLoss(self.config), VitsDiscriminatorLoss(self.config)]
    @staticmethod
    def make_symbols(config):
        """Create a custom arrangement of symbols used by the model. The output list of symbols propagate along the
        whole training and inference steps."""
        _pad = config.characters["pad"]
        _punctuations = config.characters["punctuations"]
        _letters = config.characters["characters"]
        _letters_ipa = config.characters["phonemes"]
        symbols = [_pad] + list(_punctuations) + list(_letters)
        if config.use_phonemes:
            symbols += list(_letters_ipa)
        return symbols
    @staticmethod
    def get_characters(config: Coqpit):
        if config.characters is not None:
            symbols = Vits.make_symbols(config)
        else:
            from TTS.tts.utils.text.symbols import (  # pylint: disable=import-outside-toplevel
                parse_symbols,
                phonemes,
                symbols,
            )
            config.characters = parse_symbols()
            if config.use_phonemes:
                symbols = phonemes
        num_chars = len(symbols) + getattr(config, "add_blank", False)
        return symbols, config, num_chars
    def load_checkpoint(
        self, config, checkpoint_path, eval=False
    ):  # pylint: disable=unused-argument, redefined-builtin
        """Load the model checkpoint and setup for training or inference"""
        state = torch.load(checkpoint_path, map_location=torch.device("cpu"))
        self.load_state_dict(state["model"])
        if eval:
            self.eval()
            assert not self.training
--- a/TTS/tts/tf/utils/generic_utils.py
+++ b/TTS/tts/tf/utils/generic_utils.py
@ -2,6 +2,7 @@ import datetime
 import importlib
 import pickle
 import fsspec
 import numpy as np
 import tensorflow as tf
@ -16,11 +17,13 @@ def save_checkpoint(model, optimizer, current_step, epoch, r, output_path, **kwa
        "r": r,
    }
    state.update(kwargs)
-    pickle.dump(state, open(output_path, "wb"))
+    with fsspec.open(output_path, "wb") as f:
        pickle.dump(state, f)
 def load_checkpoint(model, checkpoint_path):
-    checkpoint = pickle.load(open(checkpoint_path, "rb"))
+    with fsspec.open(checkpoint_path, "rb") as f:
        checkpoint = pickle.load(f)
    chkp_var_dict = {var.name: var.numpy() for var in checkpoint["model"]}
    tf_vars = model.weights
    for tf_var in tf_vars:
--- a/TTS/tts/tf/utils/io.py
+++ b/TTS/tts/tf/utils/io.py
@ -1,6 +1,7 @@
 import datetime
 import pickle
 import fsspec
 import tensorflow as tf
@ -14,11 +15,13 @@ def save_checkpoint(model, optimizer, current_step, epoch, r, output_path, **kwa
        "r": r,
    }
    state.update(kwargs)
-    pickle.dump(state, open(output_path, "wb"))
+    with fsspec.open(output_path, "wb") as f:
        pickle.dump(state, f)
 def load_checkpoint(model, checkpoint_path):
-    checkpoint = pickle.load(open(checkpoint_path, "rb"))
+    with fsspec.open(checkpoint_path, "rb") as f:
        checkpoint = pickle.load(f)
    chkp_var_dict = {var.name: var.numpy() for var in checkpoint["model"]}
    tf_vars = model.weights
    for tf_var in tf_vars:
--- a/TTS/tts/tf/utils/tflite.py
+++ b/TTS/tts/tf/utils/tflite.py
@ -1,3 +1,4 @@
 import fsspec
 import tensorflow as tf
@ -14,7 +15,7 @@ def convert_tacotron2_to_tflite(model, output_path=None, experimental_converter=
    print(f"Tflite Model size is {len(tflite_model) / (1024.0 * 1024.0)} MBs.")
    if output_path is not None:
        # same model binary if outputpath is provided
-        with open(output_path, "wb") as f:
+        with fsspec.open(output_path, "wb") as f:
            f.write(tflite_model)
        return None
    return tflite_model
--- a/TTS/tts/utils/speakers.py
+++ b/TTS/tts/utils/speakers.py
@ -3,6 +3,7 @@ import os
 import random
 from typing import Any, Dict, List, Tuple, Union
 import fsspec
 import numpy as np
 import torch
 from coqpit import Coqpit
@ -84,12 +85,12 @@ class SpeakerManager:
    @staticmethod
    def _load_json(json_file_path: str) -> Dict:
-        with open(json_file_path) as f:
+        with fsspec.open(json_file_path, "r") as f:
            return json.load(f)
    @staticmethod
    def _save_json(json_file_path: str, data: dict) -> None:
-        with open(json_file_path, "w") as f:
+        with fsspec.open(json_file_path, "w") as f:
            json.dump(data, f, indent=4)
    @property
@ -294,9 +295,10 @@ def _set_file_path(path):
    Intended to band aid the different paths returned in restored and continued training."""
    path_restore = os.path.join(os.path.dirname(path), "speakers.json")
    path_continue = os.path.join(path, "speakers.json")
-    if os.path.exists(path_restore):
+    fs = fsspec.get_mapper(path).fs
    if fs.exists(path_restore):
        return path_restore
-    if os.path.exists(path_continue):
+    if fs.exists(path_continue):
        return path_continue
    raise FileNotFoundError(f" [!] `speakers.json` not found in {path}")
@ -307,7 +309,7 @@ def load_speaker_mapping(out_path):
        json_file = out_path
    else:
        json_file = _set_file_path(out_path)
-    with open(json_file) as f:
+    with fsspec.open(json_file, "r") as f:
        return json.load(f)
@ -315,7 +317,7 @@ def save_speaker_mapping(out_path, speaker_mapping):
    """Saves speaker mapping if not yet present."""
    if out_path is not None:
        speakers_json_path = _set_file_path(out_path)
-        with open(speakers_json_path, "w") as f:
+        with fsspec.open(speakers_json_path, "w") as f:
            json.dump(speaker_mapping, f, indent=4)
@ -358,10 +360,13 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None,
        elif c.use_d_vector_file and c.d_vector_file:
            # new speaker manager with external speaker embeddings.
            speaker_manager.set_d_vectors_from_file(c.d_vector_file)
-        elif c.use_d_vector_file and not c.d_vector_file:  # new speaker manager with speaker IDs file.
+        elif c.use_d_vector_file and not c.d_vector_file:
-            raise "use_d_vector_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder"
+            raise "use_d_vector_file is True, so you need pass a external speaker embedding file."
        elif c.use_speaker_embedding and "speakers_file" in c and c.speakers_file:
            # new speaker manager with speaker IDs file.
            speaker_manager.set_speaker_ids_from_file(c.speakers_file)
        print(
-            " > Training with {} speakers: {}".format(
+            " > Speaker manager is loaded with {} speakers: {}".format(
                speaker_manager.num_speakers, ", ".join(speaker_manager.speaker_ids)
            )
        )
--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@ -15,7 +15,7 @@ if "tensorflow" in installed or "tensorflow-gpu" in installed:
    import tensorflow as tf
-def text_to_seq(text, CONFIG):
+def text_to_seq(text, CONFIG, custom_symbols=None):
    text_cleaner = [CONFIG.text_cleaner]
    # text ot phonemes to sequence vector
    if CONFIG.use_phonemes:
@ -28,16 +28,14 @@ def text_to_seq(text, CONFIG):
                tp=CONFIG.characters,
                add_blank=CONFIG.add_blank,
                use_espeak_phonemes=CONFIG.use_espeak_phonemes,
                custom_symbols=custom_symbols,
            ),
            dtype=np.int32,
        )
    else:
        seq = np.asarray(
            text_to_sequence(
-                text,
+                text, text_cleaner, tp=CONFIG.characters, add_blank=CONFIG.add_blank, custom_symbols=custom_symbols
                text_cleaner,
                tp=CONFIG.characters,
                add_blank=CONFIG.add_blank,
            ),
            dtype=np.int32,
        )
@ -229,13 +227,16 @@ def synthesis(
    """
    # GST processing
    style_mel = None
    custom_symbols = None
    if CONFIG.has("gst") and CONFIG.gst and style_wav is not None:
        if isinstance(style_wav, dict):
            style_mel = style_wav
        else:
            style_mel = compute_style_mel(style_wav, ap, cuda=use_cuda)
    if hasattr(model, "make_symbols"):
        custom_symbols = model.make_symbols(CONFIG)
    # preprocess the given text
-    text_inputs = text_to_seq(text, CONFIG)
+    text_inputs = text_to_seq(text, CONFIG, custom_symbols=custom_symbols)
    # pass tensors to backend
    if backend == "torch":
        if speaker_id is not None:
@ -274,15 +275,18 @@ def synthesis(
    # convert outputs to numpy
    # plot results
    wav = None
-    if use_griffin_lim:
+    if hasattr(model, "END2END") and model.END2END:
-        wav = inv_spectrogram(model_outputs, ap, CONFIG)
+        wav = model_outputs.squeeze(0)
-        # trim silence
+    else:
-        if do_trim_silence:
+        if use_griffin_lim:
-            wav = trim_silence(wav, ap)
+            wav = inv_spectrogram(model_outputs, ap, CONFIG)
            # trim silence
            if do_trim_silence:
                wav = trim_silence(wav, ap)
    return_dict = {
        "wav": wav,
        "alignments": alignments,
        "model_outputs": model_outputs,
        "text_inputs": text_inputs,
        "outputs": outputs,
    }
    return return_dict
--- a/TTS/tts/utils/text/init.py
+++ b/TTS/tts/utils/text/init.py
@ -2,10 +2,9 @@
 # adapted from https://github.com/keithito/tacotron
 import re
-import unicodedata
+from typing import Dict, List
 import gruut
 from packaging import version
 from TTS.tts.utils.text import cleaners
 from TTS.tts.utils.text.chinese_mandarin.phonemizer import chinese_text_to_phonemes
@ -22,6 +21,7 @@ _id_to_phonemes = {i: s for i, s in enumerate(phonemes)}
 _symbols = symbols
 _phonemes = phonemes
 # Regular expression matching text enclosed in curly braces:
 _CURLY_RE = re.compile(r"(.*?)\{(.+?)\}(.*)")
@ -81,7 +81,6 @@ def text2phone(text, language, use_espeak_phonemes=False):
        # Fix a few phonemes
        ph = ph.translate(GRUUT_TRANS_TABLE)
        print(" > Phonemes: {}".format(ph))
        return ph
    raise ValueError(f" [!] Language {language} is not supported for phonemization.")
@ -106,13 +105,38 @@ def pad_with_eos_bos(phoneme_sequence, tp=None):
 def phoneme_to_sequence(
-    text, cleaner_names, language, enable_eos_bos=False, tp=None, add_blank=False, use_espeak_phonemes=False
+    text: str,
-):
+    cleaner_names: List[str],
    language: str,
    enable_eos_bos: bool = False,
    custom_symbols: List[str] = None,
    tp: Dict = None,
    add_blank: bool = False,
    use_espeak_phonemes: bool = False,
 ) -> List[int]:
    """Converts a string of phonemes to a sequence of IDs.
    If `custom_symbols` is provided, it will override the default symbols.
    Args:
      text (str): string to convert to a sequence
      cleaner_names (List[str]): names of the cleaner functions to run the text through
      language (str): text language key for phonemization.
      enable_eos_bos (bool): whether to append the end-of-sentence and beginning-of-sentence tokens.
      tp (Dict): dictionary of character parameters to use a custom character set.
      add_blank (bool): option to add a blank token between each token.
      use_espeak_phonemes (bool): use espeak based lexicons to convert phonemes to sequenc
    Returns:
      List[int]: List of integers corresponding to the symbols in the text
    """
    # pylint: disable=global-statement
    global _phonemes_to_id, _phonemes
-    if tp:
+
    if custom_symbols is not None:
        _phonemes = custom_symbols
    elif tp:
        _, _phonemes = make_symbols(**tp)
-        _phonemes_to_id = {s: i for i, s in enumerate(_phonemes)}
+    _phonemes_to_id = {s: i for i, s in enumerate(_phonemes)}
    sequence = []
    clean_text = _clean_text(text, cleaner_names)
@ -127,20 +151,22 @@ def phoneme_to_sequence(
        sequence = pad_with_eos_bos(sequence, tp=tp)
    if add_blank:
        sequence = intersperse(sequence, len(_phonemes))  # add a blank token (new), whose id number is len(_phonemes)
    return sequence
-def sequence_to_phoneme(sequence, tp=None, add_blank=False):
+def sequence_to_phoneme(sequence: List, tp: Dict = None, add_blank=False, custom_symbols: List["str"] = None):
    # pylint: disable=global-statement
    """Converts a sequence of IDs back to a string"""
    global _id_to_phonemes, _phonemes
    if add_blank:
        sequence = list(filter(lambda x: x != len(_phonemes), sequence))
    result = ""
-    if tp:
+
    if custom_symbols is not None:
        _phonemes = custom_symbols
    elif tp:
        _, _phonemes = make_symbols(**tp)
-        _id_to_phonemes = {i: s for i, s in enumerate(_phonemes)}
+    _id_to_phonemes = {i: s for i, s in enumerate(_phonemes)}
    for symbol_id in sequence:
        if symbol_id in _id_to_phonemes:
@ -149,27 +175,32 @@ def sequence_to_phoneme(sequence, tp=None, add_blank=False):
    return result.replace("}{", " ")
-def text_to_sequence(text, cleaner_names, tp=None, add_blank=False):
+def text_to_sequence(
    text: str, cleaner_names: List[str], custom_symbols: List[str] = None, tp: Dict = None, add_blank: bool = False
 ) -> List[int]:
    """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
-
+    If `custom_symbols` is provided, it will override the default symbols.
    The text can optionally have ARPAbet sequences enclosed in curly braces embedded
    in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
    Args:
-      text: string to convert to a sequence
+      text (str): string to convert to a sequence
-      cleaner_names: names of the cleaner functions to run the text through
+      cleaner_names (List[str]): names of the cleaner functions to run the text through
-      tp: dictionary of character parameters to use a custom character set.
+      tp (Dict): dictionary of character parameters to use a custom character set.
      add_blank (bool): option to add a blank token between each token.
    Returns:
-      List of integers corresponding to the symbols in the text
+      List[int]: List of integers corresponding to the symbols in the text
    """
    # pylint: disable=global-statement
    global _symbol_to_id, _symbols
-    if tp:
+
    if custom_symbols is not None:
        _symbols = custom_symbols
    elif tp:
        _symbols, _ = make_symbols(**tp)
-        _symbol_to_id = {s: i for i, s in enumerate(_symbols)}
+    _symbol_to_id = {s: i for i, s in enumerate(_symbols)}
    sequence = []
    # Check for curly braces and treat their contents as ARPAbet:
    while text:
        m = _CURLY_RE.match(text)
@ -185,16 +216,18 @@ def text_to_sequence(text, cleaner_names, tp=None, add_blank=False):
    return sequence
-def sequence_to_text(sequence, tp=None, add_blank=False):
+def sequence_to_text(sequence: List, tp: Dict = None, add_blank=False, custom_symbols: List[str] = None):
    """Converts a sequence of IDs back to a string"""
    # pylint: disable=global-statement
    global _id_to_symbol, _symbols
    if add_blank:
        sequence = list(filter(lambda x: x != len(_symbols), sequence))
-    if tp:
+    if custom_symbols is not None:
        _symbols = custom_symbols
    elif tp:
        _symbols, _ = make_symbols(**tp)
-        _id_to_symbol = {i: s for i, s in enumerate(_symbols)}
+    _id_to_symbol = {i: s for i, s in enumerate(_symbols)}
    result = ""
    for symbol_id in sequence:
--- a/TTS/tts/utils/text/symbols.py
+++ b/TTS/tts/utils/text/symbols.py
@ -28,10 +28,10 @@ def make_symbols(
            sorted(list(set(phonemes))) if unique else sorted(list(phonemes))
        )  # this is to keep previous models compatible.
        # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
-        _arpabet = ["@" + s for s in _phonemes_sorted]
+        # _arpabet = ["@" + s for s in _phonemes_sorted]
        # Export all symbols:
        _phonemes = [pad, eos, bos] + list(_phonemes_sorted) + list(punctuations)
-        _symbols += _arpabet
+        # _symbols += _arpabet
    return _symbols, _phonemes
--- a/TTS/utils/audio.py
+++ b/TTS/utils/audio.py
@ -14,7 +14,10 @@ from TTS.tts.utils.data import StandardScaler
 class TorchSTFT(nn.Module):  # pylint: disable=abstract-method
-    """TODO: Merge this with audio.py"""
+    """Some of the audio processing funtions using Torch for faster batch processing.
    TODO: Merge this with audio.py
    """
    def __init__(
        self,
@ -28,6 +31,8 @@ class TorchSTFT(nn.Module):  # pylint: disable=abstract-method
        mel_fmax=None,
        n_mels=80,
        use_mel=False,
        do_amp_to_db=False,
        spec_gain=1.0,
    ):
        super().__init__()
        self.n_fft = n_fft
@ -39,6 +44,8 @@ class TorchSTFT(nn.Module):  # pylint: disable=abstract-method
        self.mel_fmax = mel_fmax
        self.n_mels = n_mels
        self.use_mel = use_mel
        self.do_amp_to_db = do_amp_to_db
        self.spec_gain = spec_gain
        self.window = nn.Parameter(getattr(torch, window)(win_length), requires_grad=False)
        self.mel_basis = None
        if use_mel:
@ -79,6 +86,8 @@ class TorchSTFT(nn.Module):  # pylint: disable=abstract-method
        S = torch.sqrt(torch.clamp(M ** 2 + P ** 2, min=1e-8))
        if self.use_mel:
            S = torch.matmul(self.mel_basis.to(x), S)
        if self.do_amp_to_db:
            S = self._amp_to_db(S, spec_gain=self.spec_gain)
        return S
    def _build_mel_basis(self):
@ -87,6 +96,14 @@ class TorchSTFT(nn.Module):  # pylint: disable=abstract-method
        )
        self.mel_basis = torch.from_numpy(mel_basis).float()
    @staticmethod
    def _amp_to_db(x, spec_gain=1.0):
        return torch.log(torch.clamp(x, min=1e-5) * spec_gain)
    @staticmethod
    def _db_to_amp(x, spec_gain=1.0):
        return torch.exp(x) / spec_gain
 # pylint: disable=too-many-public-methods
 class AudioProcessor(object):
@ -97,33 +114,93 @@ class AudioProcessor(object):
        of the class with the model config. They are not meaningful for all the arguments.
    Args:
-        sample_rate (int, optional): target audio sampling rate. Defaults to None.
+        sample_rate (int, optional):
-        resample (bool, optional): enable/disable resampling of the audio clips when the target sampling rate does not match the original sampling rate. Defaults to False.
+            target audio sampling rate. Defaults to None.
-        num_mels (int, optional): number of melspectrogram dimensions. Defaults to None.
+
-        log_func (int, optional): log exponent used for converting spectrogram aplitude to DB.
+        resample (bool, optional):
-        min_level_db (int, optional): minimum db threshold for the computed melspectrograms. Defaults to None.
+            enable/disable resampling of the audio clips when the target sampling rate does not match the original sampling rate. Defaults to False.
-        frame_shift_ms (int, optional): milliseconds of frames between STFT columns. Defaults to None.
+
-        frame_length_ms (int, optional): milliseconds of STFT window length. Defaults to None.
+        num_mels (int, optional):
-        hop_length (int, optional): number of frames between STFT columns. Used if ```frame_shift_ms``` is None. Defaults to None.
+            number of melspectrogram dimensions. Defaults to None.
-        win_length (int, optional): STFT window length. Used if ```frame_length_ms``` is None. Defaults to None.
+
-        ref_level_db (int, optional): reference DB level to avoid background noise. In general <20DB corresponds to the air noise. Defaults to None.
+        log_func (int, optional):
-        fft_size (int, optional): FFT window size for STFT. Defaults to 1024.
+            log exponent used for converting spectrogram aplitude to DB.
-        power (int, optional): Exponent value applied to the spectrogram before GriffinLim. Defaults to None.
+
-        preemphasis (float, optional): Preemphasis coefficient. Preemphasis is disabled if == 0.0. Defaults to 0.0.
+        min_level_db (int, optional):
-        signal_norm (bool, optional): enable/disable signal normalization. Defaults to None.
+            minimum db threshold for the computed melspectrograms. Defaults to None.
-        symmetric_norm (bool, optional): enable/disable symmetric normalization. If set True normalization is performed in the range [-k, k] else [0, k], Defaults to None.
+
-        max_norm (float, optional): ```k``` defining the normalization range. Defaults to None.
+        frame_shift_ms (int, optional):
-        mel_fmin (int, optional): minimum filter frequency for computing melspectrograms. Defaults to None.
+            milliseconds of frames between STFT columns. Defaults to None.
-        mel_fmax (int, optional): maximum filter frequency for computing melspectrograms.. Defaults to None.
+
-        spec_gain (int, optional): gain applied when converting amplitude to DB. Defaults to 20.
+        frame_length_ms (int, optional):
-        stft_pad_mode (str, optional): Padding mode for STFT. Defaults to 'reflect'.
+            milliseconds of STFT window length. Defaults to None.
-        clip_norm (bool, optional): enable/disable clipping the our of range values in the normalized audio signal. Defaults to True.
+
-        griffin_lim_iters (int, optional): Number of GriffinLim iterations. Defaults to None.
+        hop_length (int, optional):
-        do_trim_silence (bool, optional): enable/disable silence trimming when loading the audio signal. Defaults to False.
+            number of frames between STFT columns. Used if ```frame_shift_ms``` is None. Defaults to None.
-        trim_db (int, optional): DB threshold used for silence trimming. Defaults to 60.
+
-        do_sound_norm (bool, optional): enable/disable signal normalization. Defaults to False.
+        win_length (int, optional):
-        stats_path (str, optional): Path to the computed stats file. Defaults to None.
+            STFT window length. Used if ```frame_length_ms``` is None. Defaults to None.
-        verbose (bool, optional): enable/disable logging. Defaults to True.
+
        ref_level_db (int, optional):
            reference DB level to avoid background noise. In general <20DB corresponds to the air noise. Defaults to None.
        fft_size (int, optional):
            FFT window size for STFT. Defaults to 1024.
        power (int, optional):
            Exponent value applied to the spectrogram before GriffinLim. Defaults to None.
        preemphasis (float, optional):
            Preemphasis coefficient. Preemphasis is disabled if == 0.0. Defaults to 0.0.
        signal_norm (bool, optional):
            enable/disable signal normalization. Defaults to None.
        symmetric_norm (bool, optional):
            enable/disable symmetric normalization. If set True normalization is performed in the range [-k, k] else [0, k], Defaults to None.
        max_norm (float, optional):
            ```k``` defining the normalization range. Defaults to None.
        mel_fmin (int, optional):
            minimum filter frequency for computing melspectrograms. Defaults to None.
        mel_fmax (int, optional):
            maximum filter frequency for computing melspectrograms.. Defaults to None.
        spec_gain (int, optional):
            gain applied when converting amplitude to DB. Defaults to 20.
        stft_pad_mode (str, optional):
            Padding mode for STFT. Defaults to 'reflect'.
        clip_norm (bool, optional):
            enable/disable clipping the our of range values in the normalized audio signal. Defaults to True.
        griffin_lim_iters (int, optional):
            Number of GriffinLim iterations. Defaults to None.
        do_trim_silence (bool, optional):
            enable/disable silence trimming when loading the audio signal. Defaults to False.
        trim_db (int, optional):
            DB threshold used for silence trimming. Defaults to 60.
        do_sound_norm (bool, optional):
            enable/disable signal normalization. Defaults to False.
        do_amp_to_db_linear (bool, optional):
            enable/disable amplitude to dB conversion of linear spectrograms. Defaults to True.
        do_amp_to_db_mel (bool, optional):
            enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True.
        stats_path (str, optional):
            Path to the computed stats file. Defaults to None.
        verbose (bool, optional):
            enable/disable logging. Defaults to True.
    """
    def __init__(
@ -153,6 +230,8 @@ class AudioProcessor(object):
        do_trim_silence=False,
        trim_db=60,
        do_sound_norm=False,
        do_amp_to_db_linear=True,
        do_amp_to_db_mel=True,
        stats_path=None,
        verbose=True,
        **_,
@ -181,6 +260,8 @@ class AudioProcessor(object):
        self.do_trim_silence = do_trim_silence
        self.trim_db = trim_db
        self.do_sound_norm = do_sound_norm
        self.do_amp_to_db_linear = do_amp_to_db_linear
        self.do_amp_to_db_mel = do_amp_to_db_mel
        self.stats_path = stats_path
        # setup exp_func for db to amp conversion
        if log_func == "np.log":
@ -381,7 +462,6 @@ class AudioProcessor(object):
        Returns:
            np.ndarray: Decibels spectrogram.
        """
        return self.spec_gain * _log(np.maximum(1e-5, x), self.base)
    # pylint: disable=no-self-use
@ -448,7 +528,10 @@ class AudioProcessor(object):
            D = self._stft(self.apply_preemphasis(y))
        else:
            D = self._stft(y)
-        S = self._amp_to_db(np.abs(D))
+        if self.do_amp_to_db_linear:
            S = self._amp_to_db(np.abs(D))
        else:
            S = np.abs(D)
        return self.normalize(S).astype(np.float32)
    def melspectrogram(self, y: np.ndarray) -> np.ndarray:
@ -457,7 +540,10 @@ class AudioProcessor(object):
            D = self._stft(self.apply_preemphasis(y))
        else:
            D = self._stft(y)
-        S = self._amp_to_db(self._linear_to_mel(np.abs(D)))
+        if self.do_amp_to_db_mel:
            S = self._amp_to_db(self._linear_to_mel(np.abs(D)))
        else:
            S = self._linear_to_mel(np.abs(D))
        return self.normalize(S).astype(np.float32)
    def inv_spectrogram(self, spectrogram: np.ndarray) -> np.ndarray:
--- a/TTS/utils/generic_utils.py
+++ b/TTS/utils/generic_utils.py
@ -1,15 +1,14 @@
 # -*- coding: utf-8 -*-
 import datetime
 import glob
 import importlib
 import os
 import re
 import shutil
 import subprocess
 import sys
 from pathlib import Path
 from typing import Dict
 import fsspec
 import torch
@ -58,23 +57,22 @@ def get_commit_hash():
    return commit
-def create_experiment_folder(root_path, model_name):
+def get_experiment_folder_path(root_path, model_name):
-    """Create a folder with the current date and time"""
+    """Get an experiment folder path with the current date and time"""
    date_str = datetime.datetime.now().strftime("%B-%d-%Y_%I+%M%p")
    commit_hash = get_commit_hash()
    output_folder = os.path.join(root_path, model_name + "-" + date_str + "-" + commit_hash)
    os.makedirs(output_folder, exist_ok=True)
    print(" > Experiment folder: {}".format(output_folder))
    return output_folder
 def remove_experiment_folder(experiment_path):
    """Check folder if there is a checkpoint, otherwise remove the folder"""
-
+    fs = fsspec.get_mapper(experiment_path).fs
-    checkpoint_files = glob.glob(experiment_path + "/*.pth.tar")
+    checkpoint_files = fs.glob(experiment_path + "/*.pth.tar")
    if not checkpoint_files:
-        if os.path.exists(experiment_path):
+        if fs.exists(experiment_path):
-            shutil.rmtree(experiment_path, ignore_errors=True)
+            fs.rm(experiment_path, recursive=True)
            print(" ! Run is removed from {}".format(experiment_path))
    else:
        print(" ! Run is kept in {}".format(experiment_path))
--- a/TTS/utils/io.py
+++ b/TTS/utils/io.py
@ -1,9 +1,11 @@
 import datetime
-import glob
+import json
 import os
 import pickle as pickle_tts
-from shutil import copyfile
+import shutil
 from typing import Any
 import fsspec
 import torch
 from coqpit import Coqpit
@ -24,7 +26,7 @@ class AttrDict(dict):
        self.__dict__ = self
-def copy_model_files(config, out_path, new_fields):
+def copy_model_files(config: Coqpit, out_path, new_fields):
    """Copy config.json and other model files to training folder and add
    new fields.
@ -37,23 +39,40 @@ def copy_model_files(config, out_path, new_fields):
    copy_config_path = os.path.join(out_path, "config.json")
    # add extra information fields
    config.update(new_fields, allow_new=True)
-    config.save_json(copy_config_path)
+    # TODO: Revert to config.save_json() once Coqpit supports arbitrary paths.
    with fsspec.open(copy_config_path, "w", encoding="utf8") as f:
        json.dump(config.to_dict(), f, indent=4)
    # copy model stats file if available
    if config.audio.stats_path is not None:
        copy_stats_path = os.path.join(out_path, "scale_stats.npy")
-        if not os.path.exists(copy_stats_path):
+        filesystem = fsspec.get_mapper(copy_stats_path).fs
-            copyfile(
+        if not filesystem.exists(copy_stats_path):
-                config.audio.stats_path,
+            with fsspec.open(config.audio.stats_path, "rb") as source_file:
-                copy_stats_path,
+                with fsspec.open(copy_stats_path, "wb") as target_file:
-            )
+                    shutil.copyfileobj(source_file, target_file)
 def load_fsspec(path: str, **kwargs) -> Any:
    """Like torch.load but can load from other locations (e.g. s3:// , gs://).
    Args:
        path: Any path or url supported by fsspec.
        **kwargs: Keyword arguments forwarded to torch.load.
    Returns:
        Object stored in path.
    """
    with fsspec.open(path, "rb") as f:
        return torch.load(f, **kwargs)
 def load_checkpoint(model, checkpoint_path, use_cuda=False, eval=False):  # pylint: disable=redefined-builtin
    try:
-        state = torch.load(checkpoint_path, map_location=torch.device("cpu"))
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
    except ModuleNotFoundError:
        pickle_tts.Unpickler = RenamingUnpickler
-        state = torch.load(checkpoint_path, map_location=torch.device("cpu"), pickle_module=pickle_tts)
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), pickle_module=pickle_tts)
    model.load_state_dict(state["model"])
    if use_cuda:
        model.cuda()
@ -62,6 +81,18 @@ def load_checkpoint(model, checkpoint_path, use_cuda=False, eval=False):  # pyli
    return model, state
 def save_fsspec(state: Any, path: str, **kwargs):
    """Like torch.save but can save to other locations (e.g. s3:// , gs://).
    Args:
        state: State object to save
        path: Any path or url supported by fsspec.
        **kwargs: Keyword arguments forwarded to torch.save.
    """
    with fsspec.open(path, "wb") as f:
        torch.save(state, f, **kwargs)
 def save_model(config, model, optimizer, scaler, current_step, epoch, output_path, **kwargs):
    if hasattr(model, "module"):
        model_state = model.module.state_dict()
@ -90,7 +121,7 @@ def save_model(config, model, optimizer, scaler, current_step, epoch, output_pat
        "date": datetime.date.today().strftime("%B %d, %Y"),
    }
    state.update(kwargs)
-    torch.save(state, output_path)
+    save_fsspec(state, output_path)
 def save_checkpoint(
@ -147,18 +178,16 @@ def save_best_model(
            model_loss=current_loss,
            **kwargs,
        )
        fs = fsspec.get_mapper(out_path).fs
        # only delete previous if current is saved successfully
        if not keep_all_best or (current_step < keep_after):
-            model_names = glob.glob(os.path.join(out_path, "best_model*.pth.tar"))
+            model_names = fs.glob(os.path.join(out_path, "best_model*.pth.tar"))
            for model_name in model_names:
-                if os.path.basename(model_name) == best_model_name:
+                if os.path.basename(model_name) != best_model_name:
-                    continue
+                    fs.rm(model_name)
-                os.remove(model_name)
+        # create a shortcut which always points to the currently best model
-        # create symlink to best model for convinience
+        shortcut_name = "best_model.pth.tar"
-        link_name = "best_model.pth.tar"
+        shortcut_path = os.path.join(out_path, shortcut_name)
-        link_path = os.path.join(out_path, link_name)
+        fs.copy(checkpoint_path, shortcut_path)
        if os.path.islink(link_path) or os.path.isfile(link_path):
            os.remove(link_path)
        os.symlink(best_model_name, os.path.join(out_path, link_name))
        best_loss = current_loss
    return best_loss
--- a/TTS/utils/logging/init.py
+++ b/TTS/utils/logging/init.py
@ -1,2 +1,24 @@
 from TTS.utils.logging.console_logger import ConsoleLogger
 from TTS.utils.logging.tensorboard_logger import TensorboardLogger
 from TTS.utils.logging.wandb_logger import WandbLogger
 def init_logger(config):
    if config.dashboard_logger == "tensorboard":
        dashboard_logger = TensorboardLogger(config.output_log_path, model_name=config.model)
    elif config.dashboard_logger == "wandb":
        project_name = config.model
        if config.project_name:
            project_name = config.project_name
        dashboard_logger = WandbLogger(
            project=project_name,
            name=config.run_name,
            config=config,
            entity=config.wandb_entity,
        )
    dashboard_logger.add_text("model-config", f"<pre>{config.to_json()}</pre>", 0)
    return dashboard_logger
--- a/TTS/utils/logging/console_logger.py
+++ b/TTS/utils/logging/console_logger.py
@ -38,7 +38,7 @@ class ConsoleLogger:
    def print_train_start(self):
        print(f"\n{tcolors.BOLD} > TRAINING ({self.get_time()}) {tcolors.ENDC}")
-    def print_train_step(self, batch_steps, step, global_step, log_dict, loss_dict, avg_loss_dict):
+    def print_train_step(self, batch_steps, step, global_step, loss_dict, avg_loss_dict):
        indent = "     | > "
        print()
        log_text = "{}   --> STEP: {}/{} -- GLOBAL_STEP: {}{}\n".format(
@ -50,13 +50,6 @@ class ConsoleLogger:
                log_text += "{}{}: {:.5f}  ({:.5f})\n".format(indent, key, value, avg_loss_dict[f"avg_{key}"])
            else:
                log_text += "{}{}: {:.5f} \n".format(indent, key, value)
        for idx, (key, value) in enumerate(log_dict.items()):
            if isinstance(value, list):
                log_text += f"{indent}{key}: {value[0]:.{value[1]}f}"
            else:
                log_text += f"{indent}{key}: {value}"
            if idx < len(log_dict) - 1:
                log_text += "\n"
        print(log_text, flush=True)
    # pylint: disable=unused-argument
--- a/TTS/utils/logging/tensorboard_logger.py
+++ b/TTS/utils/logging/tensorboard_logger.py
@ -7,10 +7,8 @@ class TensorboardLogger(object):
    def __init__(self, log_dir, model_name):
        self.model_name = model_name
        self.writer = SummaryWriter(log_dir)
        self.train_stats = {}
        self.eval_stats = {}
-    def tb_model_weights(self, model, step):
+    def model_weights(self, model, step):
        layer_num = 1
        for name, param in model.named_parameters():
            if param.numel() == 1:
@ -41,32 +39,41 @@ class TensorboardLogger(object):
            except RuntimeError:
                traceback.print_exc()
-    def tb_train_step_stats(self, step, stats):
+    def train_step_stats(self, step, stats):
        self.dict_to_tb_scalar(f"{self.model_name}_TrainIterStats", stats, step)
-    def tb_train_epoch_stats(self, step, stats):
+    def train_epoch_stats(self, step, stats):
        self.dict_to_tb_scalar(f"{self.model_name}_TrainEpochStats", stats, step)
-    def tb_train_figures(self, step, figures):
+    def train_figures(self, step, figures):
        self.dict_to_tb_figure(f"{self.model_name}_TrainFigures", figures, step)
-    def tb_train_audios(self, step, audios, sample_rate):
+    def train_audios(self, step, audios, sample_rate):
        self.dict_to_tb_audios(f"{self.model_name}_TrainAudios", audios, step, sample_rate)
-    def tb_eval_stats(self, step, stats):
+    def eval_stats(self, step, stats):
        self.dict_to_tb_scalar(f"{self.model_name}_EvalStats", stats, step)
-    def tb_eval_figures(self, step, figures):
+    def eval_figures(self, step, figures):
        self.dict_to_tb_figure(f"{self.model_name}_EvalFigures", figures, step)
-    def tb_eval_audios(self, step, audios, sample_rate):
+    def eval_audios(self, step, audios, sample_rate):
        self.dict_to_tb_audios(f"{self.model_name}_EvalAudios", audios, step, sample_rate)
-    def tb_test_audios(self, step, audios, sample_rate):
+    def test_audios(self, step, audios, sample_rate):
        self.dict_to_tb_audios(f"{self.model_name}_TestAudios", audios, step, sample_rate)
-    def tb_test_figures(self, step, figures):
+    def test_figures(self, step, figures):
        self.dict_to_tb_figure(f"{self.model_name}_TestFigures", figures, step)
-    def tb_add_text(self, title, text, step):
+    def add_text(self, title, text, step):
        self.writer.add_text(title, text, step)
    def log_artifact(self, file_or_dir, name, artifact_type, aliases=None):  # pylint: disable=W0613, R0201
        yield
    def flush(self):
        self.writer.flush()
    def finish(self):
        self.writer.close()
--- a/TTS/utils/logging/wandb_logger.py
+++ b/TTS/utils/logging/wandb_logger.py
@ -0,0 +1,111 @@
 # pylint: disable=W0613
 import traceback
 from pathlib import Path
 try:
    import wandb
    from wandb import finish, init  # pylint: disable=W0611
 except ImportError:
    wandb = None
 class WandbLogger:
    def __init__(self, **kwargs):
        if not wandb:
            raise Exception("install wandb using `pip install wandb` to use WandbLogger")
        self.run = None
        self.run = wandb.init(**kwargs) if not wandb.run else wandb.run
        self.model_name = self.run.config.model
        self.log_dict = {}
    def model_weights(self, model):
        layer_num = 1
        for name, param in model.named_parameters():
            if param.numel() == 1:
                self.dict_to_scalar("weights", {"layer{}-{}/value".format(layer_num, name): param.max()})
            else:
                self.dict_to_scalar("weights", {"layer{}-{}/max".format(layer_num, name): param.max()})
                self.dict_to_scalar("weights", {"layer{}-{}/min".format(layer_num, name): param.min()})
                self.dict_to_scalar("weights", {"layer{}-{}/mean".format(layer_num, name): param.mean()})
                self.dict_to_scalar("weights", {"layer{}-{}/std".format(layer_num, name): param.std()})
                self.log_dict["weights/layer{}-{}/param".format(layer_num, name)] = wandb.Histogram(param)
                self.log_dict["weights/layer{}-{}/grad".format(layer_num, name)] = wandb.Histogram(param.grad)
            layer_num += 1
    def dict_to_scalar(self, scope_name, stats):
        for key, value in stats.items():
            self.log_dict["{}/{}".format(scope_name, key)] = value
    def dict_to_figure(self, scope_name, figures):
        for key, value in figures.items():
            self.log_dict["{}/{}".format(scope_name, key)] = wandb.Image(value)
    def dict_to_audios(self, scope_name, audios, sample_rate):
        for key, value in audios.items():
            if value.dtype == "float16":
                value = value.astype("float32")
            try:
                self.log_dict["{}/{}".format(scope_name, key)] = wandb.Audio(value, sample_rate=sample_rate)
            except RuntimeError:
                traceback.print_exc()
    def log(self, log_dict, prefix="", flush=False):
        for key, value in log_dict.items():
            self.log_dict[prefix + key] = value
        if flush:  # for cases where you don't want to accumulate data
            self.flush()
    def train_step_stats(self, step, stats):
        self.dict_to_scalar(f"{self.model_name}_TrainIterStats", stats)
    def train_epoch_stats(self, step, stats):
        self.dict_to_scalar(f"{self.model_name}_TrainEpochStats", stats)
    def train_figures(self, step, figures):
        self.dict_to_figure(f"{self.model_name}_TrainFigures", figures)
    def train_audios(self, step, audios, sample_rate):
        self.dict_to_audios(f"{self.model_name}_TrainAudios", audios, sample_rate)
    def eval_stats(self, step, stats):
        self.dict_to_scalar(f"{self.model_name}_EvalStats", stats)
    def eval_figures(self, step, figures):
        self.dict_to_figure(f"{self.model_name}_EvalFigures", figures)
    def eval_audios(self, step, audios, sample_rate):
        self.dict_to_audios(f"{self.model_name}_EvalAudios", audios, sample_rate)
    def test_audios(self, step, audios, sample_rate):
        self.dict_to_audios(f"{self.model_name}_TestAudios", audios, sample_rate)
    def test_figures(self, step, figures):
        self.dict_to_figure(f"{self.model_name}_TestFigures", figures)
    def add_text(self, title, text, step):
        pass
    def flush(self):
        if self.run:
            wandb.log(self.log_dict)
        self.log_dict = {}
    def finish(self):
        if self.run:
            self.run.finish()
    def log_artifact(self, file_or_dir, name, artifact_type, aliases=None):
        if not self.run:
            return
        name = "_".join([self.run.id, name])
        artifact = wandb.Artifact(name, type=artifact_type)
        data_path = Path(file_or_dir)
        if data_path.is_dir():
            artifact.add_dir(str(data_path))
        elif data_path.is_file():
            artifact.add_file(str(data_path))
        self.run.log_artifact(artifact, aliases=aliases)
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@ -64,6 +64,7 @@ class ModelManager(object):
    def list_models(self):
        print(" Name format: type/language/dataset/model")
        models_name_list = []
        model_count = 1
        for model_type in self.models_dict:
            for lang in self.models_dict[model_type]:
                for dataset in self.models_dict[model_type][lang]:
@ -71,10 +72,11 @@ class ModelManager(object):
                        model_full_name = f"{model_type}--{lang}--{dataset}--{model}"
                        output_path = os.path.join(self.output_prefix, model_full_name)
                        if os.path.exists(output_path):
-                            print(f" >: {model_type}/{lang}/{dataset}/{model} [already downloaded]")
+                            print(f" {model_count}: {model_type}/{lang}/{dataset}/{model} [already downloaded]")
                        else:
-                            print(f" >: {model_type}/{lang}/{dataset}/{model}")
+                            print(f" {model_count}: {model_type}/{lang}/{dataset}/{model}")
                        models_name_list.append(f"{model_type}/{lang}/{dataset}/{model}")
                        model_count += 1
        return models_name_list
    def download_model(self, model_name):
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@ -12,7 +12,6 @@ from TTS.tts.utils.speakers import SpeakerManager
 # pylint: disable=unused-wildcard-import
 # pylint: disable=wildcard-import
 from TTS.tts.utils.synthesis import synthesis, trim_silence
 from TTS.tts.utils.text import make_symbols, phonemes, symbols
 from TTS.utils.audio import AudioProcessor
 from TTS.vocoder.models import setup_model as setup_vocoder_model
 from TTS.vocoder.utils.generic_utils import interpolate_vocoder_input
@ -103,6 +102,34 @@ class Synthesizer(object):
        self.num_speakers = self.speaker_manager.num_speakers
        self.d_vector_dim = self.speaker_manager.d_vector_dim
    def _set_tts_speaker_file(self):
        """Set the TTS speaker file used by a multi-speaker model."""
        # setup if multi-speaker settings are in the global model config
        if hasattr(self.tts_config, "use_speaker_embedding") and self.tts_config.use_speaker_embedding is True:
            if self.tts_config.use_d_vector_file:
                self.tts_speakers_file = (
                    self.tts_speakers_file if self.tts_speakers_file else self.tts_config["d_vector_file"]
                )
                self.tts_config["d_vector_file"] = self.tts_speakers_file
            else:
                self.tts_speakers_file = (
                    self.tts_speakers_file if self.tts_speakers_file else self.tts_config["speakers_file"]
                )
        # setup if multi-speaker settings are in the model args config
        if (
            self.tts_speakers_file is None
            and hasattr(self.tts_config, "model_args")
            and hasattr(self.tts_config.model_args, "use_speaker_embedding")
            and self.tts_config.model_args.use_speaker_embedding
        ):
            _args = self.tts_config.model_args
            if _args.use_d_vector_file:
                self.tts_speakers_file = self.tts_speakers_file if self.tts_speakers_file else _args["d_vector_file"]
                _args["d_vector_file"] = self.tts_speakers_file
            else:
                self.tts_speakers_file = self.tts_speakers_file if self.tts_speakers_file else _args["speakers_file"]
    def _load_tts(self, tts_checkpoint: str, tts_config_path: str, use_cuda: bool) -> None:
        """Load the TTS model.
@ -113,29 +140,15 @@ class Synthesizer(object):
        """
        # pylint: disable=global-statement
        global symbols, phonemes
        self.tts_config = load_config(tts_config_path)
        self.use_phonemes = self.tts_config.use_phonemes
        self.ap = AudioProcessor(verbose=False, **self.tts_config.audio)
        if self.tts_config.has("characters") and self.tts_config.characters:
            symbols, phonemes = make_symbols(**self.tts_config.characters)
        if self.use_phonemes:
            self.input_size = len(phonemes)
        else:
            self.input_size = len(symbols)
        if self.tts_config.use_speaker_embedding is True:
            self.tts_speakers_file = (
                self.tts_speakers_file if self.tts_speakers_file else self.tts_config["d_vector_file"]
            )
            self.tts_config["d_vector_file"] = self.tts_speakers_file
        self.tts_model = setup_tts_model(config=self.tts_config)
        self.tts_model.load_checkpoint(self.tts_config, tts_checkpoint, eval=True)
        if use_cuda:
            self.tts_model.cuda()
        self._set_tts_speaker_file()
    def _load_vocoder(self, model_file: str, model_config: str, use_cuda: bool) -> None:
        """Load the vocoder model.
@ -187,15 +200,22 @@ class Synthesizer(object):
        """
        start_time = time.time()
        wavs = []
        speaker_embedding = None
        sens = self.split_into_sentences(text)
        print(" > Text splitted to sentences.")
        print(sens)
        # handle multi-speaker
        speaker_embedding = None
        speaker_id = None
        if self.tts_speakers_file:
            # get the speaker embedding from the saved d_vectors.
            if speaker_idx and isinstance(speaker_idx, str):
-                speaker_embedding = self.tts_model.speaker_manager.get_d_vectors_by_speaker(speaker_idx)[0]
+                if self.tts_config.use_d_vector_file:
                    # get the speaker embedding from the saved d_vectors.
                    speaker_embedding = self.tts_model.speaker_manager.get_d_vectors_by_speaker(speaker_idx)[0]
                else:
                    # get speaker idx from the speaker name
                    speaker_id = self.tts_model.speaker_manager.speaker_ids[speaker_idx]
            elif not speaker_idx and not speaker_wav:
                raise ValueError(
                    " [!] Look like you use a multi-speaker model. "
@ -224,14 +244,14 @@ class Synthesizer(object):
                CONFIG=self.tts_config,
                use_cuda=self.use_cuda,
                ap=self.ap,
-                speaker_id=None,
+                speaker_id=speaker_id,
                style_wav=style_wav,
                enable_eos_bos_chars=self.tts_config.enable_eos_bos_chars,
                use_griffin_lim=use_gl,
                d_vector=speaker_embedding,
            )
            waveform = outputs["wav"]
-            mel_postnet_spec = outputs["model_outputs"]
+            mel_postnet_spec = outputs["outputs"]["model_outputs"][0].detach().numpy()
            if not use_gl:
                # denormalize tts output based on tts audio config
                mel_postnet_spec = self.ap.denormalize(mel_postnet_spec.T).T
--- a/TTS/utils/trainer_utils.py
+++ b/TTS/utils/trainer_utils.py
@ -1,5 +1,5 @@
 import importlib
-from typing import Dict
+from typing import Dict, List
 import torch
@ -48,7 +48,7 @@ def get_scheduler(
 def get_optimizer(
-    optimizer_name: str, optimizer_params: dict, lr: float, model: torch.nn.Module
+    optimizer_name: str, optimizer_params: dict, lr: float, model: torch.nn.Module = None, parameters: List = None
 ) -> torch.optim.Optimizer:
    """Find, initialize and return a optimizer.
@ -66,4 +66,6 @@ def get_optimizer(
        optimizer = getattr(module, "RAdam")
    else:
        optimizer = getattr(torch.optim, optimizer_name)
-    return optimizer(model.parameters(), lr=lr, **optimizer_params)
+    if model is not None:
        parameters = model.parameters()
    return optimizer(parameters, lr=lr, **optimizer_params)
--- a/TTS/vocoder/models/init.py
+++ b/TTS/vocoder/models/init.py
@ -24,8 +24,10 @@ def setup_model(config: Coqpit):
        elif config.model.lower() == "wavegrad":
            MyModel = getattr(MyModel, "Wavegrad")
        else:
-            MyModel = getattr(MyModel, to_camel(config.model))
+            try:
-            raise ValueError(f"Model {config.model} not exist!")
+                MyModel = getattr(MyModel, to_camel(config.model))
            except ModuleNotFoundError as e:
                raise ValueError(f"Model {config.model} not exist!") from e
    model = MyModel(config)
    return model
--- a/TTS/vocoder/models/gan.py
+++ b/TTS/vocoder/models/gan.py
@ -9,6 +9,7 @@ from torch.utils.data import DataLoader
 from torch.utils.data.distributed import DistributedSampler
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.io import load_fsspec
 from TTS.utils.trainer_utils import get_optimizer, get_scheduler
 from TTS.vocoder.datasets.gan_dataset import GANDataset
 from TTS.vocoder.layers.losses import DiscriminatorLoss, GeneratorLoss
@ -222,7 +223,7 @@ class GAN(BaseVocoder):
            checkpoint_path (str): Checkpoint file path.
            eval (bool, optional): If true, load the model for inference. If falseDefaults to False.
        """
-        state = torch.load(checkpoint_path, map_location=torch.device("cpu"))
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
        # band-aid for older than v0.0.15 GAN models
        if "model_disc" in state:
            self.model_g.load_checkpoint(config, checkpoint_path, eval)
--- a/TTS/vocoder/models/hifigan_discriminator.py
+++ b/TTS/vocoder/models/hifigan_discriminator.py
@ -33,10 +33,10 @@ class DiscriminatorP(torch.nn.Module):
        norm_f = nn.utils.spectral_norm if use_spectral_norm else nn.utils.weight_norm
        self.convs = nn.ModuleList(
            [
-                norm_f(nn.Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+                norm_f(nn.Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
-                norm_f(nn.Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+                norm_f(nn.Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
-                norm_f(nn.Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+                norm_f(nn.Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
-                norm_f(nn.Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+                norm_f(nn.Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
                norm_f(nn.Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
            ]
        )
@ -81,15 +81,15 @@ class MultiPeriodDiscriminator(torch.nn.Module):
    Periods are suggested to be prime numbers to reduce the overlap between each discriminator.
    """
-    def __init__(self):
+    def __init__(self, use_spectral_norm=False):
        super().__init__()
        self.discriminators = nn.ModuleList(
            [
-                DiscriminatorP(2),
+                DiscriminatorP(2, use_spectral_norm=use_spectral_norm),
-                DiscriminatorP(3),
+                DiscriminatorP(3, use_spectral_norm=use_spectral_norm),
-                DiscriminatorP(5),
+                DiscriminatorP(5, use_spectral_norm=use_spectral_norm),
-                DiscriminatorP(7),
+                DiscriminatorP(7, use_spectral_norm=use_spectral_norm),
-                DiscriminatorP(11),
+                DiscriminatorP(11, use_spectral_norm=use_spectral_norm),
            ]
        )
@ -99,7 +99,7 @@ class MultiPeriodDiscriminator(torch.nn.Module):
            x (Tensor): input waveform.
        Returns:
-            [List[Tensor]]: list of scores from each discriminator.
+        [List[Tensor]]: list of scores from each discriminator.
            [List[List[Tensor]]]: list of list of features from each discriminator's each convolutional layer.
        Shapes:
--- a/TTS/vocoder/models/hifigan_generator.py
+++ b/TTS/vocoder/models/hifigan_generator.py
@ -5,6 +5,8 @@ import torch.nn.functional as F
 from torch.nn import Conv1d, ConvTranspose1d
 from torch.nn.utils import remove_weight_norm, weight_norm
 from TTS.utils.io import load_fsspec
 LRELU_SLOPE = 0.1
@ -168,6 +170,10 @@ class HifiganGenerator(torch.nn.Module):
        upsample_initial_channel,
        upsample_factors,
        inference_padding=5,
        cond_channels=0,
        conv_pre_weight_norm=True,
        conv_post_weight_norm=True,
        conv_post_bias=True,
    ):
        r"""HiFiGAN Generator with Multi-Receptive Field Fusion (MRF)
@ -216,12 +222,21 @@ class HifiganGenerator(torch.nn.Module):
            for _, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
                self.resblocks.append(resblock(ch, k, d))
        # post convolution layer
-        self.conv_post = weight_norm(Conv1d(ch, out_channels, 7, 1, padding=3))
+        self.conv_post = weight_norm(Conv1d(ch, out_channels, 7, 1, padding=3, bias=conv_post_bias))
        if cond_channels > 0:
            self.cond_layer = nn.Conv1d(cond_channels, upsample_initial_channel, 1)
-    def forward(self, x):
+        if not conv_pre_weight_norm:
            remove_weight_norm(self.conv_pre)
        if not conv_post_weight_norm:
            remove_weight_norm(self.conv_post)
    def forward(self, x, g=None):
        """
        Args:
-            x (Tensor): conditioning input tensor.
+            x (Tensor): feature input tensor.
            g (Tensor): global conditioning input tensor.
        Returns:
            Tensor: output waveform.
@ -231,6 +246,8 @@ class HifiganGenerator(torch.nn.Module):
            Tensor: [B, 1, T]
        """
        o = self.conv_pre(x)
        if hasattr(self, "cond_layer"):
            o = o + self.cond_layer(g)
        for i in range(self.num_upsamples):
            o = F.leaky_relu(o, LRELU_SLOPE)
            o = self.ups[i](o)
@ -275,7 +292,7 @@ class HifiganGenerator(torch.nn.Module):
    def load_checkpoint(
        self, config, checkpoint_path, eval=False
    ):  # pylint: disable=unused-argument, redefined-builtin
-        state = torch.load(checkpoint_path, map_location=torch.device("cpu"))
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
        self.load_state_dict(state["model"])
        if eval:
            self.eval()
--- a/TTS/vocoder/models/melgan_generator.py
+++ b/TTS/vocoder/models/melgan_generator.py
@ -2,6 +2,7 @@ import torch
 from torch import nn
 from torch.nn.utils import weight_norm
 from TTS.utils.io import load_fsspec
 from TTS.vocoder.layers.melgan import ResidualStack
@ -86,7 +87,7 @@ class MelganGenerator(nn.Module):
    def load_checkpoint(
        self, config, checkpoint_path, eval=False
    ):  # pylint: disable=unused-argument, redefined-builtin
-        state = torch.load(checkpoint_path, map_location=torch.device("cpu"))
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
        self.load_state_dict(state["model"])
        if eval:
            self.eval()
--- a/TTS/vocoder/models/parallel_wavegan_generator.py
+++ b/TTS/vocoder/models/parallel_wavegan_generator.py
@ -3,6 +3,7 @@ import math
 import numpy as np
 import torch
 from TTS.utils.io import load_fsspec
 from TTS.vocoder.layers.parallel_wavegan import ResidualBlock
 from TTS.vocoder.layers.upsample import ConvUpsample
@ -154,7 +155,7 @@ class ParallelWaveganGenerator(torch.nn.Module):
    def load_checkpoint(
        self, config, checkpoint_path, eval=False
    ):  # pylint: disable=unused-argument, redefined-builtin
-        state = torch.load(checkpoint_path, map_location=torch.device("cpu"))
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
        self.load_state_dict(state["model"])
        if eval:
            self.eval()
--- a/TTS/vocoder/models/univnet_generator.py
+++ b/TTS/vocoder/models/univnet_generator.py
@ -1,3 +1,5 @@
 from typing import List
 import numpy as np
 import torch
 import torch.nn.functional as F
@ -10,18 +12,35 @@ LRELU_SLOPE = 0.1
 class UnivnetGenerator(torch.nn.Module):
    def __init__(
        self,
-        in_channels,
+        in_channels: int,
-        out_channels,
+        out_channels: int,
-        hidden_channels,
+        hidden_channels: int,
-        cond_channels,
+        cond_channels: int,
-        upsample_factors,
+        upsample_factors: List[int],
-        lvc_layers_each_block,
+        lvc_layers_each_block: int,
-        lvc_kernel_size,
+        lvc_kernel_size: int,
-        kpnet_hidden_channels,
+        kpnet_hidden_channels: int,
-        kpnet_conv_size,
+        kpnet_conv_size: int,
-        dropout,
+        dropout: float,
        use_weight_norm=True,
    ):
        """Univnet Generator network.
        Paper: https://arxiv.org/pdf/2106.07889.pdf
        Args:
            in_channels (int): Number of input tensor channels.
            out_channels (int): Number of channels of the output tensor.
            hidden_channels (int): Number of hidden network channels.
            cond_channels (int): Number of channels of the conditioning tensors.
            upsample_factors (List[int]): List of uplsample factors for the upsampling layers.
            lvc_layers_each_block (int): Number of LVC layers in each block.
            lvc_kernel_size (int): Kernel size of the LVC layers.
            kpnet_hidden_channels (int): Number of hidden channels in the key-point network.
            kpnet_conv_size (int): Number of convolution channels in the key-point network.
            dropout (float): Dropout rate.
            use_weight_norm (bool, optional): Enable/disable weight norm. Defaults to True.
        """
        super().__init__()
        self.in_channels = in_channels
--- a/TTS/vocoder/models/wavegrad.py
+++ b/TTS/vocoder/models/wavegrad.py
@ -11,6 +11,7 @@ from torch.utils.data.distributed import DistributedSampler
 from TTS.model import BaseModel
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.io import load_fsspec
 from TTS.utils.trainer_utils import get_optimizer, get_scheduler
 from TTS.vocoder.datasets import WaveGradDataset
 from TTS.vocoder.layers.wavegrad import Conv1d, DBlock, FiLM, UBlock
@ -220,7 +221,7 @@ class Wavegrad(BaseModel):
    def load_checkpoint(
        self, config, checkpoint_path, eval=False
    ):  # pylint: disable=unused-argument, redefined-builtin
-        state = torch.load(checkpoint_path, map_location=torch.device("cpu"))
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
        self.load_state_dict(state["model"])
        if eval:
            self.eval()
--- a/TTS/vocoder/models/wavernn.py
+++ b/TTS/vocoder/models/wavernn.py
@ -13,6 +13,7 @@ from torch.utils.data.distributed import DistributedSampler
 from TTS.tts.utils.visual import plot_spectrogram
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.io import load_fsspec
 from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset
 from TTS.vocoder.layers.losses import WaveRNNLoss
 from TTS.vocoder.models.base_vocoder import BaseVocoder
@ -545,7 +546,7 @@ class Wavernn(BaseVocoder):
    def load_checkpoint(
        self, config, checkpoint_path, eval=False
    ):  # pylint: disable=unused-argument, redefined-builtin
-        state = torch.load(checkpoint_path, map_location=torch.device("cpu"))
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
        self.load_state_dict(state["model"])
        if eval:
            self.eval()
--- a/TTS/vocoder/tf/utils/io.py
+++ b/TTS/vocoder/tf/utils/io.py
@ -1,6 +1,7 @@
 import datetime
 import pickle
 import fsspec
 import tensorflow as tf
@ -13,12 +14,14 @@ def save_checkpoint(model, current_step, epoch, output_path, **kwargs):
        "date": datetime.date.today().strftime("%B %d, %Y"),
    }
    state.update(kwargs)
-    pickle.dump(state, open(output_path, "wb"))
+    with fsspec.open(output_path, "wb") as f:
        pickle.dump(state, f)
 def load_checkpoint(model, checkpoint_path):
    """Load TF Vocoder model"""
-    checkpoint = pickle.load(open(checkpoint_path, "rb"))
+    with fsspec.open(checkpoint_path, "rb") as f:
        checkpoint = pickle.load(f)
    chkp_var_dict = {var.name: var.numpy() for var in checkpoint["model"]}
    tf_vars = model.weights
    for tf_var in tf_vars:
--- a/TTS/vocoder/tf/utils/tflite.py
+++ b/TTS/vocoder/tf/utils/tflite.py
@ -1,3 +1,4 @@
 import fsspec
 import tensorflow as tf
@ -14,7 +15,7 @@ def convert_melgan_to_tflite(model, output_path=None, experimental_converter=Tru
    print(f"Tflite Model size is {len(tflite_model) / (1024.0 * 1024.0)} MBs.")
    if output_path is not None:
        # same model binary if outputpath is provided
-        with open(output_path, "wb") as f:
+        with fsspec.open(output_path, "wb") as f:
            f.write(tflite_model)
        return None
    return tflite_model
--- a/TTS/vocoder/utils/generic_utils.py
+++ b/TTS/vocoder/utils/generic_utils.py
@ -1,8 +1,11 @@
 from typing import Dict
 import numpy as np
 import torch
 from matplotlib import pyplot as plt
 from TTS.tts.utils.visual import plot_spectrogram
 from TTS.utils.audio import AudioProcessor
 def interpolate_vocoder_input(scale_factor, spec):
@ -26,12 +29,24 @@ def interpolate_vocoder_input(scale_factor, spec):
    return spec
-def plot_results(y_hat, y, ap, name_prefix):
+def plot_results(y_hat: torch.tensor, y: torch.tensor, ap: AudioProcessor, name_prefix: str = None) -> Dict:
-    """Plot vocoder model results"""
+    """Plot the predicted and the real waveform and their spectrograms.
    Args:
        y_hat (torch.tensor): Predicted waveform.
        y (torch.tensor): Real waveform.
        ap (AudioProcessor): Audio processor used to process the waveform.
        name_prefix (str, optional): Name prefix used to name the figures. Defaults to None.
    Returns:
        Dict: output figures keyed by the name of the figures.
    """ """Plot vocoder model results"""
    if name_prefix is None:
        name_prefix = ""
    # select an instance from batch
-    y_hat = y_hat[0].squeeze(0).detach().cpu().numpy()
+    y_hat = y_hat[0].squeeze().detach().cpu().numpy()
-    y = y[0].squeeze(0).detach().cpu().numpy()
+    y = y[0].squeeze().detach().cpu().numpy()
    spec_fake = ap.melspectrogram(y_hat).T
    spec_real = ap.melspectrogram(y).T
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@ -2,4 +2,5 @@ furo
 myst-parser == 0.15.1
 sphinx == 4.0.2
 sphinx_inline_tabs
-sphinx_copybutton
+sphinx_copybutton
 linkify-it-py
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -68,6 +68,8 @@ extensions = [
    "sphinx_inline_tabs",
 ]
 myst_enable_extensions = ['linkify',]
 # 'sphinxcontrib.katex',
 # 'sphinx.ext.autosectionlabel',
--- a/docs/source/index.md
+++ b/docs/source/index.md
@ -44,6 +44,7 @@
    :caption: `tts` Models
    models/glow_tts.md
    models/vits.md
 .. toctree::
    :maxdepth: 2
--- a/docs/source/models/glow_tts.md
+++ b/docs/source/models/glow_tts.md
@ -0,0 +1,22 @@
 # Glow TTS
 Glow TTS is a normalizing flow model for text-to-speech. It is built on the generic Glow model that is previously
 used in computer vision and vocoder models. It uses "monotonic alignment search" (MAS) to fine the text-to-speech alignment
 and uses the output to train a separate duration predictor network for faster inference run-time.
 ## Important resources & papers
 - GlowTTS: https://arxiv.org/abs/2005.11129
 - Glow (Generative Flow with invertible 1x1 Convolutions): https://arxiv.org/abs/1807.03039
 - Normalizing Flows: https://blog.evjang.com/2018/01/nf1.html
 ## GlowTTS Config
 ```{eval-rst}
 .. autoclass:: TTS.tts.configs.glow_tts_config.GlowTTSConfig
    :members:
 ```
 ## GlowTTS Model
 ```{eval-rst}
 .. autoclass:: TTS.tts.models.glow_tts.GlowTTS
    :members:
 ```
--- a/docs/source/models/vits.md
+++ b/docs/source/models/vits.md
@ -0,0 +1,33 @@
 # VITS
 VITS (Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech
 ) is an End-to-End (encoder -> vocoder together) TTS model that takes advantage of SOTA DL techniques like GANs, VAE,
 Normalizing Flows. It does not require external alignment annotations and learns the text-to-audio alignment
 using MAS as explained in the paper. The model architecture is a combination of GlowTTS encoder and HiFiGAN vocoder.
 It is a feed-forward model with x67.12 real-time factor on a GPU.
 ## Important resources & papers
 - VITS: https://arxiv.org/pdf/2106.06103.pdf
 - Neural Spline Flows: https://arxiv.org/abs/1906.04032
 - Variational Autoencoder: https://arxiv.org/pdf/1312.6114.pdf
 - Generative Adversarial Networks: https://arxiv.org/abs/1406.2661
 - HiFiGAN: https://arxiv.org/abs/2010.05646
 - Normalizing Flows: https://blog.evjang.com/2018/01/nf1.html
 ## VitsConfig
 ```{eval-rst}
 .. autoclass:: TTS.tts.configs.vits_config.VitsConfig
    :members:
 ```
 ## VitsArgs
 ```{eval-rst}
 .. autoclass:: TTS.tts.models.vits.VitsArgs
    :members:
 ```
 ## Vits Model
 ```{eval-rst}
 .. autoclass:: TTS.tts.models.vits.Vits
    :members:
 ```
--- a/docs/source/tutorial_for_nervous_beginners.md
+++ b/docs/source/tutorial_for_nervous_beginners.md
@ -85,6 +85,7 @@ We still support running training from CLI like in the old days. The same traini
    ```json
    {
        "run_name": "my_run",
        "model": "glow_tts",
        "batch_size": 32,
        "eval_batch_size": 16,
--- a/notebooks/PlotUmapLibriTTS.ipynb
+++ b/notebooks/PlotUmapLibriTTS.ipynb
@ -25,9 +25,7 @@
    "import umap\n",
    "\n",
    "from TTS.speaker_encoder.model import SpeakerEncoder\n",
-    "from TTS.utils.audio import AudioProcessor
+    "from TTS.utils.audio import AudioProcessor\n",
 \n",
    "from TTS.tts.utils.generic_utils import load_config\n",
    "\n",
    "from bokeh.io import output_notebook, show\n",
--- a/recipes/ljspeech/align_tts/train_aligntts.py
+++ b/recipes/ljspeech/align_tts/train_aligntts.py
@ -1,12 +1,12 @@
 import os
-from TTS.tts.configs import AlignTTSConfig
+from TTS.trainer import Trainer, TrainingArgs, init_training
-from TTS.tts.configs import BaseDatasetConfig
+from TTS.tts.configs import AlignTTSConfig, BaseDatasetConfig
 from TTS.trainer import init_training, Trainer, TrainingArgs
 output_path = os.path.dirname(os.path.abspath(__file__))
-dataset_config = BaseDatasetConfig(name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/"))
+dataset_config = BaseDatasetConfig(
    name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
 )
 config = AlignTTSConfig(
    batch_size=32,
    eval_batch_size=16,
@ -23,8 +23,8 @@ config = AlignTTSConfig(
    print_eval=True,
    mixed_precision=False,
    output_path=output_path,
-    datasets=[dataset_config]
+    datasets=[dataset_config],
 )
-args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config)
+args, config, output_path, _, c_logger, dashboard_logger = init_training(TrainingArgs(), config)
-trainer = Trainer(args, config, output_path, c_logger, tb_logger)
+trainer = Trainer(args, config, output_path, c_logger, dashboard_logger)
 trainer.fit()
--- a/recipes/ljspeech/glow_tts/train_glowtts.py
+++ b/recipes/ljspeech/glow_tts/train_glowtts.py
@ -1,12 +1,12 @@
 import os
-from TTS.tts.configs import GlowTTSConfig
+from TTS.trainer import Trainer, TrainingArgs, init_training
-from TTS.tts.configs import BaseDatasetConfig
+from TTS.tts.configs import BaseDatasetConfig, GlowTTSConfig
 from TTS.trainer import init_training, Trainer, TrainingArgs
 output_path = os.path.dirname(os.path.abspath(__file__))
-dataset_config = BaseDatasetConfig(name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/"))
+dataset_config = BaseDatasetConfig(
    name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
 )
 config = GlowTTSConfig(
    batch_size=32,
    eval_batch_size=16,
@ -23,8 +23,8 @@ config = GlowTTSConfig(
    print_eval=True,
    mixed_precision=False,
    output_path=output_path,
-    datasets=[dataset_config]
+    datasets=[dataset_config],
 )
-args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config)
+args, config, output_path, _, c_logger, dashboard_logger = init_training(TrainingArgs(), config)
-trainer = Trainer(args, config, output_path, c_logger, tb_logger)
+trainer = Trainer(args, config, output_path, c_logger, dashboard_logger)
 trainer.fit()
--- a/recipes/ljspeech/hifigan/train_hifigan.py
+++ b/recipes/ljspeech/hifigan/train_hifigan.py
@ -24,6 +24,6 @@ config = HifiganConfig(
    data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"),
    output_path=output_path,
 )
-args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config)
+args, config, output_path, _, c_logger, dashboard_logger = init_training(TrainingArgs(), config)
-trainer = Trainer(args, config, output_path, c_logger, tb_logger)
+trainer = Trainer(args, config, output_path, c_logger, dashboard_logger)
 trainer.fit()
--- a/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py
+++ b/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py
@ -1,8 +1,7 @@
 import os
 from TTS.trainer import Trainer, TrainingArgs, init_training
 from TTS.vocoder.configs import MultibandMelganConfig
 from TTS.trainer import init_training, Trainer, TrainingArgs
 output_path = os.path.dirname(os.path.abspath(__file__))
 config = MultibandMelganConfig(
@ -25,6 +24,6 @@ config = MultibandMelganConfig(
    data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"),
    output_path=output_path,
 )
-args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config)
+args, config, output_path, _, c_logger, dashboard_logger = init_training(TrainingArgs(), config)
-trainer = Trainer(args, config, output_path, c_logger, tb_logger)
+trainer = Trainer(args, config, output_path, c_logger, dashboard_logger)
 trainer.fit()
--- a/recipes/ljspeech/univnet/train.py
+++ b/recipes/ljspeech/univnet/train.py
@ -1,6 +1,5 @@
 import os
 from TTS.config.shared_configs import BaseAudioConfig
 from TTS.trainer import Trainer, TrainingArgs, init_training
 from TTS.vocoder.configs import UnivnetConfig
@ -25,6 +24,6 @@ config = UnivnetConfig(
    data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"),
    output_path=output_path,
 )
-args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config)
+args, config, output_path, _, c_logger, dashboard_logger = init_training(TrainingArgs(), config)
-trainer = Trainer(args, config, output_path, c_logger, tb_logger)
+trainer = Trainer(args, config, output_path, c_logger, dashboard_logger)
 trainer.fit()
--- a/recipes/ljspeech/vits_tts/train_vits.py
+++ b/recipes/ljspeech/vits_tts/train_vits.py
@ -0,0 +1,52 @@
 import os
 from TTS.config.shared_configs import BaseAudioConfig
 from TTS.trainer import Trainer, TrainingArgs, init_training
 from TTS.tts.configs import BaseDatasetConfig, VitsConfig
 output_path = os.path.dirname(os.path.abspath(__file__))
 dataset_config = BaseDatasetConfig(
    name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
 )
 audio_config = BaseAudioConfig(
    sample_rate=22050,
    win_length=1024,
    hop_length=256,
    num_mels=80,
    preemphasis=0.0,
    ref_level_db=20,
    log_func="np.log",
    do_trim_silence=True,
    trim_db=45,
    mel_fmin=0,
    mel_fmax=None,
    spec_gain=1.0,
    signal_norm=False,
    do_amp_to_db_linear=False,
 )
 config = VitsConfig(
    audio=audio_config,
    run_name="vits_ljspeech",
    batch_size=48,
    eval_batch_size=16,
    batch_group_size=0,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=1000,
    text_cleaner="english_cleaners",
    use_phonemes=True,
    phoneme_language="en-us",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    compute_input_seq_cache=True,
    print_step=25,
    print_eval=True,
    mixed_precision=True,
    max_seq_len=5000,
    output_path=output_path,
    datasets=[dataset_config],
 )
 args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config)
 trainer = Trainer(args, config, output_path, c_logger, tb_logger, cudnn_benchmark=True)
 trainer.fit()
--- a/recipes/ljspeech/wavegrad/train_wavegrad.py
+++ b/recipes/ljspeech/wavegrad/train_wavegrad.py
@ -1,10 +1,8 @@
 import os
-from TTS.trainer import Trainer, init_training
+from TTS.trainer import Trainer, TrainingArgs, init_training
 from TTS.trainer import TrainingArgs
 from TTS.vocoder.configs import WavegradConfig
 output_path = os.path.dirname(os.path.abspath(__file__))
 config = WavegradConfig(
    batch_size=32,
@ -24,6 +22,6 @@ config = WavegradConfig(
    data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"),
    output_path=output_path,
 )
-args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config)
+args, config, output_path, _, c_logger, dashboard_logger = init_training(TrainingArgs(), config)
-trainer = Trainer(args, config, output_path, c_logger, tb_logger)
+trainer = Trainer(args, config, output_path, c_logger, dashboard_logger)
 trainer.fit()
--- a/recipes/ljspeech/wavernn/train_wavernn.py
+++ b/recipes/ljspeech/wavernn/train_wavernn.py
@ -1,9 +1,8 @@
 import os
-from TTS.trainer import Trainer, init_training, TrainingArgs
+from TTS.trainer import Trainer, TrainingArgs, init_training
 from TTS.vocoder.configs import WavernnConfig
 output_path = os.path.dirname(os.path.abspath(__file__))
 config = WavernnConfig(
    batch_size=64,
@ -25,6 +24,6 @@ config = WavernnConfig(
    data_path=os.path.join(output_path, "../LJSpeech-1.1/wavs/"),
    output_path=output_path,
 )
-args, config, output_path, _, c_logger, tb_logger = init_training(TrainingArgs(), config)
+args, config, output_path, _, c_logger, dashboard_logger = init_training(TrainingArgs(), config)
-trainer = Trainer(args, config, output_path, c_logger, tb_logger, cudnn_benchmark=True)
+trainer = Trainer(args, config, output_path, c_logger, dashboard_logger, cudnn_benchmark=True)
 trainer.fit()
--- a/requirements.txt
+++ b/requirements.txt
@ -24,3 +24,4 @@ mecab-python3==1.0.3
 unidic-lite==1.0.8
 # gruut+supported langs
 gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=1.2.0
 fsspec>=2021.04.0
--- a/tests/data_tests/test_loader.py
+++ b/tests/data_tests/test_loader.py
@ -42,6 +42,7 @@ class TestTTSDataset(unittest.TestCase):
            r,
            c.text_cleaner,
            compute_linear_spec=True,
            return_wav=True,
            ap=self.ap,
            meta_data=items,
            characters=c.characters,
@ -75,16 +76,26 @@ class TestTTSDataset(unittest.TestCase):
                mel_lengths = data[5]
                stop_target = data[6]
                item_idx = data[7]
                wavs = data[11]
                neg_values = text_input[text_input < 0]
                check_count = len(neg_values)
                assert check_count == 0, " !! Negative values in text_input: {}".format(check_count)
                # TODO: more assertion here
                assert isinstance(speaker_name[0], str)
                assert linear_input.shape[0] == c.batch_size
                assert linear_input.shape[2] == self.ap.fft_size // 2 + 1
                assert mel_input.shape[0] == c.batch_size
                assert mel_input.shape[2] == c.audio["num_mels"]
                assert (
                    wavs.shape[1] == mel_input.shape[1] * c.audio.hop_length
                ), f"wavs.shape: {wavs.shape[1]}, mel_input.shape: {mel_input.shape[1] * c.audio.hop_length}"
                # make sure that the computed mels and the waveform match and correctly computed
                mel_new = self.ap.melspectrogram(wavs[0].squeeze().numpy())
                ignore_seg = -(1 + c.audio.win_length // c.audio.hop_length)
                mel_diff = (mel_new[:, : mel_input.shape[1]] - mel_input[0].T.numpy())[:, 0:ignore_seg]
                assert abs(mel_diff.sum()) < 1e-5
                # check normalization ranges
                if self.ap.symmetric_norm:
                    assert mel_input.max() <= self.ap.max_norm
--- a/tests/tts_tests/test_align_tts_train.py
+++ b/tests/tts_tests/test_align_tts_train.py
@ -27,6 +27,7 @@ config = AlignTTSConfig(
        "Be a voice, not an echo.",
    ],
 )
 config.audio.do_trim_silence = True
 config.audio.trim_db = 60
 config.save_json(config_path)
--- a/tests/tts_tests/test_tacotron2_d-vectors_train.py
+++ b/tests/tts_tests/test_tacotron2_d-vectors_train.py
@ -29,6 +29,7 @@ config = Tacotron2Config(
        "Be a voice, not an echo.",
    ],
    d_vector_file="tests/data/ljspeech/speakers.json",
    d_vector_dim=256,
    max_decoder_steps=50,
 )
--- a/tests/tts_tests/test_tacotron2_model.py
+++ b/tests/tts_tests/test_tacotron2_model.py
@ -25,8 +25,68 @@ WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
 class TacotronTrainTest(unittest.TestCase):
    """Test vanilla Tacotron2 model."""
    def test_train_step(self):  # pylint: disable=no-self-use
        config = config_global.copy()
        config.use_speaker_embedding = False
        config.num_speakers = 1
        input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
        input_lengths = torch.randint(100, 128, (8,)).long().to(device)
        input_lengths = torch.sort(input_lengths, descending=True)[0]
        mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
        mel_postnet_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
        mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
        mel_lengths[0] = 30
        stop_targets = torch.zeros(8, 30, 1).float().to(device)
        for idx in mel_lengths:
            stop_targets[:, int(idx.item()) :, 0] = 1.0
        stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1)
        stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
        criterion = MSELossMasked(seq_len_norm=False).to(device)
        criterion_st = nn.BCEWithLogitsLoss().to(device)
        model = Tacotron2(config).to(device)
        model.train()
        model_ref = copy.deepcopy(model)
        count = 0
        for param, param_ref in zip(model.parameters(), model_ref.parameters()):
            assert (param - param_ref).sum() == 0, param
            count += 1
        optimizer = optim.Adam(model.parameters(), lr=config.lr)
        for i in range(5):
            outputs = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths)
            assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0
            assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0
            optimizer.zero_grad()
            loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths)
            stop_loss = criterion_st(outputs["stop_tokens"], stop_targets)
            loss = loss + criterion(outputs["model_outputs"], mel_postnet_spec, mel_lengths) + stop_loss
            loss.backward()
            optimizer.step()
        # check parameter changes
        count = 0
        for param, param_ref in zip(model.parameters(), model_ref.parameters()):
            # ignore pre-higway layer since it works conditional
            # if count not in [145, 59]:
            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
                count, param.shape, param, param_ref
            )
            count += 1
 class MultiSpeakerTacotronTrainTest(unittest.TestCase):
    """Test multi-speaker Tacotron2 with speaker embedding layer"""
    @staticmethod
    def test_train_step():
        config = config_global.copy()
        config.use_speaker_embedding = True
        config.num_speakers = 5
        input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
        input_lengths = torch.randint(100, 128, (8,)).long().to(device)
        input_lengths = torch.sort(input_lengths, descending=True)[0]
@ -45,6 +105,7 @@ class TacotronTrainTest(unittest.TestCase):
        criterion = MSELossMasked(seq_len_norm=False).to(device)
        criterion_st = nn.BCEWithLogitsLoss().to(device)
        config.d_vector_dim = 55
        model = Tacotron2(config).to(device)
        model.train()
        model_ref = copy.deepcopy(model)
@ -76,65 +137,18 @@ class TacotronTrainTest(unittest.TestCase):
            count += 1
 class MultiSpeakeTacotronTrainTest(unittest.TestCase):
    @staticmethod
    def test_train_step():
        config = config_global.copy()
        input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
        input_lengths = torch.randint(100, 128, (8,)).long().to(device)
        input_lengths = torch.sort(input_lengths, descending=True)[0]
        mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
        mel_postnet_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
        mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
        mel_lengths[0] = 30
        stop_targets = torch.zeros(8, 30, 1).float().to(device)
        speaker_ids = torch.rand(8, 55).to(device)
        for idx in mel_lengths:
            stop_targets[:, int(idx.item()) :, 0] = 1.0
        stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1)
        stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
        criterion = MSELossMasked(seq_len_norm=False).to(device)
        criterion_st = nn.BCEWithLogitsLoss().to(device)
        config.d_vector_dim = 55
        model = Tacotron2(config).to(device)
        model.train()
        model_ref = copy.deepcopy(model)
        count = 0
        for param, param_ref in zip(model.parameters(), model_ref.parameters()):
            assert (param - param_ref).sum() == 0, param
            count += 1
        optimizer = optim.Adam(model.parameters(), lr=config.lr)
        for i in range(5):
            outputs = model.forward(
                input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_ids}
            )
            assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0
            assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0
            optimizer.zero_grad()
            loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths)
            stop_loss = criterion_st(outputs["stop_tokens"], stop_targets)
            loss = loss + criterion(outputs["model_outputs"], mel_postnet_spec, mel_lengths) + stop_loss
            loss.backward()
            optimizer.step()
        # check parameter changes
        count = 0
        for param, param_ref in zip(model.parameters(), model_ref.parameters()):
            # ignore pre-higway layer since it works conditional
            # if count not in [145, 59]:
            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
                count, param.shape, param, param_ref
            )
            count += 1
 class TacotronGSTTrainTest(unittest.TestCase):
    """Test multi-speaker Tacotron2 with Global Style Token and Speaker Embedding"""
    # pylint: disable=no-self-use
    def test_train_step(self):
        # with random gst mel style
        config = config_global.copy()
        config.use_speaker_embedding = True
        config.num_speakers = 10
        config.use_gst = True
        config.gst = GSTConfig()
        input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
        input_lengths = torch.randint(100, 128, (8,)).long().to(device)
        input_lengths = torch.sort(input_lengths, descending=True)[0]
@ -247,9 +261,17 @@ class TacotronGSTTrainTest(unittest.TestCase):
 class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
    """Test multi-speaker Tacotron2 with Global Style Tokens and d-vector inputs."""
    @staticmethod
    def test_train_step():
        config = config_global.copy()
        config.use_d_vector_file = True
        config.use_gst = True
        config.gst = GSTConfig()
        input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
        input_lengths = torch.randint(100, 128, (8,)).long().to(device)
        input_lengths = torch.sort(input_lengths, descending=True)[0]
--- a/tests/tts_tests/test_tacotron2_train_fsspec_path.py
+++ b/tests/tts_tests/test_tacotron2_train_fsspec_path.py
@ -0,0 +1,55 @@
 import glob
 import os
 import shutil
 from tests import get_device_id, get_tests_output_path, run_cli
 from TTS.tts.configs import Tacotron2Config
 config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
 output_path = os.path.join(get_tests_output_path(), "train_outputs")
 config = Tacotron2Config(
    r=5,
    batch_size=8,
    eval_batch_size=8,
    num_loader_workers=0,
    num_eval_loader_workers=0,
    text_cleaner="english_cleaners",
    use_phonemes=False,
    phoneme_language="en-us",
    phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"),
    run_eval=True,
    test_delay_epochs=-1,
    epochs=1,
    print_step=1,
    test_sentences=[
        "Be a voice, not an echo.",
    ],
    print_eval=True,
    max_decoder_steps=50,
 )
 config.audio.do_trim_silence = True
 config.audio.trim_db = 60
 config.save_json(config_path)
 # train the model for one epoch
 command_train = (
    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path file://{config_path} "
    f"--coqpit.output_path file://{output_path} "
    "--coqpit.datasets.0.name ljspeech "
    "--coqpit.datasets.0.meta_file_train metadata.csv "
    "--coqpit.datasets.0.meta_file_val metadata.csv "
    "--coqpit.datasets.0.path tests/data/ljspeech "
    "--coqpit.test_delay_epochs 0 "
 )
 run_cli(command_train)
 # Find latest folder
 continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
 # restore the model and continue training for one more epoch
 command_train = (
    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path file://{continue_path} "
 )
 run_cli(command_train)
 shutil.rmtree(continue_path)
--- a/tests/tts_tests/test_tacotron_model.py
+++ b/tests/tts_tests/test_tacotron_model.py
@ -32,6 +32,61 @@ class TacotronTrainTest(unittest.TestCase):
    @staticmethod
    def test_train_step():
        config = config_global.copy()
        config.use_speaker_embedding = False
        config.num_speakers = 1
        input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
        input_lengths = torch.randint(100, 129, (8,)).long().to(device)
        input_lengths[-1] = 128
        mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
        linear_spec = torch.rand(8, 30, config.audio["fft_size"] // 2 + 1).to(device)
        mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
        mel_lengths[-1] = mel_spec.size(1)
        stop_targets = torch.zeros(8, 30, 1).float().to(device)
        for idx in mel_lengths:
            stop_targets[:, int(idx.item()) :, 0] = 1.0
        stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1)
        stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
        criterion = L1LossMasked(seq_len_norm=False).to(device)
        criterion_st = nn.BCEWithLogitsLoss().to(device)
        model = Tacotron(config).to(device)  # FIXME: missing num_speakers parameter to Tacotron ctor
        model.train()
        print(" > Num parameters for Tacotron model:%s" % (count_parameters(model)))
        model_ref = copy.deepcopy(model)
        count = 0
        for param, param_ref in zip(model.parameters(), model_ref.parameters()):
            assert (param - param_ref).sum() == 0, param
            count += 1
        optimizer = optim.Adam(model.parameters(), lr=config.lr)
        for _ in range(5):
            outputs = model.forward(input_dummy, input_lengths, mel_spec, mel_lengths)
            optimizer.zero_grad()
            loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths)
            stop_loss = criterion_st(outputs["stop_tokens"], stop_targets)
            loss = loss + criterion(outputs["model_outputs"], linear_spec, mel_lengths) + stop_loss
            loss.backward()
            optimizer.step()
        # check parameter changes
        count = 0
        for param, param_ref in zip(model.parameters(), model_ref.parameters()):
            # ignore pre-higway layer since it works conditional
            # if count not in [145, 59]:
            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
                count, param.shape, param, param_ref
            )
            count += 1
 class MultiSpeakeTacotronTrainTest(unittest.TestCase):
    @staticmethod
    def test_train_step():
        config = config_global.copy()
        config.use_speaker_embedding = True
        config.num_speakers = 5
        input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
        input_lengths = torch.randint(100, 129, (8,)).long().to(device)
        input_lengths[-1] = 128
@ -50,6 +105,7 @@ class TacotronTrainTest(unittest.TestCase):
        criterion = L1LossMasked(seq_len_norm=False).to(device)
        criterion_st = nn.BCEWithLogitsLoss().to(device)
        config.d_vector_dim = 55
        model = Tacotron(config).to(device)  # FIXME: missing num_speakers parameter to Tacotron ctor
        model.train()
        print(" > Num parameters for Tacotron model:%s" % (count_parameters(model)))
@ -80,63 +136,14 @@ class TacotronTrainTest(unittest.TestCase):
            count += 1
 class MultiSpeakeTacotronTrainTest(unittest.TestCase):
    @staticmethod
    def test_train_step():
        config = config_global.copy()
        input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
        input_lengths = torch.randint(100, 129, (8,)).long().to(device)
        input_lengths[-1] = 128
        mel_spec = torch.rand(8, 30, config.audio["num_mels"]).to(device)
        linear_spec = torch.rand(8, 30, config.audio["fft_size"] // 2 + 1).to(device)
        mel_lengths = torch.randint(20, 30, (8,)).long().to(device)
        mel_lengths[-1] = mel_spec.size(1)
        stop_targets = torch.zeros(8, 30, 1).float().to(device)
        speaker_embeddings = torch.rand(8, 55).to(device)
        for idx in mel_lengths:
            stop_targets[:, int(idx.item()) :, 0] = 1.0
        stop_targets = stop_targets.view(input_dummy.shape[0], stop_targets.size(1) // config.r, -1)
        stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
        criterion = L1LossMasked(seq_len_norm=False).to(device)
        criterion_st = nn.BCEWithLogitsLoss().to(device)
        config.d_vector_dim = 55
        model = Tacotron(config).to(device)  # FIXME: missing num_speakers parameter to Tacotron ctor
        model.train()
        print(" > Num parameters for Tacotron model:%s" % (count_parameters(model)))
        model_ref = copy.deepcopy(model)
        count = 0
        for param, param_ref in zip(model.parameters(), model_ref.parameters()):
            assert (param - param_ref).sum() == 0, param
            count += 1
        optimizer = optim.Adam(model.parameters(), lr=config.lr)
        for _ in range(5):
            outputs = model.forward(
                input_dummy, input_lengths, mel_spec, mel_lengths, aux_input={"d_vectors": speaker_embeddings}
            )
            optimizer.zero_grad()
            loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths)
            stop_loss = criterion_st(outputs["stop_tokens"], stop_targets)
            loss = loss + criterion(outputs["model_outputs"], linear_spec, mel_lengths) + stop_loss
            loss.backward()
            optimizer.step()
        # check parameter changes
        count = 0
        for param, param_ref in zip(model.parameters(), model_ref.parameters()):
            # ignore pre-higway layer since it works conditional
            # if count not in [145, 59]:
            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
                count, param.shape, param, param_ref
            )
            count += 1
 class TacotronGSTTrainTest(unittest.TestCase):
    @staticmethod
    def test_train_step():
        config = config_global.copy()
        config.use_speaker_embedding = True
        config.num_speakers = 10
        config.use_gst = True
        config.gst = GSTConfig()
        # with random gst mel style
        input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
        input_lengths = torch.randint(100, 129, (8,)).long().to(device)
@ -244,6 +251,11 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
    @staticmethod
    def test_train_step():
        config = config_global.copy()
        config.use_d_vector_file = True
        config.use_gst = True
        config.gst = GSTConfig()
        input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
        input_lengths = torch.randint(100, 129, (8,)).long().to(device)
        input_lengths[-1] = 128
--- a/tests/tts_tests/test_vits_train.py
+++ b/tests/tts_tests/test_vits_train.py
@ -0,0 +1,54 @@
 import glob
 import os
 import shutil
 from tests import get_device_id, get_tests_output_path, run_cli
 from TTS.tts.configs import VitsConfig
 config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
 output_path = os.path.join(get_tests_output_path(), "train_outputs")
 config = VitsConfig(
    batch_size=2,
    eval_batch_size=2,
    num_loader_workers=0,
    num_eval_loader_workers=0,
    text_cleaner="english_cleaners",
    use_phonemes=True,
    use_espeak_phonemes=True,
    phoneme_language="en-us",
    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
    run_eval=True,
    test_delay_epochs=-1,
    epochs=1,
    print_step=1,
    print_eval=True,
    test_sentences=[
        "Be a voice, not an echo.",
    ],
 )
 config.audio.do_trim_silence = True
 config.audio.trim_db = 60
 config.save_json(config_path)
 # train the model for one epoch
 command_train = (
    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} "
    f"--coqpit.output_path {output_path} "
    "--coqpit.datasets.0.name ljspeech "
    "--coqpit.datasets.0.meta_file_train metadata.csv "
    "--coqpit.datasets.0.meta_file_val metadata.csv "
    "--coqpit.datasets.0.path tests/data/ljspeech "
    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
    "--coqpit.test_delay_epochs 0"
 )
 run_cli(command_train)
 # Find latest folder
 continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
 # restore the model and continue training for one more epoch
 command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
 run_cli(command_train)
 shutil.rmtree(continue_path)