Merge pull request #1537 from coqui-ai/dev

v0.7.0
2022-06-20 23:55:22 +02:00 · 2022-06-20 23:55:22 +02:00 · c7cca4135d
parent c410bc58ef 71281ff1e4
commit c7cca4135d
78 changed files with 3667 additions and 421 deletions
--- a/.github/workflows/aux_tests.yml
+++ b/.github/workflows/aux_tests.yml
@ -18,7 +18,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: [3.6, 3.7, 3.8, 3.9]
+        python-version: [3.7, 3.8, 3.9, "3.10"]
        experimental: [false]
    steps:
      - uses: actions/checkout@v2
@ -38,6 +38,9 @@ jobs:
          make system-deps
      - name: Install/upgrade Python setup deps
        run: python3 -m pip install --upgrade pip setuptools wheel
+      - name: Replace scarf urls
+        run: |
+          sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
      - name: Install TTS
        run: |
          python3 -m pip install .[all]
--- a/.github/workflows/data_tests.yml
+++ b/.github/workflows/data_tests.yml
@ -18,7 +18,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: [3.6, 3.7, 3.8, 3.9]
+        python-version: [3.7, 3.8, 3.9, "3.10"]
        experimental: [false]
    steps:
      - uses: actions/checkout@v2
@ -38,6 +38,9 @@ jobs:
          make system-deps
      - name: Install/upgrade Python setup deps
        run: python3 -m pip install --upgrade pip setuptools wheel
+      - name: Replace scarf urls
+        run: |
+          sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
      - name: Install TTS
        run: |
          python3 -m pip install .[all]
--- a/.github/workflows/docker.yaml
+++ b/.github/workflows/docker.yaml
@ -14,6 +14,9 @@ jobs:
    strategy:
      matrix:
        arch: ["amd64"]
+        base:
+        - "nvcr.io/nvidia/pytorch:22.03-py3" # GPU enabled
+        - "ubuntu:20.04" # CPU only
    steps:
      - uses: actions/checkout@v2
      - name: Log in to the Container registry
@ -28,6 +31,11 @@ jobs:
          set -ex
          base="ghcr.io/coqui-ai/tts"
          tags="" # PR build
+
+          if [[ ${{ matrix.base }} = "ubuntu:20.04" ]]; then
+            base="ghcr.io/coqui-ai/tts-cpu"
+          fi
+
          if [[ "${{ startsWith(github.ref, 'refs/heads/') }}" = "true" ]]; then
            # Push to branch
            github_ref="${{ github.ref }}"
@ -53,4 +61,5 @@ jobs:
          context: .
          platforms: linux/${{ matrix.arch }}
          push: ${{ github.event_name == 'push' }}
+          build-args: "BASE=${{ matrix.base }}"
          tags: ${{ steps.compute-tag.outputs.tags }}
--- a/.github/workflows/inference_tests.yml
+++ b/.github/workflows/inference_tests.yml
@ -18,7 +18,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: [3.6, 3.7, 3.8, 3.9]
+        python-version: [3.7, 3.8, 3.9, "3.10"]
        experimental: [false]
    steps:
      - uses: actions/checkout@v2
@ -38,6 +38,9 @@ jobs:
          make system-deps
      - name: Install/upgrade Python setup deps
        run: python3 -m pip install --upgrade pip setuptools wheel
+      - name: Replace scarf urls
+        run: |
+          sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
      - name: Install TTS
        run: |
          python3 -m pip install .[all]
--- a/.github/workflows/pypi-release.yml
+++ b/.github/workflows/pypi-release.yml
@ -36,7 +36,7 @@ jobs:
    runs-on: ubuntu-20.04
    strategy:
      matrix:
-        python-version: ["3.6", "3.7", "3.8", "3.9"]
+        python-version: ["3.7", "3.8", "3.9", "3.10"]
    steps:
      - uses: actions/checkout@v2
      - uses: actions/setup-python@v2
@ -62,10 +62,6 @@ jobs:
        with:
          name: "sdist"
          path: "dist/"
-      - uses: actions/download-artifact@v2
-        with:
-          name: "wheel-3.6"
-          path: "dist/"
      - uses: actions/download-artifact@v2
        with:
          name: "wheel-3.7"
@ -78,6 +74,10 @@ jobs:
        with:
          name: "wheel-3.9"
          path: "dist/"
+      - uses: actions/download-artifact@v2
+        with:
+          name: "wheel-3.10"
+          path: "dist/"
      - run: |
          ls -lh dist/
      - name: Setup PyPI config
--- a/.github/workflows/text_tests.yml
+++ b/.github/workflows/text_tests.yml
@ -18,7 +18,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: [3.6, 3.7, 3.8, 3.9]
+        python-version: [3.7, 3.8, 3.9, "3.10"]
        experimental: [false]
    steps:
      - uses: actions/checkout@v2
--- a/.github/workflows/tts_tests.yml
+++ b/.github/workflows/tts_tests.yml
@ -18,7 +18,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: [3.6, 3.7, 3.8, 3.9]
+        python-version: [3.7, 3.8, 3.9, "3.10"]
        experimental: [false]
    steps:
      - uses: actions/checkout@v2
@ -40,6 +40,9 @@ jobs:
          make system-deps
      - name: Install/upgrade Python setup deps
        run: python3 -m pip install --upgrade pip setuptools wheel
+      - name: Replace scarf urls
+        run: |
+          sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
      - name: Install TTS
        run: |
          python3 -m pip install .[all]
--- a/.github/workflows/vocoder_tests.yml
+++ b/.github/workflows/vocoder_tests.yml
@ -18,7 +18,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: [3.6, 3.7, 3.8, 3.9]
+        python-version: [3.7, 3.8, 3.9, "3.10"]
        experimental: [false]
    steps:
      - uses: actions/checkout@v2
--- a/.github/workflows/zoo_tests.yml
+++ b/.github/workflows/zoo_tests.yml
@ -18,7 +18,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: [3.6, 3.7, 3.8, 3.9]
+        python-version: [3.7, 3.8, 3.9, "3.10"]
        experimental: [false]
    steps:
      - uses: actions/checkout@v2
@ -39,6 +39,9 @@ jobs:
          make system-deps
      - name: Install/upgrade Python setup deps
        run: python3 -m pip install --upgrade pip setuptools wheel
+      - name: Replace scarf urls
+        run: |
+          sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
      - name: Install TTS
        run: |
          python3 -m pip install .[all]
--- a/.gitignore
+++ b/.gitignore
@ -117,6 +117,7 @@ venv.bak/
 # pytorch models
 *.pth
 *.pth.tar
+!dummy_speakers.pth
 result/

 # setup.py
--- a/15
+++ b/15
@ -1,10 +1,19 @@
-FROM nvcr.io/nvidia/pytorch:22.03-py3
-RUN apt-get update && apt-get install -y --no-install-recommends espeak && rm -rf /var/lib/apt/lists/*
+ARG BASE=nvcr.io/nvidia/pytorch:22.03-py3
+FROM ${BASE}
+RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ make  python3 python3-dev python3-pip python3-venv python3-wheel espeak espeak-ng libsndfile1-dev && rm -rf /var/lib/apt/lists/*
+RUN pip install llvmlite --ignore-installed
+
+# Create and activate virtual env
+ENV VIRTUAL_ENV=/venv
+RUN python3 -m venv $VIRTUAL_ENV
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+RUN pip install -U pip setuptools wheel
+
 WORKDIR /root
 COPY requirements.txt /root
 COPY requirements.dev.txt /root
 COPY requirements.notebooks.txt /root
-RUN pip install -r <(cat requirements.txt requirements.dev.txt requirements.notebooks.txt)
+RUN ["/bin/bash", "-c", "pip install -r <(cat requirements.txt requirements.dev.txt requirements.notebooks.txt)"]
 COPY . /root
 RUN make install
 ENTRYPOINT ["tts"]
--- a/20
+++ b/20
@ -7,36 +7,36 @@ help:
 target_dirs := tests TTS notebooks recipes

 test_all:	## run tests and don't stop on an error.
-	nosetests --with-cov -cov  --cover-erase --cover-package TTS tests --nologcapture --with-id
+	nose2 --with-coverage --coverage TTS tests
 	./run_bash_tests.sh

 test:	## run tests.
-	nosetests -x --with-cov -cov  --cover-erase --cover-package TTS tests --nologcapture --with-id
+	nose2 -F -v -B --with-coverage --coverage TTS tests

 test_vocoder:	## run vocoder tests.
-	nosetests tests.vocoder_tests -x --with-cov -cov  --cover-erase --cover-package TTS tests.vocoder_tests --nologcapture --with-id
+	nose2 -F -v -B --with-coverage --coverage TTS tests.vocoder_tests

 test_tts:	## run tts tests.
-	nosetests tests.tts_tests -x --with-cov -cov  --cover-erase --cover-package TTS tests.tts_tests --nologcapture --with-id
+	nose2 -F -v -B --with-coverage --coverage TTS tests.tts_tests

 test_aux:	## run aux tests.
-	nosetests tests.aux_tests -x --with-cov -cov  --cover-erase --cover-package TTS tests.aux_tests --nologcapture --with-id
+	nose2 -F -v -B --with-coverage --coverage TTS tests.aux_tests
 	./run_bash_tests.sh

 test_zoo:	## run zoo tests.
-	nosetests tests.zoo_tests -x --with-cov -cov  --cover-erase --cover-package TTS tests.zoo_tests --nologcapture --with-id
+	nose2 -F -v -B --with-coverage --coverage TTS tests.zoo_tests

 inference_tests: ## run inference tests.
-	nosetests tests.inference_tests -x --with-cov -cov  --cover-erase --cover-package TTS tests.inference_tests --nologcapture --with-id
+	nose2 -F -v -B --with-coverage --coverage TTS tests.inference_tests

 data_tests: ## run data tests.
-	nosetests tests.data_tests -x --with-cov -cov  --cover-erase --cover-package TTS tests.data_tests --nologcapture --with-id
+	nose2 -F -v -B --with-coverage --coverage TTS tests.data_tests

 test_text: ## run text tests.
-	nosetests tests.text_tests -x --with-cov -cov  --cover-erase --cover-package TTS tests.text_tests --nologcapture --with-id
+	nose2 -F -v -B --with-coverage --coverage TTS tests.text_tests

 test_failed:  ## only run tests failed the last time.
-	nosetests -x --with-cov -cov  --cover-erase --cover-package TTS tests --nologcapture --failed
+	nose2 -F -v -B --with-coverage --coverage TTS tests

 style:	## update code style.
 	black ${target_dirs}
--- a/README.md
+++ b/README.md
@ -3,15 +3,23 @@
 🐸TTS is a library for advanced Text-to-Speech generation. It's built on the latest research, was designed to achieve the best trade-off among ease-of-training, speed and quality.
 🐸TTS comes with pretrained models, tools for measuring dataset quality and already used in **20+ languages** for products and research projects.

-[![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/main.yml/badge.svg)](https://github.com/coqui-ai/TTS/actions)
+[![Gitter](https://badges.gitter.im/coqui-ai/TTS.svg)](https://gitter.im/coqui-ai/TTS?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge)
+[![License](<https://img.shields.io/badge/License-MPL%202.0-brightgreen.svg>)](https://opensource.org/licenses/MPL-2.0)
 [![PyPI version](https://badge.fury.io/py/TTS.svg)](https://badge.fury.io/py/TTS)
 [![Covenant](https://camo.githubusercontent.com/7d620efaa3eac1c5b060ece5d6aacfcc8b81a74a04d05cd0398689c01c4463bb/68747470733a2f2f696d672e736869656c64732e696f2f62616467652f436f6e7472696275746f72253230436f76656e616e742d76322e3025323061646f707465642d6666363962342e737667)](https://github.com/coqui-ai/TTS/blob/master/CODE_OF_CONDUCT.md)
 [![Downloads](https://pepy.tech/badge/tts)](https://pepy.tech/project/tts)
 [![DOI](https://zenodo.org/badge/265612440.svg)](https://zenodo.org/badge/latestdoi/265612440)

+![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/aux_tests.yml/badge.svg)
+![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/data_tests.yml/badge.svg)
+![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/docker.yaml/badge.svg)
+![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/inference_tests.yml/badge.svg)
+![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/style_check.yml/badge.svg)
+![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/text_tests.yml/badge.svg)
+![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/tts_tests.yml/badge.svg)
+![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/vocoder_tests.yml/badge.svg)
+![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/zoo_tests.yml/badge.svg)
 [![Docs](<https://readthedocs.org/projects/tts/badge/?version=latest&style=plastic>)](https://tts.readthedocs.io/en/latest/)
-[![Gitter](https://badges.gitter.im/coqui-ai/TTS.svg)](https://gitter.im/coqui-ai/TTS?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge)
-[![License](<https://img.shields.io/badge/License-MPL%202.0-brightgreen.svg>)](https://opensource.org/licenses/MPL-2.0)

 📰 [**Subscribe to 🐸Coqui.ai Newsletter**](https://coqui.ai/?subscription=true)

@ -104,7 +112,7 @@ Underlined "TTS*" and "Judy*" are 🐸TTS models
 You can also help us implement more models.

 ## Install TTS
-🐸TTS is tested on Ubuntu 18.04 with **python >= 3.6, < 3.9**.
+🐸TTS is tested on Ubuntu 18.04 with **python >= 3.7, < 3.11.**.

 If you are only interested in [synthesizing speech](https://tts.readthedocs.io/en/latest/inference.html) with the released 🐸TTS models, installing from PyPI is the easiest option.

--- a/TTS/.models.json
+++ b/TTS/.models.json
@ -119,6 +119,26 @@
                    "license": "apache 2.0",
                    "contact": "egolge@coqui.com"
                }
+            },
+            "blizzard2013": {
+                "capacitron-t2-c50": {
+                    "description": "Capacitron additions to Tacotron 2 with Capacity at 50 as in https://arxiv.org/pdf/1906.03402.pdf",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--en--blizzard2013--capacitron-t2-c50.zip",
+                    "commit": "d6284e7",
+                    "default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
+                    "author": "Adam Froghyar @a-froghyar",
+                    "license": "apache 2.0",
+                    "contact": "adamfroghyar@gmail.com"
+                },
+                "capacitron-t2-c150": {
+                    "description": "Capacitron additions to Tacotron 2 with Capacity at 150 as in https://arxiv.org/pdf/1906.03402.pdf",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--en--blizzard2013--capacitron-t2-c150.zip",
+                    "commit": "d6284e7",
+                    "default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
+                    "author": "Adam Froghyar @a-froghyar",
+                    "license": "apache 2.0",
+                    "contact": "adamfroghyar@gmail.com"
+                }
            }
        },
        "es": {
@ -379,6 +399,16 @@
                    "contact": "egolge@coqui.ai"
                }
            },
+            "blizzard2013": {
+                "hifigan_v2": {
+                    "description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/vocoder_models--en--blizzard2013--hifigan_v2.zip",
+                    "commit": "d6284e7",
+                    "author": "Adam Froghyar @a-froghyar",
+                    "license": "apache 2.0",
+                    "contact": "adamfroghyar@gmail.com"
+                }
+            },
            "vctk": {
                "hifigan_v2": {
                    "description": "Finetuned and intended to be used with tts_models/en/vctk/sc-glow-tts",
--- a/TTS/VERSION
+++ b/TTS/VERSION
@ -1 +1 @@
-0.6.2
+0.7.0
--- a/TTS/bin/compute_embeddings.py
+++ b/TTS/bin/compute_embeddings.py
@ -2,51 +2,48 @@ import argparse
 import os
 from argparse import RawTextHelpFormatter

+import torch
 from tqdm import tqdm

 from TTS.config import load_config
 from TTS.tts.datasets import load_tts_samples
+from TTS.tts.utils.managers import save_file
 from TTS.tts.utils.speakers import SpeakerManager

 parser = argparse.ArgumentParser(
    description="""Compute embedding vectors for each wav file in a dataset.\n\n"""
    """
    Example runs:
-    python TTS/bin/compute_embeddings.py speaker_encoder_model.pth speaker_encoder_config.json  dataset_config.json embeddings_output_path/
+    python TTS/bin/compute_embeddings.py speaker_encoder_model.pth speaker_encoder_config.json  dataset_config.json
    """,
    formatter_class=RawTextHelpFormatter,
 )
 parser.add_argument("model_path", type=str, help="Path to model checkpoint file.")
-parser.add_argument(
-    "config_path",
-    type=str,
-    help="Path to model config file.",
-)
-
-parser.add_argument(
-    "config_dataset_path",
-    type=str,
-    help="Path to dataset config file.",
-)
-parser.add_argument("output_path", type=str, help="path for output speakers.json and/or speakers.npy.")
-parser.add_argument(
-    "--old_file", type=str, help="Previous speakers.json file, only compute for new audios.", default=None
-)
-parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)
-parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
+parser.add_argument("config_path", type=str, help="Path to model config file.")
+parser.add_argument("config_dataset_path", type=str, help="Path to dataset config file.")
+parser.add_argument("--output_path", type=str, help="Path for output `pth` or `json` file.", default="speakers.pth")
+parser.add_argument("--old_file", type=str, help="Previous embedding file to only compute new audios.", default=None)
+parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False)
+parser.add_argument("--no_eval", type=bool, help="Do not compute eval?. Default False", default=False)

 args = parser.parse_args()

+use_cuda = torch.cuda.is_available() and not args.disable_cuda
+
 c_dataset = load_config(args.config_dataset_path)

-meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=args.eval)
+meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=not args.no_eval)
+
+if meta_data_eval is None:
+    wav_files = meta_data_train
+else:
    wav_files = meta_data_train + meta_data_eval

 encoder_manager = SpeakerManager(
    encoder_model_path=args.model_path,
    encoder_config_path=args.config_path,
    d_vectors_file_path=args.old_file,
-    use_cuda=args.use_cuda,
+    use_cuda=use_cuda,
 )

 class_name_key = encoder_manager.encoder_config.class_name_key
@ -75,13 +72,13 @@ for idx, wav_file in enumerate(tqdm(wav_files)):

 if speaker_mapping:
    # save speaker_mapping if target dataset is defined
-    if ".json" not in args.output_path:
-        mapping_file_path = os.path.join(args.output_path, "speakers.json")
+    if os.path.isdir(args.output_path):
+        mapping_file_path = os.path.join(args.output_path, "speakers.pth")
    else:
        mapping_file_path = args.output_path

+    if os.path.dirname(mapping_file_path) != "":
        os.makedirs(os.path.dirname(mapping_file_path), exist_ok=True)

-    # pylint: disable=W0212
-    encoder_manager._save_json(mapping_file_path, speaker_mapping)
+    save_file(speaker_mapping, mapping_file_path)
    print("Speaker embeddings saved at:", mapping_file_path)
--- a/TTS/bin/synthesize.py
+++ b/TTS/bin/synthesize.py
@ -39,6 +39,18 @@ If you don't specify any models, then it uses LJSpeech based English model.
    $ tts --list_models
    ```

+- Query info for model info by idx:
+
+    ```
+    $ tts --model_info_by_idx "<model_type>/<model_query_idx>"
+    ```
+
+- Query info for model info by full name:
+
+    ```
+    $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
+    ```
+
 - Run TTS with default models:

    ```
@ -48,7 +60,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
 - Run a TTS model with its default vocoder model:

    ```
-    $ tts --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>
+    $ tts --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>"
    ```

 - Run with specific TTS and vocoder models from the list:
@ -104,6 +116,21 @@ If you don't specify any models, then it uses LJSpeech based English model.
        default=False,
        help="list available pre-trained TTS and vocoder models.",
    )
+
+    parser.add_argument(
+        "--model_info_by_idx",
+        type=str,
+        default=None,
+        help="model info using query format: <model_type>/<model_query_idx>",
+    )
+
+    parser.add_argument(
+        "--model_info_by_name",
+        type=str,
+        default=None,
+        help="model info using query format: <model_type>/<language>/<dataset>/<model_name>",
+    )
+
    parser.add_argument("--text", type=str, default=None, help="Text to generate speech.")

    # Args for running pre-trained TTS models.
@ -172,6 +199,10 @@ If you don't specify any models, then it uses LJSpeech based English model.
        default=None,
    )
    parser.add_argument("--gst_style", help="Wav path file for GST style reference.", default=None)
+    parser.add_argument(
+        "--capacitron_style_wav", type=str, help="Wav path file for Capacitron prosody reference.", default=None
+    )
+    parser.add_argument("--capacitron_style_text", type=str, help="Transcription of the reference.", default=None)
    parser.add_argument(
        "--list_speaker_idxs",
        help="List available speaker ids for the defined multi-speaker model.",
@ -210,13 +241,16 @@ If you don't specify any models, then it uses LJSpeech based English model.
    args = parser.parse_args()

    # print the description if either text or list_models is not set
-    if (
-        not args.text
-        and not args.list_models
-        and not args.list_speaker_idxs
-        and not args.list_language_idxs
-        and not args.reference_wav
-    ):
+    check_args = [
+        args.text,
+        args.list_models,
+        args.list_speaker_idxs,
+        args.list_language_idxs,
+        args.reference_wav,
+        args.model_info_by_idx,
+        args.model_info_by_name,
+    ]
+    if not any(check_args):
        parser.parse_args(["-h"])

    # load model manager
@ -232,12 +266,23 @@ If you don't specify any models, then it uses LJSpeech based English model.
    encoder_path = None
    encoder_config_path = None

-    # CASE1: list pre-trained TTS models
+    # CASE1 #list : list pre-trained TTS models
    if args.list_models:
        manager.list_models()
        sys.exit()

-    # CASE2: load pre-trained model paths
+    # CASE2 #info : model info of pre-trained TTS models
+    if args.model_info_by_idx:
+        model_query = args.model_info_by_idx
+        manager.model_info_by_idx(model_query)
+        sys.exit()
+
+    if args.model_info_by_name:
+        model_query_full_name = args.model_info_by_name
+        manager.model_info_by_full_name(model_query_full_name)
+        sys.exit()
+
+    # CASE3: load pre-trained model paths
    if args.model_name is not None and not args.model_path:
        model_path, config_path, model_item = manager.download_model(args.model_name)
        args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
@ -245,7 +290,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
    if args.vocoder_name is not None and not args.vocoder_path:
        vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)

-    # CASE3: set custom model paths
+    # CASE4: set custom model paths
    if args.model_path is not None:
        model_path = args.model_path
        config_path = args.config_path
@ -308,6 +353,8 @@ If you don't specify any models, then it uses LJSpeech based English model.
        args.language_idx,
        args.speaker_wav,
        reference_wav=args.reference_wav,
+        style_wav=args.capacitron_style_wav,
+        style_text=args.capacitron_style_text,
        reference_speaker_name=args.reference_speaker_idx,
    )

--- a/TTS/server/README.md
+++ b/TTS/server/README.md
@ -1,9 +1,3 @@
-<!-- ## TTS example web-server
-
-You'll need a model package (Zip file, includes TTS Python wheel, model files, server configuration, and optional nginx/uwsgi configs). Publicly available models are listed [here](https://github.com/mozilla/TTS/wiki/Released-Models#simple-packaging---self-contained-package-that-runs-an-http-api-for-a-pre-trained-tts-model).
-
-Instructions below are based on a Ubuntu 18.04 machine, but it should be simple to adapt the package names to other distros if needed. Python 3.6 is recommended, as some of the dependencies' versions predate Python 3.7 and will force building from source, which requires extra dependencies and is not guaranteed to work. -->
-
 # :frog: TTS demo server
 Before you use the server, make sure you [install](https://github.com/coqui-ai/TTS/tree/dev#install-tts)) :frog: TTS properly. Then, you can follow the steps below.

--- a/TTS/server/server.py
+++ b/TTS/server/server.py
@ -111,7 +111,10 @@ synthesizer = Synthesizer(
    use_cuda=args.use_cuda,
 )

-use_multi_speaker = hasattr(synthesizer.tts_model, "num_speakers") and synthesizer.tts_model.num_speakers > 1
+use_multi_speaker = hasattr(synthesizer.tts_model, "num_speakers") and (
+    synthesizer.tts_model.num_speakers > 1 or synthesizer.tts_speakers_file is not None
+)
+
 speaker_manager = getattr(synthesizer.tts_model, "speaker_manager", None)
 # TODO: set this from SpeakerManager
 use_gst = synthesizer.tts_config.get("use_gst", False)
--- a/TTS/tts/configs/shared_configs.py
+++ b/TTS/tts/configs/shared_configs.py
@ -48,6 +48,50 @@ class GSTConfig(Coqpit):
        check_argument("gst_num_style_tokens", c, restricted=True, min_val=1, max_val=1000)


+@dataclass
+class CapacitronVAEConfig(Coqpit):
+    """Defines the capacitron VAE Module
+    Args:
+        capacitron_capacity (int):
+            Defines the variational capacity limit of the prosody embeddings. Defaults to 150.
+        capacitron_VAE_embedding_dim (int):
+            Defines the size of the Capacitron embedding vector dimension. Defaults to 128.
+        capacitron_use_text_summary_embeddings (bool):
+            If True, use a text summary embedding in Capacitron. Defaults to True.
+        capacitron_text_summary_embedding_dim (int):
+            Defines the size of the capacitron text embedding vector dimension. Defaults to 128.
+        capacitron_use_speaker_embedding (bool):
+            if True use speaker embeddings in Capacitron. Defaults to False.
+        capacitron_VAE_loss_alpha (float):
+            Weight for the VAE loss of the Tacotron model. If set less than or equal to zero, it disables the
+            corresponding loss function. Defaults to 0.25
+        capacitron_grad_clip (float):
+            Gradient clipping value for all gradients except beta. Defaults to 5.0
+    """
+
+    capacitron_loss_alpha: int = 1
+    capacitron_capacity: int = 150
+    capacitron_VAE_embedding_dim: int = 128
+    capacitron_use_text_summary_embeddings: bool = True
+    capacitron_text_summary_embedding_dim: int = 128
+    capacitron_use_speaker_embedding: bool = False
+    capacitron_VAE_loss_alpha: float = 0.25
+    capacitron_grad_clip: float = 5.0
+
+    def check_values(
+        self,
+    ):
+        """Check config fields"""
+        c = asdict(self)
+        super().check_values()
+        check_argument("capacitron_capacity", c, restricted=True, min_val=10, max_val=500)
+        check_argument("capacitron_VAE_embedding_dim", c, restricted=True, min_val=16, max_val=1024)
+        check_argument("capacitron_use_speaker_embedding", c, restricted=False)
+        check_argument("capacitron_text_summary_embedding_dim", c, restricted=False, min_val=16, max_val=512)
+        check_argument("capacitron_VAE_loss_alpha", c, restricted=False)
+        check_argument("capacitron_grad_clip", c, restricted=False)
+
+
@dataclass
 class CharactersConfig(Coqpit):
    """Defines arguments for the `BaseCharacters` or `BaseVocabulary` and their subclasses.
@ -232,6 +276,14 @@ class BaseTTSConfig(BaseTrainingConfig):

        language_weighted_sampler_alpha (float):
            Number that control the influence of the language sampler weights. Defaults to ```1.0```.
+
+        use_length_weighted_sampler (bool):
+            Enable / Disable the batch balancer by audio length. If enabled the dataset will be divided
+            into 10 buckets considering the min and max audio of the dataset. The sampler weights will be
+            computed forcing to have the same quantity of data for each bucket in each training batch. Defaults to ```False```.
+
+        length_weighted_sampler_alpha (float):
+            Number that control the influence of the length sampler weights. Defaults to ```1.0```.
    """

    audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
@ -279,3 +331,5 @@ class BaseTTSConfig(BaseTrainingConfig):
    speaker_weighted_sampler_alpha: float = 1.0
    use_language_weighted_sampler: bool = False
    language_weighted_sampler_alpha: float = 1.0
+    use_length_weighted_sampler: bool = False
+    length_weighted_sampler_alpha: float = 1.0
--- a/TTS/tts/configs/tacotron_config.py
+++ b/TTS/tts/configs/tacotron_config.py
@ -1,7 +1,7 @@
 from dataclasses import dataclass, field
 from typing import List

-from TTS.tts.configs.shared_configs import BaseTTSConfig, GSTConfig
+from TTS.tts.configs.shared_configs import BaseTTSConfig, CapacitronVAEConfig, GSTConfig


@dataclass
@ -23,6 +23,10 @@ class TacotronConfig(BaseTTSConfig):
        gst_style_input (str):
            Path to the wav file used at inference to set the speech style through GST. If `GST` is enabled and
            this is not defined, the model uses a zero vector as an input. Defaults to None.
+        use_capacitron_vae (bool):
+            enable / disable the use of Capacitron modules. Defaults to False.
+        capacitron_vae (CapacitronConfig):
+            Instance of `CapacitronConfig` class.
        num_chars (int):
            Number of characters used by the model. It must be defined before initializing the model. Defaults to None.
        num_speakers (int):
@ -143,6 +147,9 @@ class TacotronConfig(BaseTTSConfig):
    gst: GSTConfig = None
    gst_style_input: str = None

+    use_capacitron_vae: bool = False
+    capacitron_vae: CapacitronVAEConfig = None
+
    # model specific params
    num_speakers: int = 1
    num_chars: int = 0
--- a/TTS/tts/datasets/formatters.py
+++ b/TTS/tts/datasets/formatters.py
@ -5,6 +5,7 @@ from glob import glob
 from pathlib import Path
 from typing import List

+import pandas as pd
 from tqdm import tqdm

 ########################
@ -12,6 +13,34 @@ from tqdm import tqdm
 ########################


+def coqui(root_path, meta_file, ignored_speakers=None):
+    """Interal dataset formatter."""
+    metadata = pd.read_csv(os.path.join(root_path, meta_file), sep="|")
+    assert all(x in metadata.columns for x in ["audio_file", "text"])
+    speaker_name = None if "speaker_name" in metadata.columns else "coqui"
+    emotion_name = None if "emotion_name" in metadata.columns else "neutral"
+    items = []
+    not_found_counter = 0
+    for row in metadata.itertuples():
+        if speaker_name is None and ignored_speakers is not None and row.speaker_name in ignored_speakers:
+            continue
+        audio_path = os.path.join(root_path, row.audio_file)
+        if not os.path.exists(audio_path):
+            not_found_counter += 1
+            continue
+        items.append(
+            {
+                "text": row.text,
+                "audio_file": audio_path,
+                "speaker_name": speaker_name if speaker_name is not None else row.speaker_name,
+                "emotion_name": emotion_name if emotion_name is not None else row.emotion_name,
+            }
+        )
+    if not_found_counter > 0:
+        print(f" | > [!] {not_found_counter} files not found")
+    return items
+
+
 def tweb(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
    """Normalize TWEB dataset.
    https://www.kaggle.com/bryanpark/the-world-english-bible-speech-dataset
@ -141,6 +170,21 @@ def ljspeech_test(root_path, meta_file, **kwargs):  # pylint: disable=unused-arg
    return items


+def thorsten(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
+    """Normalizes the thorsten meta data file to TTS format
+    https://github.com/thorstenMueller/deep-learning-german-tts/"""
+    txt_file = os.path.join(root_path, meta_file)
+    items = []
+    speaker_name = "thorsten"
+    with open(txt_file, "r", encoding="utf-8") as ttf:
+        for line in ttf:
+            cols = line.split("|")
+            wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav")
+            text = cols[1]
+            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name})
+    return items
+
+
 def sam_accenture(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
    """Normalizes the sam-accenture meta data file to TTS format
    https://github.com/Sam-Accenture-Non-Binary-Voice/non-binary-voice-files"""
@ -352,6 +396,25 @@ def vctk_old(root_path, meta_files=None, wavs_path="wav48", ignored_speakers=Non
    return items


+def synpaflex(root_path, metafiles=None, **kwargs):  # pylint: disable=unused-argument
+    items = []
+    speaker_name = "synpaflex"
+    root_path = os.path.join(root_path, "")
+    wav_files = glob(f"{root_path}**/*.wav", recursive=True)
+    for wav_file in wav_files:
+        if os.sep + "wav" + os.sep in wav_file:
+            txt_file = wav_file.replace("wav", "txt")
+        else:
+            txt_file = os.path.join(
+                os.path.dirname(wav_file), "txt", os.path.basename(wav_file).replace(".wav", ".txt")
+            )
+        if os.path.exists(txt_file) and os.path.exists(wav_file):
+            with open(txt_file, "r", encoding="utf-8") as file_text:
+                text = file_text.readlines()[0]
+            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name})
+    return items
+
+
 def open_bible(root_path, meta_files="train", ignore_digits_sentences=True, ignored_speakers=None):
    """ToDo: Refer the paper when available"""
    items = []
--- a/TTS/tts/layers/losses.py
+++ b/TTS/tts/layers/losses.py
@ -281,6 +281,10 @@ class TacotronLoss(torch.nn.Module):
    def __init__(self, c, ga_sigma=0.4):
        super().__init__()
        self.stopnet_pos_weight = c.stopnet_pos_weight
+        self.use_capacitron_vae = c.use_capacitron_vae
+        if self.use_capacitron_vae:
+            self.capacitron_capacity = c.capacitron_vae.capacitron_capacity
+            self.capacitron_vae_loss_alpha = c.capacitron_vae.capacitron_VAE_loss_alpha
        self.ga_alpha = c.ga_alpha
        self.decoder_diff_spec_alpha = c.decoder_diff_spec_alpha
        self.postnet_diff_spec_alpha = c.postnet_diff_spec_alpha
@ -308,6 +312,9 @@ class TacotronLoss(torch.nn.Module):
        # pylint: disable=not-callable
        self.criterion_st = BCELossMasked(pos_weight=torch.tensor(self.stopnet_pos_weight)) if c.stopnet else None

+        # For dev pruposes only
+        self.criterion_capacitron_reconstruction_loss = nn.L1Loss(reduction="sum")
+
    def forward(
        self,
        postnet_output,
@ -317,6 +324,7 @@ class TacotronLoss(torch.nn.Module):
        stopnet_output,
        stopnet_target,
        stop_target_length,
+        capacitron_vae_outputs,
        output_lens,
        decoder_b_output,
        alignments,
@ -348,6 +356,55 @@ class TacotronLoss(torch.nn.Module):
        return_dict["decoder_loss"] = decoder_loss
        return_dict["postnet_loss"] = postnet_loss

+        if self.use_capacitron_vae:
+            # extract capacitron vae infos
+            posterior_distribution, prior_distribution, beta = capacitron_vae_outputs
+
+            # KL divergence term between the posterior and the prior
+            kl_term = torch.mean(torch.distributions.kl_divergence(posterior_distribution, prior_distribution))
+
+            # Limit the mutual information between the data and latent space by the variational capacity limit
+            kl_capacity = kl_term - self.capacitron_capacity
+
+            # pass beta through softplus to keep it positive
+            beta = torch.nn.functional.softplus(beta)[0]
+
+            # This is the term going to the main ADAM optimiser, we detach beta because
+            # beta is optimised by a separate, SGD optimiser below
+            capacitron_vae_loss = beta.detach() * kl_capacity
+
+            # normalize the capacitron_vae_loss as in L1Loss or MSELoss.
+            # After this, both the standard loss and capacitron_vae_loss will be in the same scale.
+            # For this reason we don't need use L1Loss and MSELoss in "sum" reduction mode.
+            # Note: the batch is not considered because the L1Loss was calculated in "sum" mode
+            # divided by the batch size, So not dividing the capacitron_vae_loss by B is legitimate.
+
+            # get B T D dimension from input
+            B, T, D = mel_input.size()
+            # normalize
+            if self.config.loss_masking:
+                # if mask loss get T using the mask
+                T = output_lens.sum() / B
+
+            # Only for dev purposes to be able to compare the reconstruction loss with the values in the
+            # original Capacitron paper
+            return_dict["capaciton_reconstruction_loss"] = (
+                self.criterion_capacitron_reconstruction_loss(decoder_output, mel_input) / decoder_output.size(0)
+            ) + kl_capacity
+
+            capacitron_vae_loss = capacitron_vae_loss / (T * D)
+            capacitron_vae_loss = capacitron_vae_loss * self.capacitron_vae_loss_alpha
+
+            # This is the term to purely optimise beta and to pass into the SGD optimizer
+            beta_loss = torch.negative(beta) * kl_capacity.detach()
+
+            loss += capacitron_vae_loss
+
+            return_dict["capacitron_vae_loss"] = capacitron_vae_loss
+            return_dict["capacitron_vae_beta_loss"] = beta_loss
+            return_dict["capacitron_vae_kl_term"] = kl_term
+            return_dict["capacitron_beta"] = beta
+
        stop_loss = (
            self.criterion_st(stopnet_output, stopnet_target, stop_target_length)
            if self.config.stopnet
--- a/TTS/tts/layers/tacotron/attentions.py
+++ b/TTS/tts/layers/tacotron/attentions.py
@ -484,4 +484,4 @@ def init_attn(
            beta=0.9,
        )

-    raise RuntimeError(" [!] Given Attention Type '{attn_type}' is not exist.")
+    raise RuntimeError(f" [!] Given Attention Type '{attn_type}' is not exist.")
--- a/TTS/tts/layers/tacotron/capacitron_layers.py
+++ b/TTS/tts/layers/tacotron/capacitron_layers.py
@ -0,0 +1,205 @@
+import torch
+from torch import nn
+from torch.distributions.multivariate_normal import MultivariateNormal as MVN
+from torch.nn import functional as F
+
+
+class CapacitronVAE(nn.Module):
+    """Effective Use of Variational Embedding Capacity for prosody transfer.
+
+    See https://arxiv.org/abs/1906.03402"""
+
+    def __init__(
+        self,
+        num_mel,
+        capacitron_VAE_embedding_dim,
+        encoder_output_dim=256,
+        reference_encoder_out_dim=128,
+        speaker_embedding_dim=None,
+        text_summary_embedding_dim=None,
+    ):
+        super().__init__()
+        # Init distributions
+        self.prior_distribution = MVN(
+            torch.zeros(capacitron_VAE_embedding_dim), torch.eye(capacitron_VAE_embedding_dim)
+        )
+        self.approximate_posterior_distribution = None
+        # define output ReferenceEncoder dim to the capacitron_VAE_embedding_dim
+        self.encoder = ReferenceEncoder(num_mel, out_dim=reference_encoder_out_dim)
+
+        # Init beta, the lagrange-like term for the KL distribution
+        self.beta = torch.nn.Parameter(torch.log(torch.exp(torch.Tensor([1.0])) - 1), requires_grad=True)
+        mlp_input_dimension = reference_encoder_out_dim
+
+        if text_summary_embedding_dim is not None:
+            self.text_summary_net = TextSummary(text_summary_embedding_dim, encoder_output_dim=encoder_output_dim)
+            mlp_input_dimension += text_summary_embedding_dim
+        if speaker_embedding_dim is not None:
+            # TODO: Test a multispeaker model!
+            mlp_input_dimension += speaker_embedding_dim
+        self.post_encoder_mlp = PostEncoderMLP(mlp_input_dimension, capacitron_VAE_embedding_dim)
+
+    def forward(self, reference_mel_info=None, text_info=None, speaker_embedding=None):
+        # Use reference
+        if reference_mel_info is not None:
+            reference_mels = reference_mel_info[0]  # [batch_size, num_frames, num_mels]
+            mel_lengths = reference_mel_info[1]  # [batch_size]
+            enc_out = self.encoder(reference_mels, mel_lengths)
+
+            # concat speaker_embedding and/or text summary embedding
+            if text_info is not None:
+                text_inputs = text_info[0]  # [batch_size, num_characters, num_embedding]
+                input_lengths = text_info[1]
+                text_summary_out = self.text_summary_net(text_inputs, input_lengths).to(reference_mels.device)
+                enc_out = torch.cat([enc_out, text_summary_out], dim=-1)
+            if speaker_embedding is not None:
+                enc_out = torch.cat([enc_out, speaker_embedding], dim=-1)
+
+            # Feed the output of the ref encoder and information about text/speaker into
+            # an MLP to produce the parameteres for the approximate poterior distributions
+            mu, sigma = self.post_encoder_mlp(enc_out)
+            # convert to cpu because prior_distribution was created on cpu
+            mu = mu.cpu()
+            sigma = sigma.cpu()
+
+            # Sample from the posterior: z ~ q(z|x)
+            self.approximate_posterior_distribution = MVN(mu, torch.diag_embed(sigma))
+            VAE_embedding = self.approximate_posterior_distribution.rsample()
+        # Infer from the model, bypasses encoding
+        else:
+            # Sample from the prior: z ~ p(z)
+            VAE_embedding = self.prior_distribution.sample().unsqueeze(0)
+
+        # reshape to [batch_size, 1, capacitron_VAE_embedding_dim]
+        return VAE_embedding.unsqueeze(1), self.approximate_posterior_distribution, self.prior_distribution, self.beta
+
+
+class ReferenceEncoder(nn.Module):
+    """NN module creating a fixed size prosody embedding from a spectrogram.
+
+    inputs: mel spectrograms [batch_size, num_spec_frames, num_mel]
+    outputs: [batch_size, embedding_dim]
+    """
+
+    def __init__(self, num_mel, out_dim):
+
+        super().__init__()
+        self.num_mel = num_mel
+        filters = [1] + [32, 32, 64, 64, 128, 128]
+        num_layers = len(filters) - 1
+        convs = [
+            nn.Conv2d(
+                in_channels=filters[i], out_channels=filters[i + 1], kernel_size=(3, 3), stride=(2, 2), padding=(2, 2)
+            )
+            for i in range(num_layers)
+        ]
+        self.convs = nn.ModuleList(convs)
+        self.training = False
+        self.bns = nn.ModuleList([nn.BatchNorm2d(num_features=filter_size) for filter_size in filters[1:]])
+
+        post_conv_height = self.calculate_post_conv_height(num_mel, 3, 2, 2, num_layers)
+        self.recurrence = nn.LSTM(
+            input_size=filters[-1] * post_conv_height, hidden_size=out_dim, batch_first=True, bidirectional=False
+        )
+
+    def forward(self, inputs, input_lengths):
+        batch_size = inputs.size(0)
+        x = inputs.view(batch_size, 1, -1, self.num_mel)  # [batch_size, num_channels==1, num_frames, num_mel]
+        valid_lengths = input_lengths.float()  # [batch_size]
+        for conv, bn in zip(self.convs, self.bns):
+            x = conv(x)
+            x = bn(x)
+            x = F.relu(x)
+
+            # Create the post conv width mask based on the valid lengths of the output of the convolution.
+            # The valid lengths for the output of a convolution on varying length inputs is
+            # ceil(input_length/stride) + 1 for stride=3 and padding=2
+            # For example (kernel_size=3, stride=2, padding=2):
+            # 0 0 x x x x x 0 0 -> Input = 5, 0 is zero padding, x is valid values coming from padding=2 in conv2d
+            # _____
+            #   x _____
+            #       x _____
+            #           x  ____
+            #               x
+            # x x x x -> Output valid length = 4
+            # Since every example in te batch is zero padded and therefore have separate valid_lengths,
+            # we need to mask off all the values AFTER the valid length for each example in the batch.
+            # Otherwise, the convolutions create noise and a lot of not real information
+            valid_lengths = (valid_lengths / 2).float()
+            valid_lengths = torch.ceil(valid_lengths).to(dtype=torch.int64) + 1  # 2 is stride -- size: [batch_size]
+            post_conv_max_width = x.size(2)
+
+            mask = torch.arange(post_conv_max_width).to(inputs.device).expand(
+                len(valid_lengths), post_conv_max_width
+            ) < valid_lengths.unsqueeze(1)
+            mask = mask.expand(1, 1, -1, -1).transpose(2, 0).transpose(-1, 2)  # [batch_size, 1, post_conv_max_width, 1]
+            x = x * mask
+
+        x = x.transpose(1, 2)
+        # x: 4D tensor [batch_size, post_conv_width,
+        #               num_channels==128, post_conv_height]
+
+        post_conv_width = x.size(1)
+        x = x.contiguous().view(batch_size, post_conv_width, -1)
+        # x: 3D tensor [batch_size, post_conv_width,
+        #               num_channels*post_conv_height]
+
+        # Routine for fetching the last valid output of a dynamic LSTM with varying input lengths and padding
+        post_conv_input_lengths = valid_lengths
+        packed_seqs = nn.utils.rnn.pack_padded_sequence(
+            x, post_conv_input_lengths.tolist(), batch_first=True, enforce_sorted=False
+        )  # dynamic rnn sequence padding
+        self.recurrence.flatten_parameters()
+        _, (ht, _) = self.recurrence(packed_seqs)
+        last_output = ht[-1]
+
+        return last_output.to(inputs.device)  # [B, 128]
+
+    @staticmethod
+    def calculate_post_conv_height(height, kernel_size, stride, pad, n_convs):
+        """Height of spec after n convolutions with fixed kernel/stride/pad."""
+        for _ in range(n_convs):
+            height = (height - kernel_size + 2 * pad) // stride + 1
+        return height
+
+
+class TextSummary(nn.Module):
+    def __init__(self, embedding_dim, encoder_output_dim):
+        super().__init__()
+        self.lstm = nn.LSTM(
+            encoder_output_dim,  # text embedding dimension from the text encoder
+            embedding_dim,  # fixed length output summary the lstm creates from the input
+            batch_first=True,
+            bidirectional=False,
+        )
+
+    def forward(self, inputs, input_lengths):
+        # Routine for fetching the last valid output of a dynamic LSTM with varying input lengths and padding
+        packed_seqs = nn.utils.rnn.pack_padded_sequence(
+            inputs, input_lengths.tolist(), batch_first=True, enforce_sorted=False
+        )  # dynamic rnn sequence padding
+        self.lstm.flatten_parameters()
+        _, (ht, _) = self.lstm(packed_seqs)
+        last_output = ht[-1]
+        return last_output
+
+
+class PostEncoderMLP(nn.Module):
+    def __init__(self, input_size, hidden_size):
+        super().__init__()
+        self.hidden_size = hidden_size
+        modules = [
+            nn.Linear(input_size, hidden_size),  # Hidden Layer
+            nn.Tanh(),
+            nn.Linear(hidden_size, hidden_size * 2),
+        ]  # Output layer twice the size for mean and variance
+        self.net = nn.Sequential(*modules)
+        self.softplus = nn.Softplus()
+
+    def forward(self, _input):
+        mlp_output = self.net(_input)
+        # The mean parameter is unconstrained
+        mu = mlp_output[:, : self.hidden_size]
+        # The standard deviation must be positive. Parameterise with a softplus
+        sigma = self.softplus(mlp_output[:, self.hidden_size :])
+        return mu, sigma
--- a/TTS/tts/layers/vits/discriminator.py
+++ b/TTS/tts/layers/vits/discriminator.py
@ -58,10 +58,8 @@ class VitsDiscriminator(nn.Module):
        use_spectral_norm (bool): if `True` swith to spectral norm instead of weight norm.
    """

-    def __init__(self, use_spectral_norm=False):
+    def __init__(self, periods=(2, 3, 5, 7, 11), use_spectral_norm=False):
        super().__init__()
-        periods = [2, 3, 5, 7, 11]
-
        self.nets = nn.ModuleList()
        self.nets.append(DiscriminatorS(use_spectral_norm=use_spectral_norm))
        self.nets.extend([DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods])
--- a/TTS/tts/models/base_tacotron.py
+++ b/TTS/tts/models/base_tacotron.py
@ -1,6 +1,6 @@
 import copy
 from abc import abstractmethod
-from typing import Dict
+from typing import Dict, Tuple

 import torch
 from coqpit import Coqpit
@ -10,7 +10,9 @@ from TTS.tts.layers.losses import TacotronLoss
 from TTS.tts.models.base_tts import BaseTTS
 from TTS.tts.utils.helpers import sequence_mask
 from TTS.tts.utils.speakers import SpeakerManager
+from TTS.tts.utils.synthesis import synthesis
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
+from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
 from TTS.utils.generic_utils import format_aux_input
 from TTS.utils.io import load_fsspec
 from TTS.utils.training import gradual_training_scheduler
@ -47,6 +49,11 @@ class BaseTacotron(BaseTTS):
            self.decoder_in_features += self.gst.gst_embedding_dim  # add gst embedding dim
            self.gst_layer = None

+        # Capacitron
+        if self.capacitron_vae and self.use_capacitron_vae:
+            self.decoder_in_features += self.capacitron_vae.capacitron_VAE_embedding_dim  # add capacitron embedding dim
+            self.capacitron_vae_layer = None
+
        # additional layers
        self.decoder_backward = None
        self.coarse_decoder = None
@ -125,6 +132,53 @@ class BaseTacotron(BaseTTS):
        speaker_manager = SpeakerManager.init_from_config(config)
        return BaseTacotron(config, ap, tokenizer, speaker_manager)

+    ##########################
+    # TEST AND LOG FUNCTIONS #
+    ##########################
+
+    def test_run(self, assets: Dict) -> Tuple[Dict, Dict]:
+        """Generic test run for `tts` models used by `Trainer`.
+
+        You can override this for a different behaviour.
+
+        Args:
+            assets (dict): A dict of training assets. For `tts` models, it must include `{'audio_processor': ap}`.
+
+        Returns:
+            Tuple[Dict, Dict]: Test figures and audios to be projected to Tensorboard.
+        """
+        print(" | > Synthesizing test sentences.")
+        test_audios = {}
+        test_figures = {}
+        test_sentences = self.config.test_sentences
+        aux_inputs = self._get_test_aux_input()
+        for idx, sen in enumerate(test_sentences):
+            outputs_dict = synthesis(
+                self,
+                sen,
+                self.config,
+                "cuda" in str(next(self.parameters()).device),
+                speaker_id=aux_inputs["speaker_id"],
+                d_vector=aux_inputs["d_vector"],
+                style_wav=aux_inputs["style_wav"],
+                use_griffin_lim=True,
+                do_trim_silence=False,
+            )
+            test_audios["{}-audio".format(idx)] = outputs_dict["wav"]
+            test_figures["{}-prediction".format(idx)] = plot_spectrogram(
+                outputs_dict["outputs"]["model_outputs"], self.ap, output_fig=False
+            )
+            test_figures["{}-alignment".format(idx)] = plot_alignment(
+                outputs_dict["outputs"]["alignments"], output_fig=False
+            )
+        return {"figures": test_figures, "audios": test_audios}
+
+    def test_log(
+        self, outputs: dict, logger: "Logger", assets: dict, steps: int  # pylint: disable=unused-argument
+    ) -> None:
+        logger.test_audios(steps, outputs["audios"], self.ap.sample_rate)
+        logger.test_figures(steps, outputs["figures"])
+
    #############################
    # COMMON COMPUTE FUNCTIONS
    #############################
@ -160,7 +214,9 @@ class BaseTacotron(BaseTTS):
        )
        # scale_factor = self.decoder.r_init / self.decoder.r
        alignments_backward = torch.nn.functional.interpolate(
-            alignments_backward.transpose(1, 2), size=alignments.shape[1], mode="nearest"
+            alignments_backward.transpose(1, 2),
+            size=alignments.shape[1],
+            mode="nearest",
        ).transpose(1, 2)
        decoder_outputs_backward = decoder_outputs_backward.transpose(1, 2)
        decoder_outputs_backward = decoder_outputs_backward[:, :T, :]
@ -193,6 +249,25 @@ class BaseTacotron(BaseTTS):
        inputs = self._concat_speaker_embedding(inputs, gst_outputs)
        return inputs

+    def compute_capacitron_VAE_embedding(self, inputs, reference_mel_info, text_info=None, speaker_embedding=None):
+        """Capacitron Variational Autoencoder"""
+        (VAE_outputs, posterior_distribution, prior_distribution, capacitron_beta,) = self.capacitron_vae_layer(
+            reference_mel_info,
+            text_info,
+            speaker_embedding,  # pylint: disable=not-callable
+        )
+
+        VAE_outputs = VAE_outputs.to(inputs.device)
+        encoder_output = self._concat_speaker_embedding(
+            inputs, VAE_outputs
+        )  # concatenate to the output of the basic tacotron encoder
+        return (
+            encoder_output,
+            posterior_distribution,
+            prior_distribution,
+            capacitron_beta,
+        )
+
    @staticmethod
    def _add_speaker_embedding(outputs, embedded_speakers):
        embedded_speakers_ = embedded_speakers.expand(outputs.size(0), outputs.size(1), -1)
--- a/TTS/tts/models/base_tts.py
+++ b/TTS/tts/models/base_tts.py
@ -12,6 +12,7 @@ from trainer.torch import DistributedSampler, DistributedSamplerWrapper

 from TTS.model import BaseTrainerModel
 from TTS.tts.datasets.dataset import TTSDataset
+from TTS.tts.utils.data import get_length_balancer_weights
 from TTS.tts.utils.languages import LanguageManager, get_language_balancer_weights
 from TTS.tts.utils.speakers import SpeakerManager, get_speaker_balancer_weights, get_speaker_manager
 from TTS.tts.utils.synthesis import synthesis
@ -250,6 +251,14 @@ class BaseTTS(BaseTrainerModel):
            else:
                weights = get_speaker_balancer_weights(data_items) * alpha

+        if getattr(config, "use_length_weighted_sampler", False):
+            alpha = getattr(config, "length_weighted_sampler_alpha", 1.0)
+            print(" > Using Length weighted sampler with alpha:", alpha)
+            if weights is not None:
+                weights += get_length_balancer_weights(data_items) * alpha
+            else:
+                weights = get_length_balancer_weights(data_items) * alpha
+
        if weights is not None:
            sampler = WeightedRandomSampler(weights, len(weights))
        else:
@ -398,16 +407,16 @@ class BaseTTS(BaseTrainerModel):
        return test_figures, test_audios

    def on_init_start(self, trainer):
-        """Save the speaker.json and language_ids.json at the beginning of the training. Also update both paths."""
+        """Save the speaker.pth and language_ids.json at the beginning of the training. Also update both paths."""
        if self.speaker_manager is not None:
-            output_path = os.path.join(trainer.output_path, "speakers.json")
+            output_path = os.path.join(trainer.output_path, "speakers.pth")
            self.speaker_manager.save_ids_to_file(output_path)
            trainer.config.speakers_file = output_path
            # some models don't have `model_args` set
            if hasattr(trainer.config, "model_args"):
                trainer.config.model_args.speakers_file = output_path
            trainer.config.save_json(os.path.join(trainer.output_path, "config.json"))
-            print(f" > `speakers.json` is saved to {output_path}.")
+            print(f" > `speakers.pth` is saved to {output_path}.")
            print(" > `speakers_file` is updated in the config.json.")

        if hasattr(self, "language_manager") and self.language_manager is not None:
--- a/TTS/tts/models/tacotron.py
+++ b/TTS/tts/models/tacotron.py
@ -1,11 +1,13 @@
 # coding: utf-8

-from typing import Dict, List, Union
+from typing import Dict, List, Tuple, Union

 import torch
 from torch import nn
 from torch.cuda.amp.autocast_mode import autocast
+from trainer.trainer_utils import get_optimizer, get_scheduler

+from TTS.tts.layers.tacotron.capacitron_layers import CapacitronVAE
 from TTS.tts.layers.tacotron.gst_layers import GST
 from TTS.tts.layers.tacotron.tacotron import Decoder, Encoder, PostCBHG
 from TTS.tts.models.base_tacotron import BaseTacotron
@ -13,6 +15,7 @@ from TTS.tts.utils.measures import alignment_diagonal_score
 from TTS.tts.utils.speakers import SpeakerManager
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
+from TTS.utils.capacitron_optimizer import CapacitronOptimizer


 class Tacotron(BaseTacotron):
@ -51,6 +54,9 @@ class Tacotron(BaseTacotron):
        if self.use_gst:
            self.decoder_in_features += self.gst.gst_embedding_dim

+        if self.use_capacitron_vae:
+            self.decoder_in_features += self.capacitron_vae.capacitron_VAE_embedding_dim
+
        # embedding layer
        self.embedding = nn.Embedding(self.num_chars, 256, padding_idx=0)
        self.embedding.weight.data.normal_(0, 0.3)
@ -90,6 +96,20 @@ class Tacotron(BaseTacotron):
                gst_embedding_dim=self.gst.gst_embedding_dim,
            )

+        # Capacitron layers
+        if self.capacitron_vae and self.use_capacitron_vae:
+            self.capacitron_vae_layer = CapacitronVAE(
+                num_mel=self.decoder_output_dim,
+                encoder_output_dim=self.encoder_in_features,
+                capacitron_VAE_embedding_dim=self.capacitron_vae.capacitron_VAE_embedding_dim,
+                speaker_embedding_dim=self.embedded_speaker_dim
+                if self.use_speaker_embedding and self.capacitron_vae.capacitron_use_speaker_embedding
+                else None,
+                text_summary_embedding_dim=self.capacitron_vae.capacitron_text_summary_embedding_dim
+                if self.capacitron_vae.capacitron_use_text_summary_embeddings
+                else None,
+            )
+
        # backward pass decoder
        if self.bidirectional_decoder:
            self._init_backward_decoder()
@ -146,6 +166,19 @@ class Tacotron(BaseTacotron):
                # B x 1 x speaker_embed_dim
                embedded_speakers = torch.unsqueeze(aux_input["d_vectors"], 1)
            encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers)
+        # Capacitron
+        if self.capacitron_vae and self.use_capacitron_vae:
+            # B x capacitron_VAE_embedding_dim
+            encoder_outputs, *capacitron_vae_outputs = self.compute_capacitron_VAE_embedding(
+                encoder_outputs,
+                reference_mel_info=[mel_specs, mel_lengths],
+                text_info=[inputs, text_lengths]
+                if self.capacitron_vae.capacitron_use_text_summary_embeddings
+                else None,
+                speaker_embedding=embedded_speakers if self.capacitron_vae.capacitron_use_speaker_embedding else None,
+            )
+        else:
+            capacitron_vae_outputs = None
        # decoder_outputs: B x decoder_in_features x T_out
        # alignments: B x T_in x encoder_in_features
        # stop_tokens: B x T_in
@ -178,6 +211,7 @@ class Tacotron(BaseTacotron):
                "decoder_outputs": decoder_outputs,
                "alignments": alignments,
                "stop_tokens": stop_tokens,
+                "capacitron_vae_outputs": capacitron_vae_outputs,
            }
        )
        return outputs
@ -190,6 +224,28 @@ class Tacotron(BaseTacotron):
        if self.gst and self.use_gst:
            # B x gst_dim
            encoder_outputs = self.compute_gst(encoder_outputs, aux_input["style_mel"], aux_input["d_vectors"])
+        if self.capacitron_vae and self.use_capacitron_vae:
+            if aux_input["style_text"] is not None:
+                style_text_embedding = self.embedding(aux_input["style_text"])
+                style_text_length = torch.tensor([style_text_embedding.size(1)], dtype=torch.int64).to(
+                    encoder_outputs.device
+                )  # pylint: disable=not-callable
+            reference_mel_length = (
+                torch.tensor([aux_input["style_mel"].size(1)], dtype=torch.int64).to(encoder_outputs.device)
+                if aux_input["style_mel"] is not None
+                else None
+            )  # pylint: disable=not-callable
+            # B x capacitron_VAE_embedding_dim
+            encoder_outputs, *_ = self.compute_capacitron_VAE_embedding(
+                encoder_outputs,
+                reference_mel_info=[aux_input["style_mel"], reference_mel_length]
+                if aux_input["style_mel"] is not None
+                else None,
+                text_info=[style_text_embedding, style_text_length] if aux_input["style_text"] is not None else None,
+                speaker_embedding=aux_input["d_vectors"]
+                if self.capacitron_vae.capacitron_use_speaker_embedding
+                else None,
+            )
        if self.num_speakers > 1:
            if not self.use_d_vector_file:
                # B x 1 x speaker_embed_dim
@ -215,12 +271,19 @@ class Tacotron(BaseTacotron):
        }
        return outputs

-    def train_step(self, batch, criterion):
-        """Perform a single training step by fetching the right set if samples from the batch.
+    def before_backward_pass(self, loss_dict, optimizer) -> None:
+        # Extracting custom training specific operations for capacitron
+        # from the trainer
+        if self.use_capacitron_vae:
+            loss_dict["capacitron_vae_beta_loss"].backward()
+            optimizer.first_step()
+
+    def train_step(self, batch: Dict, criterion: torch.nn.Module) -> Tuple[Dict, Dict]:
+        """Perform a single training step by fetching the right set of samples from the batch.

        Args:
-            batch ([type]): [description]
-            criterion ([type]): [description]
+            batch ([Dict]): A dictionary of input tensors.
+            criterion ([torch.nn.Module]): Callable criterion to compute model loss.
        """
        text_input = batch["text_input"]
        text_lengths = batch["text_lengths"]
@ -232,14 +295,8 @@ class Tacotron(BaseTacotron):
        speaker_ids = batch["speaker_ids"]
        d_vectors = batch["d_vectors"]

-        # forward pass model
-        outputs = self.forward(
-            text_input,
-            text_lengths,
-            mel_input,
-            mel_lengths,
-            aux_input={"speaker_ids": speaker_ids, "d_vectors": d_vectors},
-        )
+        aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
+        outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input)

        # set the [alignment] lengths wrt reduction factor for guided attention
        if mel_lengths.max() % self.decoder.r != 0:
@ -249,9 +306,6 @@ class Tacotron(BaseTacotron):
        else:
            alignment_lengths = mel_lengths // self.decoder.r

-        aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
-        outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input)
-
        # compute loss
        with autocast(enabled=False):  # use float32 for the criterion
            loss_dict = criterion(
@ -262,6 +316,7 @@ class Tacotron(BaseTacotron):
                outputs["stop_tokens"].float(),
                stop_targets.float(),
                stop_target_lengths,
+                outputs["capacitron_vae_outputs"] if self.capacitron_vae else None,
                mel_lengths,
                None if outputs["decoder_outputs_backward"] is None else outputs["decoder_outputs_backward"].float(),
                outputs["alignments"].float(),
@ -275,6 +330,25 @@ class Tacotron(BaseTacotron):
        loss_dict["align_error"] = align_error
        return outputs, loss_dict

+    def get_optimizer(self) -> List:
+        if self.use_capacitron_vae:
+            return CapacitronOptimizer(self.config, self.named_parameters())
+        return get_optimizer(self.config.optimizer, self.config.optimizer_params, self.config.lr, self)
+
+    def get_scheduler(self, optimizer: object):
+        opt = optimizer.primary_optimizer if self.use_capacitron_vae else optimizer
+        return get_scheduler(self.config.lr_scheduler, self.config.lr_scheduler_params, opt)
+
+    def before_gradient_clipping(self):
+        if self.use_capacitron_vae:
+            # Capacitron model specific gradient clipping
+            model_params_to_clip = []
+            for name, param in self.named_parameters():
+                if param.requires_grad:
+                    if name != "capacitron_vae_layer.beta":
+                        model_params_to_clip.append(param)
+            torch.nn.utils.clip_grad_norm_(model_params_to_clip, self.capacitron_vae.capacitron_grad_clip)
+
    def _create_logs(self, batch, outputs, ap):
        postnet_outputs = outputs["model_outputs"]
        decoder_outputs = outputs["decoder_outputs"]
--- a/TTS/tts/models/tacotron2.py
+++ b/TTS/tts/models/tacotron2.py
@ -5,7 +5,9 @@ from typing import Dict, List, Union
 import torch
 from torch import nn
 from torch.cuda.amp.autocast_mode import autocast
+from trainer.trainer_utils import get_optimizer, get_scheduler

+from TTS.tts.layers.tacotron.capacitron_layers import CapacitronVAE
 from TTS.tts.layers.tacotron.gst_layers import GST
 from TTS.tts.layers.tacotron.tacotron2 import Decoder, Encoder, Postnet
 from TTS.tts.models.base_tacotron import BaseTacotron
@ -13,6 +15,7 @@ from TTS.tts.utils.measures import alignment_diagonal_score
 from TTS.tts.utils.speakers import SpeakerManager
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
+from TTS.utils.capacitron_optimizer import CapacitronOptimizer


 class Tacotron2(BaseTacotron):
@ -65,6 +68,9 @@ class Tacotron2(BaseTacotron):
        if self.use_gst:
            self.decoder_in_features += self.gst.gst_embedding_dim

+        if self.use_capacitron_vae:
+            self.decoder_in_features += self.capacitron_vae.capacitron_VAE_embedding_dim
+
        # embedding layer
        self.embedding = nn.Embedding(self.num_chars, 512, padding_idx=0)

@ -102,6 +108,20 @@ class Tacotron2(BaseTacotron):
                gst_embedding_dim=self.gst.gst_embedding_dim,
            )

+        # Capacitron VAE Layers
+        if self.capacitron_vae and self.use_capacitron_vae:
+            self.capacitron_vae_layer = CapacitronVAE(
+                num_mel=self.decoder_output_dim,
+                encoder_output_dim=self.encoder_in_features,
+                capacitron_VAE_embedding_dim=self.capacitron_vae.capacitron_VAE_embedding_dim,
+                speaker_embedding_dim=self.embedded_speaker_dim
+                if self.capacitron_vae.capacitron_use_speaker_embedding
+                else None,
+                text_summary_embedding_dim=self.capacitron_vae.capacitron_text_summary_embedding_dim
+                if self.capacitron_vae.capacitron_use_text_summary_embeddings
+                else None,
+            )
+
        # backward pass decoder
        if self.bidirectional_decoder:
            self._init_backward_decoder()
@ -166,6 +186,20 @@ class Tacotron2(BaseTacotron):
                embedded_speakers = torch.unsqueeze(aux_input["d_vectors"], 1)
            encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers)

+        # capacitron
+        if self.capacitron_vae and self.use_capacitron_vae:
+            # B x capacitron_VAE_embedding_dim
+            encoder_outputs, *capacitron_vae_outputs = self.compute_capacitron_VAE_embedding(
+                encoder_outputs,
+                reference_mel_info=[mel_specs, mel_lengths],
+                text_info=[embedded_inputs.transpose(1, 2), text_lengths]
+                if self.capacitron_vae.capacitron_use_text_summary_embeddings
+                else None,
+                speaker_embedding=embedded_speakers if self.capacitron_vae.capacitron_use_speaker_embedding else None,
+            )
+        else:
+            capacitron_vae_outputs = None
+
        encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs)

        # B x mel_dim x T_out -- B x T_out//r x T_in -- B x T_out//r
@ -197,6 +231,7 @@ class Tacotron2(BaseTacotron):
                "decoder_outputs": decoder_outputs,
                "alignments": alignments,
                "stop_tokens": stop_tokens,
+                "capacitron_vae_outputs": capacitron_vae_outputs,
            }
        )
        return outputs
@ -217,6 +252,29 @@ class Tacotron2(BaseTacotron):
            # B x gst_dim
            encoder_outputs = self.compute_gst(encoder_outputs, aux_input["style_mel"], aux_input["d_vectors"])

+        if self.capacitron_vae and self.use_capacitron_vae:
+            if aux_input["style_text"] is not None:
+                style_text_embedding = self.embedding(aux_input["style_text"])
+                style_text_length = torch.tensor([style_text_embedding.size(1)], dtype=torch.int64).to(
+                    encoder_outputs.device
+                )  # pylint: disable=not-callable
+            reference_mel_length = (
+                torch.tensor([aux_input["style_mel"].size(1)], dtype=torch.int64).to(encoder_outputs.device)
+                if aux_input["style_mel"] is not None
+                else None
+            )  # pylint: disable=not-callable
+            # B x capacitron_VAE_embedding_dim
+            encoder_outputs, *_ = self.compute_capacitron_VAE_embedding(
+                encoder_outputs,
+                reference_mel_info=[aux_input["style_mel"], reference_mel_length]
+                if aux_input["style_mel"] is not None
+                else None,
+                text_info=[style_text_embedding, style_text_length] if aux_input["style_text"] is not None else None,
+                speaker_embedding=aux_input["d_vectors"]
+                if self.capacitron_vae.capacitron_use_speaker_embedding
+                else None,
+            )
+
        if self.num_speakers > 1:
            if not self.use_d_vector_file:
                embedded_speakers = self.speaker_embedding(aux_input["speaker_ids"])[None]
@ -242,6 +300,13 @@ class Tacotron2(BaseTacotron):
        }
        return outputs

+    def before_backward_pass(self, loss_dict, optimizer) -> None:
+        # Extracting custom training specific operations for capacitron
+        # from the trainer
+        if self.use_capacitron_vae:
+            loss_dict["capacitron_vae_beta_loss"].backward()
+            optimizer.first_step()
+
    def train_step(self, batch: Dict, criterion: torch.nn.Module):
        """A single training step. Forward pass and loss computation.

@ -258,14 +323,8 @@ class Tacotron2(BaseTacotron):
        speaker_ids = batch["speaker_ids"]
        d_vectors = batch["d_vectors"]

-        # forward pass model
-        outputs = self.forward(
-            text_input,
-            text_lengths,
-            mel_input,
-            mel_lengths,
-            aux_input={"speaker_ids": speaker_ids, "d_vectors": d_vectors},
-        )
+        aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
+        outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input)

        # set the [alignment] lengths wrt reduction factor for guided attention
        if mel_lengths.max() % self.decoder.r != 0:
@ -275,9 +334,6 @@ class Tacotron2(BaseTacotron):
        else:
            alignment_lengths = mel_lengths // self.decoder.r

-        aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
-        outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input)
-
        # compute loss
        with autocast(enabled=False):  # use float32 for the criterion
            loss_dict = criterion(
@ -288,6 +344,7 @@ class Tacotron2(BaseTacotron):
                outputs["stop_tokens"].float(),
                stop_targets.float(),
                stop_target_lengths,
+                outputs["capacitron_vae_outputs"] if self.capacitron_vae else None,
                mel_lengths,
                None if outputs["decoder_outputs_backward"] is None else outputs["decoder_outputs_backward"].float(),
                outputs["alignments"].float(),
@ -301,6 +358,25 @@ class Tacotron2(BaseTacotron):
        loss_dict["align_error"] = align_error
        return outputs, loss_dict

+    def get_optimizer(self) -> List:
+        if self.use_capacitron_vae:
+            return CapacitronOptimizer(self.config, self.named_parameters())
+        return get_optimizer(self.config.optimizer, self.config.optimizer_params, self.config.lr, self)
+
+    def get_scheduler(self, optimizer: object):
+        opt = optimizer.primary_optimizer if self.use_capacitron_vae else optimizer
+        return get_scheduler(self.config.lr_scheduler, self.config.lr_scheduler_params, opt)
+
+    def before_gradient_clipping(self):
+        if self.use_capacitron_vae:
+            # Capacitron model specific gradient clipping
+            model_params_to_clip = []
+            for name, param in self.named_parameters():
+                if param.requires_grad:
+                    if name != "capacitron_vae_layer.beta":
+                        model_params_to_clip.append(param)
+            torch.nn.utils.clip_grad_norm_(model_params_to_clip, self.capacitron_vae.capacitron_grad_clip)
+
    def _create_logs(self, batch, outputs, ap):
        """Create dashboard log information."""
        postnet_outputs = outputs["model_outputs"]
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@ -41,6 +41,23 @@ hann_window = {}
 mel_basis = {}


+@torch.no_grad()
+def weights_reset(m: nn.Module):
+    # check if the current module has reset_parameters and if it is reset the weight
+    reset_parameters = getattr(m, "reset_parameters", None)
+    if callable(reset_parameters):
+        m.reset_parameters()
+
+
+def get_module_weights_sum(mdl: nn.Module):
+    dict_sums = {}
+    for name, w in mdl.named_parameters():
+        if "weight" in name:
+            value = w.data.sum().item()
+            dict_sums[name] = value
+    return dict_sums
+
+
 def load_audio(file_path):
    """Load the audio file normalized in [-1, 1]

@ -189,15 +206,20 @@ def wav_to_mel(y, n_fft, num_mels, sample_rate, hop_length, win_length, fmin, fm


 class VitsDataset(TTSDataset):
-    def __init__(self, *args, **kwargs):
+    def __init__(self, model_args, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.pad_id = self.tokenizer.characters.pad_id
+        self.model_args = model_args

    def __getitem__(self, idx):
        item = self.samples[idx]
        raw_text = item["text"]

        wav, _ = load_audio(item["audio_file"])
+        if self.model_args.encoder_sample_rate is not None:
+            if wav.size(1) % self.model_args.encoder_sample_rate != 0:
+                wav = wav[:, : -int(wav.size(1) % self.model_args.encoder_sample_rate)]
+
        wav_filename = os.path.basename(item["audio_file"])

        token_ids = self.get_token_ids(idx, item["text"])
@ -362,6 +384,9 @@ class VitsArgs(Coqpit):
        upsample_kernel_sizes_decoder (List[int]):
            Kernel sizes for each upsampling layer of the decoder network. Defaults to `[16, 16, 4, 4]`.

+        periods_multi_period_discriminator (List[int]):
+            Periods values for Vits Multi-Period Discriminator. Defaults to `[2, 3, 5, 7, 11]`.
+
        use_sdp (bool):
            Use Stochastic Duration Predictor. Defaults to True.

@ -451,6 +476,18 @@ class VitsArgs(Coqpit):

        freeze_waveform_decoder (bool):
            Freeze the waveform decoder weigths during training. Defaults to False.
+
+        encoder_sample_rate (int):
+            If not None this sample rate will be used for training the Posterior Encoder,
+            flow, text_encoder and duration predictor. The decoder part (vocoder) will be
+            trained with the `config.audio.sample_rate`. Defaults to None.
+
+        interpolate_z (bool):
+            If `encoder_sample_rate` not None and  this parameter True the nearest interpolation
+            will be used to upsampling the latent variable z with the sampling rate `encoder_sample_rate`
+            to the `config.audio.sample_rate`. If it is False you will need to add extra
+            `upsample_rates_decoder` to match the shape. Defaults to True.
+
    """

    num_chars: int = 100
@ -475,6 +512,7 @@ class VitsArgs(Coqpit):
    upsample_rates_decoder: List[int] = field(default_factory=lambda: [8, 8, 2, 2])
    upsample_initial_channel_decoder: int = 512
    upsample_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [16, 16, 4, 4])
+    periods_multi_period_discriminator: List[int] = field(default_factory=lambda: [2, 3, 5, 7, 11])
    use_sdp: bool = True
    noise_scale: float = 1.0
    inference_noise_scale: float = 0.667
@ -505,6 +543,10 @@ class VitsArgs(Coqpit):
    freeze_PE: bool = False
    freeze_flow_decoder: bool = False
    freeze_waveform_decoder: bool = False
+    encoder_sample_rate: int = None
+    interpolate_z: bool = True
+    reinit_DP: bool = False
+    reinit_text_encoder: bool = False


 class Vits(BaseTTS):
@ -548,6 +590,7 @@ class Vits(BaseTTS):

        self.init_multispeaker(config)
        self.init_multilingual(config)
+        self.init_upsampling()

        self.length_scale = self.args.length_scale
        self.noise_scale = self.args.noise_scale
@ -625,7 +668,10 @@ class Vits(BaseTTS):
        )

        if self.args.init_discriminator:
-            self.disc = VitsDiscriminator(use_spectral_norm=self.args.use_spectral_norm_disriminator)
+            self.disc = VitsDiscriminator(
+                periods=self.args.periods_multi_period_discriminator,
+                use_spectral_norm=self.args.use_spectral_norm_disriminator,
+            )

    def init_multispeaker(self, config: Coqpit):
        """Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer
@ -707,6 +753,38 @@ class Vits(BaseTTS):
        else:
            self.embedded_language_dim = 0

+    def init_upsampling(self):
+        """
+        Initialize upsampling modules of a model.
+        """
+        if self.args.encoder_sample_rate:
+            self.interpolate_factor = self.config.audio["sample_rate"] / self.args.encoder_sample_rate
+            self.audio_resampler = torchaudio.transforms.Resample(
+                orig_freq=self.config.audio["sample_rate"], new_freq=self.args.encoder_sample_rate
+            )  # pylint: disable=W0201
+
+    def on_init_end(self, trainer):  # pylint: disable=W0613
+        """Reinit layes if needed"""
+        if self.args.reinit_DP:
+            before_dict = get_module_weights_sum(self.duration_predictor)
+            # Applies weights_reset recursively to every submodule of the duration predictor
+            self.duration_predictor.apply(fn=weights_reset)
+            after_dict = get_module_weights_sum(self.duration_predictor)
+            for key, value in after_dict.items():
+                if value == before_dict[key]:
+                    raise RuntimeError(" [!] The weights of Duration Predictor was not reinit check it !")
+            print(" > Duration Predictor was reinit.")
+
+        if self.args.reinit_text_encoder:
+            before_dict = get_module_weights_sum(self.text_encoder)
+            # Applies weights_reset recursively to every submodule of the duration predictor
+            self.text_encoder.apply(fn=weights_reset)
+            after_dict = get_module_weights_sum(self.text_encoder)
+            for key, value in after_dict.items():
+                if value == before_dict[key]:
+                    raise RuntimeError(" [!] The weights of Text Encoder was not reinit check it !")
+            print(" > Text Encoder was reinit.")
+
    def get_aux_input(self, aux_input: Dict):
        sid, g, lid = self._set_cond_input(aux_input)
        return {"speaker_ids": sid, "style_wav": None, "d_vectors": g, "language_ids": lid}
@ -804,6 +882,23 @@ class Vits(BaseTTS):
        outputs["loss_duration"] = loss_duration
        return outputs, attn

+    def upsampling_z(self, z, slice_ids=None, y_lengths=None, y_mask=None):
+        spec_segment_size = self.spec_segment_size
+        if self.args.encoder_sample_rate:
+            # recompute the slices and spec_segment_size if needed
+            slice_ids = slice_ids * int(self.interpolate_factor) if slice_ids is not None else slice_ids
+            spec_segment_size = spec_segment_size * int(self.interpolate_factor)
+            # interpolate z if needed
+            if self.args.interpolate_z:
+                z = torch.nn.functional.interpolate(z, scale_factor=[self.interpolate_factor], mode="linear").squeeze(0)
+                # recompute the mask if needed
+                if y_lengths is not None and y_mask is not None:
+                    y_mask = (
+                        sequence_mask(y_lengths * self.interpolate_factor, None).to(y_mask.dtype).unsqueeze(1)
+                    )  # [B, 1, T_dec_resampled]
+
+        return z, spec_segment_size, slice_ids, y_mask
+
    def forward(  # pylint: disable=dangerous-default-value
        self,
        x: torch.tensor,
@ -878,12 +973,16 @@ class Vits(BaseTTS):

        # select a random feature segment for the waveform decoder
        z_slice, slice_ids = rand_segments(z, y_lengths, self.spec_segment_size, let_short_samples=True, pad_short=True)
+
+        # interpolate z if needed
+        z_slice, spec_segment_size, slice_ids, _ = self.upsampling_z(z_slice, slice_ids=slice_ids)
+
        o = self.waveform_decoder(z_slice, g=g)

        wav_seg = segment(
            waveform,
            slice_ids * self.config.audio.hop_length,
-            self.args.spec_segment_size * self.config.audio.hop_length,
+            spec_segment_size * self.config.audio.hop_length,
            pad_short=True,
        )

@ -927,6 +1026,7 @@ class Vits(BaseTTS):
            return aux_input["x_lengths"]
        return torch.tensor(x.shape[1:2]).to(x.device)

+    @torch.no_grad()
    def inference(
        self, x, aux_input={"x_lengths": None, "d_vectors": None, "speaker_ids": None, "language_ids": None}
    ):  # pylint: disable=dangerous-default-value
@ -989,9 +1089,22 @@ class Vits(BaseTTS):

        z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * self.inference_noise_scale
        z = self.flow(z_p, y_mask, g=g, reverse=True)
+
+        # upsampling if needed
+        z, _, _, y_mask = self.upsampling_z(z, y_lengths=y_lengths, y_mask=y_mask)
+
        o = self.waveform_decoder((z * y_mask)[:, :, : self.max_inference_len], g=g)

-        outputs = {"model_outputs": o, "alignments": attn.squeeze(1), "z": z, "z_p": z_p, "m_p": m_p, "logs_p": logs_p}
+        outputs = {
+            "model_outputs": o,
+            "alignments": attn.squeeze(1),
+            "durations": w_ceil,
+            "z": z,
+            "z_p": z_p,
+            "m_p": m_p,
+            "logs_p": logs_p,
+            "y_mask": y_mask,
+        }
        return outputs

    @torch.no_grad()
@ -1014,7 +1127,7 @@ class Vits(BaseTTS):
            self.config.audio.hop_length,
            self.config.audio.win_length,
            center=False,
-        ).transpose(1, 2)
+        )
        y_lengths = torch.tensor([y.size(-1)]).to(y.device)
        speaker_cond_src = reference_speaker_id if reference_speaker_id is not None else reference_d_vector
        speaker_cond_tgt = speaker_id if speaker_id is not None else d_vector
@ -1044,7 +1157,7 @@ class Vits(BaseTTS):
        else:
            raise RuntimeError(" [!] Voice conversion is only supported on multi-speaker models.")

-        z, _, _, y_mask = self.posterior_encoder(y.transpose(1, 2), y_lengths, g=g_src)
+        z, _, _, y_mask = self.posterior_encoder(y, y_lengths, g=g_src)
        z_p = self.flow(z, y_mask, g=g_src)
        z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
        o_hat = self.waveform_decoder(z_hat * y_mask, g=g_tgt)
@ -1064,13 +1177,12 @@ class Vits(BaseTTS):

        self._freeze_layers()

-        mel_lens = batch["mel_lens"]
+        spec_lens = batch["spec_lens"]

        if optimizer_idx == 0:
            tokens = batch["tokens"]
            token_lenghts = batch["token_lens"]
            spec = batch["spec"]
-            spec_lens = batch["spec_lens"]

            d_vectors = batch["d_vectors"]
            speaker_ids = batch["speaker_ids"]
@ -1108,8 +1220,14 @@ class Vits(BaseTTS):

            # compute melspec segment
            with autocast(enabled=False):
+
+                if self.args.encoder_sample_rate:
+                    spec_segment_size = self.spec_segment_size * int(self.interpolate_factor)
+                else:
+                    spec_segment_size = self.spec_segment_size
+
                mel_slice = segment(
-                    mel.float(), self.model_outputs_cache["slice_ids"], self.spec_segment_size, pad_short=True
+                    mel.float(), self.model_outputs_cache["slice_ids"], spec_segment_size, pad_short=True
                )
                mel_slice_hat = wav_to_mel(
                    y=self.model_outputs_cache["model_outputs"].float(),
@ -1137,7 +1255,7 @@ class Vits(BaseTTS):
                    logs_q=self.model_outputs_cache["logs_q"].float(),
                    m_p=self.model_outputs_cache["m_p"].float(),
                    logs_p=self.model_outputs_cache["logs_p"].float(),
-                    z_len=mel_lens,
+                    z_len=spec_lens,
                    scores_disc_fake=scores_disc_fake,
                    feats_disc_fake=feats_disc_fake,
                    feats_disc_real=feats_disc_real,
@ -1318,21 +1436,48 @@ class Vits(BaseTTS):
        """Compute spectrograms on the device."""
        ac = self.config.audio

+        if self.args.encoder_sample_rate:
+            wav = self.audio_resampler(batch["waveform"])
+        else:
+            wav = batch["waveform"]
+
        # compute spectrograms
-        batch["spec"] = wav_to_spec(batch["waveform"], ac.fft_size, ac.hop_length, ac.win_length, center=False)
+        batch["spec"] = wav_to_spec(wav, ac.fft_size, ac.hop_length, ac.win_length, center=False)
+
+        if self.args.encoder_sample_rate:
+            # recompute spec with high sampling rate to the loss
+            spec_mel = wav_to_spec(batch["waveform"], ac.fft_size, ac.hop_length, ac.win_length, center=False)
+            # remove extra stft frames if needed
+            if spec_mel.size(2) > int(batch["spec"].size(2) * self.interpolate_factor):
+                spec_mel = spec_mel[:, :, : int(batch["spec"].size(2) * self.interpolate_factor)]
+            else:
+                batch["spec"] = batch["spec"][:, :, : int(spec_mel.size(2) / self.interpolate_factor)]
+        else:
+            spec_mel = batch["spec"]
+
        batch["mel"] = spec_to_mel(
-            spec=batch["spec"],
+            spec=spec_mel,
            n_fft=ac.fft_size,
            num_mels=ac.num_mels,
            sample_rate=ac.sample_rate,
            fmin=ac.mel_fmin,
            fmax=ac.mel_fmax,
        )
+
+        if self.args.encoder_sample_rate:
+            assert batch["spec"].shape[2] == int(
+                batch["mel"].shape[2] / self.interpolate_factor
+            ), f"{batch['spec'].shape[2]}, {batch['mel'].shape[2]}"
+        else:
            assert batch["spec"].shape[2] == batch["mel"].shape[2], f"{batch['spec'].shape[2]}, {batch['mel'].shape[2]}"

        # compute spectrogram frame lengths
        batch["spec_lens"] = (batch["spec"].shape[2] * batch["waveform_rel_lens"]).int()
        batch["mel_lens"] = (batch["mel"].shape[2] * batch["waveform_rel_lens"]).int()
+
+        if self.args.encoder_sample_rate:
+            assert (batch["spec_lens"] - (batch["mel_lens"] / self.interpolate_factor).int()).sum() == 0
+        else:
            assert (batch["spec_lens"] - batch["mel_lens"]).sum() == 0

        # zero the padding frames
@ -1355,8 +1500,9 @@ class Vits(BaseTTS):
        else:
            # init dataloader
            dataset = VitsDataset(
+                model_args=self.args,
                samples=samples,
-                # batch_group_size=0 if is_eval else config.batch_group_size * config.batch_size,
+                batch_group_size=0 if is_eval else config.batch_group_size * config.batch_size,
                min_text_len=config.min_text_len,
                max_text_len=config.max_text_len,
                min_audio_len=config.min_audio_len,
@ -1449,6 +1595,11 @@ class Vits(BaseTTS):
        # TODO: consider baking the speaker encoder into the model and call it from there.
        # as it is probably easier for model distribution.
        state["model"] = {k: v for k, v in state["model"].items() if "speaker_encoder" not in k}
+
+        if self.args.encoder_sample_rate is not None and eval:
+            # audio resampler is not used in inference time
+            self.audio_resampler = None
+
        # handle fine-tuning from a checkpoint with additional speakers
        if hasattr(self, "emb_g") and state["model"]["emb_g.weight"].shape != self.emb_g.weight.shape:
            num_new_speakers = self.emb_g.weight.shape[0] - state["model"]["emb_g.weight"].shape[0]
@ -1476,9 +1627,17 @@ class Vits(BaseTTS):
        from TTS.utils.audio import AudioProcessor

        upsample_rate = torch.prod(torch.as_tensor(config.model_args.upsample_rates_decoder)).item()
+
+        if not config.model_args.encoder_sample_rate:
            assert (
                upsample_rate == config.audio.hop_length
            ), f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {config.audio.hop_length}"
+        else:
+            encoder_to_vocoder_upsampling_factor = config.audio.sample_rate / config.model_args.encoder_sample_rate
+            effective_hop_length = config.audio.hop_length * encoder_to_vocoder_upsampling_factor
+            assert (
+                upsample_rate == effective_hop_length
+            ), f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {effective_hop_length}"

        ap = AudioProcessor.init_from_config(config, verbose=verbose)
        tokenizer, new_config = TTSTokenizer.init_from_config(config)
--- a/TTS/tts/utils/data.py
+++ b/TTS/tts/utils/data.py
@ -1,4 +1,7 @@
+import bisect
+
 import numpy as np
+import torch


 def _pad_data(x, length):
@ -51,3 +54,26 @@ def prepare_stop_target(inputs, out_steps):

 def pad_per_step(inputs, pad_len):
    return np.pad(inputs, [[0, 0], [0, 0], [0, pad_len]], mode="constant", constant_values=0.0)
+
+
+def get_length_balancer_weights(items: list, num_buckets=10):
+    # get all durations
+    audio_lengths = np.array([item["audio_length"] for item in items])
+    # create the $num_buckets buckets classes based in the dataset max and min length
+    max_length = int(max(audio_lengths))
+    min_length = int(min(audio_lengths))
+    step = int((max_length - min_length) / num_buckets) + 1
+    buckets_classes = [i + step for i in range(min_length, (max_length - step) + num_buckets + 1, step)]
+    # add each sample in their respective length bucket
+    buckets_names = np.array(
+        [buckets_classes[bisect.bisect_left(buckets_classes, item["audio_length"])] for item in items]
+    )
+    # count and compute the weights_bucket for each sample
+    unique_buckets_names = np.unique(buckets_names).tolist()
+    bucket_ids = [unique_buckets_names.index(l) for l in buckets_names]
+    bucket_count = np.array([len(np.where(buckets_names == l)[0]) for l in unique_buckets_names])
+    weight_bucket = 1.0 / bucket_count
+    dataset_samples_weight = np.array([weight_bucket[l] for l in bucket_ids])
+    # normalize
+    dataset_samples_weight = dataset_samples_weight / np.linalg.norm(dataset_samples_weight)
+    return torch.from_numpy(dataset_samples_weight).float()
--- a/TTS/tts/utils/managers.py
+++ b/TTS/tts/utils/managers.py
@ -11,6 +11,28 @@ from TTS.encoder.utils.generic_utils import setup_encoder_model
 from TTS.utils.audio import AudioProcessor


+def load_file(path: str):
+    if path.endswith(".json"):
+        with fsspec.open(path, "r") as f:
+            return json.load(f)
+    elif path.endswith(".pth"):
+        with fsspec.open(path, "rb") as f:
+            return torch.load(f, map_location="cpu")
+    else:
+        raise ValueError("Unsupported file type")
+
+
+def save_file(obj: Any, path: str):
+    if path.endswith(".json"):
+        with fsspec.open(path, "w") as f:
+            json.dump(obj, f, indent=4)
+    elif path.endswith(".pth"):
+        with fsspec.open(path, "wb") as f:
+            torch.save(obj, f)
+    else:
+        raise ValueError("Unsupported file type")
+
+
 class BaseIDManager:
    """Base `ID` Manager class. Every new `ID` manager must inherit this.
    It defines common `ID` manager specific functions.
@ -46,7 +68,7 @@ class BaseIDManager:
        Args:
            file_path (str): Path to the file.
        """
-        self.ids = self._load_json(file_path)
+        self.ids = load_file(file_path)

    def save_ids_to_file(self, file_path: str) -> None:
        """Save IDs to a json file.
@ -54,7 +76,7 @@ class BaseIDManager:
        Args:
            file_path (str): Path to the output file.
        """
-        self._save_json(file_path, self.ids)
+        save_file(self.ids, file_path)

    def get_random_id(self) -> Any:
        """Get a random embedding.
@ -110,7 +132,7 @@ class EmbeddingManager(BaseIDManager):
            self.load_embeddings_from_file(embedding_file_path)

        if encoder_model_path and encoder_config_path:
-            self.init_encoder(encoder_model_path, encoder_config_path)
+            self.init_encoder(encoder_model_path, encoder_config_path, use_cuda)

    @property
    def embedding_dim(self):
@ -125,7 +147,7 @@ class EmbeddingManager(BaseIDManager):
        Args:
            file_path (str): Path to the output file.
        """
-        self._save_json(file_path, self.embeddings)
+        save_file(self.embeddings, file_path)

    def load_embeddings_from_file(self, file_path: str) -> None:
        """Load embeddings from a json file.
@ -133,7 +155,7 @@ class EmbeddingManager(BaseIDManager):
        Args:
            file_path (str): Path to the target json file.
        """
-        self.embeddings = self._load_json(file_path)
+        self.embeddings = load_file(file_path)

        speakers = sorted({x["name"] for x in self.embeddings.values()})
        self.ids = {name: i for i, name in enumerate(speakers)}
@ -216,17 +238,19 @@ class EmbeddingManager(BaseIDManager):
    def get_clips(self) -> List:
        return sorted(self.embeddings.keys())

-    def init_encoder(self, model_path: str, config_path: str) -> None:
+    def init_encoder(self, model_path: str, config_path: str, use_cuda=False) -> None:
        """Initialize a speaker encoder model.

        Args:
            model_path (str): Model file path.
            config_path (str): Model config file path.
+            use_cuda (bool, optional): Use CUDA. Defaults to False.
        """
+        self.use_cuda = use_cuda
        self.encoder_config = load_config(config_path)
        self.encoder = setup_encoder_model(self.encoder_config)
        self.encoder_criterion = self.encoder.load_checkpoint(
-            self.encoder_config, model_path, eval=True, use_cuda=self.use_cuda
+            self.encoder_config, model_path, eval=True, use_cuda=use_cuda
        )
        self.encoder_ap = AudioProcessor(**self.encoder_config.audio)

--- a/TTS/tts/utils/speakers.py
+++ b/TTS/tts/utils/speakers.py
@ -108,6 +108,7 @@ class SpeakerManager(EmbeddingManager):
                )

        if get_from_config_or_model_args_with_default(config, "use_d_vector_file", False):
+            speaker_manager = SpeakerManager()
            if get_from_config_or_model_args_with_default(config, "speakers_file", None):
                speaker_manager = SpeakerManager(
                    d_vectors_file_path=get_from_config_or_model_args_with_default(config, "speaker_file", None)
--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@ -26,6 +26,7 @@ def run_model_torch(
    inputs: torch.Tensor,
    speaker_id: int = None,
    style_mel: torch.Tensor = None,
+    style_text: str = None,
    d_vector: torch.Tensor = None,
    language_id: torch.Tensor = None,
 ) -> Dict:
@ -53,6 +54,7 @@ def run_model_torch(
            "speaker_ids": speaker_id,
            "d_vectors": d_vector,
            "style_mel": style_mel,
+            "style_text": style_text,
            "language_ids": language_id,
        },
    )
@ -115,6 +117,7 @@ def synthesis(
    use_cuda,
    speaker_id=None,
    style_wav=None,
+    style_text=None,
    use_griffin_lim=False,
    do_trim_silence=False,
    d_vector=None,
@ -140,7 +143,12 @@ def synthesis(
            Speaker ID passed to the speaker embedding layer in multi-speaker model. Defaults to None.

        style_wav (str | Dict[str, float]):
-            Path or tensor to/of a waveform used for computing the style embedding. Defaults to None.
+            Path or tensor to/of a waveform used for computing the style embedding based on GST or Capacitron.
+            Defaults to None, meaning that Capacitron models will sample from the prior distribution to
+            generate random but realistic prosody.
+
+        style_text (str):
+            Transcription of style_wav for Capacitron models. Defaults to None.

        enable_eos_bos_chars (bool):
            enable special chars for end of sentence and start of sentence. Defaults to False.
@ -154,13 +162,19 @@ def synthesis(
        language_id (int):
            Language ID passed to the language embedding layer in multi-langual model. Defaults to None.
    """
-    # GST processing
+    # GST or Capacitron processing
+    # TODO: need to handle the case of setting both gst and capacitron to true somewhere
    style_mel = None
    if CONFIG.has("gst") and CONFIG.gst and style_wav is not None:
        if isinstance(style_wav, dict):
            style_mel = style_wav
        else:
            style_mel = compute_style_mel(style_wav, model.ap, cuda=use_cuda)
+
+    if CONFIG.has("capacitron_vae") and CONFIG.use_capacitron_vae and style_wav is not None:
+        style_mel = compute_style_mel(style_wav, model.ap, cuda=use_cuda)
+        style_mel = style_mel.transpose(1, 2)  # [1, time, depth]
+
    # convert text to sequence of token IDs
    text_inputs = np.asarray(
        model.tokenizer.text_to_ids(text, language=language_id),
@ -177,11 +191,28 @@ def synthesis(
        language_id = id_to_torch(language_id, cuda=use_cuda)

    if not isinstance(style_mel, dict):
+        # GST or Capacitron style mel
        style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda)
+        if style_text is not None:
+            style_text = np.asarray(
+                model.tokenizer.text_to_ids(style_text, language=language_id),
+                dtype=np.int32,
+            )
+            style_text = numpy_to_torch(style_text, torch.long, cuda=use_cuda)
+            style_text = style_text.unsqueeze(0)
+
    text_inputs = numpy_to_torch(text_inputs, torch.long, cuda=use_cuda)
    text_inputs = text_inputs.unsqueeze(0)
    # synthesize voice
-    outputs = run_model_torch(model, text_inputs, speaker_id, style_mel, d_vector=d_vector, language_id=language_id)
+    outputs = run_model_torch(
+        model,
+        text_inputs,
+        speaker_id,
+        style_mel,
+        style_text,
+        d_vector=d_vector,
+        language_id=language_id,
+    )
    model_outputs = outputs["model_outputs"]
    model_outputs = model_outputs[0].data.cpu().numpy()
    alignments = outputs["alignments"]
--- a/TTS/tts/utils/text/phonemizers/espeak_wrapper.py
+++ b/TTS/tts/utils/text/phonemizers/espeak_wrapper.py
@ -107,15 +107,6 @@ class ESpeak(BasePhonemizer):
        if backend not in ["espeak", "espeak-ng"]:
            raise Exception("Unknown backend: %s" % backend)
        self._ESPEAK_LIB = backend
-        # skip first two characters of the retuned text
-        # "_ p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n"
-        #  ^^
-        self.num_skip_chars = 2
-        if backend == "espeak-ng":
-            # skip the first character of the retuned text
-            # "_p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n"
-            #  ^
-            self.num_skip_chars = 1

    def auto_set_espeak_lib(self) -> None:
        if is_tool("espeak-ng"):
@ -163,7 +154,16 @@ class ESpeak(BasePhonemizer):
        phonemes = ""
        for line in _espeak_exe(self._ESPEAK_LIB, args, sync=True):
            logging.debug("line: %s", repr(line))
-            phonemes += line.decode("utf8").strip()[self.num_skip_chars :]  # skip initial redundant characters
+            ph_decoded = line.decode("utf8").strip()
+            # espeak need to skip first two characters of the retuned text:
+            #   version 1.48.03: "_ p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n"
+            #   version 1.48.15: " p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n"
+            # espeak-ng need to skip the first character of the retuned text:
+            #   "_p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n"
+
+            # dealing with the conditions descrived above
+            ph_decoded = ph_decoded[:1].replace("_", "") + ph_decoded[1:]
+            phonemes += ph_decoded.strip()
        return phonemes.replace("_", separator)

    def _phonemize(self, text, separator=None):
--- a/TTS/utils/audio.py
+++ b/TTS/utils/audio.py
@ -859,7 +859,11 @@ class AudioProcessor(object):
            path (str): Path to a output file.
            sr (int, optional): Sampling rate used for saving to the file. Defaults to None.
        """
+        if self.do_rms_norm:
+            wav_norm = self.rms_volume_norm(wav, self.db_level) * 32767
+        else:
            wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
+
        scipy.io.wavfile.write(path, sr if sr else self.sample_rate, wav_norm.astype(np.int16))

    def get_duration(self, filename: str) -> float:
--- a/TTS/utils/capacitron_optimizer.py
+++ b/TTS/utils/capacitron_optimizer.py
@ -0,0 +1,65 @@
+from typing import Generator
+
+from trainer.trainer_utils import get_optimizer
+
+
+class CapacitronOptimizer:
+    """Double optimizer class for the Capacitron model."""
+
+    def __init__(self, config: dict, model_params: Generator) -> None:
+        self.primary_params, self.secondary_params = self.split_model_parameters(model_params)
+
+        optimizer_names = list(config.optimizer_params.keys())
+        optimizer_parameters = list(config.optimizer_params.values())
+
+        self.primary_optimizer = get_optimizer(
+            optimizer_names[0],
+            optimizer_parameters[0],
+            config.lr,
+            parameters=self.primary_params,
+        )
+
+        self.secondary_optimizer = get_optimizer(
+            optimizer_names[1],
+            self.extract_optimizer_parameters(optimizer_parameters[1]),
+            optimizer_parameters[1]["lr"],
+            parameters=self.secondary_params,
+        )
+
+        self.param_groups = self.primary_optimizer.param_groups
+
+    def first_step(self):
+        self.secondary_optimizer.step()
+        self.secondary_optimizer.zero_grad()
+        self.primary_optimizer.zero_grad()
+
+    def step(self):
+        self.primary_optimizer.step()
+
+    def zero_grad(self):
+        self.primary_optimizer.zero_grad()
+        self.secondary_optimizer.zero_grad()
+
+    def load_state_dict(self, state_dict):
+        self.primary_optimizer.load_state_dict(state_dict[0])
+        self.secondary_optimizer.load_state_dict(state_dict[1])
+
+    def state_dict(self):
+        return [self.primary_optimizer.state_dict(), self.secondary_optimizer.state_dict()]
+
+    @staticmethod
+    def split_model_parameters(model_params: Generator) -> list:
+        primary_params = []
+        secondary_params = []
+        for name, param in model_params:
+            if param.requires_grad:
+                if name == "capacitron_vae_layer.beta":
+                    secondary_params.append(param)
+                else:
+                    primary_params.append(param)
+        return [iter(primary_params), iter(secondary_params)]
+
+    @staticmethod
+    def extract_optimizer_parameters(params: dict) -> dict:
+        """Extract parameters that are not the learning rate"""
+        return {k: v for k, v in params.items() if k != "lr"}
--- a/TTS/utils/io.py
+++ b/TTS/utils/io.py
@ -106,6 +106,8 @@ def save_model(config, model, optimizer, scaler, current_step, epoch, output_pat
        model_state = model.state_dict()
    if isinstance(optimizer, list):
        optimizer_state = [optim.state_dict() for optim in optimizer]
+    elif optimizer.__class__.__name__ == "CapacitronOptimizer":
+        optimizer_state = [optimizer.primary_optimizer.state_dict(), optimizer.secondary_optimizer.state_dict()]
    else:
        optimizer_state = optimizer.state_dict() if optimizer is not None else None

--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@ -90,6 +90,81 @@ class ModelManager(object):
            models_name_list.extend(model_list)
        return models_name_list

+    def model_info_by_idx(self, model_query):
+        """Print the description of the model from .models.json file using model_idx
+
+        Args:
+            model_query (str): <model_tye>/<model_idx>
+        """
+        model_name_list = []
+        model_type, model_query_idx = model_query.split("/")
+        try:
+            model_query_idx = int(model_query_idx)
+            if model_query_idx <= 0:
+                print("> model_query_idx should be a positive integer!")
+                return
+        except:
+            print("> model_query_idx should be an integer!")
+            return
+        model_count = 0
+        if model_type in self.models_dict:
+            for lang in self.models_dict[model_type]:
+                for dataset in self.models_dict[model_type][lang]:
+                    for model in self.models_dict[model_type][lang][dataset]:
+                        model_name_list.append(f"{model_type}/{lang}/{dataset}/{model}")
+                        model_count += 1
+        else:
+            print(f"> model_type {model_type} does not exist in the list.")
+            return
+        if model_query_idx > model_count:
+            print(f"model query idx exceeds the number of available models [{model_count}] ")
+        else:
+            model_type, lang, dataset, model = model_name_list[model_query_idx - 1].split("/")
+            print(f"> model type : {model_type}")
+            print(f"> language supported : {lang}")
+            print(f"> dataset used : {dataset}")
+            print(f"> model name : {model}")
+            if "description" in self.models_dict[model_type][lang][dataset][model]:
+                print(f"> description : {self.models_dict[model_type][lang][dataset][model]['description']}")
+            else:
+                print("> description : coming soon")
+            if "default_vocoder" in self.models_dict[model_type][lang][dataset][model]:
+                print(f"> default_vocoder : {self.models_dict[model_type][lang][dataset][model]['default_vocoder']}")
+
+    def model_info_by_full_name(self, model_query_name):
+        """Print the description of the model from .models.json file using model_full_name
+
+        Args:
+            model_query_name (str): Format is <model_type>/<language>/<dataset>/<model_name>
+        """
+        model_type, lang, dataset, model = model_query_name.split("/")
+        if model_type in self.models_dict:
+            if lang in self.models_dict[model_type]:
+                if dataset in self.models_dict[model_type][lang]:
+                    if model in self.models_dict[model_type][lang][dataset]:
+                        print(f"> model type : {model_type}")
+                        print(f"> language supported : {lang}")
+                        print(f"> dataset used : {dataset}")
+                        print(f"> model name : {model}")
+                        if "description" in self.models_dict[model_type][lang][dataset][model]:
+                            print(
+                                f"> description : {self.models_dict[model_type][lang][dataset][model]['description']}"
+                            )
+                        else:
+                            print("> description : coming soon")
+                        if "default_vocoder" in self.models_dict[model_type][lang][dataset][model]:
+                            print(
+                                f"> default_vocoder : {self.models_dict[model_type][lang][dataset][model]['default_vocoder']}"
+                            )
+                    else:
+                        print(f"> model {model} does not exist for {model_type}/{lang}/{dataset}.")
+                else:
+                    print(f"> dataset {dataset} does not exist for {model_type}/{lang}.")
+            else:
+                print(f"> lang {lang} does not exist for {model_type}.")
+        else:
+            print(f"> model_type {model_type} does not exist in the list.")
+
    def list_tts_models(self):
        """Print all `TTS` models and return a list of model names

--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@ -1,5 +1,5 @@
 import time
-from typing import List, Union
+from typing import List

 import numpy as np
 import pysbd
@ -97,10 +97,10 @@ class Synthesizer(object):
        """Load the TTS model.

        1. Load the model config.
-        2. Init the AudioProcessor.
-        3. Init the model from the config.
+        2. Init the model from the config.
+        3. Load the model weights.
        4. Move the model to the GPU if CUDA is enabled.
-        5. Init the speaker manager for the model.
+        5. Init the speaker manager in the model.

        Args:
            tts_checkpoint (str): path to the model checkpoint.
@ -122,7 +122,7 @@ class Synthesizer(object):
            self.tts_model.cuda()

        if self.encoder_checkpoint and hasattr(self.tts_model, "speaker_manager"):
-            self.tts_model.speaker_manager.init_encoder(self.encoder_checkpoint, self.encoder_config)
+            self.tts_model.speaker_manager.init_encoder(self.encoder_checkpoint, self.encoder_config, use_cuda)

    def _set_speaker_encoder_paths_from_tts_config(self):
        """Set the encoder paths from the tts model config for models with speaker encoders."""
@ -178,8 +178,9 @@ class Synthesizer(object):
        text: str = "",
        speaker_name: str = "",
        language_name: str = "",
-        speaker_wav: Union[str, List[str]] = None,
+        speaker_wav=None,
        style_wav=None,
+        style_text=None,
        reference_wav=None,
        reference_speaker_name=None,
    ) -> List[int]:
@ -191,6 +192,7 @@ class Synthesizer(object):
            language_name (str, optional): language id for multi-language models. Defaults to "".
            speaker_wav (Union[str, List[str]], optional): path to the speaker wav. Defaults to None.
            style_wav ([type], optional): style waveform for GST. Defaults to None.
+            style_text ([type], optional): transcription of style_wav for Capacitron. Defaults to None.
            reference_wav ([type], optional): reference waveform for voice conversion. Defaults to None.
            reference_speaker_name ([type], optional): spekaer id of reference waveform. Defaults to None.
        Returns:
@ -273,10 +275,11 @@ class Synthesizer(object):
                    CONFIG=self.tts_config,
                    use_cuda=self.use_cuda,
                    speaker_id=speaker_id,
-                    language_id=language_id,
                    style_wav=style_wav,
+                    style_text=style_text,
                    use_griffin_lim=use_gl,
                    d_vector=speaker_embedding,
+                    language_id=language_id,
                )
                waveform = outputs["wav"]
                mel_postnet_spec = outputs["outputs"]["model_outputs"][0].detach().cpu().numpy()
@ -315,7 +318,7 @@ class Synthesizer(object):
            # get the speaker embedding or speaker id for the reference wav file
            reference_speaker_embedding = None
            reference_speaker_id = None
-            if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "speaker_ids"):
+            if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "ids"):
                if reference_speaker_name and isinstance(reference_speaker_name, str):
                    if self.tts_config.use_d_vector_file:
                        # get the speaker embedding from the saved d_vectors.
--- a/TTS/vocoder/datasets/gan_dataset.py
+++ b/TTS/vocoder/datasets/gan_dataset.py
@ -115,8 +115,8 @@ class GANDataset(Dataset):
                audio, mel = self.cache[idx]
            else:
                audio = self.ap.load_wav(wavpath)
-                audio, _ = self._pad_short_samples(audio)
                mel = self.ap.melspectrogram(audio)
+                audio, mel = self._pad_short_samples(audio, mel)
        else:

            # load precomputed features
--- a/TTS/vocoder/models/gan.py
+++ b/TTS/vocoder/models/gan.py
@ -90,50 +90,26 @@ class GAN(BaseVocoder):
            raise ValueError(" [!] Unexpected `optimizer_idx`.")

        if optimizer_idx == 0:
-            # GENERATOR
+            # DISCRIMINATOR optimization
+
            # generator pass
            y_hat = self.model_g(x)[:, :, : y.size(2)]
-            self.y_hat_g = y_hat  # save for discriminator
-            y_hat_sub = None
-            y_sub = None
+
+            # cache for generator loss
+            # pylint: disable=W0201
+            self.y_hat_g = y_hat
+            self.y_hat_sub = None
+            self.y_sub_g = None

            # PQMF formatting
            if y_hat.shape[1] > 1:
-                y_hat_sub = y_hat
+                self.y_hat_sub = y_hat
                y_hat = self.model_g.pqmf_synthesis(y_hat)
-                self.y_hat_g = y_hat  # save for discriminator
-                y_sub = self.model_g.pqmf_analysis(y)
+                self.y_hat_g = y_hat  # save for generator loss
+                self.y_sub_g = self.model_g.pqmf_analysis(y)

            scores_fake, feats_fake, feats_real = None, None, None
-            if self.train_disc:

-                if len(signature(self.model_d.forward).parameters) == 2:
-                    D_out_fake = self.model_d(y_hat, x)
-                else:
-                    D_out_fake = self.model_d(y_hat)
-                D_out_real = None
-
-                if self.config.use_feat_match_loss:
-                    with torch.no_grad():
-                        D_out_real = self.model_d(y)
-
-                # format D outputs
-                if isinstance(D_out_fake, tuple):
-                    scores_fake, feats_fake = D_out_fake
-                    if D_out_real is None:
-                        feats_real = None
-                    else:
-                        _, feats_real = D_out_real
-                else:
-                    scores_fake = D_out_fake
-                    feats_fake, feats_real = None, None
-
-            # compute losses
-            loss_dict = criterion[optimizer_idx](y_hat, y, scores_fake, feats_fake, feats_real, y_hat_sub, y_sub)
-            outputs = {"model_outputs": y_hat}
-
-        if optimizer_idx == 1:
-            # DISCRIMINATOR
            if self.train_disc:
                # use different samples for G and D trainings
                if self.config.diff_samples_for_G_and_D:
@ -177,6 +153,36 @@ class GAN(BaseVocoder):
                loss_dict = criterion[optimizer_idx](scores_fake, scores_real)
                outputs = {"model_outputs": y_hat}

+        if optimizer_idx == 1:
+            # GENERATOR loss
+            scores_fake, feats_fake, feats_real = None, None, None
+            if self.train_disc:
+                if len(signature(self.model_d.forward).parameters) == 2:
+                    D_out_fake = self.model_d(self.y_hat_g, x)
+                else:
+                    D_out_fake = self.model_d(self.y_hat_g)
+                D_out_real = None
+
+                if self.config.use_feat_match_loss:
+                    with torch.no_grad():
+                        D_out_real = self.model_d(y)
+
+                # format D outputs
+                if isinstance(D_out_fake, tuple):
+                    scores_fake, feats_fake = D_out_fake
+                    if D_out_real is None:
+                        feats_real = None
+                    else:
+                        _, feats_real = D_out_real
+                else:
+                    scores_fake = D_out_fake
+                    feats_fake, feats_real = None, None
+
+            # compute losses
+            loss_dict = criterion[optimizer_idx](
+                self.y_hat_g, y, scores_fake, feats_fake, feats_real, self.y_hat_sub, self.y_sub_g
+            )
+            outputs = {"model_outputs": self.y_hat_g}
        return outputs, loss_dict

    @staticmethod
@ -210,6 +216,7 @@ class GAN(BaseVocoder):
    @torch.no_grad()
    def eval_step(self, batch: Dict, criterion: nn.Module, optimizer_idx: int) -> Tuple[Dict, Dict]:
        """Call `train_step()` with `no_grad()`"""
+        self.train_disc = True  # Avoid a bug in the Training with the missing discriminator loss
        return self.train_step(batch, criterion, optimizer_idx)

    def eval_log(
@ -266,7 +273,7 @@ class GAN(BaseVocoder):
        optimizer2 = get_optimizer(
            self.config.optimizer, self.config.optimizer_params, self.config.lr_disc, self.model_d
        )
-        return [optimizer1, optimizer2]
+        return [optimizer2, optimizer1]

    def get_lr(self) -> List:
        """Set the initial learning rates for each optimizer.
@ -274,7 +281,7 @@ class GAN(BaseVocoder):
        Returns:
            List: learning rates for each optimizer.
        """
-        return [self.config.lr_gen, self.config.lr_disc]
+        return [self.config.lr_disc, self.config.lr_gen]

    def get_scheduler(self, optimizer) -> List:
        """Set the schedulers for each optimizer.
@ -287,7 +294,7 @@ class GAN(BaseVocoder):
        """
        scheduler1 = get_scheduler(self.config.lr_scheduler_gen, self.config.lr_scheduler_gen_params, optimizer[0])
        scheduler2 = get_scheduler(self.config.lr_scheduler_disc, self.config.lr_scheduler_disc_params, optimizer[1])
-        return [scheduler1, scheduler2]
+        return [scheduler2, scheduler1]

    @staticmethod
    def format_batch(batch: List) -> Dict:
@ -359,7 +366,7 @@ class GAN(BaseVocoder):

    def get_criterion(self):
        """Return criterions for the optimizers"""
-        return [GeneratorLoss(self.config), DiscriminatorLoss(self.config)]
+        return [DiscriminatorLoss(self.config), GeneratorLoss(self.config)]

    @staticmethod
    def init_from_config(config: Coqpit, verbose=True) -> "GAN":
--- a/docs/source/faq.md
+++ b/docs/source/faq.md
@ -59,8 +59,7 @@ We tried to collect common issues and questions we receive about 🐸TTS. It is

 6. Train your model.
    - SingleGPU training: ```CUDA_VISIBLE_DEVICES="0" python train_tts.py --config_path config.json```
-    - MultiGPU training: ```CUDA_VISIBLE_DEVICES="0,1,2" python distribute.py --script train_tts.py --config_path config.json```
-        - This command uses all the GPUs given in ```CUDA_VISIBLE_DEVICES```. If you don't specify, it uses all the GPUs available.
+    - MultiGPU training: ```python3 -m trainer.distribute --gpus "0,1" --script TTS/bin/train_tts.py --config_path config.json```

 **Note:** You can also train your model using pure 🐍 python. Check ```{eval-rst} :ref: 'tutorial_for_nervous_beginners'```.

--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@ -1,6 +1,6 @@
 # Installation

-🐸TTS supports python >=3.6 <=3.9 and tested on Ubuntu 18.10, 19.10, 20.10.
+🐸TTS supports python >=3.7 <3.11.0 and tested on Ubuntu 18.10, 19.10, 20.10.

 ## Using `pip`

--- a/docs/source/training_a_model.md
+++ b/docs/source/training_a_model.md
@ -2,7 +2,7 @@

 1. Decide the model you want to use.

-    Each model has a different set of pros and cons that define the run-time efficiency and the voice quality. It is up to you to decide what model servers your needs. Other than referring to the papers, one easy way is to test the 🐸TTS
+    Each model has a different set of pros and cons that define the run-time efficiency and the voice quality. It is up to you to decide what model serves your needs. Other than referring to the papers, one easy way is to test the 🐸TTS
    community models and see how fast and good each of the models. Or you can start a discussion on our communication channels.

 2. Understand the configuration, its fields and values.
--- a/notebooks/TestAttention.ipynb
+++ b/notebooks/TestAttention.ipynb
@ -1,6 +1,5 @@
 {
- "cells": [
-  {
+    "cells": [{
            "cell_type": "markdown",
            "metadata": {
                "Collapsed": "false"
@ -37,9 +36,7 @@
                "import librosa.display\n",
                "\n",
                "from TTS.tts.layers import *\n",
-    "from TTS.utils.audio import AudioProcessor
-
-\n",
+                "from TTS.utils.audio import AudioProcessor\n",
                "from TTS.tts.utils.generic_utils import setup_model\n",
                "from TTS.tts.utils.io import load_config\n",
                "from TTS.tts.utils.text import text_to_sequence\n",
--- a/notebooks/Tutorial_1_use-pretrained-TTS.ipynb
+++ b/notebooks/Tutorial_1_use-pretrained-TTS.ipynb
@ -0,0 +1,272 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "45ea3ef5",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "# Easy Inferencing with 🐸 TTS ⚡\n",
+    "\n",
+    "#### You want to quicly synthesize speech using Coqui 🐸 TTS model?\n",
+    "\n",
+    "💡: Grab a pre-trained model and use it to synthesize speech using any speaker voice, including yours! ⚡\n",
+    "\n",
+    "🐸 TTS comes with a list of pretrained models and speaker voices. You can even start a local demo server that you can open it on your favorite web browser and 🗣️ .\n",
+    "\n",
+    "In this notebook, we will: \n",
+    "```\n",
+    "1. List available pre-trained 🐸 TTS models\n",
+    "2. Run a 🐸 TTS model\n",
+    "3. Listen to the synthesized wave 📣\n",
+    "4. Run multispeaker 🐸 TTS model \n",
+    "```\n",
+    "So, let's jump right in!\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a1e5c2a5-46eb-42fd-b550-2a052546857e",
+   "metadata": {},
+   "source": [
+    "## Install 🐸 TTS ⬇️"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fa2aec77",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! pip install -U pip\n",
+    "! pip install TTS"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8c07a273",
+   "metadata": {},
+   "source": [
+    "## ✅ List available pre-trained 🐸 TTS models\n",
+    "\n",
+    "Coqui 🐸TTS comes with a list of pretrained models for different model types (ex: TTS, vocoder), languages, datasets used for training and architectures. \n",
+    "\n",
+    "You can either use your own model or the release models under 🐸TTS.\n",
+    "\n",
+    "Use `tts --list_models` to find out the availble models.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "608d203f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! tts --list_models"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ed9dd7ab",
+   "metadata": {},
+   "source": [
+    "## ✅ Run a 🐸 TTS model\n",
+    "\n",
+    "#### **First things first**: Using a release model and default vocoder:\n",
+    "\n",
+    "You can simply copy the full model name from the list above and use it \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cc9e4608-16ec-4dcd-bd6b-bd10d62286f8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!tts --text \"hello world\" \\\n",
+    "--model_name \"tts_models/en/ljspeech/glow-tts\" \\\n",
+    "--out_path output.wav\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0ca2cb14-1aba-400e-a219-8ce44d9410be",
+   "metadata": {},
+   "source": [
+    "## 📣 Listen to the synthesized wave 📣"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5fe63ef4-9284-4461-9dda-1ca7483a8f9b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import IPython\n",
+    "IPython.display.Audio(\"output.wav\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5e67d178-1ebe-49c7-9a47-0593251bdb96",
+   "metadata": {},
+   "source": [
+    "### **Second things second**:\n",
+    "\n",
+    "🔶 A TTS model can be either trained on a single speaker voice or multispeaker voices. This training choice is directly reflected on the inference ability and the available speaker voices that can be used to synthesize speech. \n",
+    "\n",
+    "🔶 If you want to run a multispeaker model from the released models list, you can first check the speaker ids using `--list_speaker_idx` flag and use this speaker voice to synthesize speech."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "87b18839-f750-4a61-bbb0-c964acaecab2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# list the possible speaker IDs.\n",
+    "!tts --model_name \"tts_models/en/vctk/vits\" \\\n",
+    "--list_speaker_idxs \n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c4365a9d-f922-4b14-88b0-d2b22a245b2e",
+   "metadata": {},
+   "source": [
+    "## 💬 Synthesize speech using speaker ID 💬"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "52be0403-d13e-4d9b-99c2-c10b85154063",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!tts --text \"Trying out specific speaker voice\"\\\n",
+    "--out_path spkr-out.wav --model_name \"tts_models/en/vctk/vits\" \\\n",
+    "--speaker_idx \"p341\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "894a560a-f9c8-48ce-aaa6-afdf516c01f6",
+   "metadata": {},
+   "source": [
+    "## 📣 Listen to the synthesized speaker specific wave 📣"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ed485b0a-dfd5-4a7e-a571-ebf74bdfc41d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import IPython\n",
+    "IPython.display.Audio(\"spkr-out.wav\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "84636a38-097e-4dad-933b-0aeaee650e92",
+   "metadata": {},
+   "source": [
+    "🔶 If you want to use an external speaker to synthesize speech, you need to supply `--speaker_wav` flag along with an external speaker encoder path and config file, as follows:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cbdb15fa-123a-4282-a127-87b50dc70365",
+   "metadata": {},
+   "source": [
+    "First we need to get the speaker encoder model, its config and a referece `speaker_wav`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e54f1b13-560c-4fed-bafd-e38ec9712359",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!wget https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json\n",
+    "!wget https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar\n",
+    "!wget https://github.com/coqui-ai/TTS/raw/speaker_encoder_model/tests/data/ljspeech/wavs/LJ001-0001.wav"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6dac1912-5054-4a68-8357-6d20fd99cb10",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!tts --model_name tts_models/multilingual/multi-dataset/your_tts \\\n",
+    "--encoder_path model_se.pth.tar \\\n",
+    "--encoder_config config_se.json \\\n",
+    "--speaker_wav LJ001-0001.wav \\\n",
+    "--text \"Are we not allowed to dim the lights so people can see that a bit better?\"\\\n",
+    "--out_path spkr-out.wav \\\n",
+    "--language_idx \"en\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "92ddce58-8aca-4f69-84c3-645ae1b12e7d",
+   "metadata": {},
+   "source": [
+    "## 📣 Listen to the synthesized speaker specific wave 📣"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cc889adc-9c71-4232-8e85-bfc8f76476f4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import IPython\n",
+    "IPython.display.Audio(\"spkr-out.wav\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "29101d01-0b01-4153-a216-5dae415a5dd6",
+   "metadata": {},
+   "source": [
+    "## 🎉 Congratulations! 🎉 You now know how to use a TTS model to synthesize speech! \n",
+    "Follow up with the next tutorials to learn more adnavced material."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/notebooks/Tutorial_2_train_your_first_TTS_model.ipynb
+++ b/notebooks/Tutorial_2_train_your_first_TTS_model.ipynb
@ -0,0 +1,454 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "f79d99ef",
+   "metadata": {},
+   "source": [
+    "# Train your first 🐸 TTS model 💫\n",
+    "\n",
+    "### 👋 Hello and welcome to Coqui (🐸) TTS\n",
+    "\n",
+    "The goal of this notebook is to show you a **typical workflow** for **training** and **testing** a TTS model with 🐸.\n",
+    "\n",
+    "Let's train a very small model on a very small amount of data so we can iterate quickly.\n",
+    "\n",
+    "In this notebook, we will:\n",
+    "\n",
+    "1. Download data and format it for 🐸 TTS.\n",
+    "2. Configure the training and testing runs.\n",
+    "3. Train a new model.\n",
+    "4. Test the model and display its performance.\n",
+    "\n",
+    "So, let's jump right in!\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fa2aec78",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Install Coqui TTS\n",
+    "! pip install -U pip\n",
+    "! pip install TTS"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "be5fe49c",
+   "metadata": {},
+   "source": [
+    "## ✅ Data Preparation\n",
+    "\n",
+    "### **First things first**: we need some data.\n",
+    "\n",
+    "We're training a Text-to-Speech model, so we need some _text_ and we need some _speech_. Specificially, we want _transcribed speech_. The speech must be divided into audio clips and each clip needs transcription. More details about data requirements such as recording characteristics, background noise abd vocabulary coverage can be found in the [🐸TTS documentation](https://tts.readthedocs.io/en/latest/formatting_your_dataset.html).\n",
+    "\n",
+    "If you have a single audio file and you need to **split** it into clips. It is also important to use a lossless audio file format to prevent compression artifacts. We recommend using **wav** file format.\n",
+    "\n",
+    "The data format we will be adopting for this tutorial is taken from the widely-used  **LJSpeech** dataset, where **waves** are collected under a folder:\n",
+    "\n",
+    "<span style=\"color:purple;font-size:15px\">\n",
+    "/wavs<br /> \n",
+    " &emsp;| - audio1.wav<br /> \n",
+    " &emsp;| - audio2.wav<br /> \n",
+    " &emsp;| - audio3.wav<br /> \n",
+    "  ...<br /> \n",
+    "</span>\n",
+    "\n",
+    "and a **metadata.csv** file will have the audio file name in parallel to the transcript, delimited by `|`: \n",
+    " \n",
+    "<span style=\"color:purple;font-size:15px\">\n",
+    "# metadata.csv <br /> \n",
+    "audio1|This is my sentence. <br /> \n",
+    "audio2|This is maybe my sentence. <br /> \n",
+    "audio3|This is certainly my sentence. <br /> \n",
+    "audio4|Let this be your sentence. <br /> \n",
+    "...\n",
+    "</span>\n",
+    "\n",
+    "In the end, we should have the following **folder structure**:\n",
+    "\n",
+    "<span style=\"color:purple;font-size:15px\">\n",
+    "/MyTTSDataset <br /> \n",
+    "&emsp;| <br /> \n",
+    "&emsp;| -> metadata.txt<br /> \n",
+    "&emsp;| -> /wavs<br /> \n",
+    "&emsp;&emsp;| -> audio1.wav<br /> \n",
+    "&emsp;&emsp;| -> audio2.wav<br /> \n",
+    "&emsp;&emsp;| ...<br /> \n",
+    "</span>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "69501a10-3b53-4e75-ae66-90221d6f2271",
+   "metadata": {},
+   "source": [
+    "🐸TTS already provides tooling for the _LJSpeech_. if you use the same format, you can start training your models right away. <br /> \n",
+    "\n",
+    "After you collect and format your dataset, you need to check two things. Whether you need a **_formatter_** and a **_text_cleaner_**. <br /> The **_formatter_** loads the text file (created above) as a list and the **_text_cleaner_** performs a sequence of text normalization operations that converts the raw text into the spoken representation (e.g. converting numbers to text, acronyms, and symbols to the spoken format).\n",
+    "\n",
+    "If you use a different dataset format then the LJSpeech or the other public datasets that 🐸TTS supports, then you need to write your own **_formatter_** and  **_text_cleaner_**."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e7f226c8-4e55-48fa-937b-8415d539b17c",
+   "metadata": {},
+   "source": [
+    "## ⏳️ Loading your dataset\n",
+    "Load one of the dataset supported by 🐸TTS.\n",
+    "\n",
+    "We will start by defining dataset config and setting LJSpeech as our target dataset and define its path.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "b3cb0191-b8fc-4158-bd26-8423c2a8ba66",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "# BaseDatasetConfig: defines name, formatter and path of the dataset.\n",
+    "from TTS.tts.configs.shared_configs import BaseDatasetConfig\n",
+    "\n",
+    "output_path = \"tts_train_dir\"\n",
+    "if not os.path.exists(output_path):\n",
+    "    os.makedirs(output_path)\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ae6b7019-3685-4b48-8917-c152e288d7e3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Download and extract LJSpeech dataset.\n",
+    "\n",
+    "!wget -O $output_path/LJSpeech-1.1.tar.bz2 https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 \n",
+    "!tar -xf $output_path/LJSpeech-1.1.tar.bz2 -C $output_path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "76cd3ab5-6387-45f1-b488-24734cc1beb5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset_config = BaseDatasetConfig(\n",
+    "    name=\"ljspeech\", meta_file_train=\"metadata.csv\", path=os.path.join(output_path, \"LJSpeech-1.1/\")\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ae82fd75",
+   "metadata": {},
+   "source": [
+    "## ✅ Train a new model\n",
+    "\n",
+    "Let's kick off a training run 🚀🚀🚀.\n",
+    "\n",
+    "Deciding on the model architecture you'd want to use is based on your needs and available resources. Each model architecture has it's pros and cons that define the run-time efficiency and the voice quality.\n",
+    "We have many recipes under `TTS/recipes/` that provide a good starting point. For this tutorial, we will be using `GlowTTS`."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f5876e46-2aee-4bcf-b6b3-9e3c535c553f",
+   "metadata": {},
+   "source": [
+    "We will begin by initializing the model training configuration."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5483ca28-39d6-49f8-a18e-4fb53c50ad84",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# GlowTTSConfig: all model related values for training, validating and testing.\n",
+    "from TTS.tts.configs.glow_tts_config import GlowTTSConfig\n",
+    "config = GlowTTSConfig(\n",
+    "    batch_size=32,\n",
+    "    eval_batch_size=16,\n",
+    "    num_loader_workers=4,\n",
+    "    num_eval_loader_workers=4,\n",
+    "    run_eval=True,\n",
+    "    test_delay_epochs=-1,\n",
+    "    epochs=100,\n",
+    "    text_cleaner=\"phoneme_cleaners\",\n",
+    "    use_phonemes=True,\n",
+    "    phoneme_language=\"en-us\",\n",
+    "    phoneme_cache_path=os.path.join(output_path, \"phoneme_cache\"),\n",
+    "    print_step=25,\n",
+    "    print_eval=False,\n",
+    "    mixed_precision=True,\n",
+    "    output_path=output_path,\n",
+    "    datasets=[dataset_config],\n",
+    "    save_step=1000,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b93ed377-80b7-447b-bd92-106bffa777ee",
+   "metadata": {},
+   "source": [
+    "Next we will initialize the audio processor which is used for feature extraction and audio I/O."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b1b12f61-f851-4565-84dd-7640947e04ab",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from TTS.utils.audio import AudioProcessor\n",
+    "ap = AudioProcessor.init_from_config(config)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1d461683-b05e-403f-815f-8007bda08c38",
+   "metadata": {},
+   "source": [
+    "Next we will initialize the tokenizer which is used to convert text to sequences of token IDs.  If characters are not defined in the config, default characters are passed to the config."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "014879b7-f18d-44c0-b24a-e10f8002113a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from TTS.tts.utils.text.tokenizer import TTSTokenizer\n",
+    "tokenizer, config = TTSTokenizer.init_from_config(config)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "df3016e1-9e99-4c4f-94e3-fa89231fd978",
+   "metadata": {},
+   "source": [
+    "Next we will load data samples. Each sample is a list of ```[text, audio_file_path, speaker_name]```. You can define your custom sample loader returning the list of samples."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cadd6ada-c8eb-4f79-b8fe-6d72850af5a7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from TTS.tts.datasets import load_tts_samples\n",
+    "train_samples, eval_samples = load_tts_samples(\n",
+    "    dataset_config,\n",
+    "    eval_split=True,\n",
+    "    eval_split_max_size=config.eval_split_max_size,\n",
+    "    eval_split_size=config.eval_split_size,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "db8b451e-1fe1-4aa3-b69e-ab22b925bd19",
+   "metadata": {},
+   "source": [
+    "Now we're ready to initialize the model.\n",
+    "\n",
+    "Models take a config object and a speaker manager as input. Config defines the details of the model like the number of layers, the size of the embedding, etc. Speaker manager is used by multi-speaker models."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ac2ffe3e-ad0c-443e-800c-9b076ee811b4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from TTS.tts.models.glow_tts import GlowTTS\n",
+    "model = GlowTTS(config, ap, tokenizer, speaker_manager=None)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e2832c56-889d-49a6-95b6-eb231892ecc6",
+   "metadata": {},
+   "source": [
+    "Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training, distributed training, etc."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0f609945-4fe0-4d0d-b95e-11d7bfb63ebe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from trainer import Trainer, TrainerArgs\n",
+    "trainer = Trainer(\n",
+    "    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5b320831-dd83-429b-bb6a-473f9d49d321",
+   "metadata": {},
+   "source": [
+    "### AND... 3,2,1... START TRAINING 🚀🚀🚀"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d4c07f99-3d1d-4bea-801e-9f33bbff0e9f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trainer.fit()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4cff0c40-2734-40a6-a905-e945a9fb3e98",
+   "metadata": {},
+   "source": [
+    "#### 🚀 Run the Tensorboard. 🚀\n",
+    "On the notebook and Tensorboard, you can monitor the progress of your model. Also Tensorboard provides certain figures and sample outputs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5a85cd3b-1646-40ad-a6c2-49323e08eeec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install tensorboard\n",
+    "!tensorboard --logdir=tts_train_dir"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9f6dc959",
+   "metadata": {},
+   "source": [
+    "## ✅ Test the model\n",
+    "\n",
+    "We made it! 🙌\n",
+    "\n",
+    "Let's kick off the testing run, which displays performance metrics.\n",
+    "\n",
+    "We're committing the cardinal sin of ML 😈 (aka - testing on our training data) so you don't want to deploy this model into production. In this notebook we're focusing on the workflow itself, so it's forgivable 😇\n",
+    "\n",
+    "You can see from the test output that our tiny model has overfit to the data, and basically memorized this one sentence.\n",
+    "\n",
+    "When you start training your own models, make sure your testing data doesn't include your training data 😅"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "99fada7a-592f-4a09-9369-e6f3d82de3a0",
+   "metadata": {},
+   "source": [
+    "Let's get the latest saved checkpoint. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6dd47ed5-da8e-4bf9-b524-d686630d6961",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import glob, os\n",
+    "output_path = \"tts_train_dir\"\n",
+    "ckpts = sorted([f for f in glob.glob(output_path+\"/*/*.pth\")])\n",
+    "configs = sorted([f for f in glob.glob(output_path+\"/*/*.json\")])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dd42bc7a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    " !tts --text \"Text for TTS\" \\\n",
+    "      --model_path $test_ckpt \\\n",
+    "      --config_path $test_config \\\n",
+    "      --out_path out.wav"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "81cbcb3f-d952-469b-a0d8-8941cd7af670",
+   "metadata": {},
+   "source": [
+    "## 📣 Listen to the synthesized wave 📣"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e0000bd6-6763-4a10-a74d-911dd08ebcff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import IPython\n",
+    "IPython.display.Audio(\"out.wav\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "13914401-cad1-494a-b701-474e52829138",
+   "metadata": {},
+   "source": [
+    "## 🎉 Congratulations! 🎉 You now have trained your first TTS model! \n",
+    "Follow up with the next tutorials to learn more advanced material."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "950d9fc6-896f-4a2c-86fd-8fd1fcbbb3f7",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,5 +1,5 @@
 [build-system]
-requires = ["setuptools", "wheel", "Cython", "numpy==1.19.5"]
+requires = ["setuptools", "wheel", "cython==0.29.28", "numpy==1.21.6"]

 [flake8]
 max-line-length=120
--- a/recipes/blizzard2013/README.md
+++ b/recipes/blizzard2013/README.md
@ -0,0 +1,12 @@
+# How to get the Blizzard 2013 Dataset
+
+The Capacitron model is a variational encoder extension of standard Tacotron based models to model prosody.
+
+To take full advantage of the model, it is advised to train the model with a dataset that contains a significant amount of prosodic information in the utterances. A tested candidate for such applications is the blizzard2013 dataset from the Blizzard Challenge, containing many hours of high quality audio book recordings.
+
+To get a license and download link for this dataset, you need to visit the [website](https://www.cstr.ed.ac.uk/projects/blizzard/2013/lessac_blizzard2013/license.html) of the Centre for Speech Technology Research of the University of Edinburgh.
+
+You get access to the raw dataset in a couple of days. There are a few preprocessing steps you need to do to be able to use the high fidelity dataset.
+
+1. Get the forced time alignments for the blizzard dataset from [here](https://github.com/mueller91/tts_alignments).
+2. Segment the high fidelity audio-book files based on the instructions [here](https://github.com/Tomiinek/Blizzard2013_Segmentation).
--- a/recipes/blizzard2013/tacotron1-Capacitron/train_capacitron_t1.py
+++ b/recipes/blizzard2013/tacotron1-Capacitron/train_capacitron_t1.py
@ -0,0 +1,101 @@
+import os
+
+from trainer import Trainer, TrainerArgs
+
+from TTS.config.shared_configs import BaseAudioConfig
+from TTS.tts.configs.shared_configs import BaseDatasetConfig, CapacitronVAEConfig
+from TTS.tts.configs.tacotron_config import TacotronConfig
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.models.tacotron import Tacotron
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
+from TTS.utils.audio import AudioProcessor
+
+output_path = os.path.dirname(os.path.abspath(__file__))
+
+data_path = "/srv/data/"
+
+# Using LJSpeech like dataset processing for the blizzard dataset
+dataset_config = BaseDatasetConfig(name="ljspeech", meta_file_train="metadata.csv", path=data_path)
+
+audio_config = BaseAudioConfig(
+    sample_rate=24000,
+    do_trim_silence=True,
+    trim_db=60.0,
+    signal_norm=True,
+    mel_fmin=80.0,
+    mel_fmax=12000,
+    spec_gain=20.0,
+    log_func="np.log10",
+    ref_level_db=20,
+    preemphasis=0.0,
+    min_level_db=-100,
+)
+
+# Using the standard Capacitron config
+capacitron_config = CapacitronVAEConfig(capacitron_VAE_loss_alpha=1.0)
+
+config = TacotronConfig(
+    run_name="Blizzard-Capacitron-T1",
+    audio=audio_config,
+    capacitron_vae=capacitron_config,
+    use_capacitron_vae=True,
+    batch_size=128,  # Tune this to your gpu
+    max_audio_len=6 * 24000,  # Tune this to your gpu
+    min_audio_len=0.5 * 24000,
+    eval_batch_size=16,
+    num_loader_workers=12,
+    num_eval_loader_workers=8,
+    precompute_num_workers=24,
+    run_eval=True,
+    test_delay_epochs=5,
+    ga_alpha=0.0,
+    r=2,
+    optimizer="CapacitronOptimizer",
+    optimizer_params={"RAdam": {"betas": [0.9, 0.998], "weight_decay": 1e-6}, "SGD": {"lr": 1e-5, "momentum": 0.9}},
+    attention_type="graves",
+    attention_heads=5,
+    epochs=1000,
+    text_cleaner="phoneme_cleaners",
+    use_phonemes=True,
+    phoneme_language="en-us",
+    phonemizer="espeak",
+    phoneme_cache_path=os.path.join(data_path, "phoneme_cache"),
+    stopnet_pos_weight=15,
+    print_step=50,
+    print_eval=True,
+    mixed_precision=False,
+    output_path=output_path,
+    datasets=[dataset_config],
+    lr=1e-3,
+    lr_scheduler="StepwiseGradualLR",
+    lr_scheduler_params={"gradual_learning_rates": [[0, 1e-3], [2e4, 5e-4], [4e5, 3e-4], [6e4, 1e-4], [8e4, 5e-5]]},
+    scheduler_after_epoch=False,  # scheduler doesn't work without this flag
+    # Need to experiment with these below for capacitron
+    loss_masking=False,
+    decoder_loss_alpha=1.0,
+    postnet_loss_alpha=1.0,
+    postnet_diff_spec_alpha=0.0,
+    decoder_diff_spec_alpha=0.0,
+    decoder_ssim_alpha=0.0,
+    postnet_ssim_alpha=0.0,
+)
+
+ap = AudioProcessor(**config.audio.to_dict())
+
+tokenizer, config = TTSTokenizer.init_from_config(config)
+
+train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+
+model = Tacotron(config, ap, tokenizer, speaker_manager=None)
+
+trainer = Trainer(
+    TrainerArgs(),
+    config,
+    output_path,
+    model=model,
+    train_samples=train_samples,
+    eval_samples=eval_samples,
+)
+
+# 🚀
+trainer.fit()
--- a/recipes/blizzard2013/tacotron2-Capacitron/train_capacitron_t2.py
+++ b/recipes/blizzard2013/tacotron2-Capacitron/train_capacitron_t2.py
@ -0,0 +1,117 @@
+import os
+
+from trainer import Trainer, TrainerArgs
+
+from TTS.config.shared_configs import BaseAudioConfig
+from TTS.tts.configs.shared_configs import BaseDatasetConfig, CapacitronVAEConfig
+from TTS.tts.configs.tacotron2_config import Tacotron2Config
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.models.tacotron2 import Tacotron2
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
+from TTS.utils.audio import AudioProcessor
+
+output_path = os.path.dirname(os.path.abspath(__file__))
+
+data_path = "/srv/data/blizzard2013/segmented"
+
+# Using LJSpeech like dataset processing for the blizzard dataset
+dataset_config = BaseDatasetConfig(
+    name="ljspeech",
+    meta_file_train="metadata.csv",
+    path=data_path,
+)
+
+audio_config = BaseAudioConfig(
+    sample_rate=24000,
+    do_trim_silence=True,
+    trim_db=60.0,
+    signal_norm=True,
+    mel_fmin=80.0,
+    mel_fmax=12000,
+    spec_gain=25.0,
+    log_func="np.log10",
+    ref_level_db=20,
+    preemphasis=0.0,
+    min_level_db=-100,
+)
+
+# Using the standard Capacitron config
+capacitron_config = CapacitronVAEConfig(capacitron_VAE_loss_alpha=1.0)
+
+config = Tacotron2Config(
+    run_name="Blizzard-Capacitron-T2",
+    audio=audio_config,
+    capacitron_vae=capacitron_config,
+    use_capacitron_vae=True,
+    batch_size=246,  # Tune this to your gpu
+    max_audio_len=6 * 24000,  # Tune this to your gpu
+    min_audio_len=1 * 24000,
+    eval_batch_size=16,
+    num_loader_workers=12,
+    num_eval_loader_workers=8,
+    precompute_num_workers=24,
+    run_eval=True,
+    test_delay_epochs=5,
+    ga_alpha=0.0,
+    r=2,
+    optimizer="CapacitronOptimizer",
+    optimizer_params={"RAdam": {"betas": [0.9, 0.998], "weight_decay": 1e-6}, "SGD": {"lr": 1e-5, "momentum": 0.9}},
+    attention_type="dynamic_convolution",
+    grad_clip=0.0,  # Important! We overwrite the standard grad_clip with capacitron_grad_clip
+    double_decoder_consistency=False,
+    epochs=1000,
+    text_cleaner="phoneme_cleaners",
+    use_phonemes=True,
+    phoneme_language="en-us",
+    phonemizer="espeak",
+    phoneme_cache_path=os.path.join(data_path, "phoneme_cache"),
+    stopnet_pos_weight=15,
+    print_step=25,
+    print_eval=True,
+    mixed_precision=False,
+    output_path=output_path,
+    datasets=[dataset_config],
+    lr=1e-3,
+    lr_scheduler="StepwiseGradualLR",
+    lr_scheduler_params={
+        "gradual_learning_rates": [
+            [0, 1e-3],
+            [2e4, 5e-4],
+            [4e5, 3e-4],
+            [6e4, 1e-4],
+            [8e4, 5e-5],
+        ]
+    },
+    scheduler_after_epoch=False,  # scheduler doesn't work without this flag
+    # dashboard_logger='wandb',
+    # sort_by_audio_len=True,
+    seq_len_norm=True,
+    # Need to experiment with these below for capacitron
+    loss_masking=False,
+    decoder_loss_alpha=1.0,
+    postnet_loss_alpha=1.0,
+    postnet_diff_spec_alpha=0.0,
+    decoder_diff_spec_alpha=0.0,
+    decoder_ssim_alpha=0.0,
+    postnet_ssim_alpha=0.0,
+)
+
+ap = AudioProcessor(**config.audio.to_dict())
+
+tokenizer, config = TTSTokenizer.init_from_config(config)
+
+train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+
+model = Tacotron2(config, ap, tokenizer, speaker_manager=None)
+
+trainer = Trainer(
+    TrainerArgs(),
+    config,
+    output_path,
+    model=model,
+    train_samples=train_samples,
+    eval_samples=eval_samples,
+    training_assets={"audio_processor": ap},
+)
+
+trainer.fit()
--- a/recipes/ljspeech/tacotron2-Capacitron/train_capacitron_t2.py
+++ b/recipes/ljspeech/tacotron2-Capacitron/train_capacitron_t2.py
@ -0,0 +1,115 @@
+import os
+
+from trainer import Trainer, TrainerArgs
+
+from TTS.config.shared_configs import BaseAudioConfig
+from TTS.tts.configs.shared_configs import BaseDatasetConfig, CapacitronVAEConfig
+from TTS.tts.configs.tacotron2_config import Tacotron2Config
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.models.tacotron2 import Tacotron2
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
+from TTS.utils.audio import AudioProcessor
+
+output_path = os.path.dirname(os.path.abspath(__file__))
+
+data_path = "/srv/data/"
+
+# Using LJSpeech like dataset processing for the blizzard dataset
+dataset_config = BaseDatasetConfig(
+    name="ljspeech",
+    meta_file_train="metadata.csv",
+    path=data_path,
+)
+
+audio_config = BaseAudioConfig(
+    sample_rate=22050,
+    do_trim_silence=True,
+    trim_db=60.0,
+    signal_norm=False,
+    mel_fmin=0.0,
+    mel_fmax=11025,
+    spec_gain=1.0,
+    log_func="np.log",
+    ref_level_db=20,
+    preemphasis=0.0,
+)
+
+# Using the standard Capacitron config
+capacitron_config = CapacitronVAEConfig(capacitron_VAE_loss_alpha=1.0, capacitron_capacity=50)
+
+config = Tacotron2Config(
+    run_name="Capacitron-Tacotron2",
+    audio=audio_config,
+    capacitron_vae=capacitron_config,
+    use_capacitron_vae=True,
+    batch_size=128,  # Tune this to your gpu
+    max_audio_len=8 * 22050,  # Tune this to your gpu
+    min_audio_len=1 * 22050,
+    eval_batch_size=16,
+    num_loader_workers=8,
+    num_eval_loader_workers=8,
+    precompute_num_workers=24,
+    run_eval=True,
+    test_delay_epochs=25,
+    ga_alpha=0.0,
+    r=2,
+    optimizer="CapacitronOptimizer",
+    optimizer_params={"RAdam": {"betas": [0.9, 0.998], "weight_decay": 1e-6}, "SGD": {"lr": 1e-5, "momentum": 0.9}},
+    attention_type="dynamic_convolution",
+    grad_clip=0.0,  # Important! We overwrite the standard grad_clip with capacitron_grad_clip
+    double_decoder_consistency=False,
+    epochs=1000,
+    text_cleaner="phoneme_cleaners",
+    use_phonemes=True,
+    phoneme_language="en-us",
+    phonemizer="espeak",
+    phoneme_cache_path=os.path.join(data_path, "phoneme_cache"),
+    stopnet_pos_weight=15,
+    print_step=25,
+    print_eval=True,
+    mixed_precision=False,
+    sort_by_audio_len=True,
+    seq_len_norm=True,
+    output_path=output_path,
+    datasets=[dataset_config],
+    lr=1e-3,
+    lr_scheduler="StepwiseGradualLR",
+    lr_scheduler_params={
+        "gradual_learning_rates": [
+            [0, 1e-3],
+            [2e4, 5e-4],
+            [4e5, 3e-4],
+            [6e4, 1e-4],
+            [8e4, 5e-5],
+        ]
+    },
+    scheduler_after_epoch=False,  # scheduler doesn't work without this flag
+    # Need to experiment with these below for capacitron
+    loss_masking=False,
+    decoder_loss_alpha=1.0,
+    postnet_loss_alpha=1.0,
+    postnet_diff_spec_alpha=0.0,
+    decoder_diff_spec_alpha=0.0,
+    decoder_ssim_alpha=0.0,
+    postnet_ssim_alpha=0.0,
+)
+
+ap = AudioProcessor(**config.audio.to_dict())
+
+tokenizer, config = TTSTokenizer.init_from_config(config)
+
+train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+
+model = Tacotron2(config, ap, tokenizer, speaker_manager=None)
+
+trainer = Trainer(
+    TrainerArgs(),
+    config,
+    output_path,
+    model=model,
+    train_samples=train_samples,
+    eval_samples=eval_samples,
+    training_assets={"audio_processor": ap},
+)
+
+trainer.fit()
--- a/recipes/thorsten_DE/README.md
+++ b/recipes/thorsten_DE/README.md
@ -0,0 +1,15 @@
+# 🐸💬 TTS Thorsten Recipes
+
+For running the recipes you need the [Thorsten-Voice](https://github.com/thorstenMueller/Thorsten-Voice) dataset.
+
+You can download it manually from [the official website](https://www.thorsten-voice.de/) or use ```download_thorsten_de.sh``` alternatively running any of the **train_modelX.py**scripts will download the dataset if not already present.
+
+Then, go to your desired model folder and run the training.
+
+    Running Python files. (Choose the desired GPU ID for your run and set ```CUDA_VISIBLE_DEVICES```)
+    ```terminal
+    CUDA_VISIBLE_DEVICES="0" python train_modelX.py
+    ```
+
+💡 Note that these runs are just templates to help you start training your first model. They are not optimized for the best
+result. Double-check the configurations and feel free to share your experiments to find better parameters together 💪.
--- a/recipes/thorsten_DE/align_tts/train_aligntts.py
+++ b/recipes/thorsten_DE/align_tts/train_aligntts.py
@ -0,0 +1,84 @@
+import os
+
+from trainer import Trainer, TrainerArgs
+
+from TTS.tts.configs.align_tts_config import AlignTTSConfig
+from TTS.tts.configs.shared_configs import BaseDatasetConfig
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.models.align_tts import AlignTTS
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
+from TTS.utils.audio import AudioProcessor
+from TTS.utils.downloaders import download_thorsten_de
+
+output_path = os.path.dirname(os.path.abspath(__file__))
+
+# init configs
+dataset_config = BaseDatasetConfig(
+    name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
+)
+
+# download dataset if not already present
+if not os.path.exists(dataset_config.path):
+    print("Downloading dataset")
+    download_thorsten_de(os.path.split(os.path.abspath(dataset_config.path))[0])
+
+config = AlignTTSConfig(
+    batch_size=32,
+    eval_batch_size=16,
+    num_loader_workers=4,
+    num_eval_loader_workers=4,
+    run_eval=True,
+    test_delay_epochs=-1,
+    epochs=1000,
+    text_cleaner="phoneme_cleaners",
+    use_phonemes=False,
+    phoneme_language="de",
+    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
+    print_step=25,
+    print_eval=True,
+    mixed_precision=False,
+    test_sentences=[
+        "Es hat mich viel Zeit gekostet ein Stimme zu entwickeln, jetzt wo ich sie habe werde ich nicht mehr schweigen.",
+        "Sei eine Stimme, kein Echo.",
+        "Es tut mir Leid David. Das kann ich leider nicht machen.",
+        "Dieser Kuchen ist großartig. Er ist so lecker und feucht.",
+        "Vor dem 22. November 1963.",
+    ],
+    output_path=output_path,
+    datasets=[dataset_config],
+)
+
+# INITIALIZE THE AUDIO PROCESSOR
+# Audio processor is used for feature extraction and audio I/O.
+# It mainly serves to the dataloader and the training loggers.
+ap = AudioProcessor.init_from_config(config)
+
+# INITIALIZE THE TOKENIZER
+# Tokenizer is used to convert text to sequences of token IDs.
+# If characters are not defined in the config, default characters are passed to the config
+tokenizer, config = TTSTokenizer.init_from_config(config)
+
+# LOAD DATA SAMPLES
+# Each sample is a list of ```[text, audio_file_path, speaker_name]```
+# You can define your custom sample loader returning the list of samples.
+# Or define your custom formatter and pass it to the `load_tts_samples`.
+# Check `TTS.tts.datasets.load_tts_samples` for more details.
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)
+
+# init model
+model = AlignTTS(config, ap, tokenizer)
+
+# INITIALIZE THE TRAINER
+# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
+# distributed training, etc.
+trainer = Trainer(
+    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
+)
+
+# AND... 3,2,1... 🚀
+trainer.fit()
--- a/recipes/thorsten_DE/download_thorsten_DE.sh
+++ b/recipes/thorsten_DE/download_thorsten_DE.sh
@ -0,0 +1,21 @@
+# create venv
+python3 -m venv env
+source .env/bin/activate
+pip install pip --upgrade
+
+# download Thorsten_DE dataset
+pip install gdown
+gdown --id 1yKJM1LAOQpRVojKunD9r8WN_p5KzBxjc -O dataset.tgz
+tar -xzf dataset.tgz
+
+# create train-val splits
+shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv
+head -n 20668 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv
+tail -n 2000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv
+
+# rename dataset and remove archive
+mv LJSpeech-1.1 thorsten-de
+rm dataset.tgz
+
+# destry venv
+rm -rf env
--- a/recipes/thorsten_DE/glow_tts/train_glowtts.py
+++ b/recipes/thorsten_DE/glow_tts/train_glowtts.py
@ -0,0 +1,97 @@
+import os
+
+# Trainer: Where the ✨️ happens.
+# TrainingArgs: Defines the set of arguments of the Trainer.
+from trainer import Trainer, TrainerArgs
+
+# GlowTTSConfig: all model related values for training, validating and testing.
+from TTS.tts.configs.glow_tts_config import GlowTTSConfig
+
+# BaseDatasetConfig: defines name, formatter and path of the dataset.
+from TTS.tts.configs.shared_configs import BaseDatasetConfig
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.models.glow_tts import GlowTTS
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
+from TTS.utils.audio import AudioProcessor
+from TTS.utils.downloaders import download_thorsten_de
+
+# we use the same path as this script as our training folder.
+output_path = os.path.dirname(os.path.abspath(__file__))
+
+# DEFINE DATASET CONFIG
+# Set LJSpeech as our target dataset and define its path.
+# You can also use a simple Dict to define the dataset and pass it to your custom formatter.
+dataset_config = BaseDatasetConfig(
+    name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
+)
+
+# download dataset if not already present
+if not os.path.exists(dataset_config.path):
+    print("Downloading dataset")
+    download_thorsten_de(os.path.split(os.path.abspath(dataset_config.path))[0])
+
+# INITIALIZE THE TRAINING CONFIGURATION
+# Configure the model. Every config class inherits the BaseTTSConfig.
+config = GlowTTSConfig(
+    batch_size=32,
+    eval_batch_size=16,
+    num_loader_workers=4,
+    num_eval_loader_workers=4,
+    run_eval=True,
+    test_delay_epochs=-1,
+    epochs=1000,
+    text_cleaner="phoneme_cleaners",
+    use_phonemes=True,
+    phoneme_language="de",
+    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
+    print_step=25,
+    print_eval=False,
+    mixed_precision=True,
+    test_sentences=[
+        "Es hat mich viel Zeit gekostet ein Stimme zu entwickeln, jetzt wo ich sie habe werde ich nicht mehr schweigen.",
+        "Sei eine Stimme, kein Echo.",
+        "Es tut mir Leid David. Das kann ich leider nicht machen.",
+        "Dieser Kuchen ist großartig. Er ist so lecker und feucht.",
+        "Vor dem 22. November 1963.",
+    ],
+    output_path=output_path,
+    datasets=[dataset_config],
+)
+
+# INITIALIZE THE AUDIO PROCESSOR
+# Audio processor is used for feature extraction and audio I/O.
+# It mainly serves to the dataloader and the training loggers.
+ap = AudioProcessor.init_from_config(config)
+
+# INITIALIZE THE TOKENIZER
+# Tokenizer is used to convert text to sequences of token IDs.
+# If characters are not defined in the config, default characters are passed to the config
+tokenizer, config = TTSTokenizer.init_from_config(config)
+
+# LOAD DATA SAMPLES
+# Each sample is a list of ```[text, audio_file_path, speaker_name]```
+# You can define your custom sample loader returning the list of samples.
+# Or define your custom formatter and pass it to the `load_tts_samples`.
+# Check `TTS.tts.datasets.load_tts_samples` for more details.
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)
+
+# INITIALIZE THE MODEL
+# Models take a config object and a speaker manager as input
+# Config defines the details of the model like the number of layers, the size of the embedding, etc.
+# Speaker manager is used by multi-speaker models.
+model = GlowTTS(config, ap, tokenizer, speaker_manager=None)
+
+# INITIALIZE THE TRAINER
+# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
+# distributed training, etc.
+trainer = Trainer(
+    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
+)
+
+# AND... 3,2,1... 🚀
+trainer.fit()
--- a/recipes/thorsten_DE/hifigan/train_hifigan.py
+++ b/recipes/thorsten_DE/hifigan/train_hifigan.py
@ -0,0 +1,53 @@
+import os
+
+from trainer import Trainer, TrainerArgs
+
+from TTS.utils.audio import AudioProcessor
+from TTS.utils.downloaders import download_thorsten_de
+from TTS.vocoder.configs import HifiganConfig
+from TTS.vocoder.datasets.preprocess import load_wav_data
+from TTS.vocoder.models.gan import GAN
+
+output_path = os.path.dirname(os.path.abspath(__file__))
+
+config = HifiganConfig(
+    batch_size=32,
+    eval_batch_size=16,
+    num_loader_workers=4,
+    num_eval_loader_workers=4,
+    run_eval=True,
+    test_delay_epochs=5,
+    epochs=1000,
+    seq_len=8192,
+    pad_short=2000,
+    use_noise_augment=True,
+    eval_split_size=10,
+    print_step=25,
+    print_eval=False,
+    mixed_precision=False,
+    lr_gen=1e-4,
+    lr_disc=1e-4,
+    data_path=os.path.join(output_path, "../thorsten-de/wavs/"),
+    output_path=output_path,
+)
+
+# download dataset if not already present
+if not os.path.exists(config.data_path):
+    print("Downloading dataset")
+    download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../"))
+    download_thorsten_de(download_path)
+
+# init audio processor
+ap = AudioProcessor(**config.audio.to_dict())
+
+# load training samples
+eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
+
+# init model
+model = GAN(config, ap)
+
+# init the trainer and 🚀
+trainer = Trainer(
+    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
+)
+trainer.fit()
--- a/recipes/thorsten_DE/multiband_melgan/train_multiband_melgan.py
+++ b/recipes/thorsten_DE/multiband_melgan/train_multiband_melgan.py
@ -0,0 +1,53 @@
+import os
+
+from trainer import Trainer, TrainerArgs
+
+from TTS.utils.audio import AudioProcessor
+from TTS.utils.downloaders import download_thorsten_de
+from TTS.vocoder.configs import MultibandMelganConfig
+from TTS.vocoder.datasets.preprocess import load_wav_data
+from TTS.vocoder.models.gan import GAN
+
+output_path = os.path.dirname(os.path.abspath(__file__))
+
+config = MultibandMelganConfig(
+    batch_size=32,
+    eval_batch_size=16,
+    num_loader_workers=4,
+    num_eval_loader_workers=4,
+    run_eval=True,
+    test_delay_epochs=5,
+    epochs=1000,
+    seq_len=8192,
+    pad_short=2000,
+    use_noise_augment=True,
+    eval_split_size=10,
+    print_step=25,
+    print_eval=False,
+    mixed_precision=False,
+    lr_gen=1e-4,
+    lr_disc=1e-4,
+    data_path=os.path.join(output_path, "../thorsten-de/wavs/"),
+    output_path=output_path,
+)
+
+# download dataset if not already present
+if not os.path.exists(config.data_path):
+    print("Downloading dataset")
+    download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../"))
+    download_thorsten_de(download_path)
+
+# init audio processor
+ap = AudioProcessor(**config.audio.to_dict())
+
+# load training samples
+eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
+
+# init model
+model = GAN(config, ap)
+
+# init the trainer and 🚀
+trainer = Trainer(
+    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
+)
+trainer.fit()
--- a/recipes/thorsten_DE/speedy_speech/train_speedy_speech.py
+++ b/recipes/thorsten_DE/speedy_speech/train_speedy_speech.py
@ -0,0 +1,102 @@
+import os
+
+from trainer import Trainer, TrainerArgs
+
+from TTS.config import BaseAudioConfig, BaseDatasetConfig
+from TTS.tts.configs.speedy_speech_config import SpeedySpeechConfig
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.models.forward_tts import ForwardTTS
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
+from TTS.utils.audio import AudioProcessor
+from TTS.utils.downloaders import download_thorsten_de
+
+output_path = os.path.dirname(os.path.abspath(__file__))
+dataset_config = BaseDatasetConfig(
+    name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
+)
+
+# download dataset if not already present
+if not os.path.exists(dataset_config.path):
+    print("Downloading dataset")
+    download_thorsten_de(os.path.split(os.path.abspath(dataset_config.path))[0])
+
+audio_config = BaseAudioConfig(
+    sample_rate=22050,
+    do_trim_silence=True,
+    trim_db=60.0,
+    signal_norm=False,
+    mel_fmin=0.0,
+    mel_fmax=8000,
+    spec_gain=1.0,
+    log_func="np.log",
+    ref_level_db=20,
+    preemphasis=0.0,
+)
+
+config = SpeedySpeechConfig(
+    run_name="speedy_speech_thorsten-de",
+    audio=audio_config,
+    batch_size=32,
+    eval_batch_size=16,
+    num_loader_workers=4,
+    num_eval_loader_workers=4,
+    compute_input_seq_cache=True,
+    run_eval=True,
+    test_delay_epochs=-1,
+    epochs=1000,
+    min_audio_len=11050,  # need to up min_audio_len to avois speedy speech error
+    text_cleaner="phoneme_cleaners",
+    use_phonemes=True,
+    phoneme_language="de",
+    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
+    precompute_num_workers=4,
+    print_step=50,
+    print_eval=False,
+    mixed_precision=False,
+    test_sentences=[
+        "Es hat mich viel Zeit gekostet ein Stimme zu entwickeln, jetzt wo ich sie habe werde ich nicht mehr schweigen.",
+        "Sei eine Stimme, kein Echo.",
+        "Es tut mir Leid David. Das kann ich leider nicht machen.",
+        "Dieser Kuchen ist großartig. Er ist so lecker und feucht.",
+        "Vor dem 22. November 1963.",
+    ],
+    sort_by_audio_len=True,
+    max_seq_len=500000,
+    output_path=output_path,
+    datasets=[dataset_config],
+)
+
+# INITIALIZE THE AUDIO PROCESSOR
+# Audio processor is used for feature extraction and audio I/O.
+# It mainly serves to the dataloader and the training loggers.
+ap = AudioProcessor.init_from_config(config)
+
+# INITIALIZE THE TOKENIZER
+# Tokenizer is used to convert text to sequences of token IDs.
+# If characters are not defined in the config, default characters are passed to the config
+tokenizer, config = TTSTokenizer.init_from_config(config)
+
+# LOAD DATA SAMPLES
+# Each sample is a list of ```[text, audio_file_path, speaker_name]```
+# You can define your custom sample loader returning the list of samples.
+# Or define your custom formatter and pass it to the `load_tts_samples`.
+# Check `TTS.tts.datasets.load_tts_samples` for more details.
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)
+
+# init model
+model = ForwardTTS(config, ap, tokenizer)
+
+# INITIALIZE THE TRAINER
+# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
+# distributed training, etc.
+trainer = Trainer(
+    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
+)
+
+# AND... 3,2,1... 🚀
+trainer.fit()
--- a/recipes/thorsten_DE/tacotron2-DDC/train_tacotron_ddc.py
+++ b/recipes/thorsten_DE/tacotron2-DDC/train_tacotron_ddc.py
@ -0,0 +1,108 @@
+import os
+
+from trainer import Trainer, TrainerArgs
+
+from TTS.config.shared_configs import BaseAudioConfig
+from TTS.tts.configs.shared_configs import BaseDatasetConfig
+from TTS.tts.configs.tacotron2_config import Tacotron2Config
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.models.tacotron2 import Tacotron2
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
+from TTS.utils.audio import AudioProcessor
+from TTS.utils.downloaders import download_thorsten_de
+
+# from TTS.tts.datasets.tokenizer import Tokenizer
+output_path = os.path.dirname(os.path.abspath(__file__))
+
+# init configs
+dataset_config = BaseDatasetConfig(
+    name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
+)
+
+# download dataset if not already present
+if not os.path.exists(dataset_config.path):
+    print("Downloading dataset")
+    download_thorsten_de(os.path.split(os.path.abspath(dataset_config.path))[0])
+
+audio_config = BaseAudioConfig(
+    sample_rate=22050,
+    do_trim_silence=True,
+    trim_db=60.0,
+    signal_norm=False,
+    mel_fmin=0.0,
+    mel_fmax=8000,
+    spec_gain=1.0,
+    log_func="np.log",
+    ref_level_db=20,
+    preemphasis=0.0,
+)
+
+config = Tacotron2Config(  # This is the config that is saved for the future use
+    audio=audio_config,
+    batch_size=40,  # BS of 40 and max length of 10s will use about 20GB of GPU memory
+    eval_batch_size=16,
+    num_loader_workers=4,
+    num_eval_loader_workers=4,
+    run_eval=True,
+    test_delay_epochs=-1,
+    r=6,
+    gradual_training=[[0, 6, 64], [10000, 4, 32], [50000, 3, 32], [100000, 2, 32]],
+    double_decoder_consistency=True,
+    epochs=1000,
+    text_cleaner="phoneme_cleaners",
+    use_phonemes=True,
+    phoneme_language="de",
+    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
+    precompute_num_workers=8,
+    print_step=25,
+    print_eval=True,
+    mixed_precision=False,
+    test_sentences=[
+        "Es hat mich viel Zeit gekostet ein Stimme zu entwickeln, jetzt wo ich sie habe werde ich nicht mehr schweigen.",
+        "Sei eine Stimme, kein Echo.",
+        "Es tut mir Leid David. Das kann ich leider nicht machen.",
+        "Dieser Kuchen ist großartig. Er ist so lecker und feucht.",
+        "Vor dem 22. November 1963.",
+    ],
+    # max audio length of 10 seconds, feel free to increase if you got more than 20GB GPU memory
+    max_audio_len=22050 * 10,
+    output_path=output_path,
+    datasets=[dataset_config],
+)
+
+# init audio processor
+ap = AudioProcessor(**config.audio.to_dict())
+
+# INITIALIZE THE AUDIO PROCESSOR
+# Audio processor is used for feature extraction and audio I/O.
+# It mainly serves to the dataloader and the training loggers.
+ap = AudioProcessor.init_from_config(config)
+
+# INITIALIZE THE TOKENIZER
+# Tokenizer is used to convert text to sequences of token IDs.
+# If characters are not defined in the config, default characters are passed to the config
+tokenizer, config = TTSTokenizer.init_from_config(config)
+
+# LOAD DATA SAMPLES
+# Each sample is a list of ```[text, audio_file_path, speaker_name]```
+# You can define your custom sample loader returning the list of samples.
+# Or define your custom formatter and pass it to the `load_tts_samples`.
+# Check `TTS.tts.datasets.load_tts_samples` for more details.
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)
+
+# INITIALIZE THE MODEL
+# Models take a config object and a speaker manager as input
+# Config defines the details of the model like the number of layers, the size of the embedding, etc.
+# Speaker manager is used by multi-speaker models.
+model = Tacotron2(config, ap, tokenizer, speaker_manager=None)
+
+# init the trainer and 🚀
+trainer = Trainer(
+    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
+)
+trainer.fit()
--- a/recipes/thorsten_DE/univnet/train_univnet.py
+++ b/recipes/thorsten_DE/univnet/train_univnet.py
@ -0,0 +1,52 @@
+import os
+
+from trainer import Trainer, TrainerArgs
+
+from TTS.utils.audio import AudioProcessor
+from TTS.utils.downloaders import download_thorsten_de
+from TTS.vocoder.configs import UnivnetConfig
+from TTS.vocoder.datasets.preprocess import load_wav_data
+from TTS.vocoder.models.gan import GAN
+
+output_path = os.path.dirname(os.path.abspath(__file__))
+config = UnivnetConfig(
+    batch_size=64,
+    eval_batch_size=16,
+    num_loader_workers=4,
+    num_eval_loader_workers=4,
+    run_eval=True,
+    test_delay_epochs=-1,
+    epochs=1000,
+    seq_len=8192,
+    pad_short=2000,
+    use_noise_augment=True,
+    eval_split_size=10,
+    print_step=25,
+    print_eval=False,
+    mixed_precision=False,
+    lr_gen=1e-4,
+    lr_disc=1e-4,
+    data_path=os.path.join(output_path, "../thorsten-de/wavs/"),
+    output_path=output_path,
+)
+
+# download dataset if not already present
+if not os.path.exists(config.data_path):
+    print("Downloading dataset")
+    download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../"))
+    download_thorsten_de(download_path)
+
+# init audio processor
+ap = AudioProcessor(**config.audio.to_dict())
+
+# load training samples
+eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
+
+# init model
+model = GAN(config, ap)
+
+# init the trainer and 🚀
+trainer = Trainer(
+    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
+)
+trainer.fit()
--- a/recipes/thorsten_DE/vits_tts/train_vits.py
+++ b/recipes/thorsten_DE/vits_tts/train_vits.py
@ -0,0 +1,105 @@
+import os
+
+from trainer import Trainer, TrainerArgs
+
+from TTS.config.shared_configs import BaseAudioConfig
+from TTS.tts.configs.shared_configs import BaseDatasetConfig
+from TTS.tts.configs.vits_config import VitsConfig
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.models.vits import Vits
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
+from TTS.utils.audio import AudioProcessor
+from TTS.utils.downloaders import download_thorsten_de
+
+output_path = os.path.dirname(os.path.abspath(__file__))
+dataset_config = BaseDatasetConfig(
+    name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
+)
+
+# download dataset if not already present
+if not os.path.exists(dataset_config.path):
+    print("Downloading dataset")
+    download_thorsten_de(os.path.split(os.path.abspath(dataset_config.path))[0])
+
+audio_config = BaseAudioConfig(
+    sample_rate=22050,
+    win_length=1024,
+    hop_length=256,
+    num_mels=80,
+    preemphasis=0.0,
+    ref_level_db=20,
+    log_func="np.log",
+    do_trim_silence=True,
+    trim_db=45,
+    mel_fmin=0,
+    mel_fmax=None,
+    spec_gain=1.0,
+    signal_norm=False,
+    do_amp_to_db_linear=False,
+)
+
+config = VitsConfig(
+    audio=audio_config,
+    run_name="vits_thorsten-de",
+    batch_size=32,
+    eval_batch_size=16,
+    batch_group_size=5,
+    num_loader_workers=0,
+    num_eval_loader_workers=4,
+    run_eval=True,
+    test_delay_epochs=-1,
+    epochs=1000,
+    text_cleaner="phoneme_cleaners",
+    use_phonemes=True,
+    phoneme_language="de",
+    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
+    compute_input_seq_cache=True,
+    print_step=25,
+    print_eval=True,
+    mixed_precision=True,
+    test_sentences=[
+        "Es hat mich viel Zeit gekostet ein Stimme zu entwickeln, jetzt wo ich sie habe werde ich nicht mehr schweigen.",
+        "Sei eine Stimme, kein Echo.",
+        "Es tut mir Leid David. Das kann ich leider nicht machen.",
+        "Dieser Kuchen ist großartig. Er ist so lecker und feucht.",
+        "Vor dem 22. November 1963.",
+    ],
+    output_path=output_path,
+    datasets=[dataset_config],
+)
+
+# INITIALIZE THE AUDIO PROCESSOR
+# Audio processor is used for feature extraction and audio I/O.
+# It mainly serves to the dataloader and the training loggers.
+ap = AudioProcessor.init_from_config(config)
+
+# INITIALIZE THE TOKENIZER
+# Tokenizer is used to convert text to sequences of token IDs.
+# config is updated with the default characters if not defined in the config.
+tokenizer, config = TTSTokenizer.init_from_config(config)
+
+# LOAD DATA SAMPLES
+# Each sample is a list of ```[text, audio_file_path, speaker_name]```
+# You can define your custom sample loader returning the list of samples.
+# Or define your custom formatter and pass it to the `load_tts_samples`.
+# Check `TTS.tts.datasets.load_tts_samples` for more details.
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)
+
+# init model
+model = Vits(config, ap, tokenizer, speaker_manager=None)
+
+# init the trainer and 🚀
+trainer = Trainer(
+    TrainerArgs(),
+    config,
+    output_path,
+    model=model,
+    train_samples=train_samples,
+    eval_samples=eval_samples,
+)
+trainer.fit()
--- a/recipes/thorsten_DE/wavegrad/train_wavegrad.py
+++ b/recipes/thorsten_DE/wavegrad/train_wavegrad.py
@ -0,0 +1,56 @@
+import os
+
+from trainer import Trainer, TrainerArgs
+
+from TTS.utils.audio import AudioProcessor
+from TTS.utils.downloaders import download_thorsten_de
+from TTS.vocoder.configs import WavegradConfig
+from TTS.vocoder.datasets.preprocess import load_wav_data
+from TTS.vocoder.models.wavegrad import Wavegrad
+
+output_path = os.path.dirname(os.path.abspath(__file__))
+config = WavegradConfig(
+    batch_size=32,
+    eval_batch_size=16,
+    num_loader_workers=4,
+    num_eval_loader_workers=4,
+    run_eval=True,
+    test_delay_epochs=-1,
+    epochs=1000,
+    seq_len=6144,
+    pad_short=2000,
+    use_noise_augment=True,
+    eval_split_size=50,
+    print_step=50,
+    print_eval=True,
+    mixed_precision=False,
+    data_path=os.path.join(output_path, "../thorsten-de/wavs/"),
+    output_path=output_path,
+)
+
+# download dataset if not already present
+if not os.path.exists(config.data_path):
+    print("Downloading dataset")
+    download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../"))
+    download_thorsten_de(download_path)
+
+# init audio processor
+ap = AudioProcessor(**config.audio.to_dict())
+
+# load training samples
+eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
+
+# init model
+model = Wavegrad(config)
+
+# init the trainer and 🚀
+trainer = Trainer(
+    TrainerArgs(),
+    config,
+    output_path,
+    model=model,
+    train_samples=train_samples,
+    eval_samples=eval_samples,
+    training_assets={"audio_processor": ap},
+)
+trainer.fit()
--- a/recipes/thorsten_DE/wavernn/train_wavernn.py
+++ b/recipes/thorsten_DE/wavernn/train_wavernn.py
@ -0,0 +1,58 @@
+import os
+
+from trainer import Trainer, TrainerArgs
+
+from TTS.utils.audio import AudioProcessor
+from TTS.utils.downloaders import download_thorsten_de
+from TTS.vocoder.configs import WavernnConfig
+from TTS.vocoder.datasets.preprocess import load_wav_data
+from TTS.vocoder.models.wavernn import Wavernn
+
+output_path = os.path.dirname(os.path.abspath(__file__))
+config = WavernnConfig(
+    batch_size=64,
+    eval_batch_size=16,
+    num_loader_workers=4,
+    num_eval_loader_workers=4,
+    run_eval=True,
+    test_delay_epochs=-1,
+    epochs=10000,
+    seq_len=1280,
+    pad_short=2000,
+    use_noise_augment=False,
+    eval_split_size=10,
+    print_step=25,
+    print_eval=True,
+    mixed_precision=False,
+    lr=1e-4,
+    grad_clip=4,
+    data_path=os.path.join(output_path, "../thorsten-de/wavs/"),
+    output_path=output_path,
+)
+
+# download dataset if not already present
+if not os.path.exists(config.data_path):
+    print("Downloading dataset")
+    download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../"))
+    download_thorsten_de(download_path)
+
+# init audio processor
+ap = AudioProcessor(**config.audio.to_dict())
+
+# load training samples
+eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
+
+# init model
+model = Wavernn(config)
+
+# init the trainer and 🚀
+trainer = Trainer(
+    TrainerArgs(),
+    config,
+    output_path,
+    model=model,
+    train_samples=train_samples,
+    eval_samples=eval_samples,
+    training_assets={"audio_processor": ap},
+)
+trainer.fit()
--- a/requirements.dev.txt
+++ b/requirements.dev.txt
@ -1,5 +1,5 @@
 black
 coverage
 isort
-nose
+nose2
 pylint==2.10.2
--- a/requirements.txt
+++ b/requirements.txt
@ -1,12 +1,12 @@
 # core deps
-numpy==1.19.5
-cython
+numpy==1.21.6
+cython==0.29.28
 scipy>=1.4.0
 torch>=1.7
 torchaudio
 soundfile
 librosa==0.8.0
-numba==0.53
+numba==0.55.1
 inflect
 tqdm
 anyascii
@ -21,16 +21,16 @@ umap-learn==0.5.1
 pandas
 # deps for training
 matplotlib
-tensorboardX
-pyworld
+pyworld==0.2.10 # > 0.2.10 is not p3.10.x compatible
 # coqui stack
 trainer
-coqpit # config management
+# config management
+coqpit>=0.0.16
 # chinese g2p deps
 jieba
 pypinyin
 # japanese g2p deps
-mecab-python3==1.0.3
+mecab-python3==1.0.5
 unidic-lite==1.0.8
 # gruut+supported langs
 gruut[cs,de,es,fr,it,nl,pt,ru,sv]==2.2.3
--- a/setup.py
+++ b/setup.py
@ -31,8 +31,8 @@ import setuptools.command.develop
 from Cython.Build import cythonize
 from setuptools import Extension, find_packages, setup

-if LooseVersion(sys.version) < LooseVersion("3.6") or LooseVersion(sys.version) > LooseVersion("3.10"):
-    raise RuntimeError("TTS requires python >= 3.6 and <=3.10 " "but your Python version is {}".format(sys.version))
+if LooseVersion(sys.version) < LooseVersion("3.7") or LooseVersion(sys.version) >= LooseVersion("3.11"):
+    raise RuntimeError("TTS requires python >= 3.7 and < 3.11 " "but your Python version is {}".format(sys.version))


 cwd = os.path.dirname(os.path.abspath(__file__))
@ -113,15 +113,15 @@ setup(
        "dev": requirements_dev,
        "notebooks": requirements_notebooks,
    },
-    python_requires=">=3.6.0, <3.10",
+    python_requires=">=3.7.0, <3.11",
    entry_points={"console_scripts": ["tts=TTS.bin.synthesize:main", "tts-server = TTS.server.server:main"]},
    classifiers=[
        "Programming Language :: Python",
        "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.6",
        "Programming Language :: Python :: 3.7",
        "Programming Language :: Python :: 3.8",
        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
        "Development Status :: 3 - Alpha",
        "Intended Audience :: Science/Research",
        "Intended Audience :: Developers",
--- a/tests/aux_tests/test_speaker_manager.py
+++ b/tests/aux_tests/test_speaker_manager.py
@ -16,6 +16,7 @@ encoder_model_path = os.path.join(get_tests_input_path(), "checkpoint_0.pth")
 sample_wav_path = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0001.wav")
 sample_wav_path2 = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0002.wav")
 d_vectors_file_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.json")
+d_vectors_file_pth_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.pth")


 class SpeakerManagerTest(unittest.TestCase):
@ -58,12 +59,13 @@ class SpeakerManagerTest(unittest.TestCase):
        # remove dummy model
        os.remove(encoder_model_path)

-    @staticmethod
-    def test_speakers_file_processing():
+    def test_speakers_file_processing(self):
        manager = SpeakerManager(d_vectors_file_path=d_vectors_file_path)
-        print(manager.num_speakers)
-        print(manager.embedding_dim)
-        print(manager.clip_ids)
+        self.assertEqual(manager.num_speakers, 1)
+        self.assertEqual(manager.embedding_dim, 256)
+        manager = SpeakerManager(d_vectors_file_path=d_vectors_file_pth_path)
+        self.assertEqual(manager.num_speakers, 1)
+        self.assertEqual(manager.embedding_dim, 256)
        d_vector = manager.get_embedding_by_clip(manager.clip_ids[0])
        assert len(d_vector) == 256
        d_vectors = manager.get_embeddings_by_name(manager.speaker_names[0])
--- a/tests/data/dummy_speakers.pth
+++ b/tests/data/dummy_speakers.pth
--- a/tests/data_tests/test_loader.py
+++ b/tests/data_tests/test_loader.py
@ -6,7 +6,7 @@ import numpy as np
 import torch
 from torch.utils.data import DataLoader

-from tests import get_tests_output_path
+from tests import get_tests_data_path, get_tests_output_path
 from TTS.tts.configs.shared_configs import BaseDatasetConfig, BaseTTSConfig
 from TTS.tts.datasets import TTSDataset, load_tts_samples
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
@ -20,7 +20,7 @@ os.makedirs(OUTPATH, exist_ok=True)
 # create a dummy config for testing data loaders.
 c = BaseTTSConfig(text_cleaner="english_cleaners", num_loader_workers=0, batch_size=2, use_noise_augment=False)
 c.r = 5
-c.data_path = "tests/data/ljspeech/"
+c.data_path = os.path.join(get_tests_data_path(), "ljspeech/")
 ok_ljspeech = os.path.exists(c.data_path)

 dataset_config = BaseDatasetConfig(
--- a/tests/data_tests/test_samplers.py
+++ b/tests/data_tests/test_samplers.py
@ -1,4 +1,5 @@
 import functools
+import random
 import unittest

 import torch
@ -6,6 +7,7 @@ import torch
 from TTS.config.shared_configs import BaseDatasetConfig
 from TTS.encoder.utils.samplers import PerfectBatchSampler
 from TTS.tts.datasets import load_tts_samples
+from TTS.tts.utils.data import get_length_balancer_weights
 from TTS.tts.utils.languages import get_language_balancer_weights
 from TTS.tts.utils.speakers import get_speaker_balancer_weights

@ -136,3 +138,28 @@ class TestSamplers(unittest.TestCase):
                else:
                    spk2 += 1
            assert spk1 == spk2, "PerfectBatchSampler is supposed to be perfectly balanced"
+
+    def test_length_weighted_random_sampler(self):  # pylint: disable=no-self-use
+        for _ in range(1000):
+            # gerenate a lenght unbalanced dataset with random max/min audio lenght
+            min_audio = random.randrange(1, 22050)
+            max_audio = random.randrange(44100, 220500)
+            for idx, item in enumerate(train_samples):
+                # increase the diversity of durations
+                random_increase = random.randrange(100, 1000)
+                if idx < 5:
+                    item["audio_length"] = min_audio + random_increase
+                else:
+                    item["audio_length"] = max_audio + random_increase
+
+            weighted_sampler = torch.utils.data.sampler.WeightedRandomSampler(
+                get_length_balancer_weights(train_samples, num_buckets=2), len(train_samples)
+            )
+            ids = functools.reduce(lambda a, b: a + b, [list(weighted_sampler) for i in range(100)])
+            len1, len2 = 0, 0
+            for index in ids:
+                if train_samples[index]["audio_length"] < max_audio:
+                    len1 += 1
+                else:
+                    len2 += 1
+            assert is_balanced(len1, len2), "Length Weighted sampler is supposed to be balanced"
--- a/tests/tts_tests/test_tacotron2_model.py
+++ b/tests/tts_tests/test_tacotron2_model.py
@ -6,7 +6,7 @@ import torch
 from torch import nn, optim

 from tests import get_tests_input_path
-from TTS.tts.configs.shared_configs import GSTConfig
+from TTS.tts.configs.shared_configs import CapacitronVAEConfig, GSTConfig
 from TTS.tts.configs.tacotron2_config import Tacotron2Config
 from TTS.tts.layers.losses import MSELossMasked
 from TTS.tts.models.tacotron2 import Tacotron2
@ -260,6 +260,73 @@ class TacotronGSTTrainTest(unittest.TestCase):
            count += 1


+class TacotronCapacitronTrainTest(unittest.TestCase):
+    @staticmethod
+    def test_train_step():
+        config = Tacotron2Config(
+            num_chars=32,
+            num_speakers=10,
+            use_speaker_embedding=True,
+            out_channels=80,
+            decoder_output_dim=80,
+            use_capacitron_vae=True,
+            capacitron_vae=CapacitronVAEConfig(),
+            optimizer="CapacitronOptimizer",
+            optimizer_params={
+                "RAdam": {"betas": [0.9, 0.998], "weight_decay": 1e-6},
+                "SGD": {"lr": 1e-5, "momentum": 0.9},
+            },
+        )
+
+        batch = dict({})
+        batch["text_input"] = torch.randint(0, 24, (8, 128)).long().to(device)
+        batch["text_lengths"] = torch.randint(100, 129, (8,)).long().to(device)
+        batch["text_lengths"] = torch.sort(batch["text_lengths"], descending=True)[0]
+        batch["text_lengths"][0] = 128
+        batch["mel_input"] = torch.rand(8, 120, config.audio["num_mels"]).to(device)
+        batch["mel_lengths"] = torch.randint(20, 120, (8,)).long().to(device)
+        batch["mel_lengths"] = torch.sort(batch["mel_lengths"], descending=True)[0]
+        batch["mel_lengths"][0] = 120
+        batch["stop_targets"] = torch.zeros(8, 120, 1).float().to(device)
+        batch["stop_target_lengths"] = torch.randint(0, 120, (8,)).to(device)
+        batch["speaker_ids"] = torch.randint(0, 5, (8,)).long().to(device)
+        batch["d_vectors"] = None
+
+        for idx in batch["mel_lengths"]:
+            batch["stop_targets"][:, int(idx.item()) :, 0] = 1.0
+
+        batch["stop_targets"] = batch["stop_targets"].view(
+            batch["text_input"].shape[0], batch["stop_targets"].size(1) // config.r, -1
+        )
+        batch["stop_targets"] = (batch["stop_targets"].sum(2) > 0.0).unsqueeze(2).float().squeeze()
+
+        model = Tacotron2(config).to(device)
+        criterion = model.get_criterion()
+        optimizer = model.get_optimizer()
+
+        model.train()
+        model_ref = copy.deepcopy(model)
+        count = 0
+        for param, param_ref in zip(model.parameters(), model_ref.parameters()):
+            assert (param - param_ref).sum() == 0, param
+            count += 1
+        for _ in range(10):
+            _, loss_dict = model.train_step(batch, criterion)
+            optimizer.zero_grad()
+            loss_dict["capacitron_vae_beta_loss"].backward()
+            optimizer.first_step()
+            loss_dict["loss"].backward()
+            optimizer.step()
+        # check parameter changes
+        count = 0
+        for param, param_ref in zip(model.parameters(), model_ref.parameters()):
+            # ignore pre-higway layer since it works conditional
+            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
+                count, param.shape, param, param_ref
+            )
+            count += 1
+
+
 class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
    """Test multi-speaker Tacotron2 with Global Style Tokens and d-vector inputs."""

--- a/tests/tts_tests/test_tacotron_model.py
+++ b/tests/tts_tests/test_tacotron_model.py
@ -6,7 +6,7 @@ import torch
 from torch import nn, optim

 from tests import get_tests_input_path
-from TTS.tts.configs.shared_configs import GSTConfig
+from TTS.tts.configs.shared_configs import CapacitronVAEConfig, GSTConfig
 from TTS.tts.configs.tacotron_config import TacotronConfig
 from TTS.tts.layers.losses import L1LossMasked
 from TTS.tts.models.tacotron import Tacotron
@ -248,6 +248,74 @@ class TacotronGSTTrainTest(unittest.TestCase):
            count += 1


+class TacotronCapacitronTrainTest(unittest.TestCase):
+    @staticmethod
+    def test_train_step():
+        config = TacotronConfig(
+            num_chars=32,
+            num_speakers=10,
+            use_speaker_embedding=True,
+            out_channels=513,
+            decoder_output_dim=80,
+            use_capacitron_vae=True,
+            capacitron_vae=CapacitronVAEConfig(),
+            optimizer="CapacitronOptimizer",
+            optimizer_params={
+                "RAdam": {"betas": [0.9, 0.998], "weight_decay": 1e-6},
+                "SGD": {"lr": 1e-5, "momentum": 0.9},
+            },
+        )
+
+        batch = dict({})
+        batch["text_input"] = torch.randint(0, 24, (8, 128)).long().to(device)
+        batch["text_lengths"] = torch.randint(100, 129, (8,)).long().to(device)
+        batch["text_lengths"] = torch.sort(batch["text_lengths"], descending=True)[0]
+        batch["text_lengths"][0] = 128
+        batch["linear_input"] = torch.rand(8, 120, config.audio["fft_size"] // 2 + 1).to(device)
+        batch["mel_input"] = torch.rand(8, 120, config.audio["num_mels"]).to(device)
+        batch["mel_lengths"] = torch.randint(20, 120, (8,)).long().to(device)
+        batch["mel_lengths"] = torch.sort(batch["mel_lengths"], descending=True)[0]
+        batch["mel_lengths"][0] = 120
+        batch["stop_targets"] = torch.zeros(8, 120, 1).float().to(device)
+        batch["stop_target_lengths"] = torch.randint(0, 120, (8,)).to(device)
+        batch["speaker_ids"] = torch.randint(0, 5, (8,)).long().to(device)
+        batch["d_vectors"] = None
+
+        for idx in batch["mel_lengths"]:
+            batch["stop_targets"][:, int(idx.item()) :, 0] = 1.0
+
+        batch["stop_targets"] = batch["stop_targets"].view(
+            batch["text_input"].shape[0], batch["stop_targets"].size(1) // config.r, -1
+        )
+        batch["stop_targets"] = (batch["stop_targets"].sum(2) > 0.0).unsqueeze(2).float().squeeze()
+
+        model = Tacotron(config).to(device)
+        criterion = model.get_criterion()
+        optimizer = model.get_optimizer()
+        model.train()
+        print(" > Num parameters for Tacotron with Capacitron VAE model:%s" % (count_parameters(model)))
+        model_ref = copy.deepcopy(model)
+        count = 0
+        for param, param_ref in zip(model.parameters(), model_ref.parameters()):
+            assert (param - param_ref).sum() == 0, param
+            count += 1
+        for _ in range(10):
+            _, loss_dict = model.train_step(batch, criterion)
+            optimizer.zero_grad()
+            loss_dict["capacitron_vae_beta_loss"].backward()
+            optimizer.first_step()
+            loss_dict["loss"].backward()
+            optimizer.step()
+        # check parameter changes
+        count = 0
+        for param, param_ref in zip(model.parameters(), model_ref.parameters()):
+            # ignore pre-higway layer since it works conditional
+            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
+                count, param.shape, param, param_ref
+            )
+            count += 1
+
+
 class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
    @staticmethod
    def test_train_step():
--- a/tests/tts_tests/test_vits.py
+++ b/tests/tts_tests/test_vits.py
@ -122,7 +122,7 @@ class TestVits(unittest.TestCase):
        args = VitsArgs(num_speakers=num_speakers, use_speaker_embedding=True)
        model = Vits(args)

-        ref_inp = torch.randn(1, spec_len, 513)
+        ref_inp = torch.randn(1, 513, spec_len)
        ref_inp_len = torch.randint(1, spec_effective_len, (1,))
        ref_spk_id = torch.randint(1, num_speakers, (1,))
        tgt_spk_id = torch.randint(1, num_speakers, (1,))
@ -420,6 +420,76 @@ class TestVits(unittest.TestCase):
        # check parameter changes
        self._check_parameter_changes(model, model_ref)

+    def test_train_step_upsampling(self):
+        # setup the model
+        with torch.autograd.set_detect_anomaly(True):
+            model_args = VitsArgs(
+                num_chars=32,
+                spec_segment_size=10,
+                encoder_sample_rate=11025,
+                interpolate_z=False,
+                upsample_rates_decoder=[8, 8, 4, 2],
+            )
+            config = VitsConfig(model_args=model_args)
+            model = Vits(config).to(device)
+            model.train()
+            # model to train
+            optimizers = model.get_optimizer()
+            criterions = model.get_criterion()
+            criterions = [criterions[0].to(device), criterions[1].to(device)]
+            # reference model to compare model weights
+            model_ref = Vits(config).to(device)
+            # # pass the state to ref model
+            model_ref.load_state_dict(copy.deepcopy(model.state_dict()))
+            count = 0
+            for param, param_ref in zip(model.parameters(), model_ref.parameters()):
+                assert (param - param_ref).sum() == 0, param
+                count = count + 1
+            for _ in range(5):
+                batch = self._create_batch(config, 2)
+                for idx in [0, 1]:
+                    outputs, loss_dict = model.train_step(batch, criterions, idx)
+                    self.assertFalse(not outputs)
+                    self.assertFalse(not loss_dict)
+                    loss_dict["loss"].backward()
+                    optimizers[idx].step()
+                    optimizers[idx].zero_grad()
+
+        # check parameter changes
+        self._check_parameter_changes(model, model_ref)
+
+    def test_train_step_upsampling_interpolation(self):
+        # setup the model
+        with torch.autograd.set_detect_anomaly(True):
+            model_args = VitsArgs(num_chars=32, spec_segment_size=10, encoder_sample_rate=11025, interpolate_z=True)
+            config = VitsConfig(model_args=model_args)
+            model = Vits(config).to(device)
+            model.train()
+            # model to train
+            optimizers = model.get_optimizer()
+            criterions = model.get_criterion()
+            criterions = [criterions[0].to(device), criterions[1].to(device)]
+            # reference model to compare model weights
+            model_ref = Vits(config).to(device)
+            # # pass the state to ref model
+            model_ref.load_state_dict(copy.deepcopy(model.state_dict()))
+            count = 0
+            for param, param_ref in zip(model.parameters(), model_ref.parameters()):
+                assert (param - param_ref).sum() == 0, param
+                count = count + 1
+            for _ in range(5):
+                batch = self._create_batch(config, 2)
+                for idx in [0, 1]:
+                    outputs, loss_dict = model.train_step(batch, criterions, idx)
+                    self.assertFalse(not outputs)
+                    self.assertFalse(not loss_dict)
+                    loss_dict["loss"].backward()
+                    optimizers[idx].step()
+                    optimizers[idx].zero_grad()
+
+        # check parameter changes
+        self._check_parameter_changes(model, model_ref)
+
    def test_train_eval_log(self):
        batch_size = 2
        config = VitsConfig(model_args=VitsArgs(num_chars=32, spec_segment_size=10))
--- a/tests/zoo_tests/test_models.py
+++ b/tests/zoo_tests/test_models.py
@ -3,7 +3,7 @@ import glob
 import os
 import shutil

-from tests import get_tests_output_path, run_cli
+from tests import get_tests_data_path, get_tests_output_path, run_cli
 from TTS.tts.utils.languages import LanguageManager
 from TTS.tts.utils.speakers import SpeakerManager
 from TTS.utils.generic_utils import get_user_data_dir
@ -56,3 +56,16 @@ def test_run_all_models():
    folders = glob.glob(os.path.join(manager.output_prefix, "*"))
    assert len(folders) == len(model_names)
    shutil.rmtree(manager.output_prefix)
+
+
+def test_voice_conversion():
+    print(" > Run voice conversion inference using YourTTS model.")
+    model_name = "tts_models/multilingual/multi-dataset/your_tts"
+    language_id = "en"
+    speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")
+    reference_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0032.wav")
+    output_path = os.path.join(get_tests_output_path(), "output.wav")
+    run_cli(
+        f"tts --model_name  {model_name}"
+        f" --out_path {output_path} --speaker_wav {speaker_wav} --reference_wav {reference_wav} --language_idx {language_id} "
+    )
 @ -1 +1 @@
 .6.2
 .7.0