Merge pull request #1537 from coqui-ai/dev

v0.7.0
2022-06-20 23:55:22 +02:00 · 2022-06-20 23:55:22 +02:00 · c7cca4135d
parent c410bc58ef 71281ff1e4
commit c7cca4135d
78 changed files with 3667 additions and 421 deletions
--- a/.github/workflows/aux_tests.yml
+++ b/.github/workflows/aux_tests.yml
@ -18,7 +18,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: [3.6, 3.7, 3.8, 3.9]
+        python-version: [3.7, 3.8, 3.9, "3.10"]
        experimental: [false]
    steps:
      - uses: actions/checkout@v2
@ -38,6 +38,9 @@ jobs:
          make system-deps
      - name: Install/upgrade Python setup deps
        run: python3 -m pip install --upgrade pip setuptools wheel
      - name: Replace scarf urls
        run: |
          sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
      - name: Install TTS
        run: |
          python3 -m pip install .[all]
--- a/.github/workflows/data_tests.yml
+++ b/.github/workflows/data_tests.yml
@ -18,7 +18,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: [3.6, 3.7, 3.8, 3.9]
+        python-version: [3.7, 3.8, 3.9, "3.10"]
        experimental: [false]
    steps:
      - uses: actions/checkout@v2
@ -38,6 +38,9 @@ jobs:
          make system-deps
      - name: Install/upgrade Python setup deps
        run: python3 -m pip install --upgrade pip setuptools wheel
      - name: Replace scarf urls
        run: |
          sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
      - name: Install TTS
        run: |
          python3 -m pip install .[all]
--- a/.github/workflows/docker.yaml
+++ b/.github/workflows/docker.yaml
@ -14,6 +14,9 @@ jobs:
    strategy:
      matrix:
        arch: ["amd64"]
        base:
        - "nvcr.io/nvidia/pytorch:22.03-py3" # GPU enabled
        - "ubuntu:20.04" # CPU only
    steps:
      - uses: actions/checkout@v2
      - name: Log in to the Container registry
@ -28,6 +31,11 @@ jobs:
          set -ex
          base="ghcr.io/coqui-ai/tts"
          tags="" # PR build
          if [[ ${{ matrix.base }} = "ubuntu:20.04" ]]; then
            base="ghcr.io/coqui-ai/tts-cpu"
          fi
          if [[ "${{ startsWith(github.ref, 'refs/heads/') }}" = "true" ]]; then
            # Push to branch
            github_ref="${{ github.ref }}"
@ -53,4 +61,5 @@ jobs:
          context: .
          platforms: linux/${{ matrix.arch }}
          push: ${{ github.event_name == 'push' }}
          build-args: "BASE=${{ matrix.base }}"
          tags: ${{ steps.compute-tag.outputs.tags }}
--- a/.github/workflows/inference_tests.yml
+++ b/.github/workflows/inference_tests.yml
@ -18,7 +18,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: [3.6, 3.7, 3.8, 3.9]
+        python-version: [3.7, 3.8, 3.9, "3.10"]
        experimental: [false]
    steps:
      - uses: actions/checkout@v2
@ -38,6 +38,9 @@ jobs:
          make system-deps
      - name: Install/upgrade Python setup deps
        run: python3 -m pip install --upgrade pip setuptools wheel
      - name: Replace scarf urls
        run: |
          sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
      - name: Install TTS
        run: |
          python3 -m pip install .[all]
--- a/.github/workflows/pypi-release.yml
+++ b/.github/workflows/pypi-release.yml
@ -36,7 +36,7 @@ jobs:
    runs-on: ubuntu-20.04
    strategy:
      matrix:
-        python-version: ["3.6", "3.7", "3.8", "3.9"]
+        python-version: ["3.7", "3.8", "3.9", "3.10"]
    steps:
      - uses: actions/checkout@v2
      - uses: actions/setup-python@v2
@ -62,10 +62,6 @@ jobs:
        with:
          name: "sdist"
          path: "dist/"
      - uses: actions/download-artifact@v2
        with:
          name: "wheel-3.6"
          path: "dist/"
      - uses: actions/download-artifact@v2
        with:
          name: "wheel-3.7"
@ -78,6 +74,10 @@ jobs:
        with:
          name: "wheel-3.9"
          path: "dist/"
      - uses: actions/download-artifact@v2
        with:
          name: "wheel-3.10"
          path: "dist/"
      - run: |
          ls -lh dist/
      - name: Setup PyPI config
--- a/.github/workflows/text_tests.yml
+++ b/.github/workflows/text_tests.yml
@ -18,7 +18,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: [3.6, 3.7, 3.8, 3.9]
+        python-version: [3.7, 3.8, 3.9, "3.10"]
        experimental: [false]
    steps:
      - uses: actions/checkout@v2
--- a/.github/workflows/tts_tests.yml
+++ b/.github/workflows/tts_tests.yml
@ -18,7 +18,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: [3.6, 3.7, 3.8, 3.9]
+        python-version: [3.7, 3.8, 3.9, "3.10"]
        experimental: [false]
    steps:
      - uses: actions/checkout@v2
@ -40,6 +40,9 @@ jobs:
          make system-deps
      - name: Install/upgrade Python setup deps
        run: python3 -m pip install --upgrade pip setuptools wheel
      - name: Replace scarf urls
        run: |
          sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
      - name: Install TTS
        run: |
          python3 -m pip install .[all]
--- a/.github/workflows/vocoder_tests.yml
+++ b/.github/workflows/vocoder_tests.yml
@ -18,7 +18,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: [3.6, 3.7, 3.8, 3.9]
+        python-version: [3.7, 3.8, 3.9, "3.10"]
        experimental: [false]
    steps:
      - uses: actions/checkout@v2
--- a/.github/workflows/zoo_tests.yml
+++ b/.github/workflows/zoo_tests.yml
@ -18,7 +18,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: [3.6, 3.7, 3.8, 3.9]
+        python-version: [3.7, 3.8, 3.9, "3.10"]
        experimental: [false]
    steps:
      - uses: actions/checkout@v2
@ -39,6 +39,9 @@ jobs:
          make system-deps
      - name: Install/upgrade Python setup deps
        run: python3 -m pip install --upgrade pip setuptools wheel
      - name: Replace scarf urls
        run: |
          sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
      - name: Install TTS
        run: |
          python3 -m pip install .[all]
--- a/.gitignore
+++ b/.gitignore
@ -117,6 +117,7 @@ venv.bak/
 # pytorch models
 *.pth
 *.pth.tar
 !dummy_speakers.pth
 result/
 # setup.py
--- a/15
+++ b/15
@ -1,10 +1,19 @@
-FROM nvcr.io/nvidia/pytorch:22.03-py3
+ARG BASE=nvcr.io/nvidia/pytorch:22.03-py3
-RUN apt-get update && apt-get install -y --no-install-recommends espeak && rm -rf /var/lib/apt/lists/*
+FROM ${BASE}
 RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ make  python3 python3-dev python3-pip python3-venv python3-wheel espeak espeak-ng libsndfile1-dev && rm -rf /var/lib/apt/lists/*
 RUN pip install llvmlite --ignore-installed
 # Create and activate virtual env
 ENV VIRTUAL_ENV=/venv
 RUN python3 -m venv $VIRTUAL_ENV
 ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 RUN pip install -U pip setuptools wheel
 WORKDIR /root
 COPY requirements.txt /root
 COPY requirements.dev.txt /root
 COPY requirements.notebooks.txt /root
-RUN pip install -r <(cat requirements.txt requirements.dev.txt requirements.notebooks.txt)
+RUN ["/bin/bash", "-c", "pip install -r <(cat requirements.txt requirements.dev.txt requirements.notebooks.txt)"]
 COPY . /root
 RUN make install
 ENTRYPOINT ["tts"]
--- a/20
+++ b/20
@ -7,36 +7,36 @@ help:
 target_dirs := tests TTS notebooks recipes
 test_all:	## run tests and don't stop on an error.
-	nosetests --with-cov -cov  --cover-erase --cover-package TTS tests --nologcapture --with-id
+	nose2 --with-coverage --coverage TTS tests
 	./run_bash_tests.sh
 test:	## run tests.
-	nosetests -x --with-cov -cov  --cover-erase --cover-package TTS tests --nologcapture --with-id
+	nose2 -F -v -B --with-coverage --coverage TTS tests
 test_vocoder:	## run vocoder tests.
-	nosetests tests.vocoder_tests -x --with-cov -cov  --cover-erase --cover-package TTS tests.vocoder_tests --nologcapture --with-id
+	nose2 -F -v -B --with-coverage --coverage TTS tests.vocoder_tests
 test_tts:	## run tts tests.
-	nosetests tests.tts_tests -x --with-cov -cov  --cover-erase --cover-package TTS tests.tts_tests --nologcapture --with-id
+	nose2 -F -v -B --with-coverage --coverage TTS tests.tts_tests
 test_aux:	## run aux tests.
-	nosetests tests.aux_tests -x --with-cov -cov  --cover-erase --cover-package TTS tests.aux_tests --nologcapture --with-id
+	nose2 -F -v -B --with-coverage --coverage TTS tests.aux_tests
 	./run_bash_tests.sh
 test_zoo:	## run zoo tests.
-	nosetests tests.zoo_tests -x --with-cov -cov  --cover-erase --cover-package TTS tests.zoo_tests --nologcapture --with-id
+	nose2 -F -v -B --with-coverage --coverage TTS tests.zoo_tests
 inference_tests: ## run inference tests.
-	nosetests tests.inference_tests -x --with-cov -cov  --cover-erase --cover-package TTS tests.inference_tests --nologcapture --with-id
+	nose2 -F -v -B --with-coverage --coverage TTS tests.inference_tests
 data_tests: ## run data tests.
-	nosetests tests.data_tests -x --with-cov -cov  --cover-erase --cover-package TTS tests.data_tests --nologcapture --with-id
+	nose2 -F -v -B --with-coverage --coverage TTS tests.data_tests
 test_text: ## run text tests.
-	nosetests tests.text_tests -x --with-cov -cov  --cover-erase --cover-package TTS tests.text_tests --nologcapture --with-id
+	nose2 -F -v -B --with-coverage --coverage TTS tests.text_tests
 test_failed:  ## only run tests failed the last time.
-	nosetests -x --with-cov -cov  --cover-erase --cover-package TTS tests --nologcapture --failed
+	nose2 -F -v -B --with-coverage --coverage TTS tests
 style:	## update code style.
 	black ${target_dirs}
--- a/README.md
+++ b/README.md
@ -3,15 +3,23 @@
 🐸TTS is a library for advanced Text-to-Speech generation. It's built on the latest research, was designed to achieve the best trade-off among ease-of-training, speed and quality.
 🐸TTS comes with pretrained models, tools for measuring dataset quality and already used in **20+ languages** for products and research projects.
-[![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/main.yml/badge.svg)](https://github.com/coqui-ai/TTS/actions)
+[![Gitter](https://badges.gitter.im/coqui-ai/TTS.svg)](https://gitter.im/coqui-ai/TTS?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge)
 [![License](<https://img.shields.io/badge/License-MPL%202.0-brightgreen.svg>)](https://opensource.org/licenses/MPL-2.0)
 [![PyPI version](https://badge.fury.io/py/TTS.svg)](https://badge.fury.io/py/TTS)
 [![Covenant](https://camo.githubusercontent.com/7d620efaa3eac1c5b060ece5d6aacfcc8b81a74a04d05cd0398689c01c4463bb/68747470733a2f2f696d672e736869656c64732e696f2f62616467652f436f6e7472696275746f72253230436f76656e616e742d76322e3025323061646f707465642d6666363962342e737667)](https://github.com/coqui-ai/TTS/blob/master/CODE_OF_CONDUCT.md)
 [![Downloads](https://pepy.tech/badge/tts)](https://pepy.tech/project/tts)
 [![DOI](https://zenodo.org/badge/265612440.svg)](https://zenodo.org/badge/latestdoi/265612440)
 ![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/aux_tests.yml/badge.svg)
 ![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/data_tests.yml/badge.svg)
 ![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/docker.yaml/badge.svg)
 ![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/inference_tests.yml/badge.svg)
 ![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/style_check.yml/badge.svg)
 ![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/text_tests.yml/badge.svg)
 ![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/tts_tests.yml/badge.svg)
 ![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/vocoder_tests.yml/badge.svg)
 ![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/zoo_tests.yml/badge.svg)
 [![Docs](<https://readthedocs.org/projects/tts/badge/?version=latest&style=plastic>)](https://tts.readthedocs.io/en/latest/)
 [![Gitter](https://badges.gitter.im/coqui-ai/TTS.svg)](https://gitter.im/coqui-ai/TTS?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge)
 [![License](<https://img.shields.io/badge/License-MPL%202.0-brightgreen.svg>)](https://opensource.org/licenses/MPL-2.0)
 📰 [**Subscribe to 🐸Coqui.ai Newsletter**](https://coqui.ai/?subscription=true)
@ -104,7 +112,7 @@ Underlined "TTS*" and "Judy*" are 🐸TTS models
 You can also help us implement more models.
 ## Install TTS
-🐸TTS is tested on Ubuntu 18.04 with **python >= 3.6, < 3.9**.
+🐸TTS is tested on Ubuntu 18.04 with **python >= 3.7, < 3.11.**.
 If you are only interested in [synthesizing speech](https://tts.readthedocs.io/en/latest/inference.html) with the released 🐸TTS models, installing from PyPI is the easiest option.
--- a/TTS/.models.json
+++ b/TTS/.models.json
@ -119,6 +119,26 @@
                    "license": "apache 2.0",
                    "contact": "egolge@coqui.com"
                }
            },
            "blizzard2013": {
                "capacitron-t2-c50": {
                    "description": "Capacitron additions to Tacotron 2 with Capacity at 50 as in https://arxiv.org/pdf/1906.03402.pdf",
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--en--blizzard2013--capacitron-t2-c50.zip",
                    "commit": "d6284e7",
                    "default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
                    "author": "Adam Froghyar @a-froghyar",
                    "license": "apache 2.0",
                    "contact": "adamfroghyar@gmail.com"
                },
                "capacitron-t2-c150": {
                    "description": "Capacitron additions to Tacotron 2 with Capacity at 150 as in https://arxiv.org/pdf/1906.03402.pdf",
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--en--blizzard2013--capacitron-t2-c150.zip",
                    "commit": "d6284e7",
                    "default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
                    "author": "Adam Froghyar @a-froghyar",
                    "license": "apache 2.0",
                    "contact": "adamfroghyar@gmail.com"
                }
            }
        },
        "es": {
@ -379,6 +399,16 @@
                    "contact": "egolge@coqui.ai"
                }
            },
            "blizzard2013": {
                "hifigan_v2": {
                    "description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/vocoder_models--en--blizzard2013--hifigan_v2.zip",
                    "commit": "d6284e7",
                    "author": "Adam Froghyar @a-froghyar",
                    "license": "apache 2.0",
                    "contact": "adamfroghyar@gmail.com"
                }
            },
            "vctk": {
                "hifigan_v2": {
                    "description": "Finetuned and intended to be used with tts_models/en/vctk/sc-glow-tts",
--- a/TTS/VERSION
+++ b/TTS/VERSION
@ -1 +1 @@
-0.6.2
+0.7.0
--- a/TTS/bin/compute_embeddings.py
+++ b/TTS/bin/compute_embeddings.py
@ -2,51 +2,48 @@ import argparse
 import os
 from argparse import RawTextHelpFormatter
 import torch
 from tqdm import tqdm
 from TTS.config import load_config
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.utils.managers import save_file
 from TTS.tts.utils.speakers import SpeakerManager
 parser = argparse.ArgumentParser(
    description="""Compute embedding vectors for each wav file in a dataset.\n\n"""
    """
    Example runs:
-    python TTS/bin/compute_embeddings.py speaker_encoder_model.pth speaker_encoder_config.json  dataset_config.json embeddings_output_path/
+    python TTS/bin/compute_embeddings.py speaker_encoder_model.pth speaker_encoder_config.json  dataset_config.json
    """,
    formatter_class=RawTextHelpFormatter,
 )
 parser.add_argument("model_path", type=str, help="Path to model checkpoint file.")
-parser.add_argument(
+parser.add_argument("config_path", type=str, help="Path to model config file.")
-    "config_path",
+parser.add_argument("config_dataset_path", type=str, help="Path to dataset config file.")
-    type=str,
+parser.add_argument("--output_path", type=str, help="Path for output `pth` or `json` file.", default="speakers.pth")
-    help="Path to model config file.",
+parser.add_argument("--old_file", type=str, help="Previous embedding file to only compute new audios.", default=None)
-)
+parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False)
-
+parser.add_argument("--no_eval", type=bool, help="Do not compute eval?. Default False", default=False)
 parser.add_argument(
    "config_dataset_path",
    type=str,
    help="Path to dataset config file.",
 )
 parser.add_argument("output_path", type=str, help="path for output speakers.json and/or speakers.npy.")
 parser.add_argument(
    "--old_file", type=str, help="Previous speakers.json file, only compute for new audios.", default=None
 )
 parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)
 parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
 args = parser.parse_args()
 use_cuda = torch.cuda.is_available() and not args.disable_cuda
 c_dataset = load_config(args.config_dataset_path)
-meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=args.eval)
+meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=not args.no_eval)
-wav_files = meta_data_train + meta_data_eval
+
 if meta_data_eval is None:
    wav_files = meta_data_train
 else:
    wav_files = meta_data_train + meta_data_eval
 encoder_manager = SpeakerManager(
    encoder_model_path=args.model_path,
    encoder_config_path=args.config_path,
    d_vectors_file_path=args.old_file,
-    use_cuda=args.use_cuda,
+    use_cuda=use_cuda,
 )
 class_name_key = encoder_manager.encoder_config.class_name_key
@ -75,13 +72,13 @@ for idx, wav_file in enumerate(tqdm(wav_files)):
 if speaker_mapping:
    # save speaker_mapping if target dataset is defined
-    if ".json" not in args.output_path:
+    if os.path.isdir(args.output_path):
-        mapping_file_path = os.path.join(args.output_path, "speakers.json")
+        mapping_file_path = os.path.join(args.output_path, "speakers.pth")
    else:
        mapping_file_path = args.output_path
    if os.path.dirname(mapping_file_path) != "":
        os.makedirs(os.path.dirname(mapping_file_path), exist_ok=True)
-    # pylint: disable=W0212
+    save_file(speaker_mapping, mapping_file_path)
    encoder_manager._save_json(mapping_file_path, speaker_mapping)
    print("Speaker embeddings saved at:", mapping_file_path)
--- a/TTS/bin/synthesize.py
+++ b/TTS/bin/synthesize.py
@ -39,6 +39,18 @@ If you don't specify any models, then it uses LJSpeech based English model.
    $ tts --list_models
    ```
 - Query info for model info by idx:
    ```
    $ tts --model_info_by_idx "<model_type>/<model_query_idx>"
    ```
 - Query info for model info by full name:
    ```
    $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
    ```
 - Run TTS with default models:
    ```
@ -48,7 +60,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
 - Run a TTS model with its default vocoder model:
    ```
-    $ tts --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>
+    $ tts --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>"
    ```
 - Run with specific TTS and vocoder models from the list:
@ -104,6 +116,21 @@ If you don't specify any models, then it uses LJSpeech based English model.
        default=False,
        help="list available pre-trained TTS and vocoder models.",
    )
    parser.add_argument(
        "--model_info_by_idx",
        type=str,
        default=None,
        help="model info using query format: <model_type>/<model_query_idx>",
    )
    parser.add_argument(
        "--model_info_by_name",
        type=str,
        default=None,
        help="model info using query format: <model_type>/<language>/<dataset>/<model_name>",
    )
    parser.add_argument("--text", type=str, default=None, help="Text to generate speech.")
    # Args for running pre-trained TTS models.
@ -171,7 +198,11 @@ If you don't specify any models, then it uses LJSpeech based English model.
        help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.",
        default=None,
    )
-    parser.add_argument("--gst_style", help="Wav path file for GST stylereference.", default=None)
+    parser.add_argument("--gst_style", help="Wav path file for GST style reference.", default=None)
    parser.add_argument(
        "--capacitron_style_wav", type=str, help="Wav path file for Capacitron prosody reference.", default=None
    )
    parser.add_argument("--capacitron_style_text", type=str, help="Transcription of the reference.", default=None)
    parser.add_argument(
        "--list_speaker_idxs",
        help="List available speaker ids for the defined multi-speaker model.",
@ -210,13 +241,16 @@ If you don't specify any models, then it uses LJSpeech based English model.
    args = parser.parse_args()
    # print the description if either text or list_models is not set
-    if (
+    check_args = [
-        not args.text
+        args.text,
-        and not args.list_models
+        args.list_models,
-        and not args.list_speaker_idxs
+        args.list_speaker_idxs,
-        and not args.list_language_idxs
+        args.list_language_idxs,
-        and not args.reference_wav
+        args.reference_wav,
-    ):
+        args.model_info_by_idx,
        args.model_info_by_name,
    ]
    if not any(check_args):
        parser.parse_args(["-h"])
    # load model manager
@ -232,12 +266,23 @@ If you don't specify any models, then it uses LJSpeech based English model.
    encoder_path = None
    encoder_config_path = None
-    # CASE1: list pre-trained TTS models
+    # CASE1 #list : list pre-trained TTS models
    if args.list_models:
        manager.list_models()
        sys.exit()
-    # CASE2: load pre-trained model paths
+    # CASE2 #info : model info of pre-trained TTS models
    if args.model_info_by_idx:
        model_query = args.model_info_by_idx
        manager.model_info_by_idx(model_query)
        sys.exit()
    if args.model_info_by_name:
        model_query_full_name = args.model_info_by_name
        manager.model_info_by_full_name(model_query_full_name)
        sys.exit()
    # CASE3: load pre-trained model paths
    if args.model_name is not None and not args.model_path:
        model_path, config_path, model_item = manager.download_model(args.model_name)
        args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
@ -245,7 +290,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
    if args.vocoder_name is not None and not args.vocoder_path:
        vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
-    # CASE3: set custom model paths
+    # CASE4: set custom model paths
    if args.model_path is not None:
        model_path = args.model_path
        config_path = args.config_path
@ -308,6 +353,8 @@ If you don't specify any models, then it uses LJSpeech based English model.
        args.language_idx,
        args.speaker_wav,
        reference_wav=args.reference_wav,
        style_wav=args.capacitron_style_wav,
        style_text=args.capacitron_style_text,
        reference_speaker_name=args.reference_speaker_idx,
    )
--- a/TTS/server/README.md
+++ b/TTS/server/README.md
@ -1,9 +1,3 @@
 <!-- ## TTS example web-server
 You'll need a model package (Zip file, includes TTS Python wheel, model files, server configuration, and optional nginx/uwsgi configs). Publicly available models are listed [here](https://github.com/mozilla/TTS/wiki/Released-Models#simple-packaging---self-contained-package-that-runs-an-http-api-for-a-pre-trained-tts-model).
 Instructions below are based on a Ubuntu 18.04 machine, but it should be simple to adapt the package names to other distros if needed. Python 3.6 is recommended, as some of the dependencies' versions predate Python 3.7 and will force building from source, which requires extra dependencies and is not guaranteed to work. -->
 # :frog: TTS demo server
 Before you use the server, make sure you [install](https://github.com/coqui-ai/TTS/tree/dev#install-tts)) :frog: TTS properly. Then, you can follow the steps below.
--- a/TTS/server/server.py
+++ b/TTS/server/server.py
@ -111,7 +111,10 @@ synthesizer = Synthesizer(
    use_cuda=args.use_cuda,
 )
-use_multi_speaker = hasattr(synthesizer.tts_model, "num_speakers") and synthesizer.tts_model.num_speakers > 1
+use_multi_speaker = hasattr(synthesizer.tts_model, "num_speakers") and (
    synthesizer.tts_model.num_speakers > 1 or synthesizer.tts_speakers_file is not None
 )
 speaker_manager = getattr(synthesizer.tts_model, "speaker_manager", None)
 # TODO: set this from SpeakerManager
 use_gst = synthesizer.tts_config.get("use_gst", False)
--- a/TTS/tts/configs/shared_configs.py
+++ b/TTS/tts/configs/shared_configs.py
@ -48,6 +48,50 @@ class GSTConfig(Coqpit):
        check_argument("gst_num_style_tokens", c, restricted=True, min_val=1, max_val=1000)
@dataclass
 class CapacitronVAEConfig(Coqpit):
    """Defines the capacitron VAE Module
    Args:
        capacitron_capacity (int):
            Defines the variational capacity limit of the prosody embeddings. Defaults to 150.
        capacitron_VAE_embedding_dim (int):
            Defines the size of the Capacitron embedding vector dimension. Defaults to 128.
        capacitron_use_text_summary_embeddings (bool):
            If True, use a text summary embedding in Capacitron. Defaults to True.
        capacitron_text_summary_embedding_dim (int):
            Defines the size of the capacitron text embedding vector dimension. Defaults to 128.
        capacitron_use_speaker_embedding (bool):
            if True use speaker embeddings in Capacitron. Defaults to False.
        capacitron_VAE_loss_alpha (float):
            Weight for the VAE loss of the Tacotron model. If set less than or equal to zero, it disables the
            corresponding loss function. Defaults to 0.25
        capacitron_grad_clip (float):
            Gradient clipping value for all gradients except beta. Defaults to 5.0
    """
    capacitron_loss_alpha: int = 1
    capacitron_capacity: int = 150
    capacitron_VAE_embedding_dim: int = 128
    capacitron_use_text_summary_embeddings: bool = True
    capacitron_text_summary_embedding_dim: int = 128
    capacitron_use_speaker_embedding: bool = False
    capacitron_VAE_loss_alpha: float = 0.25
    capacitron_grad_clip: float = 5.0
    def check_values(
        self,
    ):
        """Check config fields"""
        c = asdict(self)
        super().check_values()
        check_argument("capacitron_capacity", c, restricted=True, min_val=10, max_val=500)
        check_argument("capacitron_VAE_embedding_dim", c, restricted=True, min_val=16, max_val=1024)
        check_argument("capacitron_use_speaker_embedding", c, restricted=False)
        check_argument("capacitron_text_summary_embedding_dim", c, restricted=False, min_val=16, max_val=512)
        check_argument("capacitron_VAE_loss_alpha", c, restricted=False)
        check_argument("capacitron_grad_clip", c, restricted=False)
@dataclass
 class CharactersConfig(Coqpit):
    """Defines arguments for the `BaseCharacters` or `BaseVocabulary` and their subclasses.
@ -232,6 +276,14 @@ class BaseTTSConfig(BaseTrainingConfig):
        language_weighted_sampler_alpha (float):
            Number that control the influence of the language sampler weights. Defaults to ```1.0```.
        use_length_weighted_sampler (bool):
            Enable / Disable the batch balancer by audio length. If enabled the dataset will be divided
            into 10 buckets considering the min and max audio of the dataset. The sampler weights will be
            computed forcing to have the same quantity of data for each bucket in each training batch. Defaults to ```False```.
        length_weighted_sampler_alpha (float):
            Number that control the influence of the length sampler weights. Defaults to ```1.0```.
    """
    audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
@ -279,3 +331,5 @@ class BaseTTSConfig(BaseTrainingConfig):
    speaker_weighted_sampler_alpha: float = 1.0
    use_language_weighted_sampler: bool = False
    language_weighted_sampler_alpha: float = 1.0
    use_length_weighted_sampler: bool = False
    length_weighted_sampler_alpha: float = 1.0
--- a/TTS/tts/configs/tacotron_config.py
+++ b/TTS/tts/configs/tacotron_config.py
@ -1,7 +1,7 @@
 from dataclasses import dataclass, field
 from typing import List
-from TTS.tts.configs.shared_configs import BaseTTSConfig, GSTConfig
+from TTS.tts.configs.shared_configs import BaseTTSConfig, CapacitronVAEConfig, GSTConfig
@dataclass
@ -23,6 +23,10 @@ class TacotronConfig(BaseTTSConfig):
        gst_style_input (str):
            Path to the wav file used at inference to set the speech style through GST. If `GST` is enabled and
            this is not defined, the model uses a zero vector as an input. Defaults to None.
        use_capacitron_vae (bool):
            enable / disable the use of Capacitron modules. Defaults to False.
        capacitron_vae (CapacitronConfig):
            Instance of `CapacitronConfig` class.
        num_chars (int):
            Number of characters used by the model. It must be defined before initializing the model. Defaults to None.
        num_speakers (int):
@ -143,6 +147,9 @@ class TacotronConfig(BaseTTSConfig):
    gst: GSTConfig = None
    gst_style_input: str = None
    use_capacitron_vae: bool = False
    capacitron_vae: CapacitronVAEConfig = None
    # model specific params
    num_speakers: int = 1
    num_chars: int = 0
--- a/TTS/tts/datasets/formatters.py
+++ b/TTS/tts/datasets/formatters.py
@ -5,6 +5,7 @@ from glob import glob
 from pathlib import Path
 from typing import List
 import pandas as pd
 from tqdm import tqdm
 ########################
@ -12,6 +13,34 @@ from tqdm import tqdm
 ########################
 def coqui(root_path, meta_file, ignored_speakers=None):
    """Interal dataset formatter."""
    metadata = pd.read_csv(os.path.join(root_path, meta_file), sep="|")
    assert all(x in metadata.columns for x in ["audio_file", "text"])
    speaker_name = None if "speaker_name" in metadata.columns else "coqui"
    emotion_name = None if "emotion_name" in metadata.columns else "neutral"
    items = []
    not_found_counter = 0
    for row in metadata.itertuples():
        if speaker_name is None and ignored_speakers is not None and row.speaker_name in ignored_speakers:
            continue
        audio_path = os.path.join(root_path, row.audio_file)
        if not os.path.exists(audio_path):
            not_found_counter += 1
            continue
        items.append(
            {
                "text": row.text,
                "audio_file": audio_path,
                "speaker_name": speaker_name if speaker_name is not None else row.speaker_name,
                "emotion_name": emotion_name if emotion_name is not None else row.emotion_name,
            }
        )
    if not_found_counter > 0:
        print(f" | > [!] {not_found_counter} files not found")
    return items
 def tweb(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
    """Normalize TWEB dataset.
    https://www.kaggle.com/bryanpark/the-world-english-bible-speech-dataset
@ -141,6 +170,21 @@ def ljspeech_test(root_path, meta_file, **kwargs):  # pylint: disable=unused-arg
    return items
 def thorsten(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
    """Normalizes the thorsten meta data file to TTS format
    https://github.com/thorstenMueller/deep-learning-german-tts/"""
    txt_file = os.path.join(root_path, meta_file)
    items = []
    speaker_name = "thorsten"
    with open(txt_file, "r", encoding="utf-8") as ttf:
        for line in ttf:
            cols = line.split("|")
            wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav")
            text = cols[1]
            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name})
    return items
 def sam_accenture(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
    """Normalizes the sam-accenture meta data file to TTS format
    https://github.com/Sam-Accenture-Non-Binary-Voice/non-binary-voice-files"""
@ -352,6 +396,25 @@ def vctk_old(root_path, meta_files=None, wavs_path="wav48", ignored_speakers=Non
    return items
 def synpaflex(root_path, metafiles=None, **kwargs):  # pylint: disable=unused-argument
    items = []
    speaker_name = "synpaflex"
    root_path = os.path.join(root_path, "")
    wav_files = glob(f"{root_path}**/*.wav", recursive=True)
    for wav_file in wav_files:
        if os.sep + "wav" + os.sep in wav_file:
            txt_file = wav_file.replace("wav", "txt")
        else:
            txt_file = os.path.join(
                os.path.dirname(wav_file), "txt", os.path.basename(wav_file).replace(".wav", ".txt")
            )
        if os.path.exists(txt_file) and os.path.exists(wav_file):
            with open(txt_file, "r", encoding="utf-8") as file_text:
                text = file_text.readlines()[0]
            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name})
    return items
 def open_bible(root_path, meta_files="train", ignore_digits_sentences=True, ignored_speakers=None):
    """ToDo: Refer the paper when available"""
    items = []
--- a/TTS/tts/layers/losses.py
+++ b/TTS/tts/layers/losses.py
@ -281,6 +281,10 @@ class TacotronLoss(torch.nn.Module):
    def __init__(self, c, ga_sigma=0.4):
        super().__init__()
        self.stopnet_pos_weight = c.stopnet_pos_weight
        self.use_capacitron_vae = c.use_capacitron_vae
        if self.use_capacitron_vae:
            self.capacitron_capacity = c.capacitron_vae.capacitron_capacity
            self.capacitron_vae_loss_alpha = c.capacitron_vae.capacitron_VAE_loss_alpha
        self.ga_alpha = c.ga_alpha
        self.decoder_diff_spec_alpha = c.decoder_diff_spec_alpha
        self.postnet_diff_spec_alpha = c.postnet_diff_spec_alpha
@ -308,6 +312,9 @@ class TacotronLoss(torch.nn.Module):
        # pylint: disable=not-callable
        self.criterion_st = BCELossMasked(pos_weight=torch.tensor(self.stopnet_pos_weight)) if c.stopnet else None
        # For dev pruposes only
        self.criterion_capacitron_reconstruction_loss = nn.L1Loss(reduction="sum")
    def forward(
        self,
        postnet_output,
@ -317,6 +324,7 @@ class TacotronLoss(torch.nn.Module):
        stopnet_output,
        stopnet_target,
        stop_target_length,
        capacitron_vae_outputs,
        output_lens,
        decoder_b_output,
        alignments,
@ -348,6 +356,55 @@ class TacotronLoss(torch.nn.Module):
        return_dict["decoder_loss"] = decoder_loss
        return_dict["postnet_loss"] = postnet_loss
        if self.use_capacitron_vae:
            # extract capacitron vae infos
            posterior_distribution, prior_distribution, beta = capacitron_vae_outputs
            # KL divergence term between the posterior and the prior
            kl_term = torch.mean(torch.distributions.kl_divergence(posterior_distribution, prior_distribution))
            # Limit the mutual information between the data and latent space by the variational capacity limit
            kl_capacity = kl_term - self.capacitron_capacity
            # pass beta through softplus to keep it positive
            beta = torch.nn.functional.softplus(beta)[0]
            # This is the term going to the main ADAM optimiser, we detach beta because
            # beta is optimised by a separate, SGD optimiser below
            capacitron_vae_loss = beta.detach() * kl_capacity
            # normalize the capacitron_vae_loss as in L1Loss or MSELoss.
            # After this, both the standard loss and capacitron_vae_loss will be in the same scale.
            # For this reason we don't need use L1Loss and MSELoss in "sum" reduction mode.
            # Note: the batch is not considered because the L1Loss was calculated in "sum" mode
            # divided by the batch size, So not dividing the capacitron_vae_loss by B is legitimate.
            # get B T D dimension from input
            B, T, D = mel_input.size()
            # normalize
            if self.config.loss_masking:
                # if mask loss get T using the mask
                T = output_lens.sum() / B
            # Only for dev purposes to be able to compare the reconstruction loss with the values in the
            # original Capacitron paper
            return_dict["capaciton_reconstruction_loss"] = (
                self.criterion_capacitron_reconstruction_loss(decoder_output, mel_input) / decoder_output.size(0)
            ) + kl_capacity
            capacitron_vae_loss = capacitron_vae_loss / (T * D)
            capacitron_vae_loss = capacitron_vae_loss * self.capacitron_vae_loss_alpha
            # This is the term to purely optimise beta and to pass into the SGD optimizer
            beta_loss = torch.negative(beta) * kl_capacity.detach()
            loss += capacitron_vae_loss
            return_dict["capacitron_vae_loss"] = capacitron_vae_loss
            return_dict["capacitron_vae_beta_loss"] = beta_loss
            return_dict["capacitron_vae_kl_term"] = kl_term
            return_dict["capacitron_beta"] = beta
        stop_loss = (
            self.criterion_st(stopnet_output, stopnet_target, stop_target_length)
            if self.config.stopnet
--- a/TTS/tts/layers/tacotron/attentions.py
+++ b/TTS/tts/layers/tacotron/attentions.py
@ -484,4 +484,4 @@ def init_attn(
            beta=0.9,
        )
-    raise RuntimeError(" [!] Given Attention Type '{attn_type}' is not exist.")
+    raise RuntimeError(f" [!] Given Attention Type '{attn_type}' is not exist.")
--- a/TTS/tts/layers/tacotron/capacitron_layers.py
+++ b/TTS/tts/layers/tacotron/capacitron_layers.py
@ -0,0 +1,205 @@
 import torch
 from torch import nn
 from torch.distributions.multivariate_normal import MultivariateNormal as MVN
 from torch.nn import functional as F
 class CapacitronVAE(nn.Module):
    """Effective Use of Variational Embedding Capacity for prosody transfer.
    See https://arxiv.org/abs/1906.03402"""
    def __init__(
        self,
        num_mel,
        capacitron_VAE_embedding_dim,
        encoder_output_dim=256,
        reference_encoder_out_dim=128,
        speaker_embedding_dim=None,
        text_summary_embedding_dim=None,
    ):
        super().__init__()
        # Init distributions
        self.prior_distribution = MVN(
            torch.zeros(capacitron_VAE_embedding_dim), torch.eye(capacitron_VAE_embedding_dim)
        )
        self.approximate_posterior_distribution = None
        # define output ReferenceEncoder dim to the capacitron_VAE_embedding_dim
        self.encoder = ReferenceEncoder(num_mel, out_dim=reference_encoder_out_dim)
        # Init beta, the lagrange-like term for the KL distribution
        self.beta = torch.nn.Parameter(torch.log(torch.exp(torch.Tensor([1.0])) - 1), requires_grad=True)
        mlp_input_dimension = reference_encoder_out_dim
        if text_summary_embedding_dim is not None:
            self.text_summary_net = TextSummary(text_summary_embedding_dim, encoder_output_dim=encoder_output_dim)
            mlp_input_dimension += text_summary_embedding_dim
        if speaker_embedding_dim is not None:
            # TODO: Test a multispeaker model!
            mlp_input_dimension += speaker_embedding_dim
        self.post_encoder_mlp = PostEncoderMLP(mlp_input_dimension, capacitron_VAE_embedding_dim)
    def forward(self, reference_mel_info=None, text_info=None, speaker_embedding=None):
        # Use reference
        if reference_mel_info is not None:
            reference_mels = reference_mel_info[0]  # [batch_size, num_frames, num_mels]
            mel_lengths = reference_mel_info[1]  # [batch_size]
            enc_out = self.encoder(reference_mels, mel_lengths)
            # concat speaker_embedding and/or text summary embedding
            if text_info is not None:
                text_inputs = text_info[0]  # [batch_size, num_characters, num_embedding]
                input_lengths = text_info[1]
                text_summary_out = self.text_summary_net(text_inputs, input_lengths).to(reference_mels.device)
                enc_out = torch.cat([enc_out, text_summary_out], dim=-1)
            if speaker_embedding is not None:
                enc_out = torch.cat([enc_out, speaker_embedding], dim=-1)
            # Feed the output of the ref encoder and information about text/speaker into
            # an MLP to produce the parameteres for the approximate poterior distributions
            mu, sigma = self.post_encoder_mlp(enc_out)
            # convert to cpu because prior_distribution was created on cpu
            mu = mu.cpu()
            sigma = sigma.cpu()
            # Sample from the posterior: z ~ q(z|x)
            self.approximate_posterior_distribution = MVN(mu, torch.diag_embed(sigma))
            VAE_embedding = self.approximate_posterior_distribution.rsample()
        # Infer from the model, bypasses encoding
        else:
            # Sample from the prior: z ~ p(z)
            VAE_embedding = self.prior_distribution.sample().unsqueeze(0)
        # reshape to [batch_size, 1, capacitron_VAE_embedding_dim]
        return VAE_embedding.unsqueeze(1), self.approximate_posterior_distribution, self.prior_distribution, self.beta
 class ReferenceEncoder(nn.Module):
    """NN module creating a fixed size prosody embedding from a spectrogram.
    inputs: mel spectrograms [batch_size, num_spec_frames, num_mel]
    outputs: [batch_size, embedding_dim]
    """
    def __init__(self, num_mel, out_dim):
        super().__init__()
        self.num_mel = num_mel
        filters = [1] + [32, 32, 64, 64, 128, 128]
        num_layers = len(filters) - 1
        convs = [
            nn.Conv2d(
                in_channels=filters[i], out_channels=filters[i + 1], kernel_size=(3, 3), stride=(2, 2), padding=(2, 2)
            )
            for i in range(num_layers)
        ]
        self.convs = nn.ModuleList(convs)
        self.training = False
        self.bns = nn.ModuleList([nn.BatchNorm2d(num_features=filter_size) for filter_size in filters[1:]])
        post_conv_height = self.calculate_post_conv_height(num_mel, 3, 2, 2, num_layers)
        self.recurrence = nn.LSTM(
            input_size=filters[-1] * post_conv_height, hidden_size=out_dim, batch_first=True, bidirectional=False
        )
    def forward(self, inputs, input_lengths):
        batch_size = inputs.size(0)
        x = inputs.view(batch_size, 1, -1, self.num_mel)  # [batch_size, num_channels==1, num_frames, num_mel]
        valid_lengths = input_lengths.float()  # [batch_size]
        for conv, bn in zip(self.convs, self.bns):
            x = conv(x)
            x = bn(x)
            x = F.relu(x)
            # Create the post conv width mask based on the valid lengths of the output of the convolution.
            # The valid lengths for the output of a convolution on varying length inputs is
            # ceil(input_length/stride) + 1 for stride=3 and padding=2
            # For example (kernel_size=3, stride=2, padding=2):
            # 0 0 x x x x x 0 0 -> Input = 5, 0 is zero padding, x is valid values coming from padding=2 in conv2d
            # _____
            #   x _____
            #       x _____
            #           x  ____
            #               x
            # x x x x -> Output valid length = 4
            # Since every example in te batch is zero padded and therefore have separate valid_lengths,
            # we need to mask off all the values AFTER the valid length for each example in the batch.
            # Otherwise, the convolutions create noise and a lot of not real information
            valid_lengths = (valid_lengths / 2).float()
            valid_lengths = torch.ceil(valid_lengths).to(dtype=torch.int64) + 1  # 2 is stride -- size: [batch_size]
            post_conv_max_width = x.size(2)
            mask = torch.arange(post_conv_max_width).to(inputs.device).expand(
                len(valid_lengths), post_conv_max_width
            ) < valid_lengths.unsqueeze(1)
            mask = mask.expand(1, 1, -1, -1).transpose(2, 0).transpose(-1, 2)  # [batch_size, 1, post_conv_max_width, 1]
            x = x * mask
        x = x.transpose(1, 2)
        # x: 4D tensor [batch_size, post_conv_width,
        #               num_channels==128, post_conv_height]
        post_conv_width = x.size(1)
        x = x.contiguous().view(batch_size, post_conv_width, -1)
        # x: 3D tensor [batch_size, post_conv_width,
        #               num_channels*post_conv_height]
        # Routine for fetching the last valid output of a dynamic LSTM with varying input lengths and padding
        post_conv_input_lengths = valid_lengths
        packed_seqs = nn.utils.rnn.pack_padded_sequence(
            x, post_conv_input_lengths.tolist(), batch_first=True, enforce_sorted=False
        )  # dynamic rnn sequence padding
        self.recurrence.flatten_parameters()
        _, (ht, _) = self.recurrence(packed_seqs)
        last_output = ht[-1]
        return last_output.to(inputs.device)  # [B, 128]
    @staticmethod
    def calculate_post_conv_height(height, kernel_size, stride, pad, n_convs):
        """Height of spec after n convolutions with fixed kernel/stride/pad."""
        for _ in range(n_convs):
            height = (height - kernel_size + 2 * pad) // stride + 1
        return height
 class TextSummary(nn.Module):
    def __init__(self, embedding_dim, encoder_output_dim):
        super().__init__()
        self.lstm = nn.LSTM(
            encoder_output_dim,  # text embedding dimension from the text encoder
            embedding_dim,  # fixed length output summary the lstm creates from the input
            batch_first=True,
            bidirectional=False,
        )
    def forward(self, inputs, input_lengths):
        # Routine for fetching the last valid output of a dynamic LSTM with varying input lengths and padding
        packed_seqs = nn.utils.rnn.pack_padded_sequence(
            inputs, input_lengths.tolist(), batch_first=True, enforce_sorted=False
        )  # dynamic rnn sequence padding
        self.lstm.flatten_parameters()
        _, (ht, _) = self.lstm(packed_seqs)
        last_output = ht[-1]
        return last_output
 class PostEncoderMLP(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size
        modules = [
            nn.Linear(input_size, hidden_size),  # Hidden Layer
            nn.Tanh(),
            nn.Linear(hidden_size, hidden_size * 2),
        ]  # Output layer twice the size for mean and variance
        self.net = nn.Sequential(*modules)
        self.softplus = nn.Softplus()
    def forward(self, _input):
        mlp_output = self.net(_input)
        # The mean parameter is unconstrained
        mu = mlp_output[:, : self.hidden_size]
        # The standard deviation must be positive. Parameterise with a softplus
        sigma = self.softplus(mlp_output[:, self.hidden_size :])
        return mu, sigma
--- a/TTS/tts/layers/tacotron/gst_layers.py
+++ b/TTS/tts/layers/tacotron/gst_layers.py
@ -139,7 +139,7 @@ class MultiHeadAttention(nn.Module):
        keys = torch.stack(torch.split(keys, split_size, dim=2), dim=0)  # [h, N, T_k, num_units/h]
        values = torch.stack(torch.split(values, split_size, dim=2), dim=0)  # [h, N, T_k, num_units/h]
-        # score = softmax(QK^T / (d_k ** 0.5))
+        # score = softmax(QK^T / (d_k**0.5))
        scores = torch.matmul(queries, keys.transpose(2, 3))  # [h, N, T_q, T_k]
        scores = scores / (self.key_dim**0.5)
        scores = F.softmax(scores, dim=3)
--- a/TTS/tts/layers/vits/discriminator.py
+++ b/TTS/tts/layers/vits/discriminator.py
@ -58,10 +58,8 @@ class VitsDiscriminator(nn.Module):
        use_spectral_norm (bool): if `True` swith to spectral norm instead of weight norm.
    """
-    def __init__(self, use_spectral_norm=False):
+    def __init__(self, periods=(2, 3, 5, 7, 11), use_spectral_norm=False):
        super().__init__()
        periods = [2, 3, 5, 7, 11]
        self.nets = nn.ModuleList()
        self.nets.append(DiscriminatorS(use_spectral_norm=use_spectral_norm))
        self.nets.extend([DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods])
--- a/TTS/tts/models/base_tacotron.py
+++ b/TTS/tts/models/base_tacotron.py
@ -1,6 +1,6 @@
 import copy
 from abc import abstractmethod
-from typing import Dict
+from typing import Dict, Tuple
 import torch
 from coqpit import Coqpit
@ -10,7 +10,9 @@ from TTS.tts.layers.losses import TacotronLoss
 from TTS.tts.models.base_tts import BaseTTS
 from TTS.tts.utils.helpers import sequence_mask
 from TTS.tts.utils.speakers import SpeakerManager
 from TTS.tts.utils.synthesis import synthesis
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
 from TTS.utils.generic_utils import format_aux_input
 from TTS.utils.io import load_fsspec
 from TTS.utils.training import gradual_training_scheduler
@ -47,6 +49,11 @@ class BaseTacotron(BaseTTS):
            self.decoder_in_features += self.gst.gst_embedding_dim  # add gst embedding dim
            self.gst_layer = None
        # Capacitron
        if self.capacitron_vae and self.use_capacitron_vae:
            self.decoder_in_features += self.capacitron_vae.capacitron_VAE_embedding_dim  # add capacitron embedding dim
            self.capacitron_vae_layer = None
        # additional layers
        self.decoder_backward = None
        self.coarse_decoder = None
@ -125,6 +132,53 @@ class BaseTacotron(BaseTTS):
        speaker_manager = SpeakerManager.init_from_config(config)
        return BaseTacotron(config, ap, tokenizer, speaker_manager)
    ##########################
    # TEST AND LOG FUNCTIONS #
    ##########################
    def test_run(self, assets: Dict) -> Tuple[Dict, Dict]:
        """Generic test run for `tts` models used by `Trainer`.
        You can override this for a different behaviour.
        Args:
            assets (dict): A dict of training assets. For `tts` models, it must include `{'audio_processor': ap}`.
        Returns:
            Tuple[Dict, Dict]: Test figures and audios to be projected to Tensorboard.
        """
        print(" | > Synthesizing test sentences.")
        test_audios = {}
        test_figures = {}
        test_sentences = self.config.test_sentences
        aux_inputs = self._get_test_aux_input()
        for idx, sen in enumerate(test_sentences):
            outputs_dict = synthesis(
                self,
                sen,
                self.config,
                "cuda" in str(next(self.parameters()).device),
                speaker_id=aux_inputs["speaker_id"],
                d_vector=aux_inputs["d_vector"],
                style_wav=aux_inputs["style_wav"],
                use_griffin_lim=True,
                do_trim_silence=False,
            )
            test_audios["{}-audio".format(idx)] = outputs_dict["wav"]
            test_figures["{}-prediction".format(idx)] = plot_spectrogram(
                outputs_dict["outputs"]["model_outputs"], self.ap, output_fig=False
            )
            test_figures["{}-alignment".format(idx)] = plot_alignment(
                outputs_dict["outputs"]["alignments"], output_fig=False
            )
        return {"figures": test_figures, "audios": test_audios}
    def test_log(
        self, outputs: dict, logger: "Logger", assets: dict, steps: int  # pylint: disable=unused-argument
    ) -> None:
        logger.test_audios(steps, outputs["audios"], self.ap.sample_rate)
        logger.test_figures(steps, outputs["figures"])
    #############################
    # COMMON COMPUTE FUNCTIONS
    #############################
@ -160,7 +214,9 @@ class BaseTacotron(BaseTTS):
        )
        # scale_factor = self.decoder.r_init / self.decoder.r
        alignments_backward = torch.nn.functional.interpolate(
-            alignments_backward.transpose(1, 2), size=alignments.shape[1], mode="nearest"
+            alignments_backward.transpose(1, 2),
            size=alignments.shape[1],
            mode="nearest",
        ).transpose(1, 2)
        decoder_outputs_backward = decoder_outputs_backward.transpose(1, 2)
        decoder_outputs_backward = decoder_outputs_backward[:, :T, :]
@ -193,6 +249,25 @@ class BaseTacotron(BaseTTS):
        inputs = self._concat_speaker_embedding(inputs, gst_outputs)
        return inputs
    def compute_capacitron_VAE_embedding(self, inputs, reference_mel_info, text_info=None, speaker_embedding=None):
        """Capacitron Variational Autoencoder"""
        (VAE_outputs, posterior_distribution, prior_distribution, capacitron_beta,) = self.capacitron_vae_layer(
            reference_mel_info,
            text_info,
            speaker_embedding,  # pylint: disable=not-callable
        )
        VAE_outputs = VAE_outputs.to(inputs.device)
        encoder_output = self._concat_speaker_embedding(
            inputs, VAE_outputs
        )  # concatenate to the output of the basic tacotron encoder
        return (
            encoder_output,
            posterior_distribution,
            prior_distribution,
            capacitron_beta,
        )
    @staticmethod
    def _add_speaker_embedding(outputs, embedded_speakers):
        embedded_speakers_ = embedded_speakers.expand(outputs.size(0), outputs.size(1), -1)
--- a/TTS/tts/models/base_tts.py
+++ b/TTS/tts/models/base_tts.py
@ -12,6 +12,7 @@ from trainer.torch import DistributedSampler, DistributedSamplerWrapper
 from TTS.model import BaseTrainerModel
 from TTS.tts.datasets.dataset import TTSDataset
 from TTS.tts.utils.data import get_length_balancer_weights
 from TTS.tts.utils.languages import LanguageManager, get_language_balancer_weights
 from TTS.tts.utils.speakers import SpeakerManager, get_speaker_balancer_weights, get_speaker_manager
 from TTS.tts.utils.synthesis import synthesis
@ -250,6 +251,14 @@ class BaseTTS(BaseTrainerModel):
            else:
                weights = get_speaker_balancer_weights(data_items) * alpha
        if getattr(config, "use_length_weighted_sampler", False):
            alpha = getattr(config, "length_weighted_sampler_alpha", 1.0)
            print(" > Using Length weighted sampler with alpha:", alpha)
            if weights is not None:
                weights += get_length_balancer_weights(data_items) * alpha
            else:
                weights = get_length_balancer_weights(data_items) * alpha
        if weights is not None:
            sampler = WeightedRandomSampler(weights, len(weights))
        else:
@ -398,16 +407,16 @@ class BaseTTS(BaseTrainerModel):
        return test_figures, test_audios
    def on_init_start(self, trainer):
-        """Save the speaker.json and language_ids.json at the beginning of the training. Also update both paths."""
+        """Save the speaker.pth and language_ids.json at the beginning of the training. Also update both paths."""
        if self.speaker_manager is not None:
-            output_path = os.path.join(trainer.output_path, "speakers.json")
+            output_path = os.path.join(trainer.output_path, "speakers.pth")
            self.speaker_manager.save_ids_to_file(output_path)
            trainer.config.speakers_file = output_path
            # some models don't have `model_args` set
            if hasattr(trainer.config, "model_args"):
                trainer.config.model_args.speakers_file = output_path
            trainer.config.save_json(os.path.join(trainer.output_path, "config.json"))
-            print(f" > `speakers.json` is saved to {output_path}.")
+            print(f" > `speakers.pth` is saved to {output_path}.")
            print(" > `speakers_file` is updated in the config.json.")
        if hasattr(self, "language_manager") and self.language_manager is not None:
--- a/TTS/tts/models/tacotron.py
+++ b/TTS/tts/models/tacotron.py
@ -1,11 +1,13 @@
 # coding: utf-8
-from typing import Dict, List, Union
+from typing import Dict, List, Tuple, Union
 import torch
 from torch import nn
 from torch.cuda.amp.autocast_mode import autocast
 from trainer.trainer_utils import get_optimizer, get_scheduler
 from TTS.tts.layers.tacotron.capacitron_layers import CapacitronVAE
 from TTS.tts.layers.tacotron.gst_layers import GST
 from TTS.tts.layers.tacotron.tacotron import Decoder, Encoder, PostCBHG
 from TTS.tts.models.base_tacotron import BaseTacotron
@ -13,6 +15,7 @@ from TTS.tts.utils.measures import alignment_diagonal_score
 from TTS.tts.utils.speakers import SpeakerManager
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
 from TTS.utils.capacitron_optimizer import CapacitronOptimizer
 class Tacotron(BaseTacotron):
@ -51,6 +54,9 @@ class Tacotron(BaseTacotron):
        if self.use_gst:
            self.decoder_in_features += self.gst.gst_embedding_dim
        if self.use_capacitron_vae:
            self.decoder_in_features += self.capacitron_vae.capacitron_VAE_embedding_dim
        # embedding layer
        self.embedding = nn.Embedding(self.num_chars, 256, padding_idx=0)
        self.embedding.weight.data.normal_(0, 0.3)
@ -90,6 +96,20 @@ class Tacotron(BaseTacotron):
                gst_embedding_dim=self.gst.gst_embedding_dim,
            )
        # Capacitron layers
        if self.capacitron_vae and self.use_capacitron_vae:
            self.capacitron_vae_layer = CapacitronVAE(
                num_mel=self.decoder_output_dim,
                encoder_output_dim=self.encoder_in_features,
                capacitron_VAE_embedding_dim=self.capacitron_vae.capacitron_VAE_embedding_dim,
                speaker_embedding_dim=self.embedded_speaker_dim
                if self.use_speaker_embedding and self.capacitron_vae.capacitron_use_speaker_embedding
                else None,
                text_summary_embedding_dim=self.capacitron_vae.capacitron_text_summary_embedding_dim
                if self.capacitron_vae.capacitron_use_text_summary_embeddings
                else None,
            )
        # backward pass decoder
        if self.bidirectional_decoder:
            self._init_backward_decoder()
@ -146,6 +166,19 @@ class Tacotron(BaseTacotron):
                # B x 1 x speaker_embed_dim
                embedded_speakers = torch.unsqueeze(aux_input["d_vectors"], 1)
            encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers)
        # Capacitron
        if self.capacitron_vae and self.use_capacitron_vae:
            # B x capacitron_VAE_embedding_dim
            encoder_outputs, *capacitron_vae_outputs = self.compute_capacitron_VAE_embedding(
                encoder_outputs,
                reference_mel_info=[mel_specs, mel_lengths],
                text_info=[inputs, text_lengths]
                if self.capacitron_vae.capacitron_use_text_summary_embeddings
                else None,
                speaker_embedding=embedded_speakers if self.capacitron_vae.capacitron_use_speaker_embedding else None,
            )
        else:
            capacitron_vae_outputs = None
        # decoder_outputs: B x decoder_in_features x T_out
        # alignments: B x T_in x encoder_in_features
        # stop_tokens: B x T_in
@ -178,6 +211,7 @@ class Tacotron(BaseTacotron):
                "decoder_outputs": decoder_outputs,
                "alignments": alignments,
                "stop_tokens": stop_tokens,
                "capacitron_vae_outputs": capacitron_vae_outputs,
            }
        )
        return outputs
@ -190,6 +224,28 @@ class Tacotron(BaseTacotron):
        if self.gst and self.use_gst:
            # B x gst_dim
            encoder_outputs = self.compute_gst(encoder_outputs, aux_input["style_mel"], aux_input["d_vectors"])
        if self.capacitron_vae and self.use_capacitron_vae:
            if aux_input["style_text"] is not None:
                style_text_embedding = self.embedding(aux_input["style_text"])
                style_text_length = torch.tensor([style_text_embedding.size(1)], dtype=torch.int64).to(
                    encoder_outputs.device
                )  # pylint: disable=not-callable
            reference_mel_length = (
                torch.tensor([aux_input["style_mel"].size(1)], dtype=torch.int64).to(encoder_outputs.device)
                if aux_input["style_mel"] is not None
                else None
            )  # pylint: disable=not-callable
            # B x capacitron_VAE_embedding_dim
            encoder_outputs, *_ = self.compute_capacitron_VAE_embedding(
                encoder_outputs,
                reference_mel_info=[aux_input["style_mel"], reference_mel_length]
                if aux_input["style_mel"] is not None
                else None,
                text_info=[style_text_embedding, style_text_length] if aux_input["style_text"] is not None else None,
                speaker_embedding=aux_input["d_vectors"]
                if self.capacitron_vae.capacitron_use_speaker_embedding
                else None,
            )
        if self.num_speakers > 1:
            if not self.use_d_vector_file:
                # B x 1 x speaker_embed_dim
@ -215,12 +271,19 @@ class Tacotron(BaseTacotron):
        }
        return outputs
-    def train_step(self, batch, criterion):
+    def before_backward_pass(self, loss_dict, optimizer) -> None:
-        """Perform a single training step by fetching the right set if samples from the batch.
+        # Extracting custom training specific operations for capacitron
        # from the trainer
        if self.use_capacitron_vae:
            loss_dict["capacitron_vae_beta_loss"].backward()
            optimizer.first_step()
    def train_step(self, batch: Dict, criterion: torch.nn.Module) -> Tuple[Dict, Dict]:
        """Perform a single training step by fetching the right set of samples from the batch.
        Args:
-            batch ([type]): [description]
+            batch ([Dict]): A dictionary of input tensors.
-            criterion ([type]): [description]
+            criterion ([torch.nn.Module]): Callable criterion to compute model loss.
        """
        text_input = batch["text_input"]
        text_lengths = batch["text_lengths"]
@ -232,14 +295,8 @@ class Tacotron(BaseTacotron):
        speaker_ids = batch["speaker_ids"]
        d_vectors = batch["d_vectors"]
-        # forward pass model
+        aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
-        outputs = self.forward(
+        outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input)
            text_input,
            text_lengths,
            mel_input,
            mel_lengths,
            aux_input={"speaker_ids": speaker_ids, "d_vectors": d_vectors},
        )
        # set the [alignment] lengths wrt reduction factor for guided attention
        if mel_lengths.max() % self.decoder.r != 0:
@ -249,9 +306,6 @@ class Tacotron(BaseTacotron):
        else:
            alignment_lengths = mel_lengths // self.decoder.r
        aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
        outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input)
        # compute loss
        with autocast(enabled=False):  # use float32 for the criterion
            loss_dict = criterion(
@ -262,6 +316,7 @@ class Tacotron(BaseTacotron):
                outputs["stop_tokens"].float(),
                stop_targets.float(),
                stop_target_lengths,
                outputs["capacitron_vae_outputs"] if self.capacitron_vae else None,
                mel_lengths,
                None if outputs["decoder_outputs_backward"] is None else outputs["decoder_outputs_backward"].float(),
                outputs["alignments"].float(),
@ -275,6 +330,25 @@ class Tacotron(BaseTacotron):
        loss_dict["align_error"] = align_error
        return outputs, loss_dict
    def get_optimizer(self) -> List:
        if self.use_capacitron_vae:
            return CapacitronOptimizer(self.config, self.named_parameters())
        return get_optimizer(self.config.optimizer, self.config.optimizer_params, self.config.lr, self)
    def get_scheduler(self, optimizer: object):
        opt = optimizer.primary_optimizer if self.use_capacitron_vae else optimizer
        return get_scheduler(self.config.lr_scheduler, self.config.lr_scheduler_params, opt)
    def before_gradient_clipping(self):
        if self.use_capacitron_vae:
            # Capacitron model specific gradient clipping
            model_params_to_clip = []
            for name, param in self.named_parameters():
                if param.requires_grad:
                    if name != "capacitron_vae_layer.beta":
                        model_params_to_clip.append(param)
            torch.nn.utils.clip_grad_norm_(model_params_to_clip, self.capacitron_vae.capacitron_grad_clip)
    def _create_logs(self, batch, outputs, ap):
        postnet_outputs = outputs["model_outputs"]
        decoder_outputs = outputs["decoder_outputs"]
--- a/TTS/tts/models/tacotron2.py
+++ b/TTS/tts/models/tacotron2.py
@ -5,7 +5,9 @@ from typing import Dict, List, Union
 import torch
 from torch import nn
 from torch.cuda.amp.autocast_mode import autocast
 from trainer.trainer_utils import get_optimizer, get_scheduler
 from TTS.tts.layers.tacotron.capacitron_layers import CapacitronVAE
 from TTS.tts.layers.tacotron.gst_layers import GST
 from TTS.tts.layers.tacotron.tacotron2 import Decoder, Encoder, Postnet
 from TTS.tts.models.base_tacotron import BaseTacotron
@ -13,6 +15,7 @@ from TTS.tts.utils.measures import alignment_diagonal_score
 from TTS.tts.utils.speakers import SpeakerManager
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
 from TTS.utils.capacitron_optimizer import CapacitronOptimizer
 class Tacotron2(BaseTacotron):
@ -65,6 +68,9 @@ class Tacotron2(BaseTacotron):
        if self.use_gst:
            self.decoder_in_features += self.gst.gst_embedding_dim
        if self.use_capacitron_vae:
            self.decoder_in_features += self.capacitron_vae.capacitron_VAE_embedding_dim
        # embedding layer
        self.embedding = nn.Embedding(self.num_chars, 512, padding_idx=0)
@ -102,6 +108,20 @@ class Tacotron2(BaseTacotron):
                gst_embedding_dim=self.gst.gst_embedding_dim,
            )
        # Capacitron VAE Layers
        if self.capacitron_vae and self.use_capacitron_vae:
            self.capacitron_vae_layer = CapacitronVAE(
                num_mel=self.decoder_output_dim,
                encoder_output_dim=self.encoder_in_features,
                capacitron_VAE_embedding_dim=self.capacitron_vae.capacitron_VAE_embedding_dim,
                speaker_embedding_dim=self.embedded_speaker_dim
                if self.capacitron_vae.capacitron_use_speaker_embedding
                else None,
                text_summary_embedding_dim=self.capacitron_vae.capacitron_text_summary_embedding_dim
                if self.capacitron_vae.capacitron_use_text_summary_embeddings
                else None,
            )
        # backward pass decoder
        if self.bidirectional_decoder:
            self._init_backward_decoder()
@ -166,6 +186,20 @@ class Tacotron2(BaseTacotron):
                embedded_speakers = torch.unsqueeze(aux_input["d_vectors"], 1)
            encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers)
        # capacitron
        if self.capacitron_vae and self.use_capacitron_vae:
            # B x capacitron_VAE_embedding_dim
            encoder_outputs, *capacitron_vae_outputs = self.compute_capacitron_VAE_embedding(
                encoder_outputs,
                reference_mel_info=[mel_specs, mel_lengths],
                text_info=[embedded_inputs.transpose(1, 2), text_lengths]
                if self.capacitron_vae.capacitron_use_text_summary_embeddings
                else None,
                speaker_embedding=embedded_speakers if self.capacitron_vae.capacitron_use_speaker_embedding else None,
            )
        else:
            capacitron_vae_outputs = None
        encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs)
        # B x mel_dim x T_out -- B x T_out//r x T_in -- B x T_out//r
@ -197,6 +231,7 @@ class Tacotron2(BaseTacotron):
                "decoder_outputs": decoder_outputs,
                "alignments": alignments,
                "stop_tokens": stop_tokens,
                "capacitron_vae_outputs": capacitron_vae_outputs,
            }
        )
        return outputs
@ -217,6 +252,29 @@ class Tacotron2(BaseTacotron):
            # B x gst_dim
            encoder_outputs = self.compute_gst(encoder_outputs, aux_input["style_mel"], aux_input["d_vectors"])
        if self.capacitron_vae and self.use_capacitron_vae:
            if aux_input["style_text"] is not None:
                style_text_embedding = self.embedding(aux_input["style_text"])
                style_text_length = torch.tensor([style_text_embedding.size(1)], dtype=torch.int64).to(
                    encoder_outputs.device
                )  # pylint: disable=not-callable
            reference_mel_length = (
                torch.tensor([aux_input["style_mel"].size(1)], dtype=torch.int64).to(encoder_outputs.device)
                if aux_input["style_mel"] is not None
                else None
            )  # pylint: disable=not-callable
            # B x capacitron_VAE_embedding_dim
            encoder_outputs, *_ = self.compute_capacitron_VAE_embedding(
                encoder_outputs,
                reference_mel_info=[aux_input["style_mel"], reference_mel_length]
                if aux_input["style_mel"] is not None
                else None,
                text_info=[style_text_embedding, style_text_length] if aux_input["style_text"] is not None else None,
                speaker_embedding=aux_input["d_vectors"]
                if self.capacitron_vae.capacitron_use_speaker_embedding
                else None,
            )
        if self.num_speakers > 1:
            if not self.use_d_vector_file:
                embedded_speakers = self.speaker_embedding(aux_input["speaker_ids"])[None]
@ -242,6 +300,13 @@ class Tacotron2(BaseTacotron):
        }
        return outputs
    def before_backward_pass(self, loss_dict, optimizer) -> None:
        # Extracting custom training specific operations for capacitron
        # from the trainer
        if self.use_capacitron_vae:
            loss_dict["capacitron_vae_beta_loss"].backward()
            optimizer.first_step()
    def train_step(self, batch: Dict, criterion: torch.nn.Module):
        """A single training step. Forward pass and loss computation.
@ -258,14 +323,8 @@ class Tacotron2(BaseTacotron):
        speaker_ids = batch["speaker_ids"]
        d_vectors = batch["d_vectors"]
-        # forward pass model
+        aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
-        outputs = self.forward(
+        outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input)
            text_input,
            text_lengths,
            mel_input,
            mel_lengths,
            aux_input={"speaker_ids": speaker_ids, "d_vectors": d_vectors},
        )
        # set the [alignment] lengths wrt reduction factor for guided attention
        if mel_lengths.max() % self.decoder.r != 0:
@ -275,9 +334,6 @@ class Tacotron2(BaseTacotron):
        else:
            alignment_lengths = mel_lengths // self.decoder.r
        aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
        outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input)
        # compute loss
        with autocast(enabled=False):  # use float32 for the criterion
            loss_dict = criterion(
@ -288,6 +344,7 @@ class Tacotron2(BaseTacotron):
                outputs["stop_tokens"].float(),
                stop_targets.float(),
                stop_target_lengths,
                outputs["capacitron_vae_outputs"] if self.capacitron_vae else None,
                mel_lengths,
                None if outputs["decoder_outputs_backward"] is None else outputs["decoder_outputs_backward"].float(),
                outputs["alignments"].float(),
@ -301,6 +358,25 @@ class Tacotron2(BaseTacotron):
        loss_dict["align_error"] = align_error
        return outputs, loss_dict
    def get_optimizer(self) -> List:
        if self.use_capacitron_vae:
            return CapacitronOptimizer(self.config, self.named_parameters())
        return get_optimizer(self.config.optimizer, self.config.optimizer_params, self.config.lr, self)
    def get_scheduler(self, optimizer: object):
        opt = optimizer.primary_optimizer if self.use_capacitron_vae else optimizer
        return get_scheduler(self.config.lr_scheduler, self.config.lr_scheduler_params, opt)
    def before_gradient_clipping(self):
        if self.use_capacitron_vae:
            # Capacitron model specific gradient clipping
            model_params_to_clip = []
            for name, param in self.named_parameters():
                if param.requires_grad:
                    if name != "capacitron_vae_layer.beta":
                        model_params_to_clip.append(param)
            torch.nn.utils.clip_grad_norm_(model_params_to_clip, self.capacitron_vae.capacitron_grad_clip)
    def _create_logs(self, batch, outputs, ap):
        """Create dashboard log information."""
        postnet_outputs = outputs["model_outputs"]
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@ -41,6 +41,23 @@ hann_window = {}
 mel_basis = {}
@torch.no_grad()
 def weights_reset(m: nn.Module):
    # check if the current module has reset_parameters and if it is reset the weight
    reset_parameters = getattr(m, "reset_parameters", None)
    if callable(reset_parameters):
        m.reset_parameters()
 def get_module_weights_sum(mdl: nn.Module):
    dict_sums = {}
    for name, w in mdl.named_parameters():
        if "weight" in name:
            value = w.data.sum().item()
            dict_sums[name] = value
    return dict_sums
 def load_audio(file_path):
    """Load the audio file normalized in [-1, 1]
@ -189,15 +206,20 @@ def wav_to_mel(y, n_fft, num_mels, sample_rate, hop_length, win_length, fmin, fm
 class VitsDataset(TTSDataset):
-    def __init__(self, *args, **kwargs):
+    def __init__(self, model_args, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.pad_id = self.tokenizer.characters.pad_id
        self.model_args = model_args
    def __getitem__(self, idx):
        item = self.samples[idx]
        raw_text = item["text"]
        wav, _ = load_audio(item["audio_file"])
        if self.model_args.encoder_sample_rate is not None:
            if wav.size(1) % self.model_args.encoder_sample_rate != 0:
                wav = wav[:, : -int(wav.size(1) % self.model_args.encoder_sample_rate)]
        wav_filename = os.path.basename(item["audio_file"])
        token_ids = self.get_token_ids(idx, item["text"])
@ -362,6 +384,9 @@ class VitsArgs(Coqpit):
        upsample_kernel_sizes_decoder (List[int]):
            Kernel sizes for each upsampling layer of the decoder network. Defaults to `[16, 16, 4, 4]`.
        periods_multi_period_discriminator (List[int]):
            Periods values for Vits Multi-Period Discriminator. Defaults to `[2, 3, 5, 7, 11]`.
        use_sdp (bool):
            Use Stochastic Duration Predictor. Defaults to True.
@ -451,6 +476,18 @@ class VitsArgs(Coqpit):
        freeze_waveform_decoder (bool):
            Freeze the waveform decoder weigths during training. Defaults to False.
        encoder_sample_rate (int):
            If not None this sample rate will be used for training the Posterior Encoder,
            flow, text_encoder and duration predictor. The decoder part (vocoder) will be
            trained with the `config.audio.sample_rate`. Defaults to None.
        interpolate_z (bool):
            If `encoder_sample_rate` not None and  this parameter True the nearest interpolation
            will be used to upsampling the latent variable z with the sampling rate `encoder_sample_rate`
            to the `config.audio.sample_rate`. If it is False you will need to add extra
            `upsample_rates_decoder` to match the shape. Defaults to True.
    """
    num_chars: int = 100
@ -475,6 +512,7 @@ class VitsArgs(Coqpit):
    upsample_rates_decoder: List[int] = field(default_factory=lambda: [8, 8, 2, 2])
    upsample_initial_channel_decoder: int = 512
    upsample_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [16, 16, 4, 4])
    periods_multi_period_discriminator: List[int] = field(default_factory=lambda: [2, 3, 5, 7, 11])
    use_sdp: bool = True
    noise_scale: float = 1.0
    inference_noise_scale: float = 0.667
@ -505,6 +543,10 @@ class VitsArgs(Coqpit):
    freeze_PE: bool = False
    freeze_flow_decoder: bool = False
    freeze_waveform_decoder: bool = False
    encoder_sample_rate: int = None
    interpolate_z: bool = True
    reinit_DP: bool = False
    reinit_text_encoder: bool = False
 class Vits(BaseTTS):
@ -548,6 +590,7 @@ class Vits(BaseTTS):
        self.init_multispeaker(config)
        self.init_multilingual(config)
        self.init_upsampling()
        self.length_scale = self.args.length_scale
        self.noise_scale = self.args.noise_scale
@ -625,7 +668,10 @@ class Vits(BaseTTS):
        )
        if self.args.init_discriminator:
-            self.disc = VitsDiscriminator(use_spectral_norm=self.args.use_spectral_norm_disriminator)
+            self.disc = VitsDiscriminator(
                periods=self.args.periods_multi_period_discriminator,
                use_spectral_norm=self.args.use_spectral_norm_disriminator,
            )
    def init_multispeaker(self, config: Coqpit):
        """Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer
@ -707,6 +753,38 @@ class Vits(BaseTTS):
        else:
            self.embedded_language_dim = 0
    def init_upsampling(self):
        """
        Initialize upsampling modules of a model.
        """
        if self.args.encoder_sample_rate:
            self.interpolate_factor = self.config.audio["sample_rate"] / self.args.encoder_sample_rate
            self.audio_resampler = torchaudio.transforms.Resample(
                orig_freq=self.config.audio["sample_rate"], new_freq=self.args.encoder_sample_rate
            )  # pylint: disable=W0201
    def on_init_end(self, trainer):  # pylint: disable=W0613
        """Reinit layes if needed"""
        if self.args.reinit_DP:
            before_dict = get_module_weights_sum(self.duration_predictor)
            # Applies weights_reset recursively to every submodule of the duration predictor
            self.duration_predictor.apply(fn=weights_reset)
            after_dict = get_module_weights_sum(self.duration_predictor)
            for key, value in after_dict.items():
                if value == before_dict[key]:
                    raise RuntimeError(" [!] The weights of Duration Predictor was not reinit check it !")
            print(" > Duration Predictor was reinit.")
        if self.args.reinit_text_encoder:
            before_dict = get_module_weights_sum(self.text_encoder)
            # Applies weights_reset recursively to every submodule of the duration predictor
            self.text_encoder.apply(fn=weights_reset)
            after_dict = get_module_weights_sum(self.text_encoder)
            for key, value in after_dict.items():
                if value == before_dict[key]:
                    raise RuntimeError(" [!] The weights of Text Encoder was not reinit check it !")
            print(" > Text Encoder was reinit.")
    def get_aux_input(self, aux_input: Dict):
        sid, g, lid = self._set_cond_input(aux_input)
        return {"speaker_ids": sid, "style_wav": None, "d_vectors": g, "language_ids": lid}
@ -804,6 +882,23 @@ class Vits(BaseTTS):
        outputs["loss_duration"] = loss_duration
        return outputs, attn
    def upsampling_z(self, z, slice_ids=None, y_lengths=None, y_mask=None):
        spec_segment_size = self.spec_segment_size
        if self.args.encoder_sample_rate:
            # recompute the slices and spec_segment_size if needed
            slice_ids = slice_ids * int(self.interpolate_factor) if slice_ids is not None else slice_ids
            spec_segment_size = spec_segment_size * int(self.interpolate_factor)
            # interpolate z if needed
            if self.args.interpolate_z:
                z = torch.nn.functional.interpolate(z, scale_factor=[self.interpolate_factor], mode="linear").squeeze(0)
                # recompute the mask if needed
                if y_lengths is not None and y_mask is not None:
                    y_mask = (
                        sequence_mask(y_lengths * self.interpolate_factor, None).to(y_mask.dtype).unsqueeze(1)
                    )  # [B, 1, T_dec_resampled]
        return z, spec_segment_size, slice_ids, y_mask
    def forward(  # pylint: disable=dangerous-default-value
        self,
        x: torch.tensor,
@ -878,12 +973,16 @@ class Vits(BaseTTS):
        # select a random feature segment for the waveform decoder
        z_slice, slice_ids = rand_segments(z, y_lengths, self.spec_segment_size, let_short_samples=True, pad_short=True)
        # interpolate z if needed
        z_slice, spec_segment_size, slice_ids, _ = self.upsampling_z(z_slice, slice_ids=slice_ids)
        o = self.waveform_decoder(z_slice, g=g)
        wav_seg = segment(
            waveform,
            slice_ids * self.config.audio.hop_length,
-            self.args.spec_segment_size * self.config.audio.hop_length,
+            spec_segment_size * self.config.audio.hop_length,
            pad_short=True,
        )
@ -927,6 +1026,7 @@ class Vits(BaseTTS):
            return aux_input["x_lengths"]
        return torch.tensor(x.shape[1:2]).to(x.device)
    @torch.no_grad()
    def inference(
        self, x, aux_input={"x_lengths": None, "d_vectors": None, "speaker_ids": None, "language_ids": None}
    ):  # pylint: disable=dangerous-default-value
@ -989,9 +1089,22 @@ class Vits(BaseTTS):
        z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * self.inference_noise_scale
        z = self.flow(z_p, y_mask, g=g, reverse=True)
        # upsampling if needed
        z, _, _, y_mask = self.upsampling_z(z, y_lengths=y_lengths, y_mask=y_mask)
        o = self.waveform_decoder((z * y_mask)[:, :, : self.max_inference_len], g=g)
-        outputs = {"model_outputs": o, "alignments": attn.squeeze(1), "z": z, "z_p": z_p, "m_p": m_p, "logs_p": logs_p}
+        outputs = {
            "model_outputs": o,
            "alignments": attn.squeeze(1),
            "durations": w_ceil,
            "z": z,
            "z_p": z_p,
            "m_p": m_p,
            "logs_p": logs_p,
            "y_mask": y_mask,
        }
        return outputs
    @torch.no_grad()
@ -1014,7 +1127,7 @@ class Vits(BaseTTS):
            self.config.audio.hop_length,
            self.config.audio.win_length,
            center=False,
-        ).transpose(1, 2)
+        )
        y_lengths = torch.tensor([y.size(-1)]).to(y.device)
        speaker_cond_src = reference_speaker_id if reference_speaker_id is not None else reference_d_vector
        speaker_cond_tgt = speaker_id if speaker_id is not None else d_vector
@ -1044,7 +1157,7 @@ class Vits(BaseTTS):
        else:
            raise RuntimeError(" [!] Voice conversion is only supported on multi-speaker models.")
-        z, _, _, y_mask = self.posterior_encoder(y.transpose(1, 2), y_lengths, g=g_src)
+        z, _, _, y_mask = self.posterior_encoder(y, y_lengths, g=g_src)
        z_p = self.flow(z, y_mask, g=g_src)
        z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
        o_hat = self.waveform_decoder(z_hat * y_mask, g=g_tgt)
@ -1064,13 +1177,12 @@ class Vits(BaseTTS):
        self._freeze_layers()
-        mel_lens = batch["mel_lens"]
+        spec_lens = batch["spec_lens"]
        if optimizer_idx == 0:
            tokens = batch["tokens"]
            token_lenghts = batch["token_lens"]
            spec = batch["spec"]
            spec_lens = batch["spec_lens"]
            d_vectors = batch["d_vectors"]
            speaker_ids = batch["speaker_ids"]
@ -1108,8 +1220,14 @@ class Vits(BaseTTS):
            # compute melspec segment
            with autocast(enabled=False):
                if self.args.encoder_sample_rate:
                    spec_segment_size = self.spec_segment_size * int(self.interpolate_factor)
                else:
                    spec_segment_size = self.spec_segment_size
                mel_slice = segment(
-                    mel.float(), self.model_outputs_cache["slice_ids"], self.spec_segment_size, pad_short=True
+                    mel.float(), self.model_outputs_cache["slice_ids"], spec_segment_size, pad_short=True
                )
                mel_slice_hat = wav_to_mel(
                    y=self.model_outputs_cache["model_outputs"].float(),
@ -1137,7 +1255,7 @@ class Vits(BaseTTS):
                    logs_q=self.model_outputs_cache["logs_q"].float(),
                    m_p=self.model_outputs_cache["m_p"].float(),
                    logs_p=self.model_outputs_cache["logs_p"].float(),
-                    z_len=mel_lens,
+                    z_len=spec_lens,
                    scores_disc_fake=scores_disc_fake,
                    feats_disc_fake=feats_disc_fake,
                    feats_disc_real=feats_disc_real,
@ -1318,21 +1436,48 @@ class Vits(BaseTTS):
        """Compute spectrograms on the device."""
        ac = self.config.audio
        if self.args.encoder_sample_rate:
            wav = self.audio_resampler(batch["waveform"])
        else:
            wav = batch["waveform"]
        # compute spectrograms
-        batch["spec"] = wav_to_spec(batch["waveform"], ac.fft_size, ac.hop_length, ac.win_length, center=False)
+        batch["spec"] = wav_to_spec(wav, ac.fft_size, ac.hop_length, ac.win_length, center=False)
        if self.args.encoder_sample_rate:
            # recompute spec with high sampling rate to the loss
            spec_mel = wav_to_spec(batch["waveform"], ac.fft_size, ac.hop_length, ac.win_length, center=False)
            # remove extra stft frames if needed
            if spec_mel.size(2) > int(batch["spec"].size(2) * self.interpolate_factor):
                spec_mel = spec_mel[:, :, : int(batch["spec"].size(2) * self.interpolate_factor)]
            else:
                batch["spec"] = batch["spec"][:, :, : int(spec_mel.size(2) / self.interpolate_factor)]
        else:
            spec_mel = batch["spec"]
        batch["mel"] = spec_to_mel(
-            spec=batch["spec"],
+            spec=spec_mel,
            n_fft=ac.fft_size,
            num_mels=ac.num_mels,
            sample_rate=ac.sample_rate,
            fmin=ac.mel_fmin,
            fmax=ac.mel_fmax,
        )
        if self.args.encoder_sample_rate:
            assert batch["spec"].shape[2] == int(
                batch["mel"].shape[2] / self.interpolate_factor
            ), f"{batch['spec'].shape[2]}, {batch['mel'].shape[2]}"
        else:
            assert batch["spec"].shape[2] == batch["mel"].shape[2], f"{batch['spec'].shape[2]}, {batch['mel'].shape[2]}"
        # compute spectrogram frame lengths
        batch["spec_lens"] = (batch["spec"].shape[2] * batch["waveform_rel_lens"]).int()
        batch["mel_lens"] = (batch["mel"].shape[2] * batch["waveform_rel_lens"]).int()
        if self.args.encoder_sample_rate:
            assert (batch["spec_lens"] - (batch["mel_lens"] / self.interpolate_factor).int()).sum() == 0
        else:
            assert (batch["spec_lens"] - batch["mel_lens"]).sum() == 0
        # zero the padding frames
@ -1355,8 +1500,9 @@ class Vits(BaseTTS):
        else:
            # init dataloader
            dataset = VitsDataset(
                model_args=self.args,
                samples=samples,
-                # batch_group_size=0 if is_eval else config.batch_group_size * config.batch_size,
+                batch_group_size=0 if is_eval else config.batch_group_size * config.batch_size,
                min_text_len=config.min_text_len,
                max_text_len=config.max_text_len,
                min_audio_len=config.min_audio_len,
@ -1449,6 +1595,11 @@ class Vits(BaseTTS):
        # TODO: consider baking the speaker encoder into the model and call it from there.
        # as it is probably easier for model distribution.
        state["model"] = {k: v for k, v in state["model"].items() if "speaker_encoder" not in k}
        if self.args.encoder_sample_rate is not None and eval:
            # audio resampler is not used in inference time
            self.audio_resampler = None
        # handle fine-tuning from a checkpoint with additional speakers
        if hasattr(self, "emb_g") and state["model"]["emb_g.weight"].shape != self.emb_g.weight.shape:
            num_new_speakers = self.emb_g.weight.shape[0] - state["model"]["emb_g.weight"].shape[0]
@ -1476,9 +1627,17 @@ class Vits(BaseTTS):
        from TTS.utils.audio import AudioProcessor
        upsample_rate = torch.prod(torch.as_tensor(config.model_args.upsample_rates_decoder)).item()
        if not config.model_args.encoder_sample_rate:
            assert (
                upsample_rate == config.audio.hop_length
            ), f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {config.audio.hop_length}"
        else:
            encoder_to_vocoder_upsampling_factor = config.audio.sample_rate / config.model_args.encoder_sample_rate
            effective_hop_length = config.audio.hop_length * encoder_to_vocoder_upsampling_factor
            assert (
                upsample_rate == effective_hop_length
            ), f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {effective_hop_length}"
        ap = AudioProcessor.init_from_config(config, verbose=verbose)
        tokenizer, new_config = TTSTokenizer.init_from_config(config)
--- a/TTS/tts/utils/data.py
+++ b/TTS/tts/utils/data.py
@ -1,4 +1,7 @@
 import bisect
 import numpy as np
 import torch
 def _pad_data(x, length):
@ -51,3 +54,26 @@ def prepare_stop_target(inputs, out_steps):
 def pad_per_step(inputs, pad_len):
    return np.pad(inputs, [[0, 0], [0, 0], [0, pad_len]], mode="constant", constant_values=0.0)
 def get_length_balancer_weights(items: list, num_buckets=10):
    # get all durations
    audio_lengths = np.array([item["audio_length"] for item in items])
    # create the $num_buckets buckets classes based in the dataset max and min length
    max_length = int(max(audio_lengths))
    min_length = int(min(audio_lengths))
    step = int((max_length - min_length) / num_buckets) + 1
    buckets_classes = [i + step for i in range(min_length, (max_length - step) + num_buckets + 1, step)]
    # add each sample in their respective length bucket
    buckets_names = np.array(
        [buckets_classes[bisect.bisect_left(buckets_classes, item["audio_length"])] for item in items]
    )
    # count and compute the weights_bucket for each sample
    unique_buckets_names = np.unique(buckets_names).tolist()
    bucket_ids = [unique_buckets_names.index(l) for l in buckets_names]
    bucket_count = np.array([len(np.where(buckets_names == l)[0]) for l in unique_buckets_names])
    weight_bucket = 1.0 / bucket_count
    dataset_samples_weight = np.array([weight_bucket[l] for l in bucket_ids])
    # normalize
    dataset_samples_weight = dataset_samples_weight / np.linalg.norm(dataset_samples_weight)
    return torch.from_numpy(dataset_samples_weight).float()
--- a/TTS/tts/utils/managers.py
+++ b/TTS/tts/utils/managers.py
@ -11,6 +11,28 @@ from TTS.encoder.utils.generic_utils import setup_encoder_model
 from TTS.utils.audio import AudioProcessor
 def load_file(path: str):
    if path.endswith(".json"):
        with fsspec.open(path, "r") as f:
            return json.load(f)
    elif path.endswith(".pth"):
        with fsspec.open(path, "rb") as f:
            return torch.load(f, map_location="cpu")
    else:
        raise ValueError("Unsupported file type")
 def save_file(obj: Any, path: str):
    if path.endswith(".json"):
        with fsspec.open(path, "w") as f:
            json.dump(obj, f, indent=4)
    elif path.endswith(".pth"):
        with fsspec.open(path, "wb") as f:
            torch.save(obj, f)
    else:
        raise ValueError("Unsupported file type")
 class BaseIDManager:
    """Base `ID` Manager class. Every new `ID` manager must inherit this.
    It defines common `ID` manager specific functions.
@ -46,7 +68,7 @@ class BaseIDManager:
        Args:
            file_path (str): Path to the file.
        """
-        self.ids = self._load_json(file_path)
+        self.ids = load_file(file_path)
    def save_ids_to_file(self, file_path: str) -> None:
        """Save IDs to a json file.
@ -54,7 +76,7 @@ class BaseIDManager:
        Args:
            file_path (str): Path to the output file.
        """
-        self._save_json(file_path, self.ids)
+        save_file(self.ids, file_path)
    def get_random_id(self) -> Any:
        """Get a random embedding.
@ -110,7 +132,7 @@ class EmbeddingManager(BaseIDManager):
            self.load_embeddings_from_file(embedding_file_path)
        if encoder_model_path and encoder_config_path:
-            self.init_encoder(encoder_model_path, encoder_config_path)
+            self.init_encoder(encoder_model_path, encoder_config_path, use_cuda)
    @property
    def embedding_dim(self):
@ -125,7 +147,7 @@ class EmbeddingManager(BaseIDManager):
        Args:
            file_path (str): Path to the output file.
        """
-        self._save_json(file_path, self.embeddings)
+        save_file(self.embeddings, file_path)
    def load_embeddings_from_file(self, file_path: str) -> None:
        """Load embeddings from a json file.
@ -133,7 +155,7 @@ class EmbeddingManager(BaseIDManager):
        Args:
            file_path (str): Path to the target json file.
        """
-        self.embeddings = self._load_json(file_path)
+        self.embeddings = load_file(file_path)
        speakers = sorted({x["name"] for x in self.embeddings.values()})
        self.ids = {name: i for i, name in enumerate(speakers)}
@ -216,17 +238,19 @@ class EmbeddingManager(BaseIDManager):
    def get_clips(self) -> List:
        return sorted(self.embeddings.keys())
-    def init_encoder(self, model_path: str, config_path: str) -> None:
+    def init_encoder(self, model_path: str, config_path: str, use_cuda=False) -> None:
        """Initialize a speaker encoder model.
        Args:
            model_path (str): Model file path.
            config_path (str): Model config file path.
            use_cuda (bool, optional): Use CUDA. Defaults to False.
        """
        self.use_cuda = use_cuda
        self.encoder_config = load_config(config_path)
        self.encoder = setup_encoder_model(self.encoder_config)
        self.encoder_criterion = self.encoder.load_checkpoint(
-            self.encoder_config, model_path, eval=True, use_cuda=self.use_cuda
+            self.encoder_config, model_path, eval=True, use_cuda=use_cuda
        )
        self.encoder_ap = AudioProcessor(**self.encoder_config.audio)
--- a/TTS/tts/utils/speakers.py
+++ b/TTS/tts/utils/speakers.py
@ -108,6 +108,7 @@ class SpeakerManager(EmbeddingManager):
                )
        if get_from_config_or_model_args_with_default(config, "use_d_vector_file", False):
            speaker_manager = SpeakerManager()
            if get_from_config_or_model_args_with_default(config, "speakers_file", None):
                speaker_manager = SpeakerManager(
                    d_vectors_file_path=get_from_config_or_model_args_with_default(config, "speaker_file", None)
--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@ -26,6 +26,7 @@ def run_model_torch(
    inputs: torch.Tensor,
    speaker_id: int = None,
    style_mel: torch.Tensor = None,
    style_text: str = None,
    d_vector: torch.Tensor = None,
    language_id: torch.Tensor = None,
 ) -> Dict:
@ -53,6 +54,7 @@ def run_model_torch(
            "speaker_ids": speaker_id,
            "d_vectors": d_vector,
            "style_mel": style_mel,
            "style_text": style_text,
            "language_ids": language_id,
        },
    )
@ -115,6 +117,7 @@ def synthesis(
    use_cuda,
    speaker_id=None,
    style_wav=None,
    style_text=None,
    use_griffin_lim=False,
    do_trim_silence=False,
    d_vector=None,
@ -140,7 +143,12 @@ def synthesis(
            Speaker ID passed to the speaker embedding layer in multi-speaker model. Defaults to None.
        style_wav (str | Dict[str, float]):
-            Path or tensor to/of a waveform used for computing the style embedding. Defaults to None.
+            Path or tensor to/of a waveform used for computing the style embedding based on GST or Capacitron.
            Defaults to None, meaning that Capacitron models will sample from the prior distribution to
            generate random but realistic prosody.
        style_text (str):
            Transcription of style_wav for Capacitron models. Defaults to None.
        enable_eos_bos_chars (bool):
            enable special chars for end of sentence and start of sentence. Defaults to False.
@ -154,13 +162,19 @@ def synthesis(
        language_id (int):
            Language ID passed to the language embedding layer in multi-langual model. Defaults to None.
    """
-    # GST processing
+    # GST or Capacitron processing
    # TODO: need to handle the case of setting both gst and capacitron to true somewhere
    style_mel = None
    if CONFIG.has("gst") and CONFIG.gst and style_wav is not None:
        if isinstance(style_wav, dict):
            style_mel = style_wav
        else:
            style_mel = compute_style_mel(style_wav, model.ap, cuda=use_cuda)
    if CONFIG.has("capacitron_vae") and CONFIG.use_capacitron_vae and style_wav is not None:
        style_mel = compute_style_mel(style_wav, model.ap, cuda=use_cuda)
        style_mel = style_mel.transpose(1, 2)  # [1, time, depth]
    # convert text to sequence of token IDs
    text_inputs = np.asarray(
        model.tokenizer.text_to_ids(text, language=language_id),
@ -177,11 +191,28 @@ def synthesis(
        language_id = id_to_torch(language_id, cuda=use_cuda)
    if not isinstance(style_mel, dict):
        # GST or Capacitron style mel
        style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda)
        if style_text is not None:
            style_text = np.asarray(
                model.tokenizer.text_to_ids(style_text, language=language_id),
                dtype=np.int32,
            )
            style_text = numpy_to_torch(style_text, torch.long, cuda=use_cuda)
            style_text = style_text.unsqueeze(0)
    text_inputs = numpy_to_torch(text_inputs, torch.long, cuda=use_cuda)
    text_inputs = text_inputs.unsqueeze(0)
    # synthesize voice
-    outputs = run_model_torch(model, text_inputs, speaker_id, style_mel, d_vector=d_vector, language_id=language_id)
+    outputs = run_model_torch(
        model,
        text_inputs,
        speaker_id,
        style_mel,
        style_text,
        d_vector=d_vector,
        language_id=language_id,
    )
    model_outputs = outputs["model_outputs"]
    model_outputs = model_outputs[0].data.cpu().numpy()
    alignments = outputs["alignments"]
--- a/TTS/tts/utils/text/phonemizers/espeak_wrapper.py
+++ b/TTS/tts/utils/text/phonemizers/espeak_wrapper.py
@ -107,15 +107,6 @@ class ESpeak(BasePhonemizer):
        if backend not in ["espeak", "espeak-ng"]:
            raise Exception("Unknown backend: %s" % backend)
        self._ESPEAK_LIB = backend
        # skip first two characters of the retuned text
        # "_ p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n"
        #  ^^
        self.num_skip_chars = 2
        if backend == "espeak-ng":
            # skip the first character of the retuned text
            # "_p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n"
            #  ^
            self.num_skip_chars = 1
    def auto_set_espeak_lib(self) -> None:
        if is_tool("espeak-ng"):
@ -163,7 +154,16 @@ class ESpeak(BasePhonemizer):
        phonemes = ""
        for line in _espeak_exe(self._ESPEAK_LIB, args, sync=True):
            logging.debug("line: %s", repr(line))
-            phonemes += line.decode("utf8").strip()[self.num_skip_chars :]  # skip initial redundant characters
+            ph_decoded = line.decode("utf8").strip()
            # espeak need to skip first two characters of the retuned text:
            #   version 1.48.03: "_ p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n"
            #   version 1.48.15: " p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n"
            # espeak-ng need to skip the first character of the retuned text:
            #   "_p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n"
            # dealing with the conditions descrived above
            ph_decoded = ph_decoded[:1].replace("_", "") + ph_decoded[1:]
            phonemes += ph_decoded.strip()
        return phonemes.replace("_", separator)
    def _phonemize(self, text, separator=None):
--- a/TTS/utils/audio.py
+++ b/TTS/utils/audio.py
@ -859,7 +859,11 @@ class AudioProcessor(object):
            path (str): Path to a output file.
            sr (int, optional): Sampling rate used for saving to the file. Defaults to None.
        """
        if self.do_rms_norm:
            wav_norm = self.rms_volume_norm(wav, self.db_level) * 32767
        else:
            wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
        scipy.io.wavfile.write(path, sr if sr else self.sample_rate, wav_norm.astype(np.int16))
    def get_duration(self, filename: str) -> float:
--- a/TTS/utils/capacitron_optimizer.py
+++ b/TTS/utils/capacitron_optimizer.py
@ -0,0 +1,65 @@
 from typing import Generator
 from trainer.trainer_utils import get_optimizer
 class CapacitronOptimizer:
    """Double optimizer class for the Capacitron model."""
    def __init__(self, config: dict, model_params: Generator) -> None:
        self.primary_params, self.secondary_params = self.split_model_parameters(model_params)
        optimizer_names = list(config.optimizer_params.keys())
        optimizer_parameters = list(config.optimizer_params.values())
        self.primary_optimizer = get_optimizer(
            optimizer_names[0],
            optimizer_parameters[0],
            config.lr,
            parameters=self.primary_params,
        )
        self.secondary_optimizer = get_optimizer(
            optimizer_names[1],
            self.extract_optimizer_parameters(optimizer_parameters[1]),
            optimizer_parameters[1]["lr"],
            parameters=self.secondary_params,
        )
        self.param_groups = self.primary_optimizer.param_groups
    def first_step(self):
        self.secondary_optimizer.step()
        self.secondary_optimizer.zero_grad()
        self.primary_optimizer.zero_grad()
    def step(self):
        self.primary_optimizer.step()
    def zero_grad(self):
        self.primary_optimizer.zero_grad()
        self.secondary_optimizer.zero_grad()
    def load_state_dict(self, state_dict):
        self.primary_optimizer.load_state_dict(state_dict[0])
        self.secondary_optimizer.load_state_dict(state_dict[1])
    def state_dict(self):
        return [self.primary_optimizer.state_dict(), self.secondary_optimizer.state_dict()]
    @staticmethod
    def split_model_parameters(model_params: Generator) -> list:
        primary_params = []
        secondary_params = []
        for name, param in model_params:
            if param.requires_grad:
                if name == "capacitron_vae_layer.beta":
                    secondary_params.append(param)
                else:
                    primary_params.append(param)
        return [iter(primary_params), iter(secondary_params)]
    @staticmethod
    def extract_optimizer_parameters(params: dict) -> dict:
        """Extract parameters that are not the learning rate"""
        return {k: v for k, v in params.items() if k != "lr"}
--- a/TTS/utils/io.py
+++ b/TTS/utils/io.py
@ -106,6 +106,8 @@ def save_model(config, model, optimizer, scaler, current_step, epoch, output_pat
        model_state = model.state_dict()
    if isinstance(optimizer, list):
        optimizer_state = [optim.state_dict() for optim in optimizer]
    elif optimizer.__class__.__name__ == "CapacitronOptimizer":
        optimizer_state = [optimizer.primary_optimizer.state_dict(), optimizer.secondary_optimizer.state_dict()]
    else:
        optimizer_state = optimizer.state_dict() if optimizer is not None else None
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@ -90,6 +90,81 @@ class ModelManager(object):
            models_name_list.extend(model_list)
        return models_name_list
    def model_info_by_idx(self, model_query):
        """Print the description of the model from .models.json file using model_idx
        Args:
            model_query (str): <model_tye>/<model_idx>
        """
        model_name_list = []
        model_type, model_query_idx = model_query.split("/")
        try:
            model_query_idx = int(model_query_idx)
            if model_query_idx <= 0:
                print("> model_query_idx should be a positive integer!")
                return
        except:
            print("> model_query_idx should be an integer!")
            return
        model_count = 0
        if model_type in self.models_dict:
            for lang in self.models_dict[model_type]:
                for dataset in self.models_dict[model_type][lang]:
                    for model in self.models_dict[model_type][lang][dataset]:
                        model_name_list.append(f"{model_type}/{lang}/{dataset}/{model}")
                        model_count += 1
        else:
            print(f"> model_type {model_type} does not exist in the list.")
            return
        if model_query_idx > model_count:
            print(f"model query idx exceeds the number of available models [{model_count}] ")
        else:
            model_type, lang, dataset, model = model_name_list[model_query_idx - 1].split("/")
            print(f"> model type : {model_type}")
            print(f"> language supported : {lang}")
            print(f"> dataset used : {dataset}")
            print(f"> model name : {model}")
            if "description" in self.models_dict[model_type][lang][dataset][model]:
                print(f"> description : {self.models_dict[model_type][lang][dataset][model]['description']}")
            else:
                print("> description : coming soon")
            if "default_vocoder" in self.models_dict[model_type][lang][dataset][model]:
                print(f"> default_vocoder : {self.models_dict[model_type][lang][dataset][model]['default_vocoder']}")
    def model_info_by_full_name(self, model_query_name):
        """Print the description of the model from .models.json file using model_full_name
        Args:
            model_query_name (str): Format is <model_type>/<language>/<dataset>/<model_name>
        """
        model_type, lang, dataset, model = model_query_name.split("/")
        if model_type in self.models_dict:
            if lang in self.models_dict[model_type]:
                if dataset in self.models_dict[model_type][lang]:
                    if model in self.models_dict[model_type][lang][dataset]:
                        print(f"> model type : {model_type}")
                        print(f"> language supported : {lang}")
                        print(f"> dataset used : {dataset}")
                        print(f"> model name : {model}")
                        if "description" in self.models_dict[model_type][lang][dataset][model]:
                            print(
                                f"> description : {self.models_dict[model_type][lang][dataset][model]['description']}"
                            )
                        else:
                            print("> description : coming soon")
                        if "default_vocoder" in self.models_dict[model_type][lang][dataset][model]:
                            print(
                                f"> default_vocoder : {self.models_dict[model_type][lang][dataset][model]['default_vocoder']}"
                            )
                    else:
                        print(f"> model {model} does not exist for {model_type}/{lang}/{dataset}.")
                else:
                    print(f"> dataset {dataset} does not exist for {model_type}/{lang}.")
            else:
                print(f"> lang {lang} does not exist for {model_type}.")
        else:
            print(f"> model_type {model_type} does not exist in the list.")
    def list_tts_models(self):
        """Print all `TTS` models and return a list of model names
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@ -1,5 +1,5 @@
 import time
-from typing import List, Union
+from typing import List
 import numpy as np
 import pysbd
@ -97,10 +97,10 @@ class Synthesizer(object):
        """Load the TTS model.
        1. Load the model config.
-        2. Init the AudioProcessor.
+        2. Init the model from the config.
-        3. Init the model from the config.
+        3. Load the model weights.
        4. Move the model to the GPU if CUDA is enabled.
-        5. Init the speaker manager for the model.
+        5. Init the speaker manager in the model.
        Args:
            tts_checkpoint (str): path to the model checkpoint.
@ -122,7 +122,7 @@ class Synthesizer(object):
            self.tts_model.cuda()
        if self.encoder_checkpoint and hasattr(self.tts_model, "speaker_manager"):
-            self.tts_model.speaker_manager.init_encoder(self.encoder_checkpoint, self.encoder_config)
+            self.tts_model.speaker_manager.init_encoder(self.encoder_checkpoint, self.encoder_config, use_cuda)
    def _set_speaker_encoder_paths_from_tts_config(self):
        """Set the encoder paths from the tts model config for models with speaker encoders."""
@ -178,8 +178,9 @@ class Synthesizer(object):
        text: str = "",
        speaker_name: str = "",
        language_name: str = "",
-        speaker_wav: Union[str, List[str]] = None,
+        speaker_wav=None,
        style_wav=None,
        style_text=None,
        reference_wav=None,
        reference_speaker_name=None,
    ) -> List[int]:
@ -191,6 +192,7 @@ class Synthesizer(object):
            language_name (str, optional): language id for multi-language models. Defaults to "".
            speaker_wav (Union[str, List[str]], optional): path to the speaker wav. Defaults to None.
            style_wav ([type], optional): style waveform for GST. Defaults to None.
            style_text ([type], optional): transcription of style_wav for Capacitron. Defaults to None.
            reference_wav ([type], optional): reference waveform for voice conversion. Defaults to None.
            reference_speaker_name ([type], optional): spekaer id of reference waveform. Defaults to None.
        Returns:
@ -273,10 +275,11 @@ class Synthesizer(object):
                    CONFIG=self.tts_config,
                    use_cuda=self.use_cuda,
                    speaker_id=speaker_id,
                    language_id=language_id,
                    style_wav=style_wav,
                    style_text=style_text,
                    use_griffin_lim=use_gl,
                    d_vector=speaker_embedding,
                    language_id=language_id,
                )
                waveform = outputs["wav"]
                mel_postnet_spec = outputs["outputs"]["model_outputs"][0].detach().cpu().numpy()
@ -315,7 +318,7 @@ class Synthesizer(object):
            # get the speaker embedding or speaker id for the reference wav file
            reference_speaker_embedding = None
            reference_speaker_id = None
-            if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "speaker_ids"):
+            if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "ids"):
                if reference_speaker_name and isinstance(reference_speaker_name, str):
                    if self.tts_config.use_d_vector_file:
                        # get the speaker embedding from the saved d_vectors.
--- a/TTS/vocoder/datasets/gan_dataset.py
+++ b/TTS/vocoder/datasets/gan_dataset.py
@ -115,8 +115,8 @@ class GANDataset(Dataset):
                audio, mel = self.cache[idx]
            else:
                audio = self.ap.load_wav(wavpath)
                audio, _ = self._pad_short_samples(audio)
                mel = self.ap.melspectrogram(audio)
                audio, mel = self._pad_short_samples(audio, mel)
        else:
            # load precomputed features
--- a/TTS/vocoder/models/gan.py
+++ b/TTS/vocoder/models/gan.py
@ -90,50 +90,26 @@ class GAN(BaseVocoder):
            raise ValueError(" [!] Unexpected `optimizer_idx`.")
        if optimizer_idx == 0:
-            # GENERATOR
+            # DISCRIMINATOR optimization
            # generator pass
            y_hat = self.model_g(x)[:, :, : y.size(2)]
-            self.y_hat_g = y_hat  # save for discriminator
+
-            y_hat_sub = None
+            # cache for generator loss
-            y_sub = None
+            # pylint: disable=W0201
            self.y_hat_g = y_hat
            self.y_hat_sub = None
            self.y_sub_g = None
            # PQMF formatting
            if y_hat.shape[1] > 1:
-                y_hat_sub = y_hat
+                self.y_hat_sub = y_hat
                y_hat = self.model_g.pqmf_synthesis(y_hat)
-                self.y_hat_g = y_hat  # save for discriminator
+                self.y_hat_g = y_hat  # save for generator loss
-                y_sub = self.model_g.pqmf_analysis(y)
+                self.y_sub_g = self.model_g.pqmf_analysis(y)
            scores_fake, feats_fake, feats_real = None, None, None
            if self.train_disc:
                if len(signature(self.model_d.forward).parameters) == 2:
                    D_out_fake = self.model_d(y_hat, x)
                else:
                    D_out_fake = self.model_d(y_hat)
                D_out_real = None
                if self.config.use_feat_match_loss:
                    with torch.no_grad():
                        D_out_real = self.model_d(y)
                # format D outputs
                if isinstance(D_out_fake, tuple):
                    scores_fake, feats_fake = D_out_fake
                    if D_out_real is None:
                        feats_real = None
                    else:
                        _, feats_real = D_out_real
                else:
                    scores_fake = D_out_fake
                    feats_fake, feats_real = None, None
            # compute losses
            loss_dict = criterion[optimizer_idx](y_hat, y, scores_fake, feats_fake, feats_real, y_hat_sub, y_sub)
            outputs = {"model_outputs": y_hat}
        if optimizer_idx == 1:
            # DISCRIMINATOR
            if self.train_disc:
                # use different samples for G and D trainings
                if self.config.diff_samples_for_G_and_D:
@ -177,6 +153,36 @@ class GAN(BaseVocoder):
                loss_dict = criterion[optimizer_idx](scores_fake, scores_real)
                outputs = {"model_outputs": y_hat}
        if optimizer_idx == 1:
            # GENERATOR loss
            scores_fake, feats_fake, feats_real = None, None, None
            if self.train_disc:
                if len(signature(self.model_d.forward).parameters) == 2:
                    D_out_fake = self.model_d(self.y_hat_g, x)
                else:
                    D_out_fake = self.model_d(self.y_hat_g)
                D_out_real = None
                if self.config.use_feat_match_loss:
                    with torch.no_grad():
                        D_out_real = self.model_d(y)
                # format D outputs
                if isinstance(D_out_fake, tuple):
                    scores_fake, feats_fake = D_out_fake
                    if D_out_real is None:
                        feats_real = None
                    else:
                        _, feats_real = D_out_real
                else:
                    scores_fake = D_out_fake
                    feats_fake, feats_real = None, None
            # compute losses
            loss_dict = criterion[optimizer_idx](
                self.y_hat_g, y, scores_fake, feats_fake, feats_real, self.y_hat_sub, self.y_sub_g
            )
            outputs = {"model_outputs": self.y_hat_g}
        return outputs, loss_dict
    @staticmethod
@ -210,6 +216,7 @@ class GAN(BaseVocoder):
    @torch.no_grad()
    def eval_step(self, batch: Dict, criterion: nn.Module, optimizer_idx: int) -> Tuple[Dict, Dict]:
        """Call `train_step()` with `no_grad()`"""
        self.train_disc = True  # Avoid a bug in the Training with the missing discriminator loss
        return self.train_step(batch, criterion, optimizer_idx)
    def eval_log(
@ -266,7 +273,7 @@ class GAN(BaseVocoder):
        optimizer2 = get_optimizer(
            self.config.optimizer, self.config.optimizer_params, self.config.lr_disc, self.model_d
        )
-        return [optimizer1, optimizer2]
+        return [optimizer2, optimizer1]
    def get_lr(self) -> List:
        """Set the initial learning rates for each optimizer.
@ -274,7 +281,7 @@ class GAN(BaseVocoder):
        Returns:
            List: learning rates for each optimizer.
        """
-        return [self.config.lr_gen, self.config.lr_disc]
+        return [self.config.lr_disc, self.config.lr_gen]
    def get_scheduler(self, optimizer) -> List:
        """Set the schedulers for each optimizer.
@ -287,7 +294,7 @@ class GAN(BaseVocoder):
        """
        scheduler1 = get_scheduler(self.config.lr_scheduler_gen, self.config.lr_scheduler_gen_params, optimizer[0])
        scheduler2 = get_scheduler(self.config.lr_scheduler_disc, self.config.lr_scheduler_disc_params, optimizer[1])
-        return [scheduler1, scheduler2]
+        return [scheduler2, scheduler1]
    @staticmethod
    def format_batch(batch: List) -> Dict:
@ -359,7 +366,7 @@ class GAN(BaseVocoder):
    def get_criterion(self):
        """Return criterions for the optimizers"""
-        return [GeneratorLoss(self.config), DiscriminatorLoss(self.config)]
+        return [DiscriminatorLoss(self.config), GeneratorLoss(self.config)]
    @staticmethod
    def init_from_config(config: Coqpit, verbose=True) -> "GAN":
--- a/docs/source/faq.md
+++ b/docs/source/faq.md
@ -59,8 +59,7 @@ We tried to collect common issues and questions we receive about 🐸TTS. It is
 6. Train your model.
    - SingleGPU training: ```CUDA_VISIBLE_DEVICES="0" python train_tts.py --config_path config.json```
-    - MultiGPU training: ```CUDA_VISIBLE_DEVICES="0,1,2" python distribute.py --script train_tts.py --config_path config.json```
+    - MultiGPU training: ```python3 -m trainer.distribute --gpus "0,1" --script TTS/bin/train_tts.py --config_path config.json```
        - This command uses all the GPUs given in ```CUDA_VISIBLE_DEVICES```. If you don't specify, it uses all the GPUs available.
 **Note:** You can also train your model using pure 🐍 python. Check ```{eval-rst} :ref: 'tutorial_for_nervous_beginners'```.
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@ -1,6 +1,6 @@
 # Installation
-🐸TTS supports python >=3.6 <=3.9 and tested on Ubuntu 18.10, 19.10, 20.10.
+🐸TTS supports python >=3.7 <3.11.0 and tested on Ubuntu 18.10, 19.10, 20.10.
 ## Using `pip`
--- a/docs/source/training_a_model.md
+++ b/docs/source/training_a_model.md
@ -2,7 +2,7 @@
 1. Decide the model you want to use.
-    Each model has a different set of pros and cons that define the run-time efficiency and the voice quality. It is up to you to decide what model servers your needs. Other than referring to the papers, one easy way is to test the 🐸TTS
+    Each model has a different set of pros and cons that define the run-time efficiency and the voice quality. It is up to you to decide what model serves your needs. Other than referring to the papers, one easy way is to test the 🐸TTS
    community models and see how fast and good each of the models. Or you can start a discussion on our communication channels.
 2. Understand the configuration, its fields and values.
--- a/notebooks/TestAttention.ipynb
+++ b/notebooks/TestAttention.ipynb
@ -1,6 +1,5 @@
 {
- "cells": [
+    "cells": [{
  {
            "cell_type": "markdown",
            "metadata": {
                "Collapsed": "false"
@ -37,9 +36,7 @@
                "import librosa.display\n",
                "\n",
                "from TTS.tts.layers import *\n",
-    "from TTS.utils.audio import AudioProcessor
+                "from TTS.utils.audio import AudioProcessor\n",
 \n",
                "from TTS.tts.utils.generic_utils import setup_model\n",
                "from TTS.tts.utils.io import load_config\n",
                "from TTS.tts.utils.text import text_to_sequence\n",
--- a/notebooks/Tutorial_1_use-pretrained-TTS.ipynb
+++ b/notebooks/Tutorial_1_use-pretrained-TTS.ipynb
@ -0,0 +1,272 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "45ea3ef5",
   "metadata": {
    "tags": []
   },
   "source": [
    "# Easy Inferencing with 🐸 TTS ⚡\n",
    "\n",
    "#### You want to quicly synthesize speech using Coqui 🐸 TTS model?\n",
    "\n",
    "💡: Grab a pre-trained model and use it to synthesize speech using any speaker voice, including yours! ⚡\n",
    "\n",
    "🐸 TTS comes with a list of pretrained models and speaker voices. You can even start a local demo server that you can open it on your favorite web browser and 🗣️ .\n",
    "\n",
    "In this notebook, we will: \n",
    "```\n",
    "1. List available pre-trained 🐸 TTS models\n",
    "2. Run a 🐸 TTS model\n",
    "3. Listen to the synthesized wave 📣\n",
    "4. Run multispeaker 🐸 TTS model \n",
    "```\n",
    "So, let's jump right in!\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a1e5c2a5-46eb-42fd-b550-2a052546857e",
   "metadata": {},
   "source": [
    "## Install 🐸 TTS ⬇️"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fa2aec77",
   "metadata": {},
   "outputs": [],
   "source": [
    "! pip install -U pip\n",
    "! pip install TTS"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8c07a273",
   "metadata": {},
   "source": [
    "## ✅ List available pre-trained 🐸 TTS models\n",
    "\n",
    "Coqui 🐸TTS comes with a list of pretrained models for different model types (ex: TTS, vocoder), languages, datasets used for training and architectures. \n",
    "\n",
    "You can either use your own model or the release models under 🐸TTS.\n",
    "\n",
    "Use `tts --list_models` to find out the availble models.\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "608d203f",
   "metadata": {},
   "outputs": [],
   "source": [
    "! tts --list_models"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ed9dd7ab",
   "metadata": {},
   "source": [
    "## ✅ Run a 🐸 TTS model\n",
    "\n",
    "#### **First things first**: Using a release model and default vocoder:\n",
    "\n",
    "You can simply copy the full model name from the list above and use it \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cc9e4608-16ec-4dcd-bd6b-bd10d62286f8",
   "metadata": {},
   "outputs": [],
   "source": [
    "!tts --text \"hello world\" \\\n",
    "--model_name \"tts_models/en/ljspeech/glow-tts\" \\\n",
    "--out_path output.wav\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0ca2cb14-1aba-400e-a219-8ce44d9410be",
   "metadata": {},
   "source": [
    "## 📣 Listen to the synthesized wave 📣"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5fe63ef4-9284-4461-9dda-1ca7483a8f9b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import IPython\n",
    "IPython.display.Audio(\"output.wav\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5e67d178-1ebe-49c7-9a47-0593251bdb96",
   "metadata": {},
   "source": [
    "### **Second things second**:\n",
    "\n",
    "🔶 A TTS model can be either trained on a single speaker voice or multispeaker voices. This training choice is directly reflected on the inference ability and the available speaker voices that can be used to synthesize speech. \n",
    "\n",
    "🔶 If you want to run a multispeaker model from the released models list, you can first check the speaker ids using `--list_speaker_idx` flag and use this speaker voice to synthesize speech."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "87b18839-f750-4a61-bbb0-c964acaecab2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# list the possible speaker IDs.\n",
    "!tts --model_name \"tts_models/en/vctk/vits\" \\\n",
    "--list_speaker_idxs \n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c4365a9d-f922-4b14-88b0-d2b22a245b2e",
   "metadata": {},
   "source": [
    "## 💬 Synthesize speech using speaker ID 💬"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "52be0403-d13e-4d9b-99c2-c10b85154063",
   "metadata": {},
   "outputs": [],
   "source": [
    "!tts --text \"Trying out specific speaker voice\"\\\n",
    "--out_path spkr-out.wav --model_name \"tts_models/en/vctk/vits\" \\\n",
    "--speaker_idx \"p341\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "894a560a-f9c8-48ce-aaa6-afdf516c01f6",
   "metadata": {},
   "source": [
    "## 📣 Listen to the synthesized speaker specific wave 📣"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ed485b0a-dfd5-4a7e-a571-ebf74bdfc41d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import IPython\n",
    "IPython.display.Audio(\"spkr-out.wav\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "84636a38-097e-4dad-933b-0aeaee650e92",
   "metadata": {},
   "source": [
    "🔶 If you want to use an external speaker to synthesize speech, you need to supply `--speaker_wav` flag along with an external speaker encoder path and config file, as follows:"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cbdb15fa-123a-4282-a127-87b50dc70365",
   "metadata": {},
   "source": [
    "First we need to get the speaker encoder model, its config and a referece `speaker_wav`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e54f1b13-560c-4fed-bafd-e38ec9712359",
   "metadata": {},
   "outputs": [],
   "source": [
    "!wget https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json\n",
    "!wget https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar\n",
    "!wget https://github.com/coqui-ai/TTS/raw/speaker_encoder_model/tests/data/ljspeech/wavs/LJ001-0001.wav"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6dac1912-5054-4a68-8357-6d20fd99cb10",
   "metadata": {},
   "outputs": [],
   "source": [
    "!tts --model_name tts_models/multilingual/multi-dataset/your_tts \\\n",
    "--encoder_path model_se.pth.tar \\\n",
    "--encoder_config config_se.json \\\n",
    "--speaker_wav LJ001-0001.wav \\\n",
    "--text \"Are we not allowed to dim the lights so people can see that a bit better?\"\\\n",
    "--out_path spkr-out.wav \\\n",
    "--language_idx \"en\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "92ddce58-8aca-4f69-84c3-645ae1b12e7d",
   "metadata": {},
   "source": [
    "## 📣 Listen to the synthesized speaker specific wave 📣"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cc889adc-9c71-4232-8e85-bfc8f76476f4",
   "metadata": {},
   "outputs": [],
   "source": [
    "import IPython\n",
    "IPython.display.Audio(\"spkr-out.wav\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "29101d01-0b01-4153-a216-5dae415a5dd6",
   "metadata": {},
   "source": [
    "## 🎉 Congratulations! 🎉 You now know how to use a TTS model to synthesize speech! \n",
    "Follow up with the next tutorials to learn more adnavced material."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/notebooks/Tutorial_2_train_your_first_TTS_model.ipynb
+++ b/notebooks/Tutorial_2_train_your_first_TTS_model.ipynb
@ -0,0 +1,454 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "f79d99ef",
   "metadata": {},
   "source": [
    "# Train your first 🐸 TTS model 💫\n",
    "\n",
    "### 👋 Hello and welcome to Coqui (🐸) TTS\n",
    "\n",
    "The goal of this notebook is to show you a **typical workflow** for **training** and **testing** a TTS model with 🐸.\n",
    "\n",
    "Let's train a very small model on a very small amount of data so we can iterate quickly.\n",
    "\n",
    "In this notebook, we will:\n",
    "\n",
    "1. Download data and format it for 🐸 TTS.\n",
    "2. Configure the training and testing runs.\n",
    "3. Train a new model.\n",
    "4. Test the model and display its performance.\n",
    "\n",
    "So, let's jump right in!\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fa2aec78",
   "metadata": {},
   "outputs": [],
   "source": [
    "## Install Coqui TTS\n",
    "! pip install -U pip\n",
    "! pip install TTS"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "be5fe49c",
   "metadata": {},
   "source": [
    "## ✅ Data Preparation\n",
    "\n",
    "### **First things first**: we need some data.\n",
    "\n",
    "We're training a Text-to-Speech model, so we need some _text_ and we need some _speech_. Specificially, we want _transcribed speech_. The speech must be divided into audio clips and each clip needs transcription. More details about data requirements such as recording characteristics, background noise abd vocabulary coverage can be found in the [🐸TTS documentation](https://tts.readthedocs.io/en/latest/formatting_your_dataset.html).\n",
    "\n",
    "If you have a single audio file and you need to **split** it into clips. It is also important to use a lossless audio file format to prevent compression artifacts. We recommend using **wav** file format.\n",
    "\n",
    "The data format we will be adopting for this tutorial is taken from the widely-used  **LJSpeech** dataset, where **waves** are collected under a folder:\n",
    "\n",
    "<span style=\"color:purple;font-size:15px\">\n",
    "/wavs<br /> \n",
    " &emsp;| - audio1.wav<br /> \n",
    " &emsp;| - audio2.wav<br /> \n",
    " &emsp;| - audio3.wav<br /> \n",
    "  ...<br /> \n",
    "</span>\n",
    "\n",
    "and a **metadata.csv** file will have the audio file name in parallel to the transcript, delimited by `|`: \n",
    " \n",
    "<span style=\"color:purple;font-size:15px\">\n",
    "# metadata.csv <br /> \n",
    "audio1|This is my sentence. <br /> \n",
    "audio2|This is maybe my sentence. <br /> \n",
    "audio3|This is certainly my sentence. <br /> \n",
    "audio4|Let this be your sentence. <br /> \n",
    "...\n",
    "</span>\n",
    "\n",
    "In the end, we should have the following **folder structure**:\n",
    "\n",
    "<span style=\"color:purple;font-size:15px\">\n",
    "/MyTTSDataset <br /> \n",
    "&emsp;| <br /> \n",
    "&emsp;| -> metadata.txt<br /> \n",
    "&emsp;| -> /wavs<br /> \n",
    "&emsp;&emsp;| -> audio1.wav<br /> \n",
    "&emsp;&emsp;| -> audio2.wav<br /> \n",
    "&emsp;&emsp;| ...<br /> \n",
    "</span>"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "69501a10-3b53-4e75-ae66-90221d6f2271",
   "metadata": {},
   "source": [
    "🐸TTS already provides tooling for the _LJSpeech_. if you use the same format, you can start training your models right away. <br /> \n",
    "\n",
    "After you collect and format your dataset, you need to check two things. Whether you need a **_formatter_** and a **_text_cleaner_**. <br /> The **_formatter_** loads the text file (created above) as a list and the **_text_cleaner_** performs a sequence of text normalization operations that converts the raw text into the spoken representation (e.g. converting numbers to text, acronyms, and symbols to the spoken format).\n",
    "\n",
    "If you use a different dataset format then the LJSpeech or the other public datasets that 🐸TTS supports, then you need to write your own **_formatter_** and  **_text_cleaner_**."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e7f226c8-4e55-48fa-937b-8415d539b17c",
   "metadata": {},
   "source": [
    "## ⏳️ Loading your dataset\n",
    "Load one of the dataset supported by 🐸TTS.\n",
    "\n",
    "We will start by defining dataset config and setting LJSpeech as our target dataset and define its path.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "b3cb0191-b8fc-4158-bd26-8423c2a8ba66",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "# BaseDatasetConfig: defines name, formatter and path of the dataset.\n",
    "from TTS.tts.configs.shared_configs import BaseDatasetConfig\n",
    "\n",
    "output_path = \"tts_train_dir\"\n",
    "if not os.path.exists(output_path):\n",
    "    os.makedirs(output_path)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ae6b7019-3685-4b48-8917-c152e288d7e3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Download and extract LJSpeech dataset.\n",
    "\n",
    "!wget -O $output_path/LJSpeech-1.1.tar.bz2 https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 \n",
    "!tar -xf $output_path/LJSpeech-1.1.tar.bz2 -C $output_path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "76cd3ab5-6387-45f1-b488-24734cc1beb5",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_config = BaseDatasetConfig(\n",
    "    name=\"ljspeech\", meta_file_train=\"metadata.csv\", path=os.path.join(output_path, \"LJSpeech-1.1/\")\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ae82fd75",
   "metadata": {},
   "source": [
    "## ✅ Train a new model\n",
    "\n",
    "Let's kick off a training run 🚀🚀🚀.\n",
    "\n",
    "Deciding on the model architecture you'd want to use is based on your needs and available resources. Each model architecture has it's pros and cons that define the run-time efficiency and the voice quality.\n",
    "We have many recipes under `TTS/recipes/` that provide a good starting point. For this tutorial, we will be using `GlowTTS`."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f5876e46-2aee-4bcf-b6b3-9e3c535c553f",
   "metadata": {},
   "source": [
    "We will begin by initializing the model training configuration."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5483ca28-39d6-49f8-a18e-4fb53c50ad84",
   "metadata": {},
   "outputs": [],
   "source": [
    "# GlowTTSConfig: all model related values for training, validating and testing.\n",
    "from TTS.tts.configs.glow_tts_config import GlowTTSConfig\n",
    "config = GlowTTSConfig(\n",
    "    batch_size=32,\n",
    "    eval_batch_size=16,\n",
    "    num_loader_workers=4,\n",
    "    num_eval_loader_workers=4,\n",
    "    run_eval=True,\n",
    "    test_delay_epochs=-1,\n",
    "    epochs=100,\n",
    "    text_cleaner=\"phoneme_cleaners\",\n",
    "    use_phonemes=True,\n",
    "    phoneme_language=\"en-us\",\n",
    "    phoneme_cache_path=os.path.join(output_path, \"phoneme_cache\"),\n",
    "    print_step=25,\n",
    "    print_eval=False,\n",
    "    mixed_precision=True,\n",
    "    output_path=output_path,\n",
    "    datasets=[dataset_config],\n",
    "    save_step=1000,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b93ed377-80b7-447b-bd92-106bffa777ee",
   "metadata": {},
   "source": [
    "Next we will initialize the audio processor which is used for feature extraction and audio I/O."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b1b12f61-f851-4565-84dd-7640947e04ab",
   "metadata": {},
   "outputs": [],
   "source": [
    "from TTS.utils.audio import AudioProcessor\n",
    "ap = AudioProcessor.init_from_config(config)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1d461683-b05e-403f-815f-8007bda08c38",
   "metadata": {},
   "source": [
    "Next we will initialize the tokenizer which is used to convert text to sequences of token IDs.  If characters are not defined in the config, default characters are passed to the config."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "014879b7-f18d-44c0-b24a-e10f8002113a",
   "metadata": {},
   "outputs": [],
   "source": [
    "from TTS.tts.utils.text.tokenizer import TTSTokenizer\n",
    "tokenizer, config = TTSTokenizer.init_from_config(config)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "df3016e1-9e99-4c4f-94e3-fa89231fd978",
   "metadata": {},
   "source": [
    "Next we will load data samples. Each sample is a list of ```[text, audio_file_path, speaker_name]```. You can define your custom sample loader returning the list of samples."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cadd6ada-c8eb-4f79-b8fe-6d72850af5a7",
   "metadata": {},
   "outputs": [],
   "source": [
    "from TTS.tts.datasets import load_tts_samples\n",
    "train_samples, eval_samples = load_tts_samples(\n",
    "    dataset_config,\n",
    "    eval_split=True,\n",
    "    eval_split_max_size=config.eval_split_max_size,\n",
    "    eval_split_size=config.eval_split_size,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "db8b451e-1fe1-4aa3-b69e-ab22b925bd19",
   "metadata": {},
   "source": [
    "Now we're ready to initialize the model.\n",
    "\n",
    "Models take a config object and a speaker manager as input. Config defines the details of the model like the number of layers, the size of the embedding, etc. Speaker manager is used by multi-speaker models."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ac2ffe3e-ad0c-443e-800c-9b076ee811b4",
   "metadata": {},
   "outputs": [],
   "source": [
    "from TTS.tts.models.glow_tts import GlowTTS\n",
    "model = GlowTTS(config, ap, tokenizer, speaker_manager=None)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e2832c56-889d-49a6-95b6-eb231892ecc6",
   "metadata": {},
   "source": [
    "Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training, distributed training, etc."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0f609945-4fe0-4d0d-b95e-11d7bfb63ebe",
   "metadata": {},
   "outputs": [],
   "source": [
    "from trainer import Trainer, TrainerArgs\n",
    "trainer = Trainer(\n",
    "    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5b320831-dd83-429b-bb6a-473f9d49d321",
   "metadata": {},
   "source": [
    "### AND... 3,2,1... START TRAINING 🚀🚀🚀"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d4c07f99-3d1d-4bea-801e-9f33bbff0e9f",
   "metadata": {},
   "outputs": [],
   "source": [
    "trainer.fit()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4cff0c40-2734-40a6-a905-e945a9fb3e98",
   "metadata": {},
   "source": [
    "#### 🚀 Run the Tensorboard. 🚀\n",
    "On the notebook and Tensorboard, you can monitor the progress of your model. Also Tensorboard provides certain figures and sample outputs."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5a85cd3b-1646-40ad-a6c2-49323e08eeec",
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install tensorboard\n",
    "!tensorboard --logdir=tts_train_dir"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9f6dc959",
   "metadata": {},
   "source": [
    "## ✅ Test the model\n",
    "\n",
    "We made it! 🙌\n",
    "\n",
    "Let's kick off the testing run, which displays performance metrics.\n",
    "\n",
    "We're committing the cardinal sin of ML 😈 (aka - testing on our training data) so you don't want to deploy this model into production. In this notebook we're focusing on the workflow itself, so it's forgivable 😇\n",
    "\n",
    "You can see from the test output that our tiny model has overfit to the data, and basically memorized this one sentence.\n",
    "\n",
    "When you start training your own models, make sure your testing data doesn't include your training data 😅"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "99fada7a-592f-4a09-9369-e6f3d82de3a0",
   "metadata": {},
   "source": [
    "Let's get the latest saved checkpoint. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6dd47ed5-da8e-4bf9-b524-d686630d6961",
   "metadata": {},
   "outputs": [],
   "source": [
    "import glob, os\n",
    "output_path = \"tts_train_dir\"\n",
    "ckpts = sorted([f for f in glob.glob(output_path+\"/*/*.pth\")])\n",
    "configs = sorted([f for f in glob.glob(output_path+\"/*/*.json\")])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dd42bc7a",
   "metadata": {},
   "outputs": [],
   "source": [
    " !tts --text \"Text for TTS\" \\\n",
    "      --model_path $test_ckpt \\\n",
    "      --config_path $test_config \\\n",
    "      --out_path out.wav"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "81cbcb3f-d952-469b-a0d8-8941cd7af670",
   "metadata": {},
   "source": [
    "## 📣 Listen to the synthesized wave 📣"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e0000bd6-6763-4a10-a74d-911dd08ebcff",
   "metadata": {},
   "outputs": [],
   "source": [
    "import IPython\n",
    "IPython.display.Audio(\"out.wav\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "13914401-cad1-494a-b701-474e52829138",
   "metadata": {},
   "source": [
    "## 🎉 Congratulations! 🎉 You now have trained your first TTS model! \n",
    "Follow up with the next tutorials to learn more advanced material."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "950d9fc6-896f-4a2c-86fd-8fd1fcbbb3f7",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,5 +1,5 @@
 [build-system]
-requires = ["setuptools", "wheel", "Cython", "numpy==1.19.5"]
+requires = ["setuptools", "wheel", "cython==0.29.28", "numpy==1.21.6"]
 [flake8]
 max-line-length=120
--- a/recipes/blizzard2013/README.md
+++ b/recipes/blizzard2013/README.md
@ -0,0 +1,12 @@
 # How to get the Blizzard 2013 Dataset
 The Capacitron model is a variational encoder extension of standard Tacotron based models to model prosody.
 To take full advantage of the model, it is advised to train the model with a dataset that contains a significant amount of prosodic information in the utterances. A tested candidate for such applications is the blizzard2013 dataset from the Blizzard Challenge, containing many hours of high quality audio book recordings.
 To get a license and download link for this dataset, you need to visit the [website](https://www.cstr.ed.ac.uk/projects/blizzard/2013/lessac_blizzard2013/license.html) of the Centre for Speech Technology Research of the University of Edinburgh.
 You get access to the raw dataset in a couple of days. There are a few preprocessing steps you need to do to be able to use the high fidelity dataset.
 1. Get the forced time alignments for the blizzard dataset from [here](https://github.com/mueller91/tts_alignments).
 2. Segment the high fidelity audio-book files based on the instructions [here](https://github.com/Tomiinek/Blizzard2013_Segmentation).
--- a/recipes/blizzard2013/tacotron1-Capacitron/train_capacitron_t1.py
+++ b/recipes/blizzard2013/tacotron1-Capacitron/train_capacitron_t1.py
@ -0,0 +1,101 @@
 import os
 from trainer import Trainer, TrainerArgs
 from TTS.config.shared_configs import BaseAudioConfig
 from TTS.tts.configs.shared_configs import BaseDatasetConfig, CapacitronVAEConfig
 from TTS.tts.configs.tacotron_config import TacotronConfig
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.models.tacotron import Tacotron
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
 output_path = os.path.dirname(os.path.abspath(__file__))
 data_path = "/srv/data/"
 # Using LJSpeech like dataset processing for the blizzard dataset
 dataset_config = BaseDatasetConfig(name="ljspeech", meta_file_train="metadata.csv", path=data_path)
 audio_config = BaseAudioConfig(
    sample_rate=24000,
    do_trim_silence=True,
    trim_db=60.0,
    signal_norm=True,
    mel_fmin=80.0,
    mel_fmax=12000,
    spec_gain=20.0,
    log_func="np.log10",
    ref_level_db=20,
    preemphasis=0.0,
    min_level_db=-100,
 )
 # Using the standard Capacitron config
 capacitron_config = CapacitronVAEConfig(capacitron_VAE_loss_alpha=1.0)
 config = TacotronConfig(
    run_name="Blizzard-Capacitron-T1",
    audio=audio_config,
    capacitron_vae=capacitron_config,
    use_capacitron_vae=True,
    batch_size=128,  # Tune this to your gpu
    max_audio_len=6 * 24000,  # Tune this to your gpu
    min_audio_len=0.5 * 24000,
    eval_batch_size=16,
    num_loader_workers=12,
    num_eval_loader_workers=8,
    precompute_num_workers=24,
    run_eval=True,
    test_delay_epochs=5,
    ga_alpha=0.0,
    r=2,
    optimizer="CapacitronOptimizer",
    optimizer_params={"RAdam": {"betas": [0.9, 0.998], "weight_decay": 1e-6}, "SGD": {"lr": 1e-5, "momentum": 0.9}},
    attention_type="graves",
    attention_heads=5,
    epochs=1000,
    text_cleaner="phoneme_cleaners",
    use_phonemes=True,
    phoneme_language="en-us",
    phonemizer="espeak",
    phoneme_cache_path=os.path.join(data_path, "phoneme_cache"),
    stopnet_pos_weight=15,
    print_step=50,
    print_eval=True,
    mixed_precision=False,
    output_path=output_path,
    datasets=[dataset_config],
    lr=1e-3,
    lr_scheduler="StepwiseGradualLR",
    lr_scheduler_params={"gradual_learning_rates": [[0, 1e-3], [2e4, 5e-4], [4e5, 3e-4], [6e4, 1e-4], [8e4, 5e-5]]},
    scheduler_after_epoch=False,  # scheduler doesn't work without this flag
    # Need to experiment with these below for capacitron
    loss_masking=False,
    decoder_loss_alpha=1.0,
    postnet_loss_alpha=1.0,
    postnet_diff_spec_alpha=0.0,
    decoder_diff_spec_alpha=0.0,
    decoder_ssim_alpha=0.0,
    postnet_ssim_alpha=0.0,
 )
 ap = AudioProcessor(**config.audio.to_dict())
 tokenizer, config = TTSTokenizer.init_from_config(config)
 train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
 model = Tacotron(config, ap, tokenizer, speaker_manager=None)
 trainer = Trainer(
    TrainerArgs(),
    config,
    output_path,
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples,
 )
 # 🚀
 trainer.fit()
--- a/recipes/blizzard2013/tacotron2-Capacitron/train_capacitron_t2.py
+++ b/recipes/blizzard2013/tacotron2-Capacitron/train_capacitron_t2.py
@ -0,0 +1,117 @@
 import os
 from trainer import Trainer, TrainerArgs
 from TTS.config.shared_configs import BaseAudioConfig
 from TTS.tts.configs.shared_configs import BaseDatasetConfig, CapacitronVAEConfig
 from TTS.tts.configs.tacotron2_config import Tacotron2Config
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.models.tacotron2 import Tacotron2
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
 output_path = os.path.dirname(os.path.abspath(__file__))
 data_path = "/srv/data/blizzard2013/segmented"
 # Using LJSpeech like dataset processing for the blizzard dataset
 dataset_config = BaseDatasetConfig(
    name="ljspeech",
    meta_file_train="metadata.csv",
    path=data_path,
 )
 audio_config = BaseAudioConfig(
    sample_rate=24000,
    do_trim_silence=True,
    trim_db=60.0,
    signal_norm=True,
    mel_fmin=80.0,
    mel_fmax=12000,
    spec_gain=25.0,
    log_func="np.log10",
    ref_level_db=20,
    preemphasis=0.0,
    min_level_db=-100,
 )
 # Using the standard Capacitron config
 capacitron_config = CapacitronVAEConfig(capacitron_VAE_loss_alpha=1.0)
 config = Tacotron2Config(
    run_name="Blizzard-Capacitron-T2",
    audio=audio_config,
    capacitron_vae=capacitron_config,
    use_capacitron_vae=True,
    batch_size=246,  # Tune this to your gpu
    max_audio_len=6 * 24000,  # Tune this to your gpu
    min_audio_len=1 * 24000,
    eval_batch_size=16,
    num_loader_workers=12,
    num_eval_loader_workers=8,
    precompute_num_workers=24,
    run_eval=True,
    test_delay_epochs=5,
    ga_alpha=0.0,
    r=2,
    optimizer="CapacitronOptimizer",
    optimizer_params={"RAdam": {"betas": [0.9, 0.998], "weight_decay": 1e-6}, "SGD": {"lr": 1e-5, "momentum": 0.9}},
    attention_type="dynamic_convolution",
    grad_clip=0.0,  # Important! We overwrite the standard grad_clip with capacitron_grad_clip
    double_decoder_consistency=False,
    epochs=1000,
    text_cleaner="phoneme_cleaners",
    use_phonemes=True,
    phoneme_language="en-us",
    phonemizer="espeak",
    phoneme_cache_path=os.path.join(data_path, "phoneme_cache"),
    stopnet_pos_weight=15,
    print_step=25,
    print_eval=True,
    mixed_precision=False,
    output_path=output_path,
    datasets=[dataset_config],
    lr=1e-3,
    lr_scheduler="StepwiseGradualLR",
    lr_scheduler_params={
        "gradual_learning_rates": [
            [0, 1e-3],
            [2e4, 5e-4],
            [4e5, 3e-4],
            [6e4, 1e-4],
            [8e4, 5e-5],
        ]
    },
    scheduler_after_epoch=False,  # scheduler doesn't work without this flag
    # dashboard_logger='wandb',
    # sort_by_audio_len=True,
    seq_len_norm=True,
    # Need to experiment with these below for capacitron
    loss_masking=False,
    decoder_loss_alpha=1.0,
    postnet_loss_alpha=1.0,
    postnet_diff_spec_alpha=0.0,
    decoder_diff_spec_alpha=0.0,
    decoder_ssim_alpha=0.0,
    postnet_ssim_alpha=0.0,
 )
 ap = AudioProcessor(**config.audio.to_dict())
 tokenizer, config = TTSTokenizer.init_from_config(config)
 train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
 model = Tacotron2(config, ap, tokenizer, speaker_manager=None)
 trainer = Trainer(
    TrainerArgs(),
    config,
    output_path,
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples,
    training_assets={"audio_processor": ap},
 )
 trainer.fit()
--- a/recipes/ljspeech/tacotron2-Capacitron/train_capacitron_t2.py
+++ b/recipes/ljspeech/tacotron2-Capacitron/train_capacitron_t2.py
@ -0,0 +1,115 @@
 import os
 from trainer import Trainer, TrainerArgs
 from TTS.config.shared_configs import BaseAudioConfig
 from TTS.tts.configs.shared_configs import BaseDatasetConfig, CapacitronVAEConfig
 from TTS.tts.configs.tacotron2_config import Tacotron2Config
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.models.tacotron2 import Tacotron2
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
 output_path = os.path.dirname(os.path.abspath(__file__))
 data_path = "/srv/data/"
 # Using LJSpeech like dataset processing for the blizzard dataset
 dataset_config = BaseDatasetConfig(
    name="ljspeech",
    meta_file_train="metadata.csv",
    path=data_path,
 )
 audio_config = BaseAudioConfig(
    sample_rate=22050,
    do_trim_silence=True,
    trim_db=60.0,
    signal_norm=False,
    mel_fmin=0.0,
    mel_fmax=11025,
    spec_gain=1.0,
    log_func="np.log",
    ref_level_db=20,
    preemphasis=0.0,
 )
 # Using the standard Capacitron config
 capacitron_config = CapacitronVAEConfig(capacitron_VAE_loss_alpha=1.0, capacitron_capacity=50)
 config = Tacotron2Config(
    run_name="Capacitron-Tacotron2",
    audio=audio_config,
    capacitron_vae=capacitron_config,
    use_capacitron_vae=True,
    batch_size=128,  # Tune this to your gpu
    max_audio_len=8 * 22050,  # Tune this to your gpu
    min_audio_len=1 * 22050,
    eval_batch_size=16,
    num_loader_workers=8,
    num_eval_loader_workers=8,
    precompute_num_workers=24,
    run_eval=True,
    test_delay_epochs=25,
    ga_alpha=0.0,
    r=2,
    optimizer="CapacitronOptimizer",
    optimizer_params={"RAdam": {"betas": [0.9, 0.998], "weight_decay": 1e-6}, "SGD": {"lr": 1e-5, "momentum": 0.9}},
    attention_type="dynamic_convolution",
    grad_clip=0.0,  # Important! We overwrite the standard grad_clip with capacitron_grad_clip
    double_decoder_consistency=False,
    epochs=1000,
    text_cleaner="phoneme_cleaners",
    use_phonemes=True,
    phoneme_language="en-us",
    phonemizer="espeak",
    phoneme_cache_path=os.path.join(data_path, "phoneme_cache"),
    stopnet_pos_weight=15,
    print_step=25,
    print_eval=True,
    mixed_precision=False,
    sort_by_audio_len=True,
    seq_len_norm=True,
    output_path=output_path,
    datasets=[dataset_config],
    lr=1e-3,
    lr_scheduler="StepwiseGradualLR",
    lr_scheduler_params={
        "gradual_learning_rates": [
            [0, 1e-3],
            [2e4, 5e-4],
            [4e5, 3e-4],
            [6e4, 1e-4],
            [8e4, 5e-5],
        ]
    },
    scheduler_after_epoch=False,  # scheduler doesn't work without this flag
    # Need to experiment with these below for capacitron
    loss_masking=False,
    decoder_loss_alpha=1.0,
    postnet_loss_alpha=1.0,
    postnet_diff_spec_alpha=0.0,
    decoder_diff_spec_alpha=0.0,
    decoder_ssim_alpha=0.0,
    postnet_ssim_alpha=0.0,
 )
 ap = AudioProcessor(**config.audio.to_dict())
 tokenizer, config = TTSTokenizer.init_from_config(config)
 train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
 model = Tacotron2(config, ap, tokenizer, speaker_manager=None)
 trainer = Trainer(
    TrainerArgs(),
    config,
    output_path,
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples,
    training_assets={"audio_processor": ap},
 )
 trainer.fit()
--- a/recipes/thorsten_DE/README.md
+++ b/recipes/thorsten_DE/README.md
@ -0,0 +1,15 @@
 # 🐸💬 TTS Thorsten Recipes
 For running the recipes you need the [Thorsten-Voice](https://github.com/thorstenMueller/Thorsten-Voice) dataset.
 You can download it manually from [the official website](https://www.thorsten-voice.de/) or use ```download_thorsten_de.sh``` alternatively running any of the **train_modelX.py**scripts will download the dataset if not already present.
 Then, go to your desired model folder and run the training.
    Running Python files. (Choose the desired GPU ID for your run and set ```CUDA_VISIBLE_DEVICES```)
    ```terminal
    CUDA_VISIBLE_DEVICES="0" python train_modelX.py
    ```
 💡 Note that these runs are just templates to help you start training your first model. They are not optimized for the best
 result. Double-check the configurations and feel free to share your experiments to find better parameters together 💪.
--- a/recipes/thorsten_DE/align_tts/train_aligntts.py
+++ b/recipes/thorsten_DE/align_tts/train_aligntts.py
@ -0,0 +1,84 @@
 import os
 from trainer import Trainer, TrainerArgs
 from TTS.tts.configs.align_tts_config import AlignTTSConfig
 from TTS.tts.configs.shared_configs import BaseDatasetConfig
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.models.align_tts import AlignTTS
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.downloaders import download_thorsten_de
 output_path = os.path.dirname(os.path.abspath(__file__))
 # init configs
 dataset_config = BaseDatasetConfig(
    name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
 )
 # download dataset if not already present
 if not os.path.exists(dataset_config.path):
    print("Downloading dataset")
    download_thorsten_de(os.path.split(os.path.abspath(dataset_config.path))[0])
 config = AlignTTSConfig(
    batch_size=32,
    eval_batch_size=16,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=1000,
    text_cleaner="phoneme_cleaners",
    use_phonemes=False,
    phoneme_language="de",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    print_step=25,
    print_eval=True,
    mixed_precision=False,
    test_sentences=[
        "Es hat mich viel Zeit gekostet ein Stimme zu entwickeln, jetzt wo ich sie habe werde ich nicht mehr schweigen.",
        "Sei eine Stimme, kein Echo.",
        "Es tut mir Leid David. Das kann ich leider nicht machen.",
        "Dieser Kuchen ist großartig. Er ist so lecker und feucht.",
        "Vor dem 22. November 1963.",
    ],
    output_path=output_path,
    datasets=[dataset_config],
 )
 # INITIALIZE THE AUDIO PROCESSOR
 # Audio processor is used for feature extraction and audio I/O.
 # It mainly serves to the dataloader and the training loggers.
 ap = AudioProcessor.init_from_config(config)
 # INITIALIZE THE TOKENIZER
 # Tokenizer is used to convert text to sequences of token IDs.
 # If characters are not defined in the config, default characters are passed to the config
 tokenizer, config = TTSTokenizer.init_from_config(config)
 # LOAD DATA SAMPLES
 # Each sample is a list of ```[text, audio_file_path, speaker_name]```
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
 train_samples, eval_samples = load_tts_samples(
    dataset_config,
    eval_split=True,
    eval_split_max_size=config.eval_split_max_size,
    eval_split_size=config.eval_split_size,
 )
 # init model
 model = AlignTTS(config, ap, tokenizer)
 # INITIALIZE THE TRAINER
 # Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
 # distributed training, etc.
 trainer = Trainer(
    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
 )
 # AND... 3,2,1... 🚀
 trainer.fit()
--- a/recipes/thorsten_DE/download_thorsten_DE.sh
+++ b/recipes/thorsten_DE/download_thorsten_DE.sh
@ -0,0 +1,21 @@
 # create venv
 python3 -m venv env
 source .env/bin/activate
 pip install pip --upgrade
 # download Thorsten_DE dataset
 pip install gdown
 gdown --id 1yKJM1LAOQpRVojKunD9r8WN_p5KzBxjc -O dataset.tgz
 tar -xzf dataset.tgz
 # create train-val splits
 shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv
 head -n 20668 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv
 tail -n 2000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv
 # rename dataset and remove archive
 mv LJSpeech-1.1 thorsten-de
 rm dataset.tgz
 # destry venv
 rm -rf env
--- a/recipes/thorsten_DE/glow_tts/train_glowtts.py
+++ b/recipes/thorsten_DE/glow_tts/train_glowtts.py
@ -0,0 +1,97 @@
 import os
 # Trainer: Where the ✨️ happens.
 # TrainingArgs: Defines the set of arguments of the Trainer.
 from trainer import Trainer, TrainerArgs
 # GlowTTSConfig: all model related values for training, validating and testing.
 from TTS.tts.configs.glow_tts_config import GlowTTSConfig
 # BaseDatasetConfig: defines name, formatter and path of the dataset.
 from TTS.tts.configs.shared_configs import BaseDatasetConfig
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.models.glow_tts import GlowTTS
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.downloaders import download_thorsten_de
 # we use the same path as this script as our training folder.
 output_path = os.path.dirname(os.path.abspath(__file__))
 # DEFINE DATASET CONFIG
 # Set LJSpeech as our target dataset and define its path.
 # You can also use a simple Dict to define the dataset and pass it to your custom formatter.
 dataset_config = BaseDatasetConfig(
    name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
 )
 # download dataset if not already present
 if not os.path.exists(dataset_config.path):
    print("Downloading dataset")
    download_thorsten_de(os.path.split(os.path.abspath(dataset_config.path))[0])
 # INITIALIZE THE TRAINING CONFIGURATION
 # Configure the model. Every config class inherits the BaseTTSConfig.
 config = GlowTTSConfig(
    batch_size=32,
    eval_batch_size=16,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=1000,
    text_cleaner="phoneme_cleaners",
    use_phonemes=True,
    phoneme_language="de",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    print_step=25,
    print_eval=False,
    mixed_precision=True,
    test_sentences=[
        "Es hat mich viel Zeit gekostet ein Stimme zu entwickeln, jetzt wo ich sie habe werde ich nicht mehr schweigen.",
        "Sei eine Stimme, kein Echo.",
        "Es tut mir Leid David. Das kann ich leider nicht machen.",
        "Dieser Kuchen ist großartig. Er ist so lecker und feucht.",
        "Vor dem 22. November 1963.",
    ],
    output_path=output_path,
    datasets=[dataset_config],
 )
 # INITIALIZE THE AUDIO PROCESSOR
 # Audio processor is used for feature extraction and audio I/O.
 # It mainly serves to the dataloader and the training loggers.
 ap = AudioProcessor.init_from_config(config)
 # INITIALIZE THE TOKENIZER
 # Tokenizer is used to convert text to sequences of token IDs.
 # If characters are not defined in the config, default characters are passed to the config
 tokenizer, config = TTSTokenizer.init_from_config(config)
 # LOAD DATA SAMPLES
 # Each sample is a list of ```[text, audio_file_path, speaker_name]```
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
 train_samples, eval_samples = load_tts_samples(
    dataset_config,
    eval_split=True,
    eval_split_max_size=config.eval_split_max_size,
    eval_split_size=config.eval_split_size,
 )
 # INITIALIZE THE MODEL
 # Models take a config object and a speaker manager as input
 # Config defines the details of the model like the number of layers, the size of the embedding, etc.
 # Speaker manager is used by multi-speaker models.
 model = GlowTTS(config, ap, tokenizer, speaker_manager=None)
 # INITIALIZE THE TRAINER
 # Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
 # distributed training, etc.
 trainer = Trainer(
    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
 )
 # AND... 3,2,1... 🚀
 trainer.fit()
--- a/recipes/thorsten_DE/hifigan/train_hifigan.py
+++ b/recipes/thorsten_DE/hifigan/train_hifigan.py
@ -0,0 +1,53 @@
 import os
 from trainer import Trainer, TrainerArgs
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.downloaders import download_thorsten_de
 from TTS.vocoder.configs import HifiganConfig
 from TTS.vocoder.datasets.preprocess import load_wav_data
 from TTS.vocoder.models.gan import GAN
 output_path = os.path.dirname(os.path.abspath(__file__))
 config = HifiganConfig(
    batch_size=32,
    eval_batch_size=16,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=5,
    epochs=1000,
    seq_len=8192,
    pad_short=2000,
    use_noise_augment=True,
    eval_split_size=10,
    print_step=25,
    print_eval=False,
    mixed_precision=False,
    lr_gen=1e-4,
    lr_disc=1e-4,
    data_path=os.path.join(output_path, "../thorsten-de/wavs/"),
    output_path=output_path,
 )
 # download dataset if not already present
 if not os.path.exists(config.data_path):
    print("Downloading dataset")
    download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../"))
    download_thorsten_de(download_path)
 # init audio processor
 ap = AudioProcessor(**config.audio.to_dict())
 # load training samples
 eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
 # init model
 model = GAN(config, ap)
 # init the trainer and 🚀
 trainer = Trainer(
    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
 )
 trainer.fit()
--- a/recipes/thorsten_DE/multiband_melgan/train_multiband_melgan.py
+++ b/recipes/thorsten_DE/multiband_melgan/train_multiband_melgan.py
@ -0,0 +1,53 @@
 import os
 from trainer import Trainer, TrainerArgs
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.downloaders import download_thorsten_de
 from TTS.vocoder.configs import MultibandMelganConfig
 from TTS.vocoder.datasets.preprocess import load_wav_data
 from TTS.vocoder.models.gan import GAN
 output_path = os.path.dirname(os.path.abspath(__file__))
 config = MultibandMelganConfig(
    batch_size=32,
    eval_batch_size=16,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=5,
    epochs=1000,
    seq_len=8192,
    pad_short=2000,
    use_noise_augment=True,
    eval_split_size=10,
    print_step=25,
    print_eval=False,
    mixed_precision=False,
    lr_gen=1e-4,
    lr_disc=1e-4,
    data_path=os.path.join(output_path, "../thorsten-de/wavs/"),
    output_path=output_path,
 )
 # download dataset if not already present
 if not os.path.exists(config.data_path):
    print("Downloading dataset")
    download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../"))
    download_thorsten_de(download_path)
 # init audio processor
 ap = AudioProcessor(**config.audio.to_dict())
 # load training samples
 eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
 # init model
 model = GAN(config, ap)
 # init the trainer and 🚀
 trainer = Trainer(
    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
 )
 trainer.fit()
--- a/recipes/thorsten_DE/speedy_speech/train_speedy_speech.py
+++ b/recipes/thorsten_DE/speedy_speech/train_speedy_speech.py
@ -0,0 +1,102 @@
 import os
 from trainer import Trainer, TrainerArgs
 from TTS.config import BaseAudioConfig, BaseDatasetConfig
 from TTS.tts.configs.speedy_speech_config import SpeedySpeechConfig
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.models.forward_tts import ForwardTTS
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.downloaders import download_thorsten_de
 output_path = os.path.dirname(os.path.abspath(__file__))
 dataset_config = BaseDatasetConfig(
    name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
 )
 # download dataset if not already present
 if not os.path.exists(dataset_config.path):
    print("Downloading dataset")
    download_thorsten_de(os.path.split(os.path.abspath(dataset_config.path))[0])
 audio_config = BaseAudioConfig(
    sample_rate=22050,
    do_trim_silence=True,
    trim_db=60.0,
    signal_norm=False,
    mel_fmin=0.0,
    mel_fmax=8000,
    spec_gain=1.0,
    log_func="np.log",
    ref_level_db=20,
    preemphasis=0.0,
 )
 config = SpeedySpeechConfig(
    run_name="speedy_speech_thorsten-de",
    audio=audio_config,
    batch_size=32,
    eval_batch_size=16,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    compute_input_seq_cache=True,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=1000,
    min_audio_len=11050,  # need to up min_audio_len to avois speedy speech error
    text_cleaner="phoneme_cleaners",
    use_phonemes=True,
    phoneme_language="de",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    precompute_num_workers=4,
    print_step=50,
    print_eval=False,
    mixed_precision=False,
    test_sentences=[
        "Es hat mich viel Zeit gekostet ein Stimme zu entwickeln, jetzt wo ich sie habe werde ich nicht mehr schweigen.",
        "Sei eine Stimme, kein Echo.",
        "Es tut mir Leid David. Das kann ich leider nicht machen.",
        "Dieser Kuchen ist großartig. Er ist so lecker und feucht.",
        "Vor dem 22. November 1963.",
    ],
    sort_by_audio_len=True,
    max_seq_len=500000,
    output_path=output_path,
    datasets=[dataset_config],
 )
 # INITIALIZE THE AUDIO PROCESSOR
 # Audio processor is used for feature extraction and audio I/O.
 # It mainly serves to the dataloader and the training loggers.
 ap = AudioProcessor.init_from_config(config)
 # INITIALIZE THE TOKENIZER
 # Tokenizer is used to convert text to sequences of token IDs.
 # If characters are not defined in the config, default characters are passed to the config
 tokenizer, config = TTSTokenizer.init_from_config(config)
 # LOAD DATA SAMPLES
 # Each sample is a list of ```[text, audio_file_path, speaker_name]```
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
 train_samples, eval_samples = load_tts_samples(
    dataset_config,
    eval_split=True,
    eval_split_max_size=config.eval_split_max_size,
    eval_split_size=config.eval_split_size,
 )
 # init model
 model = ForwardTTS(config, ap, tokenizer)
 # INITIALIZE THE TRAINER
 # Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
 # distributed training, etc.
 trainer = Trainer(
    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
 )
 # AND... 3,2,1... 🚀
 trainer.fit()
--- a/recipes/thorsten_DE/tacotron2-DDC/train_tacotron_ddc.py
+++ b/recipes/thorsten_DE/tacotron2-DDC/train_tacotron_ddc.py
@ -0,0 +1,108 @@
 import os
 from trainer import Trainer, TrainerArgs
 from TTS.config.shared_configs import BaseAudioConfig
 from TTS.tts.configs.shared_configs import BaseDatasetConfig
 from TTS.tts.configs.tacotron2_config import Tacotron2Config
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.models.tacotron2 import Tacotron2
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.downloaders import download_thorsten_de
 # from TTS.tts.datasets.tokenizer import Tokenizer
 output_path = os.path.dirname(os.path.abspath(__file__))
 # init configs
 dataset_config = BaseDatasetConfig(
    name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
 )
 # download dataset if not already present
 if not os.path.exists(dataset_config.path):
    print("Downloading dataset")
    download_thorsten_de(os.path.split(os.path.abspath(dataset_config.path))[0])
 audio_config = BaseAudioConfig(
    sample_rate=22050,
    do_trim_silence=True,
    trim_db=60.0,
    signal_norm=False,
    mel_fmin=0.0,
    mel_fmax=8000,
    spec_gain=1.0,
    log_func="np.log",
    ref_level_db=20,
    preemphasis=0.0,
 )
 config = Tacotron2Config(  # This is the config that is saved for the future use
    audio=audio_config,
    batch_size=40,  # BS of 40 and max length of 10s will use about 20GB of GPU memory
    eval_batch_size=16,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=-1,
    r=6,
    gradual_training=[[0, 6, 64], [10000, 4, 32], [50000, 3, 32], [100000, 2, 32]],
    double_decoder_consistency=True,
    epochs=1000,
    text_cleaner="phoneme_cleaners",
    use_phonemes=True,
    phoneme_language="de",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    precompute_num_workers=8,
    print_step=25,
    print_eval=True,
    mixed_precision=False,
    test_sentences=[
        "Es hat mich viel Zeit gekostet ein Stimme zu entwickeln, jetzt wo ich sie habe werde ich nicht mehr schweigen.",
        "Sei eine Stimme, kein Echo.",
        "Es tut mir Leid David. Das kann ich leider nicht machen.",
        "Dieser Kuchen ist großartig. Er ist so lecker und feucht.",
        "Vor dem 22. November 1963.",
    ],
    # max audio length of 10 seconds, feel free to increase if you got more than 20GB GPU memory
    max_audio_len=22050 * 10,
    output_path=output_path,
    datasets=[dataset_config],
 )
 # init audio processor
 ap = AudioProcessor(**config.audio.to_dict())
 # INITIALIZE THE AUDIO PROCESSOR
 # Audio processor is used for feature extraction and audio I/O.
 # It mainly serves to the dataloader and the training loggers.
 ap = AudioProcessor.init_from_config(config)
 # INITIALIZE THE TOKENIZER
 # Tokenizer is used to convert text to sequences of token IDs.
 # If characters are not defined in the config, default characters are passed to the config
 tokenizer, config = TTSTokenizer.init_from_config(config)
 # LOAD DATA SAMPLES
 # Each sample is a list of ```[text, audio_file_path, speaker_name]```
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
 train_samples, eval_samples = load_tts_samples(
    dataset_config,
    eval_split=True,
    eval_split_max_size=config.eval_split_max_size,
    eval_split_size=config.eval_split_size,
 )
 # INITIALIZE THE MODEL
 # Models take a config object and a speaker manager as input
 # Config defines the details of the model like the number of layers, the size of the embedding, etc.
 # Speaker manager is used by multi-speaker models.
 model = Tacotron2(config, ap, tokenizer, speaker_manager=None)
 # init the trainer and 🚀
 trainer = Trainer(
    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
 )
 trainer.fit()
--- a/recipes/thorsten_DE/univnet/train_univnet.py
+++ b/recipes/thorsten_DE/univnet/train_univnet.py
@ -0,0 +1,52 @@
 import os
 from trainer import Trainer, TrainerArgs
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.downloaders import download_thorsten_de
 from TTS.vocoder.configs import UnivnetConfig
 from TTS.vocoder.datasets.preprocess import load_wav_data
 from TTS.vocoder.models.gan import GAN
 output_path = os.path.dirname(os.path.abspath(__file__))
 config = UnivnetConfig(
    batch_size=64,
    eval_batch_size=16,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=1000,
    seq_len=8192,
    pad_short=2000,
    use_noise_augment=True,
    eval_split_size=10,
    print_step=25,
    print_eval=False,
    mixed_precision=False,
    lr_gen=1e-4,
    lr_disc=1e-4,
    data_path=os.path.join(output_path, "../thorsten-de/wavs/"),
    output_path=output_path,
 )
 # download dataset if not already present
 if not os.path.exists(config.data_path):
    print("Downloading dataset")
    download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../"))
    download_thorsten_de(download_path)
 # init audio processor
 ap = AudioProcessor(**config.audio.to_dict())
 # load training samples
 eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
 # init model
 model = GAN(config, ap)
 # init the trainer and 🚀
 trainer = Trainer(
    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
 )
 trainer.fit()
--- a/recipes/thorsten_DE/vits_tts/train_vits.py
+++ b/recipes/thorsten_DE/vits_tts/train_vits.py
@ -0,0 +1,105 @@
 import os
 from trainer import Trainer, TrainerArgs
 from TTS.config.shared_configs import BaseAudioConfig
 from TTS.tts.configs.shared_configs import BaseDatasetConfig
 from TTS.tts.configs.vits_config import VitsConfig
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.models.vits import Vits
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.downloaders import download_thorsten_de
 output_path = os.path.dirname(os.path.abspath(__file__))
 dataset_config = BaseDatasetConfig(
    name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
 )
 # download dataset if not already present
 if not os.path.exists(dataset_config.path):
    print("Downloading dataset")
    download_thorsten_de(os.path.split(os.path.abspath(dataset_config.path))[0])
 audio_config = BaseAudioConfig(
    sample_rate=22050,
    win_length=1024,
    hop_length=256,
    num_mels=80,
    preemphasis=0.0,
    ref_level_db=20,
    log_func="np.log",
    do_trim_silence=True,
    trim_db=45,
    mel_fmin=0,
    mel_fmax=None,
    spec_gain=1.0,
    signal_norm=False,
    do_amp_to_db_linear=False,
 )
 config = VitsConfig(
    audio=audio_config,
    run_name="vits_thorsten-de",
    batch_size=32,
    eval_batch_size=16,
    batch_group_size=5,
    num_loader_workers=0,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=1000,
    text_cleaner="phoneme_cleaners",
    use_phonemes=True,
    phoneme_language="de",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    compute_input_seq_cache=True,
    print_step=25,
    print_eval=True,
    mixed_precision=True,
    test_sentences=[
        "Es hat mich viel Zeit gekostet ein Stimme zu entwickeln, jetzt wo ich sie habe werde ich nicht mehr schweigen.",
        "Sei eine Stimme, kein Echo.",
        "Es tut mir Leid David. Das kann ich leider nicht machen.",
        "Dieser Kuchen ist großartig. Er ist so lecker und feucht.",
        "Vor dem 22. November 1963.",
    ],
    output_path=output_path,
    datasets=[dataset_config],
 )
 # INITIALIZE THE AUDIO PROCESSOR
 # Audio processor is used for feature extraction and audio I/O.
 # It mainly serves to the dataloader and the training loggers.
 ap = AudioProcessor.init_from_config(config)
 # INITIALIZE THE TOKENIZER
 # Tokenizer is used to convert text to sequences of token IDs.
 # config is updated with the default characters if not defined in the config.
 tokenizer, config = TTSTokenizer.init_from_config(config)
 # LOAD DATA SAMPLES
 # Each sample is a list of ```[text, audio_file_path, speaker_name]```
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
 train_samples, eval_samples = load_tts_samples(
    dataset_config,
    eval_split=True,
    eval_split_max_size=config.eval_split_max_size,
    eval_split_size=config.eval_split_size,
 )
 # init model
 model = Vits(config, ap, tokenizer, speaker_manager=None)
 # init the trainer and 🚀
 trainer = Trainer(
    TrainerArgs(),
    config,
    output_path,
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples,
 )
 trainer.fit()
--- a/recipes/thorsten_DE/wavegrad/train_wavegrad.py
+++ b/recipes/thorsten_DE/wavegrad/train_wavegrad.py
@ -0,0 +1,56 @@
 import os
 from trainer import Trainer, TrainerArgs
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.downloaders import download_thorsten_de
 from TTS.vocoder.configs import WavegradConfig
 from TTS.vocoder.datasets.preprocess import load_wav_data
 from TTS.vocoder.models.wavegrad import Wavegrad
 output_path = os.path.dirname(os.path.abspath(__file__))
 config = WavegradConfig(
    batch_size=32,
    eval_batch_size=16,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=1000,
    seq_len=6144,
    pad_short=2000,
    use_noise_augment=True,
    eval_split_size=50,
    print_step=50,
    print_eval=True,
    mixed_precision=False,
    data_path=os.path.join(output_path, "../thorsten-de/wavs/"),
    output_path=output_path,
 )
 # download dataset if not already present
 if not os.path.exists(config.data_path):
    print("Downloading dataset")
    download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../"))
    download_thorsten_de(download_path)
 # init audio processor
 ap = AudioProcessor(**config.audio.to_dict())
 # load training samples
 eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
 # init model
 model = Wavegrad(config)
 # init the trainer and 🚀
 trainer = Trainer(
    TrainerArgs(),
    config,
    output_path,
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples,
    training_assets={"audio_processor": ap},
 )
 trainer.fit()
--- a/recipes/thorsten_DE/wavernn/train_wavernn.py
+++ b/recipes/thorsten_DE/wavernn/train_wavernn.py
@ -0,0 +1,58 @@
 import os
 from trainer import Trainer, TrainerArgs
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.downloaders import download_thorsten_de
 from TTS.vocoder.configs import WavernnConfig
 from TTS.vocoder.datasets.preprocess import load_wav_data
 from TTS.vocoder.models.wavernn import Wavernn
 output_path = os.path.dirname(os.path.abspath(__file__))
 config = WavernnConfig(
    batch_size=64,
    eval_batch_size=16,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=10000,
    seq_len=1280,
    pad_short=2000,
    use_noise_augment=False,
    eval_split_size=10,
    print_step=25,
    print_eval=True,
    mixed_precision=False,
    lr=1e-4,
    grad_clip=4,
    data_path=os.path.join(output_path, "../thorsten-de/wavs/"),
    output_path=output_path,
 )
 # download dataset if not already present
 if not os.path.exists(config.data_path):
    print("Downloading dataset")
    download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../"))
    download_thorsten_de(download_path)
 # init audio processor
 ap = AudioProcessor(**config.audio.to_dict())
 # load training samples
 eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
 # init model
 model = Wavernn(config)
 # init the trainer and 🚀
 trainer = Trainer(
    TrainerArgs(),
    config,
    output_path,
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples,
    training_assets={"audio_processor": ap},
 )
 trainer.fit()
--- a/requirements.dev.txt
+++ b/requirements.dev.txt
@ -1,5 +1,5 @@
 black
 coverage
 isort
-nose
+nose2
 pylint==2.10.2
--- a/requirements.txt
+++ b/requirements.txt
@ -1,12 +1,12 @@
 # core deps
-numpy==1.19.5
+numpy==1.21.6
-cython
+cython==0.29.28
 scipy>=1.4.0
 torch>=1.7
 torchaudio
 soundfile
 librosa==0.8.0
-numba==0.53
+numba==0.55.1
 inflect
 tqdm
 anyascii
@ -21,16 +21,16 @@ umap-learn==0.5.1
 pandas
 # deps for training
 matplotlib
-tensorboardX
+pyworld==0.2.10 # > 0.2.10 is not p3.10.x compatible
 pyworld
 # coqui stack
 trainer
-coqpit # config management
+# config management
 coqpit>=0.0.16
 # chinese g2p deps
 jieba
 pypinyin
 # japanese g2p deps
-mecab-python3==1.0.3
+mecab-python3==1.0.5
 unidic-lite==1.0.8
 # gruut+supported langs
 gruut[cs,de,es,fr,it,nl,pt,ru,sv]==2.2.3
--- a/setup.py
+++ b/setup.py
@ -31,8 +31,8 @@ import setuptools.command.develop
 from Cython.Build import cythonize
 from setuptools import Extension, find_packages, setup
-if LooseVersion(sys.version) < LooseVersion("3.6") or LooseVersion(sys.version) > LooseVersion("3.10"):
+if LooseVersion(sys.version) < LooseVersion("3.7") or LooseVersion(sys.version) >= LooseVersion("3.11"):
-    raise RuntimeError("TTS requires python >= 3.6 and <=3.10 " "but your Python version is {}".format(sys.version))
+    raise RuntimeError("TTS requires python >= 3.7 and < 3.11 " "but your Python version is {}".format(sys.version))
 cwd = os.path.dirname(os.path.abspath(__file__))
@ -113,15 +113,15 @@ setup(
        "dev": requirements_dev,
        "notebooks": requirements_notebooks,
    },
-    python_requires=">=3.6.0, <3.10",
+    python_requires=">=3.7.0, <3.11",
    entry_points={"console_scripts": ["tts=TTS.bin.synthesize:main", "tts-server = TTS.server.server:main"]},
    classifiers=[
        "Programming Language :: Python",
        "Programming Language :: Python :: 3",
        "Programming Language :: Python :: 3.6",
        "Programming Language :: Python :: 3.7",
        "Programming Language :: Python :: 3.8",
        "Programming Language :: Python :: 3.9",
        "Programming Language :: Python :: 3.10",
        "Development Status :: 3 - Alpha",
        "Intended Audience :: Science/Research",
        "Intended Audience :: Developers",
--- a/tests/aux_tests/test_speaker_manager.py
+++ b/tests/aux_tests/test_speaker_manager.py
@ -16,6 +16,7 @@ encoder_model_path = os.path.join(get_tests_input_path(), "checkpoint_0.pth")
 sample_wav_path = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0001.wav")
 sample_wav_path2 = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0002.wav")
 d_vectors_file_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.json")
 d_vectors_file_pth_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.pth")
 class SpeakerManagerTest(unittest.TestCase):
@ -58,12 +59,13 @@ class SpeakerManagerTest(unittest.TestCase):
        # remove dummy model
        os.remove(encoder_model_path)
-    @staticmethod
+    def test_speakers_file_processing(self):
    def test_speakers_file_processing():
        manager = SpeakerManager(d_vectors_file_path=d_vectors_file_path)
-        print(manager.num_speakers)
+        self.assertEqual(manager.num_speakers, 1)
-        print(manager.embedding_dim)
+        self.assertEqual(manager.embedding_dim, 256)
-        print(manager.clip_ids)
+        manager = SpeakerManager(d_vectors_file_path=d_vectors_file_pth_path)
        self.assertEqual(manager.num_speakers, 1)
        self.assertEqual(manager.embedding_dim, 256)
        d_vector = manager.get_embedding_by_clip(manager.clip_ids[0])
        assert len(d_vector) == 256
        d_vectors = manager.get_embeddings_by_name(manager.speaker_names[0])
--- a/tests/data/dummy_speakers.pth
+++ b/tests/data/dummy_speakers.pth
--- a/tests/data_tests/test_loader.py
+++ b/tests/data_tests/test_loader.py
@ -6,7 +6,7 @@ import numpy as np
 import torch
 from torch.utils.data import DataLoader
-from tests import get_tests_output_path
+from tests import get_tests_data_path, get_tests_output_path
 from TTS.tts.configs.shared_configs import BaseDatasetConfig, BaseTTSConfig
 from TTS.tts.datasets import TTSDataset, load_tts_samples
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
@ -20,7 +20,7 @@ os.makedirs(OUTPATH, exist_ok=True)
 # create a dummy config for testing data loaders.
 c = BaseTTSConfig(text_cleaner="english_cleaners", num_loader_workers=0, batch_size=2, use_noise_augment=False)
 c.r = 5
-c.data_path = "tests/data/ljspeech/"
+c.data_path = os.path.join(get_tests_data_path(), "ljspeech/")
 ok_ljspeech = os.path.exists(c.data_path)
 dataset_config = BaseDatasetConfig(
--- a/tests/data_tests/test_samplers.py
+++ b/tests/data_tests/test_samplers.py
@ -1,4 +1,5 @@
 import functools
 import random
 import unittest
 import torch
@ -6,6 +7,7 @@ import torch
 from TTS.config.shared_configs import BaseDatasetConfig
 from TTS.encoder.utils.samplers import PerfectBatchSampler
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.utils.data import get_length_balancer_weights
 from TTS.tts.utils.languages import get_language_balancer_weights
 from TTS.tts.utils.speakers import get_speaker_balancer_weights
@ -136,3 +138,28 @@ class TestSamplers(unittest.TestCase):
                else:
                    spk2 += 1
            assert spk1 == spk2, "PerfectBatchSampler is supposed to be perfectly balanced"
    def test_length_weighted_random_sampler(self):  # pylint: disable=no-self-use
        for _ in range(1000):
            # gerenate a lenght unbalanced dataset with random max/min audio lenght
            min_audio = random.randrange(1, 22050)
            max_audio = random.randrange(44100, 220500)
            for idx, item in enumerate(train_samples):
                # increase the diversity of durations
                random_increase = random.randrange(100, 1000)
                if idx < 5:
                    item["audio_length"] = min_audio + random_increase
                else:
                    item["audio_length"] = max_audio + random_increase
            weighted_sampler = torch.utils.data.sampler.WeightedRandomSampler(
                get_length_balancer_weights(train_samples, num_buckets=2), len(train_samples)
            )
            ids = functools.reduce(lambda a, b: a + b, [list(weighted_sampler) for i in range(100)])
            len1, len2 = 0, 0
            for index in ids:
                if train_samples[index]["audio_length"] < max_audio:
                    len1 += 1
                else:
                    len2 += 1
            assert is_balanced(len1, len2), "Length Weighted sampler is supposed to be balanced"
--- a/tests/tts_tests/test_tacotron2_model.py
+++ b/tests/tts_tests/test_tacotron2_model.py
@ -6,7 +6,7 @@ import torch
 from torch import nn, optim
 from tests import get_tests_input_path
-from TTS.tts.configs.shared_configs import GSTConfig
+from TTS.tts.configs.shared_configs import CapacitronVAEConfig, GSTConfig
 from TTS.tts.configs.tacotron2_config import Tacotron2Config
 from TTS.tts.layers.losses import MSELossMasked
 from TTS.tts.models.tacotron2 import Tacotron2
@ -260,6 +260,73 @@ class TacotronGSTTrainTest(unittest.TestCase):
            count += 1
 class TacotronCapacitronTrainTest(unittest.TestCase):
    @staticmethod
    def test_train_step():
        config = Tacotron2Config(
            num_chars=32,
            num_speakers=10,
            use_speaker_embedding=True,
            out_channels=80,
            decoder_output_dim=80,
            use_capacitron_vae=True,
            capacitron_vae=CapacitronVAEConfig(),
            optimizer="CapacitronOptimizer",
            optimizer_params={
                "RAdam": {"betas": [0.9, 0.998], "weight_decay": 1e-6},
                "SGD": {"lr": 1e-5, "momentum": 0.9},
            },
        )
        batch = dict({})
        batch["text_input"] = torch.randint(0, 24, (8, 128)).long().to(device)
        batch["text_lengths"] = torch.randint(100, 129, (8,)).long().to(device)
        batch["text_lengths"] = torch.sort(batch["text_lengths"], descending=True)[0]
        batch["text_lengths"][0] = 128
        batch["mel_input"] = torch.rand(8, 120, config.audio["num_mels"]).to(device)
        batch["mel_lengths"] = torch.randint(20, 120, (8,)).long().to(device)
        batch["mel_lengths"] = torch.sort(batch["mel_lengths"], descending=True)[0]
        batch["mel_lengths"][0] = 120
        batch["stop_targets"] = torch.zeros(8, 120, 1).float().to(device)
        batch["stop_target_lengths"] = torch.randint(0, 120, (8,)).to(device)
        batch["speaker_ids"] = torch.randint(0, 5, (8,)).long().to(device)
        batch["d_vectors"] = None
        for idx in batch["mel_lengths"]:
            batch["stop_targets"][:, int(idx.item()) :, 0] = 1.0
        batch["stop_targets"] = batch["stop_targets"].view(
            batch["text_input"].shape[0], batch["stop_targets"].size(1) // config.r, -1
        )
        batch["stop_targets"] = (batch["stop_targets"].sum(2) > 0.0).unsqueeze(2).float().squeeze()
        model = Tacotron2(config).to(device)
        criterion = model.get_criterion()
        optimizer = model.get_optimizer()
        model.train()
        model_ref = copy.deepcopy(model)
        count = 0
        for param, param_ref in zip(model.parameters(), model_ref.parameters()):
            assert (param - param_ref).sum() == 0, param
            count += 1
        for _ in range(10):
            _, loss_dict = model.train_step(batch, criterion)
            optimizer.zero_grad()
            loss_dict["capacitron_vae_beta_loss"].backward()
            optimizer.first_step()
            loss_dict["loss"].backward()
            optimizer.step()
        # check parameter changes
        count = 0
        for param, param_ref in zip(model.parameters(), model_ref.parameters()):
            # ignore pre-higway layer since it works conditional
            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
                count, param.shape, param, param_ref
            )
            count += 1
 class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
    """Test multi-speaker Tacotron2 with Global Style Tokens and d-vector inputs."""
--- a/tests/tts_tests/test_tacotron_model.py
+++ b/tests/tts_tests/test_tacotron_model.py
@ -6,7 +6,7 @@ import torch
 from torch import nn, optim
 from tests import get_tests_input_path
-from TTS.tts.configs.shared_configs import GSTConfig
+from TTS.tts.configs.shared_configs import CapacitronVAEConfig, GSTConfig
 from TTS.tts.configs.tacotron_config import TacotronConfig
 from TTS.tts.layers.losses import L1LossMasked
 from TTS.tts.models.tacotron import Tacotron
@ -248,6 +248,74 @@ class TacotronGSTTrainTest(unittest.TestCase):
            count += 1
 class TacotronCapacitronTrainTest(unittest.TestCase):
    @staticmethod
    def test_train_step():
        config = TacotronConfig(
            num_chars=32,
            num_speakers=10,
            use_speaker_embedding=True,
            out_channels=513,
            decoder_output_dim=80,
            use_capacitron_vae=True,
            capacitron_vae=CapacitronVAEConfig(),
            optimizer="CapacitronOptimizer",
            optimizer_params={
                "RAdam": {"betas": [0.9, 0.998], "weight_decay": 1e-6},
                "SGD": {"lr": 1e-5, "momentum": 0.9},
            },
        )
        batch = dict({})
        batch["text_input"] = torch.randint(0, 24, (8, 128)).long().to(device)
        batch["text_lengths"] = torch.randint(100, 129, (8,)).long().to(device)
        batch["text_lengths"] = torch.sort(batch["text_lengths"], descending=True)[0]
        batch["text_lengths"][0] = 128
        batch["linear_input"] = torch.rand(8, 120, config.audio["fft_size"] // 2 + 1).to(device)
        batch["mel_input"] = torch.rand(8, 120, config.audio["num_mels"]).to(device)
        batch["mel_lengths"] = torch.randint(20, 120, (8,)).long().to(device)
        batch["mel_lengths"] = torch.sort(batch["mel_lengths"], descending=True)[0]
        batch["mel_lengths"][0] = 120
        batch["stop_targets"] = torch.zeros(8, 120, 1).float().to(device)
        batch["stop_target_lengths"] = torch.randint(0, 120, (8,)).to(device)
        batch["speaker_ids"] = torch.randint(0, 5, (8,)).long().to(device)
        batch["d_vectors"] = None
        for idx in batch["mel_lengths"]:
            batch["stop_targets"][:, int(idx.item()) :, 0] = 1.0
        batch["stop_targets"] = batch["stop_targets"].view(
            batch["text_input"].shape[0], batch["stop_targets"].size(1) // config.r, -1
        )
        batch["stop_targets"] = (batch["stop_targets"].sum(2) > 0.0).unsqueeze(2).float().squeeze()
        model = Tacotron(config).to(device)
        criterion = model.get_criterion()
        optimizer = model.get_optimizer()
        model.train()
        print(" > Num parameters for Tacotron with Capacitron VAE model:%s" % (count_parameters(model)))
        model_ref = copy.deepcopy(model)
        count = 0
        for param, param_ref in zip(model.parameters(), model_ref.parameters()):
            assert (param - param_ref).sum() == 0, param
            count += 1
        for _ in range(10):
            _, loss_dict = model.train_step(batch, criterion)
            optimizer.zero_grad()
            loss_dict["capacitron_vae_beta_loss"].backward()
            optimizer.first_step()
            loss_dict["loss"].backward()
            optimizer.step()
        # check parameter changes
        count = 0
        for param, param_ref in zip(model.parameters(), model_ref.parameters()):
            # ignore pre-higway layer since it works conditional
            assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
                count, param.shape, param, param_ref
            )
            count += 1
 class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
    @staticmethod
    def test_train_step():
--- a/tests/tts_tests/test_vits.py
+++ b/tests/tts_tests/test_vits.py
@ -122,7 +122,7 @@ class TestVits(unittest.TestCase):
        args = VitsArgs(num_speakers=num_speakers, use_speaker_embedding=True)
        model = Vits(args)
-        ref_inp = torch.randn(1, spec_len, 513)
+        ref_inp = torch.randn(1, 513, spec_len)
        ref_inp_len = torch.randint(1, spec_effective_len, (1,))
        ref_spk_id = torch.randint(1, num_speakers, (1,))
        tgt_spk_id = torch.randint(1, num_speakers, (1,))
@ -420,6 +420,76 @@ class TestVits(unittest.TestCase):
        # check parameter changes
        self._check_parameter_changes(model, model_ref)
    def test_train_step_upsampling(self):
        # setup the model
        with torch.autograd.set_detect_anomaly(True):
            model_args = VitsArgs(
                num_chars=32,
                spec_segment_size=10,
                encoder_sample_rate=11025,
                interpolate_z=False,
                upsample_rates_decoder=[8, 8, 4, 2],
            )
            config = VitsConfig(model_args=model_args)
            model = Vits(config).to(device)
            model.train()
            # model to train
            optimizers = model.get_optimizer()
            criterions = model.get_criterion()
            criterions = [criterions[0].to(device), criterions[1].to(device)]
            # reference model to compare model weights
            model_ref = Vits(config).to(device)
            # # pass the state to ref model
            model_ref.load_state_dict(copy.deepcopy(model.state_dict()))
            count = 0
            for param, param_ref in zip(model.parameters(), model_ref.parameters()):
                assert (param - param_ref).sum() == 0, param
                count = count + 1
            for _ in range(5):
                batch = self._create_batch(config, 2)
                for idx in [0, 1]:
                    outputs, loss_dict = model.train_step(batch, criterions, idx)
                    self.assertFalse(not outputs)
                    self.assertFalse(not loss_dict)
                    loss_dict["loss"].backward()
                    optimizers[idx].step()
                    optimizers[idx].zero_grad()
        # check parameter changes
        self._check_parameter_changes(model, model_ref)
    def test_train_step_upsampling_interpolation(self):
        # setup the model
        with torch.autograd.set_detect_anomaly(True):
            model_args = VitsArgs(num_chars=32, spec_segment_size=10, encoder_sample_rate=11025, interpolate_z=True)
            config = VitsConfig(model_args=model_args)
            model = Vits(config).to(device)
            model.train()
            # model to train
            optimizers = model.get_optimizer()
            criterions = model.get_criterion()
            criterions = [criterions[0].to(device), criterions[1].to(device)]
            # reference model to compare model weights
            model_ref = Vits(config).to(device)
            # # pass the state to ref model
            model_ref.load_state_dict(copy.deepcopy(model.state_dict()))
            count = 0
            for param, param_ref in zip(model.parameters(), model_ref.parameters()):
                assert (param - param_ref).sum() == 0, param
                count = count + 1
            for _ in range(5):
                batch = self._create_batch(config, 2)
                for idx in [0, 1]:
                    outputs, loss_dict = model.train_step(batch, criterions, idx)
                    self.assertFalse(not outputs)
                    self.assertFalse(not loss_dict)
                    loss_dict["loss"].backward()
                    optimizers[idx].step()
                    optimizers[idx].zero_grad()
        # check parameter changes
        self._check_parameter_changes(model, model_ref)
    def test_train_eval_log(self):
        batch_size = 2
        config = VitsConfig(model_args=VitsArgs(num_chars=32, spec_segment_size=10))
--- a/tests/zoo_tests/test_models.py
+++ b/tests/zoo_tests/test_models.py
@ -3,7 +3,7 @@ import glob
 import os
 import shutil
-from tests import get_tests_output_path, run_cli
+from tests import get_tests_data_path, get_tests_output_path, run_cli
 from TTS.tts.utils.languages import LanguageManager
 from TTS.tts.utils.speakers import SpeakerManager
 from TTS.utils.generic_utils import get_user_data_dir
@ -56,3 +56,16 @@ def test_run_all_models():
    folders = glob.glob(os.path.join(manager.output_prefix, "*"))
    assert len(folders) == len(model_names)
    shutil.rmtree(manager.output_prefix)
 def test_voice_conversion():
    print(" > Run voice conversion inference using YourTTS model.")
    model_name = "tts_models/multilingual/multi-dataset/your_tts"
    language_id = "en"
    speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")
    reference_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0032.wav")
    output_path = os.path.join(get_tests_output_path(), "output.wav")
    run_cli(
        f"tts --model_name  {model_name}"
        f" --out_path {output_path} --speaker_wav {speaker_wav} --reference_wav {reference_wav} --language_idx {language_id} "
    )