merge local changes and official repo update 0.6.2

2022-05-05 14:22:45 +02:00 · 2022-05-05 14:22:45 +02:00 · 1aa05feeb4
parent 458512d236 c410bc58ef
commit 1aa05feeb4
134 changed files with 2768 additions and 4048 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -1 +1,2 @@
-.git/
+.git/
+Dockerfile
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@ -1,58 +0,0 @@
---
-name: 🐛 Bug report
-about: Create a bug report to help 🐸 improve
-title: '[Bug] '
-labels: bug
-assignees: ''
-
---
-<!-- Welcome to the 🐸TTS!
-We are excited to see your interest, and appreciate your support! --->
-## 🐛 Description
-
-<!-- A clear and concise description of what the bug is. -->
-
-### To Reproduce
-
-<!--
-Please share your code to reproduce the error. Issues fixed faster if you can provide a working example.
-
-The best place for sharing code is colab. https://colab.research.google.com/
-So we can directly run your code and reproduce the issue.
-
-In the worse case provide steps to reproduce the behaviour.
-
-1. Run the following command '...'
-2. ...
-3. See error
-->
-
-### Expected behavior
-
-<!-- Write down what the expected behaviour -->
-
-### Environment
-
-<!--
-You can either run `TTS/bin/collect_env_info.py`
-
-```bash
-wget https://raw.githubusercontent.com/coqui-ai/TTS/main/TTS/bin/collect_env_info.py
-python collect_env_info.py
-```
-
-or fill in the fields below manually.
-->
-
- 🐸TTS Version (e.g., 1.3.0):
- PyTorch Version (e.g., 1.8)
- Python version:
- OS (e.g., Linux):
- CUDA/cuDNN version:
- GPU models and configuration:
- How you installed PyTorch (`conda`, `pip`, source):
- Any other relevant information:
-
-### Additional context
-
-<!-- Add any other context about the problem here. -->
--- a/.github/ISSUE_TEMPLATE/bug_report.yaml
+++ b/.github/ISSUE_TEMPLATE/bug_report.yaml
@ -0,0 +1,85 @@
+name: "🐛 Bug report"
+description: Create a bug report to help 🐸 improve
+title: '[Bug] '
+labels: [ "bug" ]
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Welcome to the 🐸TTS! Thanks for taking the time to fill out this bug report!
+
+  - type: textarea
+    id: bug-description
+    attributes:
+      label: Describe the bug
+      description: A clear and concise description of what the bug is. If you intend to submit a PR for this issue, tell us in the description. Thanks!
+      placeholder: Bug description
+    validations:
+      required: true
+
+  - type: textarea
+    id: reproduction
+    attributes:
+      label: To Reproduce
+      description: |
+        Please share your code to reproduce the error.
+
+        Issues are fixed faster if you can provide a working example.
+
+        The best place for sharing code is colab. https://colab.research.google.com/
+        So we can directly run your code and reproduce the issue.
+
+        In the worse case, provide steps to reproduce the behavior.
+
+        1. Run the following command '...'
+        2. ...
+        3. See error
+      placeholder: Reproduction
+    validations:
+      required: true
+
+  - type: textarea
+    id: expected-behavior
+    attributes:
+      label: Expected behavior
+      description: "Write down what the expected behaviour"
+
+  - type: textarea
+    id: logs
+    attributes:
+      label: Logs
+      description: "Please include the relevant logs if you can."
+      render: shell
+
+  - type: textarea
+    id: system-info
+    attributes:
+      label: Environment
+      description: |
+        You can either run `TTS/bin/collect_env_info.py`
+
+        ```bash
+        wget https://raw.githubusercontent.com/coqui-ai/TTS/main/TTS/bin/collect_env_info.py
+        python collect_env_info.py
+        ```
+
+        or fill in the fields below manually.
+      render: shell
+      placeholder: |
+        - 🐸TTS Version (e.g., 1.3.0):
+        - PyTorch Version (e.g., 1.8)
+        - Python version:
+        - OS (e.g., Linux):
+        - CUDA/cuDNN version:
+        - GPU models and configuration:
+        - How you installed PyTorch (`conda`, `pip`, source):
+        - Any other relevant information:
+    validations:
+      required: true
+  - type: textarea
+    id: context
+    attributes:
+      label: Additional context
+      description: Add any other context about the problem here.
+    validations:
+      required: false
--- a/.github/workflows/docker.yaml
+++ b/.github/workflows/docker.yaml
@ -0,0 +1,56 @@
+name: "Docker build and push"
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+      - dev
+    tags:
+      - v*
+jobs:
+  docker-build:
+    name: "Build and push Docker image"
+    runs-on: ubuntu-20.04
+    strategy:
+      matrix:
+        arch: ["amd64"]
+    steps:
+      - uses: actions/checkout@v2
+      - name: Log in to the Container registry
+        uses: docker/login-action@v1
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      - name: Compute Docker tags, check VERSION file matches tag
+        id: compute-tag
+        run: |
+          set -ex
+          base="ghcr.io/coqui-ai/tts"
+          tags="" # PR build
+          if [[ "${{ startsWith(github.ref, 'refs/heads/') }}" = "true" ]]; then
+            # Push to branch
+            github_ref="${{ github.ref }}"
+            branch=${github_ref#*refs/heads/} # strip prefix to get branch name
+            tags="${base}:${branch},${base}:${{ github.sha }},"
+          elif [[ "${{ startsWith(github.ref, 'refs/tags/') }}" = "true" ]]; then
+            VERSION="v$(cat TTS/VERSION)"
+            if [[ "${{ github.ref }}" != "refs/tags/${VERSION}" ]]; then
+              echo "Pushed tag does not match VERSION file. Aborting push."
+              exit 1
+            fi
+            tags="${base}:${VERSION},${base}:latest,${base}:${{ github.sha }}"
+          fi
+          echo "::set-output name=tags::${tags}"
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v1
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v1
+      - name: Build and push
+        uses: docker/build-push-action@v2
+        with:
+          context: .
+          platforms: linux/${{ matrix.arch }}
+          push: ${{ github.event_name == 'push' }}
+          tags: ${{ steps.compute-tag.outputs.tags }}
--- a/.github/workflows/text_tests.yml
+++ b/.github/workflows/text_tests.yml
@ -1,4 +1,4 @@
-name: tts-tests
+name: text-tests

 on:
  push:
--- a/.gitignore
+++ b/.gitignore
@ -115,6 +115,7 @@ venv.bak/
 *.swo

 # pytorch models
+*.pth
 *.pth.tar
 result/

--- a/CITATION.cff
+++ b/CITATION.cff
@ -0,0 +1,20 @@
+cff-version: 1.2.0
+message: "If you want to cite 🐸💬, feel free to use this (but only if you loved it 😊)"
+title: "Coqui TTS"
+abstract: "A deep learning toolkit for Text-to-Speech, battle-tested in research and production"
+date-released: 2021-01-01
+authors:
+  - family-names: "Eren"
+    given-names: "Gölge"
+  - name: "The Coqui TTS Team"
+version: 1.4
+doi: 10.5281/zenodo.6334862
+license: "MPL-2.0"
+url: "https://www.coqui.ai"
+repository-code: "https://github.com/coqui-ai/TTS"
+keywords:
+  - machine learning
+  - deep learning
+  - artificial intelligence
+  - text to speech
+  - TTS
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -26,7 +26,8 @@ If you like to contribute code, squash a bug but if you don't know where to star
    We list all the target improvements for the next version. You can pick one of them and start contributing.

 - Also feel free to suggest new features, ideas and models. We're always open for new things.
-#####Call for sharing language models
+
+## Call for sharing language models
 If possible, please consider sharing your pre-trained models in any language (if the licences allow for you to do so). We will include them in our model catalogue for public use and give the proper attribution, whether it be your name, company, website or any other source specified.

 This model can be shared in two ways:
@ -36,6 +37,7 @@ This model can be shared in two ways:
 Models are served under `.models.json` file and any model is available under TTS CLI or Server end points.

 Either way you choose, please make sure you send the models [here](https://github.com/coqui-ai/TTS/issues/380). 
+
 ## Sending a ✨**PR**✨

 If you have a new feature, a model to implement, or a bug to squash, go ahead and send a ✨**PR**✨.
--- a/11
+++ b/11
@ -0,0 +1,11 @@
+FROM nvcr.io/nvidia/pytorch:22.03-py3
+RUN apt-get update && apt-get install -y --no-install-recommends espeak && rm -rf /var/lib/apt/lists/*
+WORKDIR /root
+COPY requirements.txt /root
+COPY requirements.dev.txt /root
+COPY requirements.notebooks.txt /root
+RUN pip install -r <(cat requirements.txt requirements.dev.txt requirements.notebooks.txt)
+COPY . /root
+RUN make install
+ENTRYPOINT ["tts"]
+CMD ["--help"]
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,6 +1,7 @@
 include README.md
 include LICENSE.txt
 include requirements.*.txt
+include *.cff
 include requirements.txt
 include TTS/VERSION
 recursive-include TTS *.json
--- a/2
+++ b/2
@ -44,6 +44,8 @@ style:	## update code style.

 lint:	## run pylint linter.
 	pylint ${target_dirs}
+	black ${target_dirs} --check
+	isort ${target_dirs} --check-only

 system-deps:	## install linux system deps
 	sudo apt-get install -y libsndfile1-dev
--- a/README.md
+++ b/README.md
@ -159,13 +159,13 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht
 - Run your own TTS model (Using Griffin-Lim Vocoder):

    ```
-    $ tts --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav
+    $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
    ```

 - Run your own TTS and Vocoder models:
    ```
-    $ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav
-        --vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json
+    $ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth --out_path output/path/speech.wav
+        --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
    ```

 ### Multi-speaker Models
@ -185,7 +185,7 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht
 - Run your own multi-speaker TTS model:

    ```
-    $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth.tar --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
+    $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
    ```

 ## Directory Structure
--- a/TTS/.models.json
+++ b/TTS/.models.json
@ -4,7 +4,7 @@
            "multi-dataset":{
                "your_tts":{
                    "description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--multilingual--multi-dataset--your_tts.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--multilingual--multi-dataset--your_tts.zip",
                    "default_vocoder": null,
                    "commit": "e9a1953e",
                    "license": "CC BY-NC-ND 4.0",
@ -16,33 +16,34 @@
            "ek1": {
                "tacotron2": {
                    "description": "EK1 en-rp tacotron2 by NMStoker",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.1.0/tts_models--en--ek1--tacotron2.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ek1--tacotron2.zip",
                    "default_vocoder": "vocoder_models/en/ek1/wavegrad",
-                    "commit": "c802255"
+                    "commit": "c802255",
+                    "license": "apache 2.0"
                }
            },
            "ljspeech": {
                "tacotron2-DDC": {
                    "description": "Tacotron2 with Double Decoder Consistency.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.12/tts_models--en--ljspeech--tacotron2-DDC.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC.zip",
                    "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
                    "commit": "bae2ad0f",
                    "author": "Eren Gölge @erogol",
-                    "license": "",
+                    "license": "apache 2.0",
                    "contact": "egolge@coqui.com"
                },
                "tacotron2-DDC_ph": {
                    "description": "Tacotron2 with Double Decoder Consistency with phonemes.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--ljspeech--tacotron2-DDC_ph.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC_ph.zip",
                    "default_vocoder": "vocoder_models/en/ljspeech/univnet",
                    "commit": "3900448",
                    "author": "Eren Gölge @erogol",
-                    "license": "",
+                    "license": "apache 2.0",
                    "contact": "egolge@coqui.com"
                },
                "glow-tts": {
                    "description": "",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/tts_models--en--ljspeech--glow-tts.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--glow-tts.zip",
                    "stats_file": null,
                    "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
                    "commit": "",
@ -52,17 +53,17 @@
                },
                "speedy-speech": {
                    "description": "Speedy Speech model trained on LJSpeech dataset using the Alignment Network for learning the durations.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.3.0/tts_models--en--ljspeech--speedy_speech.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--speedy-speech.zip",
                    "stats_file": null,
                    "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
                    "commit": "4581e3d",
                    "author": "Eren Gölge @erogol",
-                    "license": "TBD",
+                    "license": "apache 2.0",
                    "contact": "egolge@coqui.com"
                },
                "tacotron2-DCA": {
                    "description": "",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/tts_models--en--ljspeech--tacotron2-DCA.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DCA.zip",
                    "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
                    "commit": "",
                    "author": "Eren Gölge @erogol",
@ -71,36 +72,36 @@
                },
                "vits": {
                    "description": "VITS is an End2End TTS model trained on LJSpeech dataset with phonemes.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--ljspeech--vits.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--vits.zip",
                    "default_vocoder": null,
                    "commit": "3900448",
                    "author": "Eren Gölge @erogol",
-                    "license": "TBD",
+                    "license": "apache 2.0",
                    "contact": "egolge@coqui.com"
                },
                "fast_pitch": {
                    "description": "FastPitch model trained on LJSpeech using the Aligner Network",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.2.2/tts_models--en--ljspeech--fast_pitch.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--fast_pitch.zip",
                    "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
                    "commit": "b27b3ba",
                    "author": "Eren Gölge @erogol",
-                    "license": "TBD",
+                    "license": "apache 2.0",
                    "contact": "egolge@coqui.com"
                }
            },
            "vctk": {
                "vits": {
                    "description": "VITS End2End TTS model trained on VCTK dataset with 109 different speakers with EN accent.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--vctk--vits.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--vits.zip",
                    "default_vocoder": null,
                    "commit": "3900448",
                    "author": "Eren @erogol",
-                    "license": "",
+                    "license": "apache 2.0",
                    "contact": "egolge@coqui.ai"
                },
                "fast_pitch":{
                    "description": "FastPitch model trained on VCTK dataseset.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--vctk--fast_pitch.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--fast_pitch.zip",
                    "default_vocoder": null,
                    "commit": "bdab788d",
                    "author": "Eren @erogol",
@ -111,11 +112,11 @@
            "sam": {
                "tacotron-DDC": {
                    "description": "Tacotron2 with Double Decoder Consistency trained with Aceenture's Sam dataset.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.13/tts_models--en--sam--tacotron_DDC.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--sam--tacotron-DDC.zip",
                    "default_vocoder": "vocoder_models/en/sam/hifigan_v2",
                    "commit": "bae2ad0f",
                    "author": "Eren Gölge @erogol",
-                    "license": "",
+                    "license": "apache 2.0",
                    "contact": "egolge@coqui.com"
                }
            }
@ -123,7 +124,7 @@
        "es": {
            "mai": {
                "tacotron2-DDC": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/tts_models--es--mai--tacotron2-DDC.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--es--mai--tacotron2-DDC.zip",
                    "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
                    "commit": "",
                    "author": "Eren Gölge @erogol",
@ -135,7 +136,7 @@
        "fr": {
            "mai": {
                "tacotron2-DDC": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/tts_models--fr--mai--tacotron2-DDC.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--fr--mai--tacotron2-DDC.zip",
                    "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
                    "commit": "",
                    "author": "Eren Gölge @erogol",
@ -147,7 +148,7 @@
        "uk":{
            "mai": {
                "glow-tts": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--uk--mai--glow-tts.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--uk--mai--glow-tts.zip",
                    "author":"@robinhad",
                    "commit": "bdab788d",
                    "license": "MIT",
@ -159,9 +160,10 @@
        "zh-CN": {
            "baker": {
                "tacotron2-DDC-GST": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip",
                    "commit": "unknown",
                    "author": "@kirianguiller",
+                    "license": "apache 2.0",
                    "default_vocoder": null
                }
            }
@ -169,8 +171,9 @@
        "nl": {
            "mai": {
                "tacotron2-DDC": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.10/tts_models--nl--mai--tacotron2-DDC.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--nl--mai--tacotron2-DDC.zip",
                    "author": "@r-dh",
+                    "license": "apache 2.0",
                    "default_vocoder": "vocoder_models/nl/mai/parallel-wavegan",
                    "stats_file": null,
                    "commit": "540d811"
@ -180,9 +183,10 @@
        "de": {
            "thorsten": {
                "tacotron2-DCA": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.11/tts_models--de--thorsten--tacotron2-DCA.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--de--thorsten--tacotron2-DCA.zip",
                    "default_vocoder": "vocoder_models/de/thorsten/fullband-melgan",
                    "author": "@thorstenMueller",
+                    "license": "apache 2.0",
                    "commit": "unknown"
                }
            }
@ -190,10 +194,11 @@
        "ja": {
            "kokoro": {
                "tacotron2-DDC": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.15/tts_models--jp--kokoro--tacotron2-DDC.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--ja--kokoro--tacotron2-DDC.zip",
                    "default_vocoder": "vocoder_models/ja/kokoro/hifigan_v1",
                    "description": "Tacotron2 with Double Decoder Consistency trained with Kokoro Speech Dataset.",
                    "author": "@kaiidams",
+                    "license": "apache 2.0",
                    "commit": "401fbd89"
                }
            }
@ -201,7 +206,7 @@
        "tr":{
            "common-voice": {
                "glow-tts":{
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--tr--common-voice--glow-tts.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--tr--common-voice--glow-tts.zip",
                    "default_vocoder": "vocoder_models/tr/common-voice/hifigan",
                    "license": "MIT",
                    "description": "Turkish GlowTTS model using an unknown speaker from the Common-Voice dataset.",
@ -213,50 +218,126 @@
        "it": {
            "mai_female": {
                "glow-tts":{
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--it--mai_female--glow-tts.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--glow-tts.zip",
                    "default_vocoder": null,
                    "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
                    "author": "@nicolalandro",
+                    "license": "apache 2.0",
                    "commit": null
                },
                "vits":{
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--it--mai_female--vits.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--vits.zip",
                    "default_vocoder": null,
                    "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
                    "author": "@nicolalandro",
+                    "license": "apache 2.0",
                    "commit": null
                }
            },
            "mai_male": {
                "glow-tts":{
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--it--mai_male--glow-tts.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--glow-tts.zip",
                    "default_vocoder": null,
                    "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
                    "author": "@nicolalandro",
+                    "license": "apache 2.0",
                    "commit": null
                },
                "vits":{
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--it--mai_male--vits.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--vits.zip",
                    "default_vocoder": null,
                    "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
                    "author": "@nicolalandro",
+                    "license": "apache 2.0",
                    "commit": null
                }
            }
+        },
+        "ewe": {
+            "openbible": {
+                "vits":{
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--ewe--openbible--vits.zip",
+                    "default_vocoder": null,
+                    "license": "CC-BY-SA 4.0",
+                    "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
+                    "author": "@coqui_ai",
+                    "commit": "1b22f03"
+                }
+            }
+        },
+        "hau": {
+            "openbible": {
+                "vits":{
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--hau--openbible--vits.zip",
+                    "default_vocoder": null,
+                    "license": "CC-BY-SA 4.0",
+                    "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
+                    "author": "@coqui_ai",
+                    "commit": "1b22f03"
+                }
+            }
+        },
+        "lin": {
+            "openbible": {
+                "vits":{
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--lin--openbible--vits.zip",
+                    "default_vocoder": null,
+                    "license": "CC-BY-SA 4.0",
+                    "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
+                    "author": "@coqui_ai",
+                    "commit": "1b22f03"
+                }
+            }
+        },
+        "tw_akuapem": {
+            "openbible": {
+                "vits":{
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_akuapem--openbible--vits.zip",
+                    "default_vocoder": null,
+                    "license": "CC-BY-SA 4.0",
+                    "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
+                    "author": "@coqui_ai",
+                    "commit": "1b22f03"
+                }
+            }
+        },
+        "tw_asante": {
+            "openbible": {
+                "vits":{
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_asante--openbible--vits.zip",
+                    "default_vocoder": null,
+                    "license": "CC-BY-SA 4.0",
+                    "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
+                    "author": "@coqui_ai",
+                    "commit": "1b22f03"
+                }
+            }
+        },
+        "yor": {
+            "openbible": {
+                "vits":{
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--yor--openbible--vits.zip",
+                    "default_vocoder": null,
+                    "license": "CC-BY-SA 4.0",
+                    "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
+                    "author": "@coqui_ai",
+                    "commit": "1b22f03"
+                }
+            }
        }
    },
    "vocoder_models": {
        "universal": {
            "libri-tts": {
                "wavegrad": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/vocoder_models--universal--libri-tts--wavegrad.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--wavegrad.zip",
                    "commit": "ea976b0",
                    "author": "Eren Gölge @erogol",
                    "license": "MPL",
                    "contact": "egolge@coqui.com"
                },
                "fullband-melgan": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/vocoder_models--universal--libri-tts--fullband-melgan.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--fullband-melgan.zip",
                    "commit": "4132240",
                    "author": "Eren Gölge @erogol",
                    "license": "MPL",
@ -268,13 +349,14 @@
            "ek1": {
                "wavegrad": {
                    "description": "EK1 en-rp wavegrad by NMStoker",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.10/vocoder_models--en--ek1--wavegrad.zip",
-                    "commit": "c802255"
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ek1--wavegrad.zip",
+                    "commit": "c802255",
+                    "license": "apache 2.0"
                }
            },
            "ljspeech": {
                "multiband-melgan": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/vocoder_models--en--ljspeech--mulitband-melgan.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--multiband-melgan.zip",
                    "commit": "ea976b0",
                    "author": "Eren Gölge @erogol",
                    "license": "MPL",
@ -282,38 +364,38 @@
                },
                "hifigan_v2": {
                    "description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.12/vocoder_model--en--ljspeech-hifigan_v2.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--hifigan_v2.zip",
                    "commit": "bae2ad0f",
                    "author": "@erogol",
-                    "license": "",
+                    "license": "apache 2.0",
                    "contact": "egolge@coqui.ai"
                },
                "univnet": {
                    "description": "UnivNet model finetuned on TacotronDDC_ph spectrograms for better compatibility.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.3.0/vocoder_models--en--ljspeech--univnet_v2.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--univnet_v2.zip",
                    "commit": "4581e3d",
                    "author": "Eren @erogol",
-                    "license": "TBD",
+                    "license": "apache 2.0",
                    "contact": "egolge@coqui.ai"
                }
            },
            "vctk": {
                "hifigan_v2": {
                    "description": "Finetuned and intended to be used with tts_models/en/vctk/sc-glow-tts",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.12/vocoder_model--en--vctk--hifigan_v2.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--vctk--hifigan_v2.zip",
                    "commit": "2f07160",
                    "author": "Edresson Casanova",
-                    "license": "",
+                    "license": "apache 2.0",
                    "contact": ""
                }
            },
            "sam": {
                "hifigan_v2": {
                    "description": "Finetuned and intended to be used with tts_models/en/sam/tacotron_DDC",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.13/vocoder_models--en--sam--hifigan_v2.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--sam--hifigan_v2.zip",
                    "commit": "2f07160",
                    "author": "Eren Gölge @erogol",
-                    "license": "",
+                    "license": "apache 2.0",
                    "contact": "egolge@coqui.ai"
                }
            }
@ -321,8 +403,9 @@
        "nl": {
            "mai": {
                "parallel-wavegan": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.10/vocoder_models--nl--mai--parallel-wavegan.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--nl--mai--parallel-wavegan.zip",
                    "author": "@r-dh",
+                    "license": "apache 2.0",
                    "commit": "unknown"
                }
            }
@ -330,13 +413,15 @@
        "de": {
            "thorsten": {
                "wavegrad": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.11/vocoder_models--de--thorsten--wavegrad.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--wavegrad.zip",
                    "author": "@thorstenMueller",
+                    "license": "apache 2.0",
                    "commit": "unknown"
                },
                "fullband-melgan": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.1.3/vocoder_models--de--thorsten--fullband-melgan.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--fullband-melgan.zip",
                    "author": "@thorstenMueller",
+                    "license": "apache 2.0",
                    "commit": "unknown"
                }
            }
@ -344,9 +429,10 @@
        "ja": {
            "kokoro": {
                "hifigan_v1": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.2.0/vocoder_models--ja--kokoro--hifigan_v1.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--ja--kokoro--hifigan_v1.zip",
                    "description": "HifiGAN model trained for kokoro dataset by @kaiidams",
                    "author": "@kaiidams",
+                    "license": "apache 2.0",
                    "commit": "3900448"
                }
            }
@ -354,7 +440,7 @@
        "uk": {
            "mai": {
                "multiband-melgan": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.5.0_models/vocoder_models--uk--mai--multiband-melgan.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--uk--mai--multiband-melgan.zip",
                    "author":"@robinhad",
                    "commit": "bdab788d",
                    "license": "MIT",
@ -365,7 +451,7 @@
        "tr":{
            "common-voice": {
                "hifigan":{
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/vocoder_models--tr--common-voice--hifigan.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--tr--common-voice--hifigan.zip",
                    "description": "HifiGAN model using an unknown speaker from the Common-Voice dataset.",
                    "author": "Fatih Akademi",
                    "license": "MIT",
--- a/TTS/VERSION
+++ b/TTS/VERSION
@ -1 +1 @@
-0.6.1
+0.6.2
--- a/TTS/bin/compute_attention_masks.py
+++ b/TTS/bin/compute_attention_masks.py
@ -25,7 +25,7 @@ These masks can be used for different purposes including training a TTS model wi
        """
 Example run:
    CUDA_VISIBLE_DEVICE="0" python TTS/bin/compute_attention_masks.py
-        --model_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/checkpoint_200000.pth.tar
+        --model_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/checkpoint_200000.pth
        --config_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/config.json
        --dataset_metafile metadata.csv
        --data_path /root/LJSpeech-1.1/
--- a/TTS/bin/compute_embeddings.py
+++ b/TTS/bin/compute_embeddings.py
@ -12,7 +12,7 @@ parser = argparse.ArgumentParser(
    description="""Compute embedding vectors for each wav file in a dataset.\n\n"""
    """
    Example runs:
-    python TTS/bin/compute_embeddings.py speaker_encoder_model.pth.tar speaker_encoder_config.json  dataset_config.json embeddings_output_path/
+    python TTS/bin/compute_embeddings.py speaker_encoder_model.pth speaker_encoder_config.json  dataset_config.json embeddings_output_path/
    """,
    formatter_class=RawTextHelpFormatter,
 )
@ -42,33 +42,35 @@ c_dataset = load_config(args.config_dataset_path)
 meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=args.eval)
 wav_files = meta_data_train + meta_data_eval

-speaker_manager = SpeakerManager(
+encoder_manager = SpeakerManager(
    encoder_model_path=args.model_path,
    encoder_config_path=args.config_path,
    d_vectors_file_path=args.old_file,
    use_cuda=args.use_cuda,
 )

+class_name_key = encoder_manager.encoder_config.class_name_key
+
 # compute speaker embeddings
 speaker_mapping = {}
 for idx, wav_file in enumerate(tqdm(wav_files)):
-    if isinstance(wav_file, list):
-        speaker_name = wav_file[2]
-        wav_file = wav_file[1]
+    if isinstance(wav_file, dict):
+        class_name = wav_file[class_name_key]
+        wav_file = wav_file["audio_file"]
    else:
-        speaker_name = None
+        class_name = None

    wav_file_name = os.path.basename(wav_file)
-    if args.old_file is not None and wav_file_name in speaker_manager.clip_ids:
+    if args.old_file is not None and wav_file_name in encoder_manager.clip_ids:
        # get the embedding from the old file
-        embedd = speaker_manager.get_d_vector_by_clip(wav_file_name)
+        embedd = encoder_manager.get_embedding_by_clip(wav_file_name)
    else:
        # extract the embedding
-        embedd = speaker_manager.compute_d_vector_from_clip(wav_file)
+        embedd = encoder_manager.compute_embedding_from_clip(wav_file)

    # create speaker_mapping if target dataset is defined
    speaker_mapping[wav_file_name] = {}
-    speaker_mapping[wav_file_name]["name"] = speaker_name
+    speaker_mapping[wav_file_name]["name"] = class_name
    speaker_mapping[wav_file_name]["embedding"] = embedd

 if speaker_mapping:
@ -81,5 +83,5 @@ if speaker_mapping:
    os.makedirs(os.path.dirname(mapping_file_path), exist_ok=True)

    # pylint: disable=W0212
-    speaker_manager._save_json(mapping_file_path, speaker_mapping)
+    encoder_manager._save_json(mapping_file_path, speaker_mapping)
    print("Speaker embeddings saved at:", mapping_file_path)
--- a/TTS/bin/distribute.py
+++ b/TTS/bin/distribute.py
@ -1,55 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-import os
-import pathlib
-import subprocess
-import time
-
-import torch
-from trainer import TrainerArgs
-
-
-def main():
-    """
-    Call train.py as a new process and pass command arguments
-    """
-    parser = TrainerArgs().init_argparse(arg_prefix="")
-    parser.add_argument("--script", type=str, help="Target training script to distibute.")
-    args, unargs = parser.parse_known_args()
-
-    num_gpus = torch.cuda.device_count()
-    group_id = time.strftime("%Y_%m_%d-%H%M%S")
-
-    # set arguments for train.py
-    folder_path = pathlib.Path(__file__).parent.absolute()
-    if os.path.exists(os.path.join(folder_path, args.script)):
-        command = [os.path.join(folder_path, args.script)]
-    else:
-        command = [args.script]
-    command.append("--continue_path={}".format(args.continue_path))
-    command.append("--restore_path={}".format(args.restore_path))
-    command.append("--config_path={}".format(args.config_path))
-    command.append("--group_id=group_{}".format(group_id))
-    command.append("--use_ddp=true")
-    command += unargs
-    command.append("")
-
-    # run processes
-    processes = []
-    for i in range(num_gpus):
-        my_env = os.environ.copy()
-        my_env["PYTHON_EGG_CACHE"] = "/tmp/tmp{}".format(i)
-        command[-1] = "--rank={}".format(i)
-        # prevent stdout for processes with rank != 0
-        stdout = None
-        p = subprocess.Popen(["python3"] + command, stdout=stdout, env=my_env)  # pylint: disable=consider-using-with
-        processes.append(p)
-        print(command)
-
-    for p in processes:
-        p.wait()
-
-
-if __name__ == "__main__":
-    main()
--- a/TTS/bin/eval_encoder.py
+++ b/TTS/bin/eval_encoder.py
@ -0,0 +1,89 @@
+import argparse
+from argparse import RawTextHelpFormatter
+
+import torch
+from tqdm import tqdm
+
+from TTS.config import load_config
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.utils.speakers import SpeakerManager
+
+
+def compute_encoder_accuracy(dataset_items, encoder_manager):
+
+    class_name_key = encoder_manager.encoder_config.class_name_key
+    map_classid_to_classname = getattr(encoder_manager.encoder_config, "map_classid_to_classname", None)
+
+    class_acc_dict = {}
+
+    # compute embeddings for all wav_files
+    for item in tqdm(dataset_items):
+        class_name = item[class_name_key]
+        wav_file = item["audio_file"]
+
+        # extract the embedding
+        embedd = encoder_manager.compute_embedding_from_clip(wav_file)
+        if encoder_manager.encoder_criterion is not None and map_classid_to_classname is not None:
+            embedding = torch.FloatTensor(embedd).unsqueeze(0)
+            if encoder_manager.use_cuda:
+                embedding = embedding.cuda()
+
+            class_id = encoder_manager.encoder_criterion.softmax.inference(embedding).item()
+            predicted_label = map_classid_to_classname[str(class_id)]
+        else:
+            predicted_label = None
+
+        if class_name is not None and predicted_label is not None:
+            is_equal = int(class_name == predicted_label)
+            if class_name not in class_acc_dict:
+                class_acc_dict[class_name] = [is_equal]
+            else:
+                class_acc_dict[class_name].append(is_equal)
+        else:
+            raise RuntimeError("Error: class_name or/and predicted_label are None")
+
+    acc_avg = 0
+    for key, values in class_acc_dict.items():
+        acc = sum(values) / len(values)
+        print("Class", key, "Accuracy:", acc)
+        acc_avg += acc
+
+    print("Average Accuracy:", acc_avg / len(class_acc_dict))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="""Compute the accuracy of the encoder.\n\n"""
+        """
+        Example runs:
+        python TTS/bin/eval_encoder.py emotion_encoder_model.pth emotion_encoder_config.json  dataset_config.json
+        """,
+        formatter_class=RawTextHelpFormatter,
+    )
+    parser.add_argument("model_path", type=str, help="Path to model checkpoint file.")
+    parser.add_argument(
+        "config_path",
+        type=str,
+        help="Path to model config file.",
+    )
+
+    parser.add_argument(
+        "config_dataset_path",
+        type=str,
+        help="Path to dataset config file.",
+    )
+    parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)
+    parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
+
+    args = parser.parse_args()
+
+    c_dataset = load_config(args.config_dataset_path)
+
+    meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=args.eval)
+    items = meta_data_train + meta_data_eval
+
+    enc_manager = SpeakerManager(
+        encoder_model_path=args.model_path, encoder_config_path=args.config_path, use_cuda=args.use_cuda
+    )
+
+    compute_encoder_accuracy(items, enc_manager)
--- a/TTS/bin/extract_tts_spectrograms.py
+++ b/TTS/bin/extract_tts_spectrograms.py
@ -37,8 +37,8 @@ def setup_loader(ap, r, verbose=False):
        precompute_num_workers=0,
        use_noise_augment=False,
        verbose=verbose,
-        speaker_id_mapping=speaker_manager.speaker_ids if c.use_speaker_embedding else None,
-        d_vector_mapping=speaker_manager.d_vectors if c.use_d_vector_file else None,
+        speaker_id_mapping=speaker_manager.ids if c.use_speaker_embedding else None,
+        d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None,
    )

    if c.use_phonemes and c.compute_input_seq_cache:
--- a/TTS/bin/find_unique_chars.py
+++ b/TTS/bin/find_unique_chars.py
@ -29,7 +29,7 @@ def main():

    items = train_items + eval_items

-    texts = "".join(item[0] for item in items)
+    texts = "".join(item["text"] for item in items)
    chars = set(texts)
    lower_chars = filter(lambda c: c.islower(), chars)
    chars_force_lower = [c.lower() for c in chars]
--- a/TTS/bin/remove_silence_using_vad.py
+++ b/TTS/bin/remove_silence_using_vad.py
@ -1,51 +1,31 @@
 import argparse
 import glob
-import multiprocessing
 import os
 import pathlib

-from tqdm.contrib.concurrent import process_map
+from tqdm import tqdm

-from TTS.utils.vad import get_vad_speech_segments, read_wave, write_wave
+from TTS.utils.vad import get_vad_model_and_utils, remove_silence


-def remove_silence(filepath):
-    output_path = filepath.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, ""))
+def adjust_path_and_remove_silence(audio_path):
+    output_path = audio_path.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, ""))
    # ignore if the file exists
    if os.path.exists(output_path) and not args.force:
-        return
+        return output_path

    # create all directory structure
    pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
-    # load wave
-    audio, sample_rate = read_wave(filepath)
+    # remove the silence and save the audio
+    output_path = remove_silence(
+        model_and_utils,
+        audio_path,
+        output_path,
+        trim_just_beginning_and_end=args.trim_just_beginning_and_end,
+        use_cuda=args.use_cuda,
+    )

-    # get speech segments
-    segments = get_vad_speech_segments(audio, sample_rate, aggressiveness=args.aggressiveness)
-
-    segments = list(segments)
-    num_segments = len(segments)
-    flag = False
-    # create the output wave
-    if num_segments != 0:
-        for i, segment in reversed(list(enumerate(segments))):
-            if i >= 1:
-                if not flag:
-                    concat_segment = segment
-                    flag = True
-                else:
-                    concat_segment = segment + concat_segment
-            else:
-                if flag:
-                    segment = segment + concat_segment
-                # print("Saving: ", output_path)
-                write_wave(output_path, segment, sample_rate)
-                return
-    else:
-        print("> Just Copying the file to:", output_path)
-        # if fail to remove silence just write the file
-        write_wave(output_path, audio, sample_rate)
-        return
+    return output_path


 def preprocess_audios():
@ -54,17 +34,24 @@ def preprocess_audios():
    if not args.force:
        print("> Ignoring files that already exist in the output directory.")

+    if args.trim_just_beginning_and_end:
+        print("> Trimming just the beginning and the end with nonspeech parts.")
+    else:
+        print("> Trimming all nonspeech parts.")
+
    if files:
        # create threads
-        num_threads = multiprocessing.cpu_count()
-        process_map(remove_silence, files, max_workers=num_threads, chunksize=15)
+        # num_threads = multiprocessing.cpu_count()
+        # process_map(adjust_path_and_remove_silence, files, max_workers=num_threads, chunksize=15)
+        for f in tqdm(files):
+            adjust_path_and_remove_silence(f)
    else:
        print("> No files Found !")


 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
-        description="python remove_silence.py -i=VCTK-Corpus-bk/ -o=../VCTK-Corpus-removed-silence -g=wav48/*/*.wav -a=2"
+        description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end True"
    )
    parser.add_argument("-i", "--input_dir", type=str, default="../VCTK-Corpus", help="Dataset root dir")
    parser.add_argument(
@ -79,11 +66,20 @@ if __name__ == "__main__":
        help="path in glob format for acess wavs from input_dir. ex: wav48/*/*.wav",
    )
    parser.add_argument(
-        "-a",
-        "--aggressiveness",
-        type=int,
-        default=2,
-        help="set its aggressiveness mode, which is an integer between 0 and 3. 0 is the least aggressive about filtering out non-speech, 3 is the most aggressive.",
+        "-t",
+        "--trim_just_beginning_and_end",
+        type=bool,
+        default=True,
+        help="If True this script will trim just the beginning and end nonspeech parts. If False all nonspeech parts will be trim. Default True",
+    )
+    parser.add_argument(
+        "-c",
+        "--use_cuda",
+        type=bool,
+        default=False,
+        help="If True use cuda",
    )
    args = parser.parse_args()
+    # load the model and utils
+    model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda)
    preprocess_audios()
--- a/TTS/bin/synthesize.py
+++ b/TTS/bin/synthesize.py
@ -60,13 +60,13 @@ If you don't specify any models, then it uses LJSpeech based English model.
 - Run your own TTS model (Using Griffin-Lim Vocoder):

    ```
-    $ tts --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav
+    $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
    ```

 - Run your own TTS and Vocoder models:
    ```
-    $ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav
-        --vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json
+    $ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth --out_path output/path/speech.wav
+        --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
    ```

 ### Multi-speaker Models
@ -86,7 +86,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
 - Run your own multi-speaker TTS model:

    ```
-    $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth.tar --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
+    $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
    ```
    """
    # We remove Markdown code formatting programmatically here to allow us to copy-and-paste from main README to keep
@ -195,11 +195,28 @@ If you don't specify any models, then it uses LJSpeech based English model.
        help="If true save raw spectogram for further (vocoder) processing in out_path.",
        default=False,
    )
-
+    parser.add_argument(
+        "--reference_wav",
+        type=str,
+        help="Reference wav file to convert in the voice of the speaker_idx or speaker_wav",
+        default=None,
+    )
+    parser.add_argument(
+        "--reference_speaker_idx",
+        type=str,
+        help="speaker ID of the reference_wav speaker (If not provided the embedding will be computed using the Speaker Encoder).",
+        default=None,
+    )
    args = parser.parse_args()

    # print the description if either text or list_models is not set
-    if args.text is None and not args.list_models and not args.list_speaker_idxs and not args.list_language_idxs:
+    if (
+        not args.text
+        and not args.list_models
+        and not args.list_speaker_idxs
+        and not args.list_language_idxs
+        and not args.reference_wav
+    ):
        parser.parse_args(["-h"])

    # load model manager
@ -261,7 +278,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
        print(
            " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
        )
-        print(synthesizer.tts_model.speaker_manager.speaker_ids)
+        print(synthesizer.tts_model.speaker_manager.ids)
        return

    # query langauge ids of a multi-lingual model.
@ -269,7 +286,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
        print(
            " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
        )
-        print(synthesizer.tts_model.language_manager.language_id_mapping)
+        print(synthesizer.tts_model.language_manager.ids)
        return

    # check the arguments against a multi-speaker model.
@ -281,10 +298,18 @@ If you don't specify any models, then it uses LJSpeech based English model.
        return

    # RUN THE SYNTHESIS
-    print(" > Text: {}".format(args.text))
+    if args.text:
+        print(" > Text: {}".format(args.text))

    # kick it
-    wav = synthesizer.tts(args.text, args.speaker_idx, args.language_idx, args.speaker_wav)
+    wav = synthesizer.tts(
+        args.text,
+        args.speaker_idx,
+        args.language_idx,
+        args.speaker_wav,
+        reference_wav=args.reference_wav,
+        reference_speaker_name=args.reference_speaker_idx,
+    )

    # save the results
    print(" > Saving output to {}".format(args.out_path))
--- a/TTS/bin/train_encoder.py
+++ b/TTS/bin/train_encoder.py
@ -9,17 +9,17 @@ import traceback
 import torch
 from torch.utils.data import DataLoader
 from trainer.torch import NoamLR
+from trainer.trainer_utils import get_optimizer

-from TTS.speaker_encoder.dataset import SpeakerEncoderDataset
-from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
-from TTS.speaker_encoder.utils.generic_utils import save_best_model, setup_speaker_encoder_model
-from TTS.speaker_encoder.utils.training import init_training
-from TTS.speaker_encoder.utils.visual import plot_embeddings
+from TTS.encoder.dataset import EncoderDataset
+from TTS.encoder.utils.generic_utils import save_best_model, save_checkpoint, setup_encoder_model
+from TTS.encoder.utils.samplers import PerfectBatchSampler
+from TTS.encoder.utils.training import init_training
+from TTS.encoder.utils.visual import plot_embeddings
 from TTS.tts.datasets import load_tts_samples
 from TTS.utils.audio import AudioProcessor
-from TTS.utils.generic_utils import count_parameters, remove_experiment_folder, set_init_dict
-from TTS.utils.io import load_fsspec
-from TTS.utils.radam import RAdam
+from TTS.utils.generic_utils import count_parameters, remove_experiment_folder
+from TTS.utils.io import copy_model_files
 from TTS.utils.training import check_update

 torch.backends.cudnn.enabled = True
@ -32,163 +32,257 @@ print(" > Number of GPUs: ", num_gpus)


 def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False):
+    num_utter_per_class = c.num_utter_per_class if not is_val else c.eval_num_utter_per_class
+    num_classes_in_batch = c.num_classes_in_batch if not is_val else c.eval_num_classes_in_batch
+
+    dataset = EncoderDataset(
+        c,
+        ap,
+        meta_data_eval if is_val else meta_data_train,
+        voice_len=c.voice_len,
+        num_utter_per_class=num_utter_per_class,
+        num_classes_in_batch=num_classes_in_batch,
+        verbose=verbose,
+        augmentation_config=c.audio_augmentation if not is_val else None,
+        use_torch_spec=c.model_params.get("use_torch_spec", False),
+    )
+    # get classes list
+    classes = dataset.get_class_list()
+
+    sampler = PerfectBatchSampler(
+        dataset.items,
+        classes,
+        batch_size=num_classes_in_batch * num_utter_per_class,  # total batch size
+        num_classes_in_batch=num_classes_in_batch,
+        num_gpus=1,
+        shuffle=not is_val,
+        drop_last=True,
+    )
+
+    if len(classes) < num_classes_in_batch:
+        if is_val:
+            raise RuntimeError(
+                f"config.eval_num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Eval dataset) !"
+            )
+        raise RuntimeError(
+            f"config.num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Train dataset) !"
+        )
+
+    # set the classes to avoid get wrong class_id when the number of training and eval classes are not equal
    if is_val:
-        loader = None
-    else:
-        dataset = SpeakerEncoderDataset(
-            ap,
-            meta_data_eval if is_val else meta_data_train,
-            voice_len=c.voice_len,
-            num_utter_per_speaker=c.num_utters_per_speaker,
-            num_speakers_in_batch=c.num_speakers_in_batch,
-            skip_speakers=c.skip_speakers,
-            storage_size=c.storage["storage_size"],
-            sample_from_storage_p=c.storage["sample_from_storage_p"],
-            verbose=verbose,
-            augmentation_config=c.audio_augmentation,
-        )
+        dataset.set_classes(train_classes)

-        # sampler = DistributedSampler(dataset) if num_gpus > 1 else None
-        loader = DataLoader(
-            dataset,
-            batch_size=c.num_speakers_in_batch,
-            shuffle=False,
-            num_workers=c.num_loader_workers,
-            collate_fn=dataset.collate_fn,
-        )
-    return loader, dataset.get_num_speakers()
+    loader = DataLoader(
+        dataset,
+        num_workers=c.num_loader_workers,
+        batch_sampler=sampler,
+        collate_fn=dataset.collate_fn,
+    )
+
+    return loader, classes, dataset.get_map_classid_to_classname()


-def train(model, optimizer, scheduler, criterion, data_loader, global_step):
+def evaluation(model, criterion, data_loader, global_step):
+    eval_loss = 0
+    for _, data in enumerate(data_loader):
+        with torch.no_grad():
+            # setup input data
+            inputs, labels = data
+
+            # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
+            labels = torch.transpose(
+                labels.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch), 0, 1
+            ).reshape(labels.shape)
+            inputs = torch.transpose(
+                inputs.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch, -1), 0, 1
+            ).reshape(inputs.shape)
+
+            # dispatch data to GPU
+            if use_cuda:
+                inputs = inputs.cuda(non_blocking=True)
+                labels = labels.cuda(non_blocking=True)
+
+            # forward pass model
+            outputs = model(inputs)
+
+            # loss computation
+            loss = criterion(
+                outputs.view(c.eval_num_classes_in_batch, outputs.shape[0] // c.eval_num_classes_in_batch, -1), labels
+            )
+
+            eval_loss += loss.item()
+
+    eval_avg_loss = eval_loss / len(data_loader)
+    # save stats
+    dashboard_logger.eval_stats(global_step, {"loss": eval_avg_loss})
+    # plot the last batch in the evaluation
+    figures = {
+        "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
+    }
+    dashboard_logger.eval_figures(global_step, figures)
+    return eval_avg_loss
+
+
+def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, global_step):
    model.train()
-    epoch_time = 0
    best_loss = float("inf")
-    avg_loss = 0
-    avg_loss_all = 0
    avg_loader_time = 0
    end_time = time.time()
+    for epoch in range(c.epochs):
+        tot_loss = 0
+        epoch_time = 0
+        for _, data in enumerate(data_loader):
+            start_time = time.time()

-    for _, data in enumerate(data_loader):
-        start_time = time.time()
+            # setup input data
+            inputs, labels = data
+            # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
+            labels = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(
+                labels.shape
+            )
+            inputs = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(
+                inputs.shape
+            )
+            # ToDo: move it to a unit test
+            # labels_converted = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(labels.shape)
+            # inputs_converted = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(inputs.shape)
+            # idx = 0
+            # for j in range(0, c.num_classes_in_batch, 1):
+            #     for i in range(j, len(labels), c.num_classes_in_batch):
+            #         if not torch.all(labels[i].eq(labels_converted[idx])) or not torch.all(inputs[i].eq(inputs_converted[idx])):
+            #             print("Invalid")
+            #             print(labels)
+            #             exit()
+            #         idx += 1
+            # labels = labels_converted
+            # inputs = inputs_converted

-        # setup input data
-        inputs, labels = data
-        loader_time = time.time() - end_time
-        global_step += 1
+            loader_time = time.time() - end_time
+            global_step += 1

-        # setup lr
-        if c.lr_decay:
-            scheduler.step()
-        optimizer.zero_grad()
+            # setup lr
+            if c.lr_decay:
+                scheduler.step()
+            optimizer.zero_grad()

-        # dispatch data to GPU
-        if use_cuda:
-            inputs = inputs.cuda(non_blocking=True)
-            labels = labels.cuda(non_blocking=True)
+            # dispatch data to GPU
+            if use_cuda:
+                inputs = inputs.cuda(non_blocking=True)
+                labels = labels.cuda(non_blocking=True)

-        # forward pass model
-        outputs = model(inputs)
+            # forward pass model
+            outputs = model(inputs)

-        # loss computation
-        loss = criterion(outputs.view(c.num_speakers_in_batch, outputs.shape[0] // c.num_speakers_in_batch, -1), labels)
-        loss.backward()
-        grad_norm, _ = check_update(model, c.grad_clip)
-        optimizer.step()
+            # loss computation
+            loss = criterion(
+                outputs.view(c.num_classes_in_batch, outputs.shape[0] // c.num_classes_in_batch, -1), labels
+            )
+            loss.backward()
+            grad_norm, _ = check_update(model, c.grad_clip)
+            optimizer.step()

-        step_time = time.time() - start_time
-        epoch_time += step_time
+            step_time = time.time() - start_time
+            epoch_time += step_time

-        # Averaged Loss and Averaged Loader Time
-        avg_loss = 0.01 * loss.item() + 0.99 * avg_loss if avg_loss != 0 else loss.item()
-        num_loader_workers = c.num_loader_workers if c.num_loader_workers > 0 else 1
-        avg_loader_time = (
-            1 / num_loader_workers * loader_time + (num_loader_workers - 1) / num_loader_workers * avg_loader_time
-            if avg_loader_time != 0
-            else loader_time
+            # acumulate the total epoch loss
+            tot_loss += loss.item()
+
+            # Averaged Loader Time
+            num_loader_workers = c.num_loader_workers if c.num_loader_workers > 0 else 1
+            avg_loader_time = (
+                1 / num_loader_workers * loader_time + (num_loader_workers - 1) / num_loader_workers * avg_loader_time
+                if avg_loader_time != 0
+                else loader_time
+            )
+            current_lr = optimizer.param_groups[0]["lr"]
+
+            if global_step % c.steps_plot_stats == 0:
+                # Plot Training Epoch Stats
+                train_stats = {
+                    "loss": loss.item(),
+                    "lr": current_lr,
+                    "grad_norm": grad_norm,
+                    "step_time": step_time,
+                    "avg_loader_time": avg_loader_time,
+                }
+                dashboard_logger.train_epoch_stats(global_step, train_stats)
+                figures = {
+                    "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
+                }
+                dashboard_logger.train_figures(global_step, figures)
+
+            if global_step % c.print_step == 0:
+                print(
+                    "   | > Step:{}  Loss:{:.5f}  GradNorm:{:.5f}  "
+                    "StepTime:{:.2f}  LoaderTime:{:.2f}  AvGLoaderTime:{:.2f}  LR:{:.6f}".format(
+                        global_step, loss.item(), grad_norm, step_time, loader_time, avg_loader_time, current_lr
+                    ),
+                    flush=True,
+                )
+
+            if global_step % c.save_step == 0:
+                # save model
+                save_checkpoint(model, optimizer, criterion, loss.item(), OUT_PATH, global_step, epoch)
+
+            end_time = time.time()
+
+        print("")
+        print(
+            ">>> Epoch:{}  AvgLoss: {:.5f} GradNorm:{:.5f}  "
+            "EpochTime:{:.2f} AvGLoaderTime:{:.2f} ".format(
+                epoch, tot_loss / len(data_loader), grad_norm, epoch_time, avg_loader_time
+            ),
+            flush=True,
        )
-        current_lr = optimizer.param_groups[0]["lr"]
-
-        if global_step % c.steps_plot_stats == 0:
-            # Plot Training Epoch Stats
-            train_stats = {
-                "loss": avg_loss,
-                "lr": current_lr,
-                "grad_norm": grad_norm,
-                "step_time": step_time,
-                "avg_loader_time": avg_loader_time,
-            }
-            dashboard_logger.train_epoch_stats(global_step, train_stats)
-            figures = {
-                # FIXME: not constant
-                "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), 10),
-            }
-            dashboard_logger.train_figures(global_step, figures)
-
-        if global_step % c.print_step == 0:
+        # evaluation
+        if c.run_eval:
+            model.eval()
+            eval_loss = evaluation(model, criterion, eval_data_loader, global_step)
+            print("\n\n")
+            print("--> EVAL PERFORMANCE")
            print(
-                "   | > Step:{}  Loss:{:.5f}  AvgLoss:{:.5f}  GradNorm:{:.5f}  "
-                "StepTime:{:.2f}  LoaderTime:{:.2f}  AvGLoaderTime:{:.2f}  LR:{:.6f}".format(
-                    global_step, loss.item(), avg_loss, grad_norm, step_time, loader_time, avg_loader_time, current_lr
-                ),
+                "   | > Epoch:{}  AvgLoss: {:.5f} ".format(epoch, eval_loss),
                flush=True,
            )
-        avg_loss_all += avg_loss
+            # save the best checkpoint
+            best_loss = save_best_model(model, optimizer, criterion, eval_loss, best_loss, OUT_PATH, global_step, epoch)
+            model.train()

-        if global_step >= c.max_train_step or global_step % c.save_step == 0:
-            # save best model only
-            best_loss = save_best_model(model, optimizer, criterion, avg_loss, best_loss, OUT_PATH, global_step)
-            avg_loss_all = 0
-            if global_step >= c.max_train_step:
-                break
-
-        end_time = time.time()
-
-    return avg_loss, global_step
+    return best_loss, global_step


 def main(args):  # pylint: disable=redefined-outer-name
    # pylint: disable=global-variable-undefined
    global meta_data_train
    global meta_data_eval
+    global train_classes

    ap = AudioProcessor(**c.audio)
-    model = setup_speaker_encoder_model(c)
+    model = setup_encoder_model(c)

-    optimizer = RAdam(model.parameters(), lr=c.lr)
+    optimizer = get_optimizer(c.optimizer, c.optimizer_params, c.lr, model)

    # pylint: disable=redefined-outer-name
-    meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=False)
+    meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=True)

-    data_loader, num_speakers = setup_loader(ap, is_val=False, verbose=True)
-
-    if c.loss == "ge2e":
-        criterion = GE2ELoss(loss_method="softmax")
-    elif c.loss == "angleproto":
-        criterion = AngleProtoLoss()
-    elif c.loss == "softmaxproto":
-        criterion = SoftmaxAngleProtoLoss(c.model_params["proj_dim"], num_speakers)
+    train_data_loader, train_classes, map_classid_to_classname = setup_loader(ap, is_val=False, verbose=True)
+    if c.run_eval:
+        eval_data_loader, _, _ = setup_loader(ap, is_val=True, verbose=True)
    else:
-        raise Exception("The %s  not is a loss supported" % c.loss)
+        eval_data_loader = None
+
+    num_classes = len(train_classes)
+    criterion = model.get_criterion(c, num_classes)
+
+    if c.loss == "softmaxproto" and c.model != "speaker_encoder":
+        c.map_classid_to_classname = map_classid_to_classname
+        copy_model_files(c, OUT_PATH)

    if args.restore_path:
-        checkpoint = load_fsspec(args.restore_path)
-        try:
-            model.load_state_dict(checkpoint["model"])
-
-            if "criterion" in checkpoint:
-                criterion.load_state_dict(checkpoint["criterion"])
-
-        except (KeyError, RuntimeError):
-            print(" > Partial model initialization.")
-            model_dict = model.state_dict()
-            model_dict = set_init_dict(model_dict, checkpoint["model"], c)
-            model.load_state_dict(model_dict)
-            del model_dict
-        for group in optimizer.param_groups:
-            group["lr"] = c.lr
-
-        print(" > Model restored from step %d" % checkpoint["step"], flush=True)
-        args.restore_step = checkpoint["step"]
+        criterion, args.restore_step = model.load_checkpoint(
+            c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion
+        )
+        print(" > Model restored from step %d" % args.restore_step, flush=True)
    else:
        args.restore_step = 0

@ -205,7 +299,7 @@ def main(args):  # pylint: disable=redefined-outer-name
        criterion.cuda()

    global_step = args.restore_step
-    _, global_step = train(model, optimizer, scheduler, criterion, data_loader, global_step)
+    _, global_step = train(model, optimizer, scheduler, criterion, train_data_loader, eval_data_loader, global_step)


 if __name__ == "__main__":
--- a/TTS/bin/train_tts.py
+++ b/TTS/bin/train_tts.py
@ -57,7 +57,7 @@ def main():
    # init the trainer and 🚀
    trainer = Trainer(
        train_args,
-        config,
+        model.config,
        config.output_path,
        model=model,
        train_samples=train_samples,
--- a/TTS/config/init.py
+++ b/TTS/config/init.py
@ -37,7 +37,7 @@ def register_config(model_name: str) -> Coqpit:
    """
    config_class = None
    config_name = model_name + "_config"
-    paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.speaker_encoder"]
+    paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.encoder.configs"]
    for path in paths:
        try:
            config_class = find_module(path, config_name)
--- a/TTS/config/shared_configs.py
+++ b/TTS/config/shared_configs.py
@ -258,4 +258,3 @@ class BaseTrainingConfig(TrainerConfig):
    num_loader_workers: int = 0
    num_eval_loader_workers: int = 0
    use_noise_augment: bool = False
-    use_language_weighted_sampler: bool = False
--- a/TTS/speaker_encoder/README.md
+++ b/TTS/speaker_encoder/README.md
@ -14,5 +14,5 @@ To run the code, you need to follow the same flow as in TTS.

 - Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
 - Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```
- Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth.tar model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
+- Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
 - Watch training on Tensorboard as in TTS
--- a/TTS/speaker_encoder/init.py
+++ b/TTS/speaker_encoder/init.py
--- a/TTS/speaker_encoder/speaker_encoder_config.py
+++ b/TTS/speaker_encoder/speaker_encoder_config.py
@ -7,10 +7,10 @@ from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig, BaseTr


@dataclass
-class SpeakerEncoderConfig(BaseTrainingConfig):
-    """Defines parameters for Speaker Encoder model."""
+class BaseEncoderConfig(BaseTrainingConfig):
+    """Defines parameters for a Generic Encoder model."""

-    model: str = "speaker_encoder"
+    model: str = None
    audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
    datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
    # model params
@ -27,34 +27,30 @@ class SpeakerEncoderConfig(BaseTrainingConfig):

    audio_augmentation: Dict = field(default_factory=lambda: {})

-    storage: Dict = field(
-        default_factory=lambda: {
-            "sample_from_storage_p": 0.66,  # the probability with which we'll sample from the DataSet in-memory storage
-            "storage_size": 15,  # the size of the in-memory storage with respect to a single batch
-        }
-    )
-
    # training params
-    max_train_step: int = 1000000  # end training when number of training steps reaches this value.
+    epochs: int = 10000
    loss: str = "angleproto"
    grad_clip: float = 3.0
    lr: float = 0.0001
+    optimizer: str = "radam"
+    optimizer_params: Dict = field(default_factory=lambda: {"betas": [0.9, 0.999], "weight_decay": 0})
    lr_decay: bool = False
    warmup_steps: int = 4000
-    wd: float = 1e-6

    # logging params
    tb_model_param_stats: bool = False
    steps_plot_stats: int = 10
-    checkpoint: bool = True
    save_step: int = 1000
    print_step: int = 20
+    run_eval: bool = False

    # data loader
-    num_speakers_in_batch: int = MISSING
-    num_utters_per_speaker: int = MISSING
+    num_classes_in_batch: int = MISSING
+    num_utter_per_class: int = MISSING
+    eval_num_classes_in_batch: int = None
+    eval_num_utter_per_class: int = None
+
    num_loader_workers: int = MISSING
-    skip_speakers: bool = False
    voice_len: float = 1.6

    def check_values(self):
--- a/TTS/encoder/configs/emotion_encoder_config.py
+++ b/TTS/encoder/configs/emotion_encoder_config.py
@ -0,0 +1,12 @@
+from dataclasses import asdict, dataclass
+
+from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
+
+
+@dataclass
+class EmotionEncoderConfig(BaseEncoderConfig):
+    """Defines parameters for Emotion Encoder model."""
+
+    model: str = "emotion_encoder"
+    map_classid_to_classname: dict = None
+    class_name_key: str = "emotion_name"
--- a/TTS/encoder/configs/speaker_encoder_config.py
+++ b/TTS/encoder/configs/speaker_encoder_config.py
@ -0,0 +1,11 @@
+from dataclasses import asdict, dataclass
+
+from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
+
+
+@dataclass
+class SpeakerEncoderConfig(BaseEncoderConfig):
+    """Defines parameters for Speaker Encoder model."""
+
+    model: str = "speaker_encoder"
+    class_name_key: str = "speaker_name"
--- a/TTS/encoder/dataset.py
+++ b/TTS/encoder/dataset.py
@ -0,0 +1,147 @@
+import random
+
+import torch
+from torch.utils.data import Dataset
+
+from TTS.encoder.utils.generic_utils import AugmentWAV
+
+
+class EncoderDataset(Dataset):
+    def __init__(
+        self,
+        config,
+        ap,
+        meta_data,
+        voice_len=1.6,
+        num_classes_in_batch=64,
+        num_utter_per_class=10,
+        verbose=False,
+        augmentation_config=None,
+        use_torch_spec=None,
+    ):
+        """
+        Args:
+            ap (TTS.tts.utils.AudioProcessor): audio processor object.
+            meta_data (list): list of dataset instances.
+            seq_len (int): voice segment length in seconds.
+            verbose (bool): print diagnostic information.
+        """
+        super().__init__()
+        self.config = config
+        self.items = meta_data
+        self.sample_rate = ap.sample_rate
+        self.seq_len = int(voice_len * self.sample_rate)
+        self.num_utter_per_class = num_utter_per_class
+        self.ap = ap
+        self.verbose = verbose
+        self.use_torch_spec = use_torch_spec
+        self.classes, self.items = self.__parse_items()
+
+        self.classname_to_classid = {key: i for i, key in enumerate(self.classes)}
+
+        # Data Augmentation
+        self.augmentator = None
+        self.gaussian_augmentation_config = None
+        if augmentation_config:
+            self.data_augmentation_p = augmentation_config["p"]
+            if self.data_augmentation_p and ("additive" in augmentation_config or "rir" in augmentation_config):
+                self.augmentator = AugmentWAV(ap, augmentation_config)
+
+            if "gaussian" in augmentation_config.keys():
+                self.gaussian_augmentation_config = augmentation_config["gaussian"]
+
+        if self.verbose:
+            print("\n > DataLoader initialization")
+            print(f" | > Classes per Batch: {num_classes_in_batch}")
+            print(f" | > Number of instances : {len(self.items)}")
+            print(f" | > Sequence length: {self.seq_len}")
+            print(f" | > Num Classes: {len(self.classes)}")
+            print(f" | > Classes: {self.classes}")
+
+    def load_wav(self, filename):
+        audio = self.ap.load_wav(filename, sr=self.ap.sample_rate)
+        return audio
+
+    def __parse_items(self):
+        class_to_utters = {}
+        for item in self.items:
+            path_ = item["audio_file"]
+            class_name = item[self.config.class_name_key]
+            if class_name in class_to_utters.keys():
+                class_to_utters[class_name].append(path_)
+            else:
+                class_to_utters[class_name] = [
+                    path_,
+                ]
+
+        # skip classes with number of samples >= self.num_utter_per_class
+        class_to_utters = {k: v for (k, v) in class_to_utters.items() if len(v) >= self.num_utter_per_class}
+
+        classes = list(class_to_utters.keys())
+        classes.sort()
+
+        new_items = []
+        for item in self.items:
+            path_ = item["audio_file"]
+            class_name = item["emotion_name"] if self.config.model == "emotion_encoder" else item["speaker_name"]
+            # ignore filtered classes
+            if class_name not in classes:
+                continue
+            # ignore small audios
+            if self.load_wav(path_).shape[0] - self.seq_len <= 0:
+                continue
+
+            new_items.append({"wav_file_path": path_, "class_name": class_name})
+
+        return classes, new_items
+
+    def __len__(self):
+        return len(self.items)
+
+    def get_num_classes(self):
+        return len(self.classes)
+
+    def get_class_list(self):
+        return self.classes
+
+    def set_classes(self, classes):
+        self.classes = classes
+        self.classname_to_classid = {key: i for i, key in enumerate(self.classes)}
+
+    def get_map_classid_to_classname(self):
+        return dict((c_id, c_n) for c_n, c_id in self.classname_to_classid.items())
+
+    def __getitem__(self, idx):
+        return self.items[idx]
+
+    def collate_fn(self, batch):
+        # get the batch class_ids
+        labels = []
+        feats = []
+        for item in batch:
+            utter_path = item["wav_file_path"]
+            class_name = item["class_name"]
+
+            # get classid
+            class_id = self.classname_to_classid[class_name]
+            # load wav file
+            wav = self.load_wav(utter_path)
+            offset = random.randint(0, wav.shape[0] - self.seq_len)
+            wav = wav[offset : offset + self.seq_len]
+
+            if self.augmentator is not None and self.data_augmentation_p:
+                if random.random() < self.data_augmentation_p:
+                    wav = self.augmentator.apply_one(wav)
+
+            if not self.use_torch_spec:
+                mel = self.ap.melspectrogram(wav)
+                feats.append(torch.FloatTensor(mel))
+            else:
+                feats.append(torch.FloatTensor(wav))
+
+            labels.append(class_id)
+
+        feats = torch.stack(feats)
+        labels = torch.LongTensor(labels)
+
+        return feats, labels
--- a/TTS/speaker_encoder/losses.py
+++ b/TTS/speaker_encoder/losses.py
@ -189,6 +189,12 @@ class SoftmaxLoss(nn.Module):

        return L

+    def inference(self, embedding):
+        x = self.fc(embedding)
+        activations = torch.nn.functional.softmax(x, dim=1).squeeze(0)
+        class_id = torch.argmax(activations)
+        return class_id
+

 class SoftmaxAngleProtoLoss(nn.Module):
    """
--- a/TTS/encoder/models/base_encoder.py
+++ b/TTS/encoder/models/base_encoder.py
@ -0,0 +1,154 @@
+import numpy as np
+import torch
+import torchaudio
+from coqpit import Coqpit
+from torch import nn
+
+from TTS.encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
+from TTS.utils.generic_utils import set_init_dict
+from TTS.utils.io import load_fsspec
+
+
+class PreEmphasis(nn.Module):
+    def __init__(self, coefficient=0.97):
+        super().__init__()
+        self.coefficient = coefficient
+        self.register_buffer("filter", torch.FloatTensor([-self.coefficient, 1.0]).unsqueeze(0).unsqueeze(0))
+
+    def forward(self, x):
+        assert len(x.size()) == 2
+
+        x = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), "reflect")
+        return torch.nn.functional.conv1d(x, self.filter).squeeze(1)
+
+
+class BaseEncoder(nn.Module):
+    """Base `encoder` class. Every new `encoder` model must inherit this.
+
+    It defines common `encoder` specific functions.
+    """
+
+    # pylint: disable=W0102
+    def __init__(self):
+        super(BaseEncoder, self).__init__()
+
+    def get_torch_mel_spectrogram_class(self, audio_config):
+        return torch.nn.Sequential(
+            PreEmphasis(audio_config["preemphasis"]),
+            # TorchSTFT(
+            #     n_fft=audio_config["fft_size"],
+            #     hop_length=audio_config["hop_length"],
+            #     win_length=audio_config["win_length"],
+            #     sample_rate=audio_config["sample_rate"],
+            #     window="hamming_window",
+            #     mel_fmin=0.0,
+            #     mel_fmax=None,
+            #     use_htk=True,
+            #     do_amp_to_db=False,
+            #     n_mels=audio_config["num_mels"],
+            #     power=2.0,
+            #     use_mel=True,
+            #     mel_norm=None,
+            # )
+            torchaudio.transforms.MelSpectrogram(
+                sample_rate=audio_config["sample_rate"],
+                n_fft=audio_config["fft_size"],
+                win_length=audio_config["win_length"],
+                hop_length=audio_config["hop_length"],
+                window_fn=torch.hamming_window,
+                n_mels=audio_config["num_mels"],
+            ),
+        )
+
+    @torch.no_grad()
+    def inference(self, x, l2_norm=True):
+        return self.forward(x, l2_norm)
+
+    @torch.no_grad()
+    def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True, l2_norm=True):
+        """
+        Generate embeddings for a batch of utterances
+        x: 1xTxD
+        """
+        # map to the waveform size
+        if self.use_torch_spec:
+            num_frames = num_frames * self.audio_config["hop_length"]
+
+        max_len = x.shape[1]
+
+        if max_len < num_frames:
+            num_frames = max_len
+
+        offsets = np.linspace(0, max_len - num_frames, num=num_eval)
+
+        frames_batch = []
+        for offset in offsets:
+            offset = int(offset)
+            end_offset = int(offset + num_frames)
+            frames = x[:, offset:end_offset]
+            frames_batch.append(frames)
+
+        frames_batch = torch.cat(frames_batch, dim=0)
+        embeddings = self.inference(frames_batch, l2_norm=l2_norm)
+
+        if return_mean:
+            embeddings = torch.mean(embeddings, dim=0, keepdim=True)
+        return embeddings
+
+    def get_criterion(self, c: Coqpit, num_classes=None):
+        if c.loss == "ge2e":
+            criterion = GE2ELoss(loss_method="softmax")
+        elif c.loss == "angleproto":
+            criterion = AngleProtoLoss()
+        elif c.loss == "softmaxproto":
+            criterion = SoftmaxAngleProtoLoss(c.model_params["proj_dim"], num_classes)
+        else:
+            raise Exception("The %s  not is a loss supported" % c.loss)
+        return criterion
+
+    def load_checkpoint(
+        self, config: Coqpit, checkpoint_path: str, eval: bool = False, use_cuda: bool = False, criterion=None
+    ):
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
+        try:
+            self.load_state_dict(state["model"])
+        except (KeyError, RuntimeError) as error:
+            # If eval raise the error
+            if eval:
+                raise error
+
+            print(" > Partial model initialization.")
+            model_dict = self.state_dict()
+            model_dict = set_init_dict(model_dict, state["model"], c)
+            self.load_state_dict(model_dict)
+            del model_dict
+
+        # load the criterion for restore_path
+        if criterion is not None and "criterion" in state:
+            try:
+                criterion.load_state_dict(state["criterion"])
+            except (KeyError, RuntimeError) as error:
+                print(" > Criterion load ignored because of:", error)
+
+        # instance and load the criterion for the encoder classifier in inference time
+        if (
+            eval
+            and criterion is None
+            and "criterion" in state
+            and getattr(config, "map_classid_to_classname", None) is not None
+        ):
+            criterion = self.get_criterion(config, len(config.map_classid_to_classname))
+            criterion.load_state_dict(state["criterion"])
+
+        if use_cuda:
+            self.cuda()
+            if criterion is not None:
+                criterion = criterion.cuda()
+
+        if eval:
+            self.eval()
+            assert not self.training
+
+        if not eval:
+            return criterion, state["step"]
+        return criterion
--- a/TTS/encoder/models/lstm.py
+++ b/TTS/encoder/models/lstm.py
@ -0,0 +1,99 @@
+import torch
+from torch import nn
+
+from TTS.encoder.models.base_encoder import BaseEncoder
+
+
+class LSTMWithProjection(nn.Module):
+    def __init__(self, input_size, hidden_size, proj_size):
+        super().__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.proj_size = proj_size
+        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
+        self.linear = nn.Linear(hidden_size, proj_size, bias=False)
+
+    def forward(self, x):
+        self.lstm.flatten_parameters()
+        o, (_, _) = self.lstm(x)
+        return self.linear(o)
+
+
+class LSTMWithoutProjection(nn.Module):
+    def __init__(self, input_dim, lstm_dim, proj_dim, num_lstm_layers):
+        super().__init__()
+        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=lstm_dim, num_layers=num_lstm_layers, batch_first=True)
+        self.linear = nn.Linear(lstm_dim, proj_dim, bias=True)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        _, (hidden, _) = self.lstm(x)
+        return self.relu(self.linear(hidden[-1]))
+
+
+class LSTMSpeakerEncoder(BaseEncoder):
+    def __init__(
+        self,
+        input_dim,
+        proj_dim=256,
+        lstm_dim=768,
+        num_lstm_layers=3,
+        use_lstm_with_projection=True,
+        use_torch_spec=False,
+        audio_config=None,
+    ):
+        super().__init__()
+        self.use_lstm_with_projection = use_lstm_with_projection
+        self.use_torch_spec = use_torch_spec
+        self.audio_config = audio_config
+        self.proj_dim = proj_dim
+
+        layers = []
+        # choise LSTM layer
+        if use_lstm_with_projection:
+            layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim))
+            for _ in range(num_lstm_layers - 1):
+                layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim))
+            self.layers = nn.Sequential(*layers)
+        else:
+            self.layers = LSTMWithoutProjection(input_dim, lstm_dim, proj_dim, num_lstm_layers)
+
+        self.instancenorm = nn.InstanceNorm1d(input_dim)
+
+        if self.use_torch_spec:
+            self.torch_spec = self.get_torch_mel_spectrogram_class(audio_config)
+        else:
+            self.torch_spec = None
+
+        self._init_layers()
+
+    def _init_layers(self):
+        for name, param in self.layers.named_parameters():
+            if "bias" in name:
+                nn.init.constant_(param, 0.0)
+            elif "weight" in name:
+                nn.init.xavier_normal_(param)
+
+    def forward(self, x, l2_norm=True):
+        """Forward pass of the model.
+
+        Args:
+            x (Tensor): Raw waveform signal or spectrogram frames. If input is a waveform, `torch_spec` must be `True`
+                to compute the spectrogram on-the-fly.
+            l2_norm (bool): Whether to L2-normalize the outputs.
+
+        Shapes:
+            - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})`
+        """
+        with torch.no_grad():
+            with torch.cuda.amp.autocast(enabled=False):
+                if self.use_torch_spec:
+                    x.squeeze_(1)
+                    x = self.torch_spec(x)
+                x = self.instancenorm(x).transpose(1, 2)
+        d = self.layers(x)
+        if self.use_lstm_with_projection:
+            d = d[:, -1]
+        if l2_norm:
+            d = torch.nn.functional.normalize(d, p=2, dim=1)
+        return d
--- a/TTS/speaker_encoder/models/resnet.py
+++ b/TTS/speaker_encoder/models/resnet.py
@ -1,23 +1,8 @@
-import numpy as np
 import torch
-import torchaudio
 from torch import nn

 # from TTS.utils.audio import TorchSTFT
-from TTS.utils.io import load_fsspec
-
-
-class PreEmphasis(nn.Module):
-    def __init__(self, coefficient=0.97):
-        super().__init__()
-        self.coefficient = coefficient
-        self.register_buffer("filter", torch.FloatTensor([-self.coefficient, 1.0]).unsqueeze(0).unsqueeze(0))
-
-    def forward(self, x):
-        assert len(x.size()) == 2
-
-        x = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), "reflect")
-        return torch.nn.functional.conv1d(x, self.filter).squeeze(1)
+from TTS.encoder.models.base_encoder import BaseEncoder


 class SELayer(nn.Module):
@ -71,7 +56,7 @@ class SEBasicBlock(nn.Module):
        return out


-class ResNetSpeakerEncoder(nn.Module):
+class ResNetSpeakerEncoder(BaseEncoder):
    """Implementation of the model H/ASP without batch normalization in speaker embedding. This model was proposed in: https://arxiv.org/abs/2009.14153
    Adapted from: https://github.com/clovaai/voxceleb_trainer
    """
@ -110,32 +95,7 @@ class ResNetSpeakerEncoder(nn.Module):
        self.instancenorm = nn.InstanceNorm1d(input_dim)

        if self.use_torch_spec:
-            self.torch_spec = torch.nn.Sequential(
-                PreEmphasis(audio_config["preemphasis"]),
-                # TorchSTFT(
-                #     n_fft=audio_config["fft_size"],
-                #     hop_length=audio_config["hop_length"],
-                #     win_length=audio_config["win_length"],
-                #     sample_rate=audio_config["sample_rate"],
-                #     window="hamming_window",
-                #     mel_fmin=0.0,
-                #     mel_fmax=None,
-                #     use_htk=True,
-                #     do_amp_to_db=False,
-                #     n_mels=audio_config["num_mels"],
-                #     power=2.0,
-                #     use_mel=True,
-                #     mel_norm=None,
-                # )
-                torchaudio.transforms.MelSpectrogram(
-                    sample_rate=audio_config["sample_rate"],
-                    n_fft=audio_config["fft_size"],
-                    win_length=audio_config["win_length"],
-                    hop_length=audio_config["hop_length"],
-                    window_fn=torch.hamming_window,
-                    n_mels=audio_config["num_mels"],
-                ),
-            )
+            self.torch_spec = self.get_torch_mel_spectrogram_class(audio_config)
        else:
            self.torch_spec = None

@ -238,47 +198,3 @@ class ResNetSpeakerEncoder(nn.Module):
        if l2_norm:
            x = torch.nn.functional.normalize(x, p=2, dim=1)
        return x
-
-    @torch.no_grad()
-    def inference(self, x, l2_norm=False):
-        return self.forward(x, l2_norm)
-
-    @torch.no_grad()
-    def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True, l2_norm=True):
-        """
-        Generate embeddings for a batch of utterances
-        x: 1xTxD
-        """
-        # map to the waveform size
-        if self.use_torch_spec:
-            num_frames = num_frames * self.audio_config["hop_length"]
-
-        max_len = x.shape[1]
-
-        if max_len < num_frames:
-            num_frames = max_len
-
-        offsets = np.linspace(0, max_len - num_frames, num=num_eval)
-
-        frames_batch = []
-        for offset in offsets:
-            offset = int(offset)
-            end_offset = int(offset + num_frames)
-            frames = x[:, offset:end_offset]
-            frames_batch.append(frames)
-
-        frames_batch = torch.cat(frames_batch, dim=0)
-        embeddings = self.inference(frames_batch, l2_norm=l2_norm)
-
-        if return_mean:
-            embeddings = torch.mean(embeddings, dim=0, keepdim=True)
-        return embeddings
-
-    def load_checkpoint(self, config: dict, checkpoint_path: str, eval: bool = False, use_cuda: bool = False):
-        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
-        self.load_state_dict(state["model"])
-        if use_cuda:
-            self.cuda()
-        if eval:
-            self.eval()
-            assert not self.training
--- a/TTS/speaker_encoder/requirements.txt
+++ b/TTS/speaker_encoder/requirements.txt
--- a/TTS/speaker_encoder/utils/init.py
+++ b/TTS/speaker_encoder/utils/init.py
--- a/TTS/speaker_encoder/utils/generic_utils.py
+++ b/TTS/speaker_encoder/utils/generic_utils.py
@ -3,60 +3,15 @@ import glob
 import os
 import random
 import re
-from multiprocessing import Manager

 import numpy as np
 from scipy import signal

-from TTS.speaker_encoder.models.lstm import LSTMSpeakerEncoder
-from TTS.speaker_encoder.models.resnet import ResNetSpeakerEncoder
+from TTS.encoder.models.lstm import LSTMSpeakerEncoder
+from TTS.encoder.models.resnet import ResNetSpeakerEncoder
 from TTS.utils.io import save_fsspec


-class Storage(object):
-    def __init__(self, maxsize, storage_batchs, num_speakers_in_batch, num_threads=8):
-        # use multiprocessing for threading safe
-        self.storage = Manager().list()
-        self.maxsize = maxsize
-        self.num_speakers_in_batch = num_speakers_in_batch
-        self.num_threads = num_threads
-        self.ignore_last_batch = False
-
-        if storage_batchs >= 3:
-            self.ignore_last_batch = True
-
-        # used for fast random sample
-        self.safe_storage_size = self.maxsize - self.num_threads
-        if self.ignore_last_batch:
-            self.safe_storage_size -= self.num_speakers_in_batch
-
-    def __len__(self):
-        return len(self.storage)
-
-    def full(self):
-        return len(self.storage) >= self.maxsize
-
-    def append(self, item):
-        # if storage is full, remove an item
-        if self.full():
-            self.storage.pop(0)
-
-        self.storage.append(item)
-
-    def get_random_sample(self):
-        # safe storage size considering all threads remove one item from storage in same time
-        storage_size = len(self.storage) - self.num_threads
-
-        if self.ignore_last_batch:
-            storage_size -= self.num_speakers_in_batch
-
-        return self.storage[random.randint(0, storage_size)]
-
-    def get_random_sample_fast(self):
-        """Call this method only when storage is full"""
-        return self.storage[random.randint(0, self.safe_storage_size)]
-
-
 class AugmentWAV(object):
    def __init__(self, ap, augmentation_config):

@ -170,7 +125,7 @@ def to_camel(text):
    return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text)


-def setup_speaker_encoder_model(config: "Coqpit"):
+def setup_encoder_model(config: "Coqpit"):
    if config.model_params["model_name"].lower() == "lstm":
        model = LSTMSpeakerEncoder(
            config.model_params["input_dim"],
@ -192,7 +147,7 @@ def setup_speaker_encoder_model(config: "Coqpit"):


 def save_checkpoint(model, optimizer, criterion, model_loss, out_path, current_step, epoch):
-    checkpoint_path = "checkpoint_{}.pth.tar".format(current_step)
+    checkpoint_path = "checkpoint_{}.pth".format(current_step)
    checkpoint_path = os.path.join(out_path, checkpoint_path)
    print(" | | > Checkpoint saving : {}".format(checkpoint_path))

@ -209,7 +164,7 @@ def save_checkpoint(model, optimizer, criterion, model_loss, out_path, current_s
    save_fsspec(state, checkpoint_path)


-def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path, current_step):
+def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path, current_step, epoch):
    if model_loss < best_loss:
        new_state_dict = model.state_dict()
        state = {
@ -217,11 +172,12 @@ def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path
            "optimizer": optimizer.state_dict(),
            "criterion": criterion.state_dict(),
            "step": current_step,
+            "epoch": epoch,
            "loss": model_loss,
            "date": datetime.date.today().strftime("%B %d, %Y"),
        }
        best_loss = model_loss
-        bestmodel_path = "best_model.pth.tar"
+        bestmodel_path = "best_model.pth"
        bestmodel_path = os.path.join(out_path, bestmodel_path)
        print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path))
        save_fsspec(state, bestmodel_path)
--- a/TTS/speaker_encoder/utils/io.py
+++ b/TTS/speaker_encoder/utils/io.py
@ -5,7 +5,7 @@ from TTS.utils.io import save_fsspec


 def save_checkpoint(model, optimizer, model_loss, out_path, current_step):
-    checkpoint_path = "checkpoint_{}.pth.tar".format(current_step)
+    checkpoint_path = "checkpoint_{}.pth".format(current_step)
    checkpoint_path = os.path.join(out_path, checkpoint_path)
    print(" | | > Checkpoint saving : {}".format(checkpoint_path))

@ -31,7 +31,7 @@ def save_best_model(model, optimizer, model_loss, best_loss, out_path, current_s
            "date": datetime.date.today().strftime("%B %d, %Y"),
        }
        best_loss = model_loss
-        bestmodel_path = "best_model.pth.tar"
+        bestmodel_path = "best_model.pth"
        bestmodel_path = os.path.join(out_path, bestmodel_path)
        print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path))
        save_fsspec(state, bestmodel_path)
--- a/TTS/speaker_encoder/utils/prepare_voxceleb.py
+++ b/TTS/speaker_encoder/utils/prepare_voxceleb.py
--- a/TTS/encoder/utils/samplers.py
+++ b/TTS/encoder/utils/samplers.py
@ -0,0 +1,114 @@
+import random
+
+from torch.utils.data.sampler import Sampler, SubsetRandomSampler
+
+
+class SubsetSampler(Sampler):
+    """
+    Samples elements sequentially from a given list of indices.
+
+    Args:
+        indices (list): a sequence of indices
+    """
+
+    def __init__(self, indices):
+        super().__init__(indices)
+        self.indices = indices
+
+    def __iter__(self):
+        return (self.indices[i] for i in range(len(self.indices)))
+
+    def __len__(self):
+        return len(self.indices)
+
+
+class PerfectBatchSampler(Sampler):
+    """
+    Samples a mini-batch of indices for a balanced class batching
+
+    Args:
+        dataset_items(list): dataset items to sample from.
+        classes (list): list of classes of dataset_items to sample from.
+        batch_size (int): total number of samples to be sampled in a mini-batch.
+        num_gpus (int): number of GPU in the data parallel mode.
+        shuffle (bool): if True, samples randomly, otherwise samples sequentially.
+        drop_last (bool): if True, drops last incomplete batch.
+    """
+
+    def __init__(
+        self,
+        dataset_items,
+        classes,
+        batch_size,
+        num_classes_in_batch,
+        num_gpus=1,
+        shuffle=True,
+        drop_last=False,
+        label_key="class_name",
+    ):
+        super().__init__(dataset_items)
+        assert (
+            batch_size % (num_classes_in_batch * num_gpus) == 0
+        ), "Batch size must be divisible by number of classes times the number of data parallel devices (if enabled)."
+
+        label_indices = {}
+        for idx, item in enumerate(dataset_items):
+            label = item[label_key]
+            if label not in label_indices.keys():
+                label_indices[label] = [idx]
+            else:
+                label_indices[label].append(idx)
+
+        if shuffle:
+            self._samplers = [SubsetRandomSampler(label_indices[key]) for key in classes]
+        else:
+            self._samplers = [SubsetSampler(label_indices[key]) for key in classes]
+
+        self._batch_size = batch_size
+        self._drop_last = drop_last
+        self._dp_devices = num_gpus
+        self._num_classes_in_batch = num_classes_in_batch
+
+    def __iter__(self):
+
+        batch = []
+        if self._num_classes_in_batch != len(self._samplers):
+            valid_samplers_idx = random.sample(range(len(self._samplers)), self._num_classes_in_batch)
+        else:
+            valid_samplers_idx = None
+
+        iters = [iter(s) for s in self._samplers]
+        done = False
+
+        while True:
+            b = []
+            for i, it in enumerate(iters):
+                if valid_samplers_idx is not None and i not in valid_samplers_idx:
+                    continue
+                idx = next(it, None)
+                if idx is None:
+                    done = True
+                    break
+                b.append(idx)
+            if done:
+                break
+            batch += b
+            if len(batch) == self._batch_size:
+                yield batch
+                batch = []
+                if valid_samplers_idx is not None:
+                    valid_samplers_idx = random.sample(range(len(self._samplers)), self._num_classes_in_batch)
+
+        if not self._drop_last:
+            if len(batch) > 0:
+                groups = len(batch) // self._num_classes_in_batch
+                if groups % self._dp_devices == 0:
+                    yield batch
+                else:
+                    batch = batch[: (groups // self._dp_devices) * self._dp_devices * self._num_classes_in_batch]
+                    if len(batch) > 0:
+                        yield batch
+
+    def __len__(self):
+        class_batch_size = self._batch_size // self._num_classes_in_batch
+        return min(((len(s) + class_batch_size - 1) // class_batch_size) for s in self._samplers)
--- a/TTS/speaker_encoder/utils/training.py
+++ b/TTS/speaker_encoder/utils/training.py
--- a/TTS/speaker_encoder/utils/visual.py
+++ b/TTS/speaker_encoder/utils/visual.py
@ -29,14 +29,18 @@ colormap = (
 )


-def plot_embeddings(embeddings, num_utter_per_speaker):
-    embeddings = embeddings[: 10 * num_utter_per_speaker]
+def plot_embeddings(embeddings, num_classes_in_batch):
+    num_utter_per_class = embeddings.shape[0] // num_classes_in_batch
+
+    # if necessary get just the first 10 classes
+    if num_classes_in_batch > 10:
+        num_classes_in_batch = 10
+        embeddings = embeddings[: num_classes_in_batch * num_utter_per_class]
+
    model = umap.UMAP()
    projection = model.fit_transform(embeddings)
-    num_speakers = embeddings.shape[0] // num_utter_per_speaker
-    ground_truth = np.repeat(np.arange(num_speakers), num_utter_per_speaker)
+    ground_truth = np.repeat(np.arange(num_classes_in_batch), num_utter_per_class)
    colors = [colormap[i] for i in ground_truth]
-
    fig, ax = plt.subplots(figsize=(16, 10))
    _ = ax.scatter(projection[:, 0], projection[:, 1], c=colors)
    plt.gca().set_aspect("equal", "datalim")
--- a/TTS/model.py
+++ b/TTS/model.py
@ -1,46 +1,34 @@
-from abc import ABC, abstractmethod
-from typing import Dict, List, Tuple
+from abc import abstractmethod
+from typing import Dict

 import torch
 from coqpit import Coqpit
-from torch import nn
+from trainer import TrainerModel

 # pylint: skip-file


-class BaseTrainerModel(ABC, nn.Module):
-    """Abstract 🐸TTS class. Every new 🐸TTS model must inherit this."""
+class BaseTrainerModel(TrainerModel):
+    """BaseTrainerModel model expanding TrainerModel with required functions by 🐸TTS.
+
+    Every new 🐸TTS model must inherit it.
+    """

    @staticmethod
    @abstractmethod
    def init_from_config(config: Coqpit):
-        """Init the model from given config.
+        """Init the model and all its attributes from the given config.

        Override this depending on your model.
        """
        ...

-    @abstractmethod
-    def forward(self, input: torch.Tensor, *args, aux_input={}, **kwargs) -> Dict:
-        """Forward ... for the model mainly used in training.
-
-        You can be flexible here and use different number of arguments and argument names since it is intended to be
-        used by `train_step()` without exposing it out of the model.
-
-        Args:
-            input (torch.Tensor): Input tensor.
-            aux_input (Dict): Auxiliary model inputs like embeddings, durations or any other sorts of inputs.
-
-        Returns:
-            Dict: Model outputs. Main model output must be named as "model_outputs".
-        """
-        outputs_dict = {"model_outputs": None}
-        ...
-        return outputs_dict
-
    @abstractmethod
    def inference(self, input: torch.Tensor, aux_input={}) -> Dict:
-        """Forward ... for inference.
+        """Forward pass for inference.
+
+        It must return a dictionary with the main model output and all the auxiliary outputs. The key ```model_outputs```
+        is considered to be the main output and you can add any other auxiliary outputs as you want.

        We don't use `*kwargs` since it is problematic with the TorchScript API.

@ -55,78 +43,9 @@ class BaseTrainerModel(ABC, nn.Module):
        ...
        return outputs_dict

-    def format_batch(self, batch: Dict) -> Dict:
-        """Format batch returned by the data loader before sending it to the model.
-
-        If not implemented, model uses the batch as is.
-        Can be used for data augmentation, feature ectraction, etc.
-        """
-        return batch
-
-    def format_batch_on_device(self, batch: Dict) -> Dict:
-        """Format batch on device before sending it to the model.
-
-        If not implemented, model uses the batch as is.
-        Can be used for data augmentation, feature ectraction, etc.
-        """
-        return batch
-
-    @abstractmethod
-    def train_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]:
-        """Perform a single training step. Run the model forward ... and compute losses.
-
-        Args:
-            batch (Dict): Input tensors.
-            criterion (nn.Module): Loss layer designed for the model.
-
-        Returns:
-            Tuple[Dict, Dict]: Model ouputs and computed losses.
-        """
-        outputs_dict = {}
-        loss_dict = {}  # this returns from the criterion
-        ...
-        return outputs_dict, loss_dict
-
-    def train_log(self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int) -> None:
-        """Create visualizations and waveform examples for training.
-
-        For example, here you can plot spectrograms and generate sample sample waveforms from these spectrograms to
-        be projected onto Tensorboard.
-
-        Args:
-            ap (AudioProcessor): audio processor used at training.
-            batch (Dict): Model inputs used at the previous training step.
-            outputs (Dict): Model outputs generated at the previoud training step.
-
-        Returns:
-            Tuple[Dict, np.ndarray]: training plots and output waveform.
-        """
-        ...
-
-    @abstractmethod
-    def eval_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]:
-        """Perform a single evaluation step. Run the model forward ... and compute losses. In most cases, you can
-        call `train_step()` with no changes.
-
-        Args:
-            batch (Dict): Input tensors.
-            criterion (nn.Module): Loss layer designed for the model.
-
-        Returns:
-            Tuple[Dict, Dict]: Model ouputs and computed losses.
-        """
-        outputs_dict = {}
-        loss_dict = {}  # this returns from the criterion
-        ...
-        return outputs_dict, loss_dict
-
-    def eval_log(self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int) -> None:
-        """The same as `train_log()`"""
-        ...
-
    @abstractmethod
    def load_checkpoint(self, config: Coqpit, checkpoint_path: str, eval: bool = False, strict: bool = True) -> None:
-        """Load a checkpoint and get ready for training or inference.
+        """Load a model checkpoint gile and get ready for training or inference.

        Args:
            config (Coqpit): Model configuration.
@ -135,36 +54,3 @@ class BaseTrainerModel(ABC, nn.Module):
            strcit (bool, optional): Match all checkpoint keys to model's keys. Defaults to True.
        """
        ...
-
-    @staticmethod
-    @abstractmethod
-    def init_from_config(config: Coqpit, samples: List[Dict] = None, verbose=False) -> "BaseTrainerModel":
-        """Init the model from given config.
-
-        Override this depending on your model.
-        """
-        ...
-
-    @abstractmethod
-    def get_data_loader(
-        self, config: Coqpit, assets: Dict, is_eval: True, data_items: List, verbose: bool, num_gpus: int
-    ):
-        ...
-
-    # def get_optimizer(self) -> Union["Optimizer", List["Optimizer"]]:
-    #     """Setup an return optimizer or optimizers."""
-    #     ...
-
-    # def get_lr(self) -> Union[float, List[float]]:
-    #     """Return learning rate(s).
-
-    #     Returns:
-    #         Union[float, List[float]]: Model's initial learning rates.
-    #     """
-    #     ...
-
-    # def get_scheduler(self, optimizer: torch.optim.Optimizer):
-    #     ...
-
-    # def get_criterion(self):
-    #     ...
--- a/TTS/server/README.md
+++ b/TTS/server/README.md
@ -21,4 +21,4 @@ Run the server with the official models on a GPU.
 ```CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py  --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan --use_cuda True```

 Run the server with a custom models.
-```python TTS/server/server.py  --tts_checkpoint /path/to/tts/model.pth.tar --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth.tar --vocoder_config /path/to/vocoder/config.json```
+```python TTS/server/server.py  --tts_checkpoint /path/to/tts/model.pth --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth --vocoder_config /path/to/vocoder/config.json```
--- a/TTS/server/conf.json
+++ b/TTS/server/conf.json
@ -1,6 +1,6 @@
 {
    "tts_path":"/media/erogol/data_ssd/Models/libri_tts/5049/",  // tts model root folder
-    "tts_file":"best_model.pth.tar",     // tts checkpoint file
+    "tts_file":"best_model.pth",     // tts checkpoint file
    "tts_config":"config.json",     // tts config.json file
    "tts_speakers": null,           // json file listing speaker ids. null if no speaker embedding.
    "vocoder_config":null,
--- a/TTS/server/server.py
+++ b/TTS/server/server.py
@ -143,7 +143,7 @@ def index():
        "index.html",
        show_details=args.show_details,
        use_multi_speaker=use_multi_speaker,
-        speaker_ids=speaker_manager.speaker_ids if speaker_manager is not None else None,
+        speaker_ids=speaker_manager.ids if speaker_manager is not None else None,
        use_gst=use_gst,
    )

--- a/TTS/speaker_encoder/configs/config.json
+++ b/TTS/speaker_encoder/configs/config.json
@ -1,118 +0,0 @@
-
-{
-    "model_name": "lstm",
-    "run_name": "mueller91",
-    "run_description": "train speaker encoder with voxceleb1, voxceleb2 and libriSpeech ",
-    "audio":{
-        // Audio processing parameters
-        "num_mels": 40,         // size of the mel spec frame. 
-        "fft_size": 400,       // number of stft frequency levels. Size of the linear spectogram frame.
-        "sample_rate": 16000,   // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
-        "win_length": 400,     // stft window length in ms.
-        "hop_length": 160,      // stft window hop-lengh in ms.
-        "frame_length_ms": null,  // stft window length in ms.If null, 'win_length' is used.
-        "frame_shift_ms": null,   // stft window hop-lengh in ms. If null, 'hop_length' is used.
-        "preemphasis": 0.98,    // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
-        "min_level_db": -100,   // normalization range
-        "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
-        "power": 1.5,           // value to sharpen wav signals after GL algorithm.
-        "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
-        // Normalization parameters
-        "signal_norm": true,    // normalize the spec values in range [0, 1]
-        "symmetric_norm": true, // move normalization to range [-1, 1]
-        "max_norm": 4.0,          // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
-        "clip_norm": true,      // clip normalized values into the range.
-        "mel_fmin": 0.0,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
-        "mel_fmax": 8000.0,        // maximum freq level for mel-spec. Tune for dataset!!
-        "do_trim_silence": true,  // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
-        "trim_db": 60,          // threshold for timming silence. Set this according to your dataset.
-        "stats_path": null    // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
-    },
-    "reinit_layers": [],
-    "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA)
-    "grad_clip": 3.0, // upper limit for gradients for clipping.
-    "epochs": 1000, // total number of epochs to train.
-    "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
-    "lr_decay": false, // if true, Noam learning rate decaying is applied through training.
-    "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
-    "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. 
-    "steps_plot_stats": 10, // number of steps to plot embeddings.
-    "num_speakers_in_batch": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
-    "num_utters_per_speaker": 10,  //
-    "skip_speakers": false, // skip speakers with samples less than "num_utters_per_speaker"
-
-    "voice_len": 1.6, // number of seconds for each training instance
-    "num_loader_workers": 8,        // number of training data loader processes. Don't set it too big. 4-8 are good values.
-    "wd": 0.000001, // Weight decay weight.
-    "checkpoint": true, // If true, it saves checkpoints per "save_step"
-    "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
-    "print_step": 20, // Number of steps to log traning on console.
-    "output_path": "../../MozillaTTSOutput/checkpoints/voxceleb_librispeech/speaker_encoder/", // DATASET-RELATED: output path for all training outputs.
-    "model": {
-        "input_dim": 40,
-        "proj_dim": 256,
-        "lstm_dim": 768,
-        "num_lstm_layers": 3,
-        "use_lstm_with_projection": true
-    },
-
-    "audio_augmentation": {
-        "p": 0,
-        //add a gaussian noise to the data in order to increase robustness
-        "gaussian":{ // as the insertion of Gaussian noise is quick to be calculated, we added it after loading the wav file, this way, even audios that were reused with the cache can receive this noise
-            "p": 1, // propability of apply this method, 0 is disable
-            "min_amplitude": 0.0,
-            "max_amplitude": 1e-5     
-        }
-    },
-    "storage": {
-        "sample_from_storage_p": 0.66,  // the probability with which we'll sample from the DataSet in-memory storage
-        "storage_size": 15,   // the size of the in-memory storage with respect to a single batch
-        "additive_noise": 1e-5   // add very small gaussian noise to the data in order to increase robustness
-    },
-    "datasets": 
-        [
-            {
-                "name": "vctk_slim",
-                "path": "../../../audio-datasets/en/VCTK-Corpus/",
-                "meta_file_train": null,
-                "meta_file_val": null
-            },
-            {
-                "name": "libri_tts",
-                "path": "../../../audio-datasets/en/LibriTTS/train-clean-100",
-                "meta_file_train": null,
-                "meta_file_val": null
-            },
-            {
-                "name": "libri_tts",
-                "path": "../../../audio-datasets/en/LibriTTS/train-clean-360",
-                "meta_file_train": null,
-                "meta_file_val": null
-            },
-            {
-                "name": "libri_tts",
-                "path": "../../../audio-datasets/en/LibriTTS/train-other-500",
-                "meta_file_train": null,
-                "meta_file_val": null
-            },
-            {
-                "name": "voxceleb1",
-                "path": "../../../audio-datasets/en/voxceleb1/",
-                "meta_file_train": null,
-                "meta_file_val": null
-            },
-            {
-                "name": "voxceleb2",
-                "path": "../../../audio-datasets/en/voxceleb2/",
-                "meta_file_train": null,
-                "meta_file_val": null
-            },
-            {
-                "name": "common_voice",
-                "path": "../../../audio-datasets/en/MozillaCommonVoice",
-                "meta_file_train": "train.tsv",
-                "meta_file_val": "test.tsv"
-            }
-        ]
-}
--- a/TTS/speaker_encoder/configs/config_resnet_angleproto.json
+++ b/TTS/speaker_encoder/configs/config_resnet_angleproto.json
@ -1,956 +0,0 @@
-{
-    "model": "speaker_encoder",
-    "run_name": "speaker_encoder",
-    "run_description": "resnet speaker encoder trained with commonvoice all languages dev and train, Voxceleb 1 dev and Voxceleb 2 dev",
-    // AUDIO PARAMETERS
-    "audio":{
-        // Audio processing parameters
-        "num_mels": 80,         // size of the mel spec frame. 
-        "fft_size": 1024,       // number of stft frequency levels. Size of the linear spectogram frame.
-        "sample_rate": 16000,  // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
-        "win_length": 1024,     // stft window length in ms.
-        "hop_length": 256,      // stft window hop-lengh in ms.
-        "frame_length_ms": null,  // stft window length in ms.If null, 'win_length' is used.
-        "frame_shift_ms": null,   // stft window hop-lengh in ms. If null, 'hop_length' is used.
-        "preemphasis": 0.98,    // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
-        "min_level_db": -100,   // normalization range
-        "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
-        "power": 1.5,           // value to sharpen wav signals after GL algorithm.
-        "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
-        "stft_pad_mode": "reflect",
-        // Normalization parameters
-        "signal_norm": true,    // normalize the spec values in range [0, 1]
-        "symmetric_norm": true, // move normalization to range [-1, 1]
-        "max_norm": 4.0,          // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
-        "clip_norm": true,      // clip normalized values into the range.
-        "mel_fmin": 0.0,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
-        "mel_fmax": 8000.0,        // maximum freq level for mel-spec. Tune for dataset!!
-        "spec_gain": 20.0, 
-        "do_trim_silence": false,  // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
-        "trim_db": 60,          // threshold for timming silence. Set this according to your dataset.
-        "stats_path": null    // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored 
-    },
-    "reinit_layers": [],
-
-    "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss, "angleproto" to use Angular Prototypical loss and "softmaxproto" to use Softmax with Angular Prototypical loss 
-    "grad_clip": 3.0, // upper limit for gradients for clipping.
-    "max_train_step": 1000000, // total number of steps to train.
-    "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
-    "lr_decay": false, // if true, Noam learning rate decaying is applied through training.
-    "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
-    "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. 
-    "steps_plot_stats": 100, // number of steps to plot embeddings.
-
-    // Speakers config
-    "num_speakers_in_batch": 200, // Batch size for training.
-    "num_utters_per_speaker": 2,  //
-    "skip_speakers": true, // skip speakers with samples less than "num_utters_per_speaker"
-    "voice_len": 2, // number of seconds for each training instance
-     
-    "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values.
-    "wd": 0.000001, // Weight decay weight.
-    "checkpoint": true, // If true, it saves checkpoints per "save_step"
-    "save_step": 1000, // Number of training steps expected to save the best checkpoints in training.
-    "print_step": 50, // Number of steps to log traning on console.
-    "output_path": "../checkpoints/speaker_encoder/angleproto/resnet_voxceleb1_and_voxceleb2-and-common-voice-all-using-angleproto/", // DATASET-RELATED: output path for all training outputs.
-
-    "audio_augmentation": {
-        "p": 0.5, // propability of apply this method, 0 is disable rir and additive noise augmentation
-        "rir":{
-            "rir_path": "/workspace/store/ecasanova/ComParE/RIRS_NOISES/simulated_rirs/",
-            "conv_mode": "full"
-        },
-        "additive":{
-            "sounds_path": "/workspace/store/ecasanova/ComParE/musan/",
-            // list of each of the directories in your data augmentation, if a directory is in "sounds_path" but is not listed here it will be ignored
-            "speech":{
-                "min_snr_in_db": 13,
-                "max_snr_in_db": 20,
-                "min_num_noises": 2,
-                "max_num_noises": 3
-                },
-            "noise":{
-                "min_snr_in_db": 0,
-                "max_snr_in_db": 15,
-                "min_num_noises": 1,
-                "max_num_noises": 1
-                },
-            "music":{
-                "min_snr_in_db": 5,
-                "max_snr_in_db": 15,
-                "min_num_noises": 1,
-                "max_num_noises": 1
-                }
-        },
-        //add a gaussian noise to the data in order to increase robustness
-        "gaussian":{ // as the insertion of Gaussian noise is quick to be calculated, we added it after loading the wav file, this way, even audios that were reused with the cache can receive this noise
-            "p": 0.5, // propability of apply this method, 0 is disable
-            "min_amplitude": 0.0,
-            "max_amplitude": 1e-5    
-        }
-    },
-    "model_params": {
-        "model_name": "resnet",
-        "input_dim": 80,
-        "proj_dim": 512
-    },
-    "storage": {
-        "sample_from_storage_p": 0.5,  // the probability with which we'll sample from the DataSet in-memory storage
-        "storage_size":  35 // the size of the in-memory storage with respect to a single batch
-    },
-    "datasets": 
-        [
-        {
-                "name": "voxceleb2",
-                "path": "/workspace/scratch/ecasanova/datasets/VoxCeleb/vox2_dev_aac/",
-                "meta_file_train": null,
-                "meta_file_val": null
-        },
-        {
-                "name": "voxceleb1",
-                "path": "/workspace/scratch/ecasanova/datasets/VoxCeleb/vox1_dev_wav/",
-                "meta_file_train": null,
-                "meta_file_val": null
-        },
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fi",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fi",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-CN",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-CN",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-sursilv",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-sursilv",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lt",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lt",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ka",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ka",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sv-SE",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sv-SE",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pl",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pl",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ru",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ru",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mn",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mn",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/nl",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/nl",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sl",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sl",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/es",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/es",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pt",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pt",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hi",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hi",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ja",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ja",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ia",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ia",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/br",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/br",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/id",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/id",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/dv",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/dv",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ta",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ta",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/or",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/or",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-HK",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-HK",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/de",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/de",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/uk",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/uk",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/en",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/en",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fa",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fa",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vi",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vi",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ab",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ab",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sah",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sah",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vot",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vot",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fr",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fr",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tr",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tr",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lg",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lg",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mt",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mt",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rw",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rw",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hu",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hu",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-vallader",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-vallader",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/el",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/el",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tt",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tt",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-TW",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-TW",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/et",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/et",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fy-NL",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fy-NL",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cs",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cs",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/as",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/as",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ro",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ro",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eo",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eo",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pa-IN",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pa-IN",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/th",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/th",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/it",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/it",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ga-IE",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ga-IE",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cnh",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cnh",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ky",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ky",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ar",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ar",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eu",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eu",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ca",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ca",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/kab",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/kab",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cy",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cy",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cv",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cv",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hsb",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hsb",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lv",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lv",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        }
-
-        ]
-}
--- a/TTS/speaker_encoder/configs/config_resnet_softmax_angleproto.json
+++ b/TTS/speaker_encoder/configs/config_resnet_softmax_angleproto.json
@ -1,957 +0,0 @@
-
-{
-    "model": "speaker_encoder",
-    "run_name": "speaker_encoder",
-    "run_description": "resnet speaker encoder trained with commonvoice all languages dev and train, Voxceleb 1 dev and Voxceleb 2 dev",
-    // AUDIO PARAMETERS
-    "audio":{
-        // Audio processing parameters
-        "num_mels": 80,         // size of the mel spec frame. 
-        "fft_size": 1024,       // number of stft frequency levels. Size of the linear spectogram frame.
-        "sample_rate": 16000,  // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
-        "win_length": 1024,     // stft window length in ms.
-        "hop_length": 256,      // stft window hop-lengh in ms.
-        "frame_length_ms": null,  // stft window length in ms.If null, 'win_length' is used.
-        "frame_shift_ms": null,   // stft window hop-lengh in ms. If null, 'hop_length' is used.
-        "preemphasis": 0.98,    // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
-        "min_level_db": -100,   // normalization range
-        "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
-        "power": 1.5,           // value to sharpen wav signals after GL algorithm.
-        "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
-        "stft_pad_mode": "reflect",
-        // Normalization parameters
-        "signal_norm": true,    // normalize the spec values in range [0, 1]
-        "symmetric_norm": true, // move normalization to range [-1, 1]
-        "max_norm": 4.0,          // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
-        "clip_norm": true,      // clip normalized values into the range.
-        "mel_fmin": 0.0,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
-        "mel_fmax": 8000.0,        // maximum freq level for mel-spec. Tune for dataset!!
-        "spec_gain": 20.0, 
-        "do_trim_silence": false,  // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
-        "trim_db": 60,          // threshold for timming silence. Set this according to your dataset.
-        "stats_path": null    // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored 
-    },
-    "reinit_layers": [],
-
-    "loss": "softmaxproto", // "ge2e" to use Generalized End-to-End loss, "angleproto" to use Angular Prototypical loss and "softmaxproto" to use Softmax with Angular Prototypical loss 
-    "grad_clip": 3.0, // upper limit for gradients for clipping.
-    "max_train_step": 1000000, // total number of steps to train.
-    "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
-    "lr_decay": false, // if true, Noam learning rate decaying is applied through training.
-    "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
-    "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. 
-    "steps_plot_stats": 100, // number of steps to plot embeddings.
-
-    // Speakers config
-    "num_speakers_in_batch": 200, // Batch size for training.
-    "num_utters_per_speaker": 2,  //
-    "skip_speakers": true, // skip speakers with samples less than "num_utters_per_speaker"
-    "voice_len": 2, // number of seconds for each training instance
-     
-    "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values.
-    "wd": 0.000001, // Weight decay weight.
-    "checkpoint": true, // If true, it saves checkpoints per "save_step"
-    "save_step": 1000, // Number of training steps expected to save the best checkpoints in training.
-    "print_step": 50, // Number of steps to log traning on console.
-    "output_path": "../../../checkpoints/speaker_encoder/resnet_voxceleb1_and_voxceleb2-and-common-voice-all/", // DATASET-RELATED: output path for all training outputs.
-
-    "audio_augmentation": {
-        "p": 0.5, // propability of apply this method, 0 is disable rir and additive noise augmentation
-        "rir":{
-            "rir_path": "/workspace/store/ecasanova/ComParE/RIRS_NOISES/simulated_rirs/",
-            "conv_mode": "full"
-        },
-        "additive":{
-            "sounds_path": "/workspace/store/ecasanova/ComParE/musan/",
-            // list of each of the directories in your data augmentation, if a directory is in "sounds_path" but is not listed here it will be ignored
-            "speech":{
-                "min_snr_in_db": 13,
-                "max_snr_in_db": 20,
-                "min_num_noises": 2,
-                "max_num_noises": 3
-                },
-            "noise":{
-                "min_snr_in_db": 0,
-                "max_snr_in_db": 15,
-                "min_num_noises": 1,
-                "max_num_noises": 1
-                },
-            "music":{
-                "min_snr_in_db": 5,
-                "max_snr_in_db": 15,
-                "min_num_noises": 1,
-                "max_num_noises": 1
-                }
-        },
-        //add a gaussian noise to the data in order to increase robustness
-        "gaussian":{ // as the insertion of Gaussian noise is quick to be calculated, we added it after loading the wav file, this way, even audios that were reused with the cache can receive this noise
-            "p": 0.5, // propability of apply this method, 0 is disable
-            "min_amplitude": 0.0,
-            "max_amplitude": 1e-5    
-        }
-    },
-    "model_params": {
-        "model_name": "resnet",
-        "input_dim": 80,
-        "proj_dim": 512
-    },
-    "storage": {
-        "sample_from_storage_p": 0.66,  // the probability with which we'll sample from the DataSet in-memory storage
-        "storage_size":  35 // the size of the in-memory storage with respect to a single batch
-    },
-    "datasets": 
-        [
-        {
-                "name": "voxceleb2",
-                "path": "/workspace/scratch/ecasanova/datasets/VoxCeleb/vox2_dev_aac/",
-                "meta_file_train": null,
-                "meta_file_val": null
-        },
-        {
-                "name": "voxceleb1",
-                "path": "/workspace/scratch/ecasanova/datasets/VoxCeleb/vox1_dev_wav/",
-                "meta_file_train": null,
-                "meta_file_val": null
-        },
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fi",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fi",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-CN",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-CN",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-sursilv",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-sursilv",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lt",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lt",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ka",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ka",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sv-SE",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sv-SE",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pl",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pl",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ru",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ru",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mn",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mn",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/nl",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/nl",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sl",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sl",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/es",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/es",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pt",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pt",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hi",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hi",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ja",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ja",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ia",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ia",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/br",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/br",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/id",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/id",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/dv",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/dv",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ta",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ta",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/or",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/or",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-HK",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-HK",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/de",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/de",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/uk",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/uk",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/en",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/en",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fa",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fa",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vi",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vi",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ab",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ab",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sah",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sah",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vot",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vot",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fr",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fr",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tr",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tr",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lg",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lg",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mt",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mt",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rw",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rw",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hu",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hu",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-vallader",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-vallader",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/el",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/el",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tt",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tt",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-TW",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-TW",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/et",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/et",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fy-NL",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fy-NL",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cs",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cs",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/as",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/as",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ro",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ro",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eo",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eo",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pa-IN",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pa-IN",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/th",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/th",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/it",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/it",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ga-IE",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ga-IE",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cnh",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cnh",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ky",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ky",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ar",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ar",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eu",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eu",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ca",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ca",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/kab",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/kab",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cy",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cy",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cv",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cv",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hsb",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hsb",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        },
-
-         {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lv",
-                "meta_file_train": "train.tsv",
-                "meta_file_val":  null
-        },
-
-        {
-                "name": "common_voice",
-                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lv",
-                "meta_file_train": "dev.tsv",
-                "meta_file_val":  null
-        }
-
-        ]
-}
--- a/TTS/speaker_encoder/dataset.py
+++ b/TTS/speaker_encoder/dataset.py
@ -1,253 +0,0 @@
-import random
-
-import numpy as np
-import torch
-from torch.utils.data import Dataset
-
-from TTS.speaker_encoder.utils.generic_utils import AugmentWAV, Storage
-
-
-class SpeakerEncoderDataset(Dataset):
-    def __init__(
-        self,
-        ap,
-        meta_data,
-        voice_len=1.6,
-        num_speakers_in_batch=64,
-        storage_size=1,
-        sample_from_storage_p=0.5,
-        num_utter_per_speaker=10,
-        skip_speakers=False,
-        verbose=False,
-        augmentation_config=None,
-    ):
-        """
-        Args:
-            ap (TTS.tts.utils.AudioProcessor): audio processor object.
-            meta_data (list): list of dataset instances.
-            seq_len (int): voice segment length in seconds.
-            verbose (bool): print diagnostic information.
-        """
-        super().__init__()
-        self.items = meta_data
-        self.sample_rate = ap.sample_rate
-        self.seq_len = int(voice_len * self.sample_rate)
-        self.num_speakers_in_batch = num_speakers_in_batch
-        self.num_utter_per_speaker = num_utter_per_speaker
-        self.skip_speakers = skip_speakers
-        self.ap = ap
-        self.verbose = verbose
-        self.__parse_items()
-        storage_max_size = storage_size * num_speakers_in_batch
-        self.storage = Storage(
-            maxsize=storage_max_size, storage_batchs=storage_size, num_speakers_in_batch=num_speakers_in_batch
-        )
-        self.sample_from_storage_p = float(sample_from_storage_p)
-
-        speakers_aux = list(self.speakers)
-        speakers_aux.sort()
-        self.speakerid_to_classid = {key: i for i, key in enumerate(speakers_aux)}
-
-        # Augmentation
-        self.augmentator = None
-        self.gaussian_augmentation_config = None
-        if augmentation_config:
-            self.data_augmentation_p = augmentation_config["p"]
-            if self.data_augmentation_p and ("additive" in augmentation_config or "rir" in augmentation_config):
-                self.augmentator = AugmentWAV(ap, augmentation_config)
-
-            if "gaussian" in augmentation_config.keys():
-                self.gaussian_augmentation_config = augmentation_config["gaussian"]
-
-        if self.verbose:
-            print("\n > DataLoader initialization")
-            print(f" | > Speakers per Batch: {num_speakers_in_batch}")
-            print(f" | > Storage Size: {storage_max_size} instances, each with {num_utter_per_speaker} utters")
-            print(f" | > Sample_from_storage_p : {self.sample_from_storage_p}")
-            print(f" | > Number of instances : {len(self.items)}")
-            print(f" | > Sequence length: {self.seq_len}")
-            print(f" | > Num speakers: {len(self.speakers)}")
-
-    def load_wav(self, filename):
-        audio = self.ap.load_wav(filename, sr=self.ap.sample_rate)
-        return audio
-
-    def load_data(self, idx):
-        text, wav_file, speaker_name = self.items[idx]
-        wav = np.asarray(self.load_wav(wav_file), dtype=np.float32)
-        mel = self.ap.melspectrogram(wav).astype("float32")
-        # sample seq_len
-
-        assert text.size > 0, self.items[idx]["audio_file"]
-        assert wav.size > 0, self.items[idx]["audio_file"]
-
-        sample = {
-            "mel": mel,
-            "item_idx": self.items[idx]["audio_file"],
-            "speaker_name": speaker_name,
-        }
-        return sample
-
-    def __parse_items(self):
-        self.speaker_to_utters = {}
-        for i in self.items:
-            path_ = i["audio_file"]
-            speaker_ = i["speaker_name"]
-            if speaker_ in self.speaker_to_utters.keys():
-                self.speaker_to_utters[speaker_].append(path_)
-            else:
-                self.speaker_to_utters[speaker_] = [
-                    path_,
-                ]
-
-        if self.skip_speakers:
-            self.speaker_to_utters = {
-                k: v for (k, v) in self.speaker_to_utters.items() if len(v) >= self.num_utter_per_speaker
-            }
-
-        self.speakers = [k for (k, v) in self.speaker_to_utters.items()]
-
-    def __len__(self):
-        return int(1e10)
-
-    def get_num_speakers(self):
-        return len(self.speakers)
-
-    def __sample_speaker(self, ignore_speakers=None):
-        speaker = random.sample(self.speakers, 1)[0]
-        # if list of speakers_id is provide make sure that it's will be ignored
-        if ignore_speakers and self.speakerid_to_classid[speaker] in ignore_speakers:
-            while True:
-                speaker = random.sample(self.speakers, 1)[0]
-                if self.speakerid_to_classid[speaker] not in ignore_speakers:
-                    break
-
-        if self.num_utter_per_speaker > len(self.speaker_to_utters[speaker]):
-            utters = random.choices(self.speaker_to_utters[speaker], k=self.num_utter_per_speaker)
-        else:
-            utters = random.sample(self.speaker_to_utters[speaker], self.num_utter_per_speaker)
-        return speaker, utters
-
-    def __sample_speaker_utterances(self, speaker):
-        """
-        Sample all M utterances for the given speaker.
-        """
-        wavs = []
-        labels = []
-        for _ in range(self.num_utter_per_speaker):
-            # TODO:dummy but works
-            while True:
-                # remove speakers that have num_utter less than 2
-                if len(self.speaker_to_utters[speaker]) > 1:
-                    utter = random.sample(self.speaker_to_utters[speaker], 1)[0]
-                else:
-                    if speaker in self.speakers:
-                        self.speakers.remove(speaker)
-
-                    speaker, _ = self.__sample_speaker()
-                    continue
-
-                wav = self.load_wav(utter)
-                if wav.shape[0] - self.seq_len > 0:
-                    break
-
-                if utter in self.speaker_to_utters[speaker]:
-                    self.speaker_to_utters[speaker].remove(utter)
-
-            if self.augmentator is not None and self.data_augmentation_p:
-                if random.random() < self.data_augmentation_p:
-                    wav = self.augmentator.apply_one(wav)
-
-            wavs.append(wav)
-            labels.append(self.speakerid_to_classid[speaker])
-        return wavs, labels
-
-    def __getitem__(self, idx):
-        speaker, _ = self.__sample_speaker()
-        speaker_id = self.speakerid_to_classid[speaker]
-        return speaker, speaker_id
-
-    def __load_from_disk_and_storage(self, speaker):
-        # don't sample from storage, but from HDD
-        wavs_, labels_ = self.__sample_speaker_utterances(speaker)
-        # put the newly loaded item into storage
-        self.storage.append((wavs_, labels_))
-        return wavs_, labels_
-
-    def collate_fn(self, batch):
-        # get the batch speaker_ids
-        batch = np.array(batch)
-        speakers_id_in_batch = set(batch[:, 1].astype(np.int32))
-
-        labels = []
-        feats = []
-        speakers = set()
-
-        for speaker, speaker_id in batch:
-            speaker_id = int(speaker_id)
-
-            # ensure that an speaker appears only once in the batch
-            if speaker_id in speakers:
-
-                # remove current speaker
-                if speaker_id in speakers_id_in_batch:
-                    speakers_id_in_batch.remove(speaker_id)
-
-                speaker, _ = self.__sample_speaker(ignore_speakers=speakers_id_in_batch)
-                speaker_id = self.speakerid_to_classid[speaker]
-                speakers_id_in_batch.add(speaker_id)
-
-            if random.random() < self.sample_from_storage_p and self.storage.full():
-                # sample from storage (if full)
-                wavs_, labels_ = self.storage.get_random_sample_fast()
-
-                # force choose the current speaker or other not in batch
-                # It's necessary for ideal training with AngleProto and GE2E losses
-                if labels_[0] in speakers_id_in_batch and labels_[0] != speaker_id:
-                    attempts = 0
-                    while True:
-                        wavs_, labels_ = self.storage.get_random_sample_fast()
-                        if labels_[0] == speaker_id or labels_[0] not in speakers_id_in_batch:
-                            break
-
-                        attempts += 1
-                        # Try 5 times after that load from disk
-                        if attempts >= 5:
-                            wavs_, labels_ = self.__load_from_disk_and_storage(speaker)
-                            break
-            else:
-                # don't sample from storage, but from HDD
-                wavs_, labels_ = self.__load_from_disk_and_storage(speaker)
-
-            # append speaker for control
-            speakers.add(labels_[0])
-
-            # remove current speaker and append other
-            if speaker_id in speakers_id_in_batch:
-                speakers_id_in_batch.remove(speaker_id)
-
-            speakers_id_in_batch.add(labels_[0])
-
-            # get a random subset of each of the wavs and extract mel spectrograms.
-            feats_ = []
-            for wav in wavs_:
-                offset = random.randint(0, wav.shape[0] - self.seq_len)
-                wav = wav[offset : offset + self.seq_len]
-                # add random gaussian noise
-                if self.gaussian_augmentation_config and self.gaussian_augmentation_config["p"]:
-                    if random.random() < self.gaussian_augmentation_config["p"]:
-                        wav += np.random.normal(
-                            self.gaussian_augmentation_config["min_amplitude"],
-                            self.gaussian_augmentation_config["max_amplitude"],
-                            size=len(wav),
-                        )
-                mel = self.ap.melspectrogram(wav)
-                feats_.append(torch.FloatTensor(mel))
-
-            labels.append(torch.LongTensor(labels_))
-            feats.extend(feats_)
-
-        feats = torch.stack(feats)
-        labels = torch.stack(labels)
-
-        return feats, labels
--- a/TTS/speaker_encoder/models/lstm.py
+++ b/TTS/speaker_encoder/models/lstm.py
@ -1,189 +0,0 @@
-import numpy as np
-import torch
-import torchaudio
-from torch import nn
-
-from TTS.speaker_encoder.models.resnet import PreEmphasis
-from TTS.utils.io import load_fsspec
-
-
-class LSTMWithProjection(nn.Module):
-    def __init__(self, input_size, hidden_size, proj_size):
-        super().__init__()
-        self.input_size = input_size
-        self.hidden_size = hidden_size
-        self.proj_size = proj_size
-        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
-        self.linear = nn.Linear(hidden_size, proj_size, bias=False)
-
-    def forward(self, x):
-        self.lstm.flatten_parameters()
-        o, (_, _) = self.lstm(x)
-        return self.linear(o)
-
-
-class LSTMWithoutProjection(nn.Module):
-    def __init__(self, input_dim, lstm_dim, proj_dim, num_lstm_layers):
-        super().__init__()
-        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=lstm_dim, num_layers=num_lstm_layers, batch_first=True)
-        self.linear = nn.Linear(lstm_dim, proj_dim, bias=True)
-        self.relu = nn.ReLU()
-
-    def forward(self, x):
-        _, (hidden, _) = self.lstm(x)
-        return self.relu(self.linear(hidden[-1]))
-
-
-class LSTMSpeakerEncoder(nn.Module):
-    def __init__(
-        self,
-        input_dim,
-        proj_dim=256,
-        lstm_dim=768,
-        num_lstm_layers=3,
-        use_lstm_with_projection=True,
-        use_torch_spec=False,
-        audio_config=None,
-    ):
-        super().__init__()
-        self.use_lstm_with_projection = use_lstm_with_projection
-        self.use_torch_spec = use_torch_spec
-        self.audio_config = audio_config
-        self.proj_dim = proj_dim
-
-        layers = []
-        # choise LSTM layer
-        if use_lstm_with_projection:
-            layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim))
-            for _ in range(num_lstm_layers - 1):
-                layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim))
-            self.layers = nn.Sequential(*layers)
-        else:
-            self.layers = LSTMWithoutProjection(input_dim, lstm_dim, proj_dim, num_lstm_layers)
-
-        self.instancenorm = nn.InstanceNorm1d(input_dim)
-
-        if self.use_torch_spec:
-            self.torch_spec = torch.nn.Sequential(
-                PreEmphasis(audio_config["preemphasis"]),
-                # TorchSTFT(
-                #     n_fft=audio_config["fft_size"],
-                #     hop_length=audio_config["hop_length"],
-                #     win_length=audio_config["win_length"],
-                #     sample_rate=audio_config["sample_rate"],
-                #     window="hamming_window",
-                #     mel_fmin=0.0,
-                #     mel_fmax=None,
-                #     use_htk=True,
-                #     do_amp_to_db=False,
-                #     n_mels=audio_config["num_mels"],
-                #     power=2.0,
-                #     use_mel=True,
-                #     mel_norm=None,
-                # )
-                torchaudio.transforms.MelSpectrogram(
-                    sample_rate=audio_config["sample_rate"],
-                    n_fft=audio_config["fft_size"],
-                    win_length=audio_config["win_length"],
-                    hop_length=audio_config["hop_length"],
-                    window_fn=torch.hamming_window,
-                    n_mels=audio_config["num_mels"],
-                ),
-            )
-        else:
-            self.torch_spec = None
-
-        self._init_layers()
-
-    def _init_layers(self):
-        for name, param in self.layers.named_parameters():
-            if "bias" in name:
-                nn.init.constant_(param, 0.0)
-            elif "weight" in name:
-                nn.init.xavier_normal_(param)
-
-    def forward(self, x, l2_norm=True):
-        """Forward pass of the model.
-
-        Args:
-            x (Tensor): Raw waveform signal or spectrogram frames. If input is a waveform, `torch_spec` must be `True`
-                to compute the spectrogram on-the-fly.
-            l2_norm (bool): Whether to L2-normalize the outputs.
-
-        Shapes:
-            - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})`
-        """
-        with torch.no_grad():
-            with torch.cuda.amp.autocast(enabled=False):
-                if self.use_torch_spec:
-                    x.squeeze_(1)
-                    x = self.torch_spec(x)
-                x = self.instancenorm(x).transpose(1, 2)
-        d = self.layers(x)
-        if self.use_lstm_with_projection:
-            d = d[:, -1]
-        if l2_norm:
-            d = torch.nn.functional.normalize(d, p=2, dim=1)
-        return d
-
-    @torch.no_grad()
-    def inference(self, x, l2_norm=True):
-        d = self.forward(x, l2_norm=l2_norm)
-        return d
-
-    def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True):
-        """
-        Generate embeddings for a batch of utterances
-        x: 1xTxD
-        """
-        max_len = x.shape[1]
-
-        if max_len < num_frames:
-            num_frames = max_len
-
-        offsets = np.linspace(0, max_len - num_frames, num=num_eval)
-
-        frames_batch = []
-        for offset in offsets:
-            offset = int(offset)
-            end_offset = int(offset + num_frames)
-            frames = x[:, offset:end_offset]
-            frames_batch.append(frames)
-
-        frames_batch = torch.cat(frames_batch, dim=0)
-        embeddings = self.inference(frames_batch)
-
-        if return_mean:
-            embeddings = torch.mean(embeddings, dim=0, keepdim=True)
-
-        return embeddings
-
-    def batch_compute_embedding(self, x, seq_lens, num_frames=160, overlap=0.5):
-        """
-        Generate embeddings for a batch of utterances
-        x: BxTxD
-        """
-        num_overlap = num_frames * overlap
-        max_len = x.shape[1]
-        embed = None
-        num_iters = seq_lens / (num_frames - num_overlap)
-        cur_iter = 0
-        for offset in range(0, max_len, num_frames - num_overlap):
-            cur_iter += 1
-            end_offset = min(x.shape[1], offset + num_frames)
-            frames = x[:, offset:end_offset]
-            if embed is None:
-                embed = self.inference(frames)
-            else:
-                embed[cur_iter <= num_iters, :] += self.inference(frames[cur_iter <= num_iters, :, :])
-        return embed / num_iters
-
-    # pylint: disable=unused-argument, redefined-builtin
-    def load_checkpoint(self, config: dict, checkpoint_path: str, eval: bool = False, use_cuda: bool = False):
-        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
-        self.load_state_dict(state["model"])
-        if use_cuda:
-            self.cuda()
-        if eval:
-            self.eval()
-            assert not self.training
--- a/TTS/speaker_encoder/umap.png
+++ b/TTS/speaker_encoder/umap.png
--- a/TTS/tts/configs/shared_configs.py
+++ b/TTS/tts/configs/shared_configs.py
@ -220,6 +220,18 @@ class BaseTTSConfig(BaseTrainingConfig):
        eval_split_size (float):
            If between 0.0 and 1.0 represents the proportion of the dataset to include in the evaluation set.
            If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%).
+
+        use_speaker_weighted_sampler (bool):
+            Enable / Disable the batch balancer by speaker. Defaults to ```False```.
+
+        speaker_weighted_sampler_alpha (float):
+            Number that control the influence of the speaker sampler weights. Defaults to ```1.0```.
+
+        use_language_weighted_sampler (bool):
+            Enable / Disable the batch balancer by language. Defaults to ```False```.
+
+        language_weighted_sampler_alpha (float):
+            Number that control the influence of the language sampler weights. Defaults to ```1.0```.
    """

    audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
@ -252,7 +264,7 @@ class BaseTTSConfig(BaseTrainingConfig):
    # dataset
    datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
    # optimizer
-    optimizer: str = None
+    optimizer: str = "radam"
    optimizer_params: dict = None
    # scheduler
    lr_scheduler: str = ""
@ -262,3 +274,8 @@ class BaseTTSConfig(BaseTrainingConfig):
    # evaluation
    eval_split_max_size: int = None
    eval_split_size: float = 0.01
+    # weighted samplers
+    use_speaker_weighted_sampler: bool = False
+    speaker_weighted_sampler_alpha: float = 1.0
+    use_language_weighted_sampler: bool = False
+    language_weighted_sampler_alpha: float = 1.0
--- a/TTS/tts/datasets/formatters.py
+++ b/TTS/tts/datasets/formatters.py
@ -246,7 +246,7 @@ def libri_tts(root_path, meta_files=None, ignored_speakers=None):
                        continue
                items.append({"text": text, "audio_file": wav_file, "speaker_name": f"LTTS_{speaker_name}"})
    for item in items:
-        assert os.path.exists(item[1]), f" [!] wav files don't exist - {item[1]}"
+        assert os.path.exists(item["audio_file"]), f" [!] wav files don't exist - {item['audio_file']}"
    return items


@ -328,27 +328,49 @@ def vctk(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic
        else:
            wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + f"_{mic}.{file_ext}")
        if os.path.exists(wav_file):
-            items.append([text, wav_file, "VCTK_" + speaker_id])
+            items.append({"text": text, "audio_file": wav_file, "speaker_name": "VCTK_" + speaker_id})
        else:
            print(f" [!] wav files don't exist - {wav_file}")
    return items


-def vctk_old(root_path, meta_files=None, wavs_path="wav48"):
+def vctk_old(root_path, meta_files=None, wavs_path="wav48", ignored_speakers=None):
    """homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz"""
-    test_speakers = meta_files
    items = []
    meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True)
    for meta_file in meta_files:
        _, speaker_id, txt_file = os.path.relpath(meta_file, root_path).split(os.sep)
        file_id = txt_file.split(".")[0]
-        if isinstance(test_speakers, list):  # if is list ignore this speakers ids
-            if speaker_id in test_speakers:
+        # ignore speakers
+        if isinstance(ignored_speakers, list):
+            if speaker_id in ignored_speakers:
                continue
        with open(meta_file, "r", encoding="utf-8") as file_text:
            text = file_text.readlines()[0]
        wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + ".wav")
-        items.append([text, wav_file, "VCTK_old_" + speaker_id])
+        items.append({"text": text, "audio_file": wav_file, "speaker_name": "VCTK_old_" + speaker_id})
+    return items
+
+
+def open_bible(root_path, meta_files="train", ignore_digits_sentences=True, ignored_speakers=None):
+    """ToDo: Refer the paper when available"""
+    items = []
+    split_dir = meta_files
+    meta_files = glob(f"{os.path.join(root_path, split_dir)}/**/*.txt", recursive=True)
+    for meta_file in meta_files:
+        _, speaker_id, txt_file = os.path.relpath(meta_file, root_path).split(os.sep)
+        file_id = txt_file.split(".")[0]
+        # ignore speakers
+        if isinstance(ignored_speakers, list):
+            if speaker_id in ignored_speakers:
+                continue
+        with open(meta_file, "r", encoding="utf-8") as file_text:
+            text = file_text.readline().replace("\n", "")
+        # ignore sentences that contains digits
+        if ignore_digits_sentences and any(map(str.isdigit, text)):
+            continue
+        wav_file = os.path.join(root_path, split_dir, speaker_id, file_id + ".flac")
+        items.append({"text": text, "audio_file": wav_file, "speaker_name": "OB_" + speaker_id})
    return items


@ -419,6 +441,26 @@ def _voxcel_x(root_path, meta_file, voxcel_idx):
        return [x.strip().split("|") for x in f.readlines()]


+def emotion(root_path, meta_file, ignored_speakers=None):
+    """Generic emotion dataset"""
+    txt_file = os.path.join(root_path, meta_file)
+    items = []
+    with open(txt_file, "r", encoding="utf-8") as ttf:
+        for line in ttf:
+            if line.startswith("file_path"):
+                continue
+            cols = line.split(",")
+            wav_file = os.path.join(root_path, cols[0])
+            speaker_id = cols[1]
+            emotion_id = cols[2].replace("\n", "")
+            # ignore speakers
+            if isinstance(ignored_speakers, list):
+                if speaker_id in ignored_speakers:
+                    continue
+            items.append({"audio_file": wav_file, "speaker_name": speaker_id, "emotion_name": emotion_id})
+    return items
+
+
 def baker(root_path: str, meta_file: str, **kwargs) -> List[List[str]]:  # pylint: disable=unused-argument
    """Normalizes the Baker meta data file to TTS format

--- a/TTS/tts/models/base_tts.py
+++ b/TTS/tts/models/base_tts.py
@ -7,12 +7,13 @@ import torch.distributed as dist
 from coqpit import Coqpit
 from torch import nn
 from torch.utils.data import DataLoader
-from torch.utils.data.distributed import DistributedSampler
+from torch.utils.data.sampler import WeightedRandomSampler
+from trainer.torch import DistributedSampler, DistributedSamplerWrapper

 from TTS.model import BaseTrainerModel
 from TTS.tts.datasets.dataset import TTSDataset
-from TTS.tts.utils.languages import LanguageManager, get_language_weighted_sampler
-from TTS.tts.utils.speakers import SpeakerManager, get_speaker_weighted_sampler
+from TTS.tts.utils.languages import LanguageManager, get_language_balancer_weights
+from TTS.tts.utils.speakers import SpeakerManager, get_speaker_balancer_weights, get_speaker_manager
 from TTS.tts.utils.synthesis import synthesis
 from TTS.tts.utils.visual import plot_alignment, plot_spectrogram

@ -135,18 +136,18 @@ class BaseTTS(BaseTrainerModel):
        if hasattr(self, "speaker_manager"):
            if config.use_d_vector_file:
                if speaker_name is None:
-                    d_vector = self.speaker_manager.get_random_d_vector()
+                    d_vector = self.speaker_manager.get_random_embeddings()
                else:
-                    d_vector = self.speaker_manager.get_d_vector_by_speaker(speaker_name)
+                    d_vector = self.speaker_manager.get_d_vector_by_name(speaker_name)
            elif config.use_speaker_embedding:
                if speaker_name is None:
-                    speaker_id = self.speaker_manager.get_random_speaker_id()
+                    speaker_id = self.speaker_manager.get_random_id()
                else:
-                    speaker_id = self.speaker_manager.speaker_ids[speaker_name]
+                    speaker_id = self.speaker_manager.ids[speaker_name]

        # get language id
        if hasattr(self, "language_manager") and config.use_language_embedding and language_name is not None:
-            language_id = self.language_manager.language_id_mapping[language_name]
+            language_id = self.language_manager.ids[language_name]

        return {
            "text": text,
@ -232,6 +233,36 @@ class BaseTTS(BaseTrainerModel):
            "language_ids": language_ids,
        }

+    def get_sampler(self, config: Coqpit, dataset: TTSDataset, num_gpus=1):
+        weights = None
+        data_items = dataset.samples
+
+        if getattr(config, "use_language_weighted_sampler", False):
+            alpha = getattr(config, "language_weighted_sampler_alpha", 1.0)
+            print(" > Using Language weighted sampler with alpha:", alpha)
+            weights = get_language_balancer_weights(data_items) * alpha
+
+        if getattr(config, "use_speaker_weighted_sampler", False):
+            alpha = getattr(config, "speaker_weighted_sampler_alpha", 1.0)
+            print(" > Using Speaker weighted sampler with alpha:", alpha)
+            if weights is not None:
+                weights += get_speaker_balancer_weights(data_items) * alpha
+            else:
+                weights = get_speaker_balancer_weights(data_items) * alpha
+
+        if weights is not None:
+            sampler = WeightedRandomSampler(weights, len(weights))
+        else:
+            sampler = None
+
+        # sampler for DDP
+        if sampler is None:
+            sampler = DistributedSampler(dataset) if num_gpus > 1 else None
+        else:  # If a sampler is already defined use this sampler and DDP sampler together
+            sampler = DistributedSamplerWrapper(sampler) if num_gpus > 1 else sampler
+
+        return sampler
+
    def get_data_loader(
        self,
        config: Coqpit,
@ -248,23 +279,19 @@ class BaseTTS(BaseTrainerModel):
            # setup multi-speaker attributes
            if hasattr(self, "speaker_manager") and self.speaker_manager is not None:
                if hasattr(config, "model_args"):
-                    speaker_id_mapping = (
-                        self.speaker_manager.speaker_ids if config.model_args.use_speaker_embedding else None
-                    )
-                    d_vector_mapping = self.speaker_manager.d_vectors if config.model_args.use_d_vector_file else None
+                    speaker_id_mapping = self.speaker_manager.ids if config.model_args.use_speaker_embedding else None
+                    d_vector_mapping = self.speaker_manager.embeddings if config.model_args.use_d_vector_file else None
                    config.use_d_vector_file = config.model_args.use_d_vector_file
                else:
-                    speaker_id_mapping = self.speaker_manager.speaker_ids if config.use_speaker_embedding else None
-                    d_vector_mapping = self.speaker_manager.d_vectors if config.use_d_vector_file else None
+                    speaker_id_mapping = self.speaker_manager.ids if config.use_speaker_embedding else None
+                    d_vector_mapping = self.speaker_manager.embeddings if config.use_d_vector_file else None
            else:
                speaker_id_mapping = None
                d_vector_mapping = None

            # setup multi-lingual attributes
            if hasattr(self, "language_manager") and self.language_manager is not None:
-                language_id_mapping = (
-                    self.language_manager.language_id_mapping if self.args.use_language_embedding else None
-                )
+                language_id_mapping = self.language_manager.ids if self.args.use_language_embedding else None
            else:
                language_id_mapping = None

@ -300,25 +327,8 @@ class BaseTTS(BaseTrainerModel):
            # sort input sequences from short to long
            dataset.preprocess_samples()

-            # sampler for DDP
-            sampler = DistributedSampler(dataset) if num_gpus > 1 else None
-
-            # Weighted samplers
-            # TODO: make this DDP amenable
-            assert not (
-                num_gpus > 1 and getattr(config, "use_language_weighted_sampler", False)
-            ), "language_weighted_sampler is not supported with DistributedSampler"
-            assert not (
-                num_gpus > 1 and getattr(config, "use_speaker_weighted_sampler", False)
-            ), "speaker_weighted_sampler is not supported with DistributedSampler"
-
-            if sampler is None:
-                if getattr(config, "use_language_weighted_sampler", False):
-                    print(" > Using Language weighted sampler")
-                    sampler = get_language_weighted_sampler(dataset.samples)
-                elif getattr(config, "use_speaker_weighted_sampler", False):
-                    print(" > Using Language weighted sampler")
-                    sampler = get_speaker_weighted_sampler(dataset.samples)
+            # get samplers
+            sampler = self.get_sampler(config, dataset, num_gpus)

            loader = DataLoader(
                dataset,
@ -338,13 +348,13 @@ class BaseTTS(BaseTrainerModel):

        d_vector = None
        if self.config.use_d_vector_file:
-            d_vector = [self.speaker_manager.d_vectors[name]["embedding"] for name in self.speaker_manager.d_vectors]
+            d_vector = [self.speaker_manager.embeddings[name]["embedding"] for name in self.speaker_manager.embeddings]
            d_vector = (random.sample(sorted(d_vector), 1),)

        aux_inputs = {
            "speaker_id": None
            if not self.config.use_speaker_embedding
-            else random.sample(sorted(self.speaker_manager.speaker_ids.values()), 1),
+            else random.sample(sorted(self.speaker_manager.ids.values()), 1),
            "d_vector": d_vector,
            "style_wav": None,  # TODO: handle GST style input
        }
@ -391,7 +401,7 @@ class BaseTTS(BaseTrainerModel):
        """Save the speaker.json and language_ids.json at the beginning of the training. Also update both paths."""
        if self.speaker_manager is not None:
            output_path = os.path.join(trainer.output_path, "speakers.json")
-            self.speaker_manager.save_speaker_ids_to_file(output_path)
+            self.speaker_manager.save_ids_to_file(output_path)
            trainer.config.speakers_file = output_path
            # some models don't have `model_args` set
            if hasattr(trainer.config, "model_args"):
@ -402,7 +412,7 @@ class BaseTTS(BaseTrainerModel):

        if hasattr(self, "language_manager") and self.language_manager is not None:
            output_path = os.path.join(trainer.output_path, "language_ids.json")
-            self.language_manager.save_language_ids_to_file(output_path)
+            self.language_manager.save_ids_to_file(output_path)
            trainer.config.language_ids_file = output_path
            if hasattr(trainer.config, "model_args"):
                trainer.config.model_args.language_ids_file = output_path
--- a/TTS/tts/models/glow_tts.py
+++ b/TTS/tts/models/glow_tts.py
@ -124,7 +124,7 @@ class GlowTTS(BaseTTS):
            )
            if self.speaker_manager is not None:
                assert (
-                    config.d_vector_dim == self.speaker_manager.d_vector_dim
+                    config.d_vector_dim == self.speaker_manager.embedding_dim
                ), " [!] d-vector dimension mismatch b/w config and speaker manager."
        # init speaker embedding layer
        if config.use_speaker_embedding and not config.use_d_vector_file:
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@ -13,7 +13,6 @@ from torch import nn
 from torch.cuda.amp.autocast_mode import autocast
 from torch.nn import functional as F
 from torch.utils.data import DataLoader
-from torch.utils.data.distributed import DistributedSampler
 from trainer.trainer_utils import get_optimizer, get_scheduler

 from TTS.tts.configs.shared_configs import CharactersConfig
@ -24,8 +23,8 @@ from TTS.tts.layers.vits.networks import PosteriorEncoder, ResidualCouplingBlock
 from TTS.tts.layers.vits.stochastic_duration_predictor import StochasticDurationPredictor
 from TTS.tts.models.base_tts import BaseTTS
 from TTS.tts.utils.helpers import generate_path, maximum_path, rand_segments, segment, sequence_mask
-from TTS.tts.utils.languages import LanguageManager, get_language_weighted_sampler
-from TTS.tts.utils.speakers import SpeakerManager, get_speaker_weighted_sampler
+from TTS.tts.utils.languages import LanguageManager
+from TTS.tts.utils.speakers import SpeakerManager
 from TTS.tts.utils.synthesis import synthesis
 from TTS.tts.utils.text.characters import BaseCharacters, _characters, _pad, _phonemes, _punctuations
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
@ -653,28 +652,28 @@ class Vits(BaseTTS):

        # TODO: make this a function
        if self.args.use_speaker_encoder_as_loss:
-            if self.speaker_manager.speaker_encoder is None and (
+            if self.speaker_manager.encoder is None and (
                not self.args.speaker_encoder_model_path or not self.args.speaker_encoder_config_path
            ):
                raise RuntimeError(
                    " [!] To use the speaker consistency loss (SCL) you need to specify speaker_encoder_model_path and speaker_encoder_config_path !!"
                )

-            self.speaker_manager.speaker_encoder.eval()
+            self.speaker_manager.encoder.eval()
            print(" > External Speaker Encoder Loaded !!")

            if (
-                hasattr(self.speaker_manager.speaker_encoder, "audio_config")
-                and self.config.audio["sample_rate"] != self.speaker_manager.speaker_encoder.audio_config["sample_rate"]
+                hasattr(self.speaker_manager.encoder, "audio_config")
+                and self.config.audio["sample_rate"] != self.speaker_manager.encoder.audio_config["sample_rate"]
            ):
                self.audio_transform = torchaudio.transforms.Resample(
                    orig_freq=self.audio_config["sample_rate"],
-                    new_freq=self.speaker_manager.speaker_encoder.audio_config["sample_rate"],
+                    new_freq=self.speaker_manager.encoder.audio_config["sample_rate"],
                )
            # pylint: disable=W0101,W0105
            self.audio_transform = torchaudio.transforms.Resample(
                orig_freq=self.config.audio.sample_rate,
-                new_freq=self.speaker_manager.speaker_encoder.audio_config["sample_rate"],
+                new_freq=self.speaker_manager.encoder.audio_config["sample_rate"],
            )

    def _init_speaker_embedding(self):
@ -707,7 +706,6 @@ class Vits(BaseTTS):
            torch.nn.init.xavier_uniform_(self.emb_l.weight)
        else:
            self.embedded_language_dim = 0
-            self.emb_l = None

    def get_aux_input(self, aux_input: Dict):
        sid, g, lid = self._set_cond_input(aux_input)
@ -889,7 +887,7 @@ class Vits(BaseTTS):
            pad_short=True,
        )

-        if self.args.use_speaker_encoder_as_loss and self.speaker_manager.speaker_encoder is not None:
+        if self.args.use_speaker_encoder_as_loss and self.speaker_manager.encoder is not None:
            # concate generated and GT waveforms
            wavs_batch = torch.cat((wav_seg, o), dim=0)

@ -898,7 +896,7 @@ class Vits(BaseTTS):
            if self.audio_transform is not None:
                wavs_batch = self.audio_transform(wavs_batch)

-            pred_embs = self.speaker_manager.speaker_encoder.forward(wavs_batch, l2_norm=True)
+            pred_embs = self.speaker_manager.encoder.forward(wavs_batch, l2_norm=True)

            # split generated and GT speaker embeddings
            gt_spk_emb, syn_spk_emb = torch.chunk(pred_embs, 2, dim=0)
@ -996,6 +994,34 @@ class Vits(BaseTTS):
        outputs = {"model_outputs": o, "alignments": attn.squeeze(1), "z": z, "z_p": z_p, "m_p": m_p, "logs_p": logs_p}
        return outputs

+    @torch.no_grad()
+    def inference_voice_conversion(
+        self, reference_wav, speaker_id=None, d_vector=None, reference_speaker_id=None, reference_d_vector=None
+    ):
+        """Inference for voice conversion
+
+        Args:
+            reference_wav (Tensor): Reference wavform. Tensor of shape [B, T]
+            speaker_id (Tensor): speaker_id of the target speaker. Tensor of shape [B]
+            d_vector (Tensor): d_vector embedding of target speaker. Tensor of shape `[B, C]`
+            reference_speaker_id (Tensor): speaker_id of the reference_wav speaker. Tensor of shape [B]
+            reference_d_vector (Tensor): d_vector embedding of the reference_wav speaker. Tensor of shape `[B, C]`
+        """
+        # compute spectrograms
+        y = wav_to_spec(
+            reference_wav,
+            self.config.audio.fft_size,
+            self.config.audio.hop_length,
+            self.config.audio.win_length,
+            center=False,
+        ).transpose(1, 2)
+        y_lengths = torch.tensor([y.size(-1)]).to(y.device)
+        speaker_cond_src = reference_speaker_id if reference_speaker_id is not None else reference_d_vector
+        speaker_cond_tgt = speaker_id if speaker_id is not None else d_vector
+        # print(y.shape, y_lengths.shape)
+        wav, _, _ = self.voice_conversion(y, y_lengths, speaker_cond_src, speaker_cond_tgt)
+        return wav
+
    def voice_conversion(self, y, y_lengths, speaker_cond_src, speaker_cond_tgt):
        """Forward pass for voice conversion

@ -1008,12 +1034,11 @@ class Vits(BaseTTS):
            speaker_cond_tgt (Tensor): Target speaker ID. Tensor of shape [B,]
        """
        assert self.num_speakers > 0, "num_speakers have to be larger than 0."
-
        # speaker embedding
        if self.args.use_speaker_embedding and not self.args.use_d_vector_file:
            g_src = self.emb_g(speaker_cond_src).unsqueeze(-1)
            g_tgt = self.emb_g(speaker_cond_tgt).unsqueeze(-1)
-        elif self.args.use_speaker_embedding and self.args.use_d_vector_file:
+        elif not self.args.use_speaker_embedding and self.args.use_d_vector_file:
            g_src = F.normalize(speaker_cond_src).unsqueeze(-1)
            g_tgt = F.normalize(speaker_cond_tgt).unsqueeze(-1)
        else:
@ -1198,18 +1223,18 @@ class Vits(BaseTTS):
        if hasattr(self, "speaker_manager"):
            if config.use_d_vector_file:
                if speaker_name is None:
-                    d_vector = self.speaker_manager.get_random_d_vector()
+                    d_vector = self.speaker_manager.get_random_embeddings()
                else:
-                    d_vector = self.speaker_manager.get_mean_d_vector(speaker_name, num_samples=1, randomize=False)
+                    d_vector = self.speaker_manager.get_mean_embedding(speaker_name, num_samples=None, randomize=False)
            elif config.use_speaker_embedding:
                if speaker_name is None:
-                    speaker_id = self.speaker_manager.get_random_speaker_id()
+                    speaker_id = self.speaker_manager.get_random_id()
                else:
-                    speaker_id = self.speaker_manager.speaker_ids[speaker_name]
+                    speaker_id = self.speaker_manager.ids[speaker_name]

        # get language id
        if hasattr(self, "language_manager") and config.use_language_embedding and language_name is not None:
-            language_id = self.language_manager.language_id_mapping[language_name]
+            language_id = self.language_manager.ids[language_name]

        return {
            "text": text,
@ -1264,26 +1289,22 @@ class Vits(BaseTTS):
        d_vectors = None

        # get numerical speaker ids from speaker names
-        if self.speaker_manager is not None and self.speaker_manager.speaker_ids and self.args.use_speaker_embedding:
-            speaker_ids = [self.speaker_manager.speaker_ids[sn] for sn in batch["speaker_names"]]
+        if self.speaker_manager is not None and self.speaker_manager.ids and self.args.use_speaker_embedding:
+            speaker_ids = [self.speaker_manager.ids[sn] for sn in batch["speaker_names"]]

        if speaker_ids is not None:
            speaker_ids = torch.LongTensor(speaker_ids)
            batch["speaker_ids"] = speaker_ids

        # get d_vectors from audio file names
-        if self.speaker_manager is not None and self.speaker_manager.d_vectors and self.args.use_d_vector_file:
-            d_vector_mapping = self.speaker_manager.d_vectors
+        if self.speaker_manager is not None and self.speaker_manager.embeddings and self.args.use_d_vector_file:
+            d_vector_mapping = self.speaker_manager.embeddings
            d_vectors = [d_vector_mapping[w]["embedding"] for w in batch["audio_files"]]
            d_vectors = torch.FloatTensor(d_vectors)

        # get language ids from language names
-        if (
-            self.language_manager is not None
-            and self.language_manager.language_id_mapping
-            and self.args.use_language_embedding
-        ):
-            language_ids = [self.language_manager.language_id_mapping[ln] for ln in batch["language_names"]]
+        if self.language_manager is not None and self.language_manager.ids and self.args.use_language_embedding:
+            language_ids = [self.language_manager.ids[ln] for ln in batch["language_names"]]

        if language_ids is not None:
            language_ids = torch.LongTensor(language_ids)
@ -1354,31 +1375,15 @@ class Vits(BaseTTS):
            # sort input sequences from short to long
            dataset.preprocess_samples()

-            # sampler for DDP
-            sampler = DistributedSampler(dataset) if num_gpus > 1 else None
-
-            # Weighted samplers
-            # TODO: make this DDP amenable
-            assert not (
-                num_gpus > 1 and getattr(config, "use_language_weighted_sampler", False)
-            ), "language_weighted_sampler is not supported with DistributedSampler"
-            assert not (
-                num_gpus > 1 and getattr(config, "use_speaker_weighted_sampler", False)
-            ), "speaker_weighted_sampler is not supported with DistributedSampler"
-
-            if sampler is None:
-                if getattr(config, "use_language_weighted_sampler", False):
-                    print(" > Using Language weighted sampler")
-                    sampler = get_language_weighted_sampler(dataset.samples)
-                elif getattr(config, "use_speaker_weighted_sampler", False):
-                    print(" > Using Language weighted sampler")
-                    sampler = get_speaker_weighted_sampler(dataset.samples)
+            # get samplers
+            sampler = self.get_sampler(config, dataset, num_gpus)

            loader = DataLoader(
                dataset,
                batch_size=config.eval_batch_size if is_eval else config.batch_size,
                shuffle=False,  # shuffle is done in the dataset.
                drop_last=False,  # setting this False might cause issues in AMP training.
+                sampler=sampler,
                collate_fn=dataset.collate_fn,
                num_workers=config.num_eval_loader_workers if is_eval else config.num_loader_workers,
                pin_memory=False,
@ -1481,7 +1486,7 @@ class Vits(BaseTTS):
        language_manager = LanguageManager.init_from_config(config)

        if config.model_args.speaker_encoder_model_path:
-            speaker_manager.init_speaker_encoder(
+            speaker_manager.init_encoder(
                config.model_args.speaker_encoder_model_path, config.model_args.speaker_encoder_config_path
            )
        return Vits(new_config, ap, tokenizer, speaker_manager, language_manager)
--- a/TTS/tts/utils/languages.py
+++ b/TTS/tts/utils/languages.py
@ -1,17 +1,16 @@
-import json
 import os
-from typing import Dict, List
+from typing import Any, Dict, List

 import fsspec
 import numpy as np
 import torch
 from coqpit import Coqpit
-from torch.utils.data.sampler import WeightedRandomSampler

 from TTS.config import check_config_and_model_args
+from TTS.tts.utils.managers import BaseIDManager


-class LanguageManager:
+class LanguageManager(BaseIDManager):
    """Manage the languages for multi-lingual 🐸TTS models. Load a datafile and parse the information
    in a way that can be queried by language.

@ -26,37 +25,23 @@ class LanguageManager:
        >>> language_id_mapper = manager.language_ids
    """

-    language_id_mapping: Dict = {}
-
    def __init__(
        self,
        language_ids_file_path: str = "",
        config: Coqpit = None,
    ):
-        self.language_id_mapping = {}
-        if language_ids_file_path:
-            self.set_language_ids_from_file(language_ids_file_path)
+        super().__init__(id_file_path=language_ids_file_path)

        if config:
            self.set_language_ids_from_config(config)

-    @staticmethod
-    def _load_json(json_file_path: str) -> Dict:
-        with fsspec.open(json_file_path, "r") as f:
-            return json.load(f)
-
-    @staticmethod
-    def _save_json(json_file_path: str, data: dict) -> None:
-        with fsspec.open(json_file_path, "w") as f:
-            json.dump(data, f, indent=4)
-
    @property
    def num_languages(self) -> int:
-        return len(list(self.language_id_mapping.keys()))
+        return len(list(self.ids.keys()))

    @property
    def language_names(self) -> List:
-        return list(self.language_id_mapping.keys())
+        return list(self.ids.keys())

    @staticmethod
    def parse_language_ids_from_config(c: Coqpit) -> Dict:
@ -80,25 +65,24 @@ class LanguageManager:
        """Set language IDs from config samples.

        Args:
-            items (List): Data sampled returned by `load_meta_data()`.
+            c (Coqpit): Config.
        """
-        self.language_id_mapping = self.parse_language_ids_from_config(c)
+        self.ids = self.parse_language_ids_from_config(c)

-    def set_language_ids_from_file(self, file_path: str) -> None:
-        """Load language ids from a json file.
+    @staticmethod
+    def parse_ids_from_data(items: List, parse_key: str) -> Any:
+        raise NotImplementedError

-        Args:
-            file_path (str): Path to the target json file.
-        """
-        self.language_id_mapping = self._load_json(file_path)
+    def set_ids_from_data(self, items: List, parse_key: str) -> Any:
+        raise NotImplementedError

-    def save_language_ids_to_file(self, file_path: str) -> None:
+    def save_ids_to_file(self, file_path: str) -> None:
        """Save language IDs to a json file.

        Args:
            file_path (str): Path to the output file.
        """
-        self._save_json(file_path, self.language_id_mapping)
+        self._save_json(file_path, self.ids)

    @staticmethod
    def init_from_config(config: Coqpit) -> "LanguageManager":
@ -128,11 +112,14 @@ def _set_file_path(path):
    return None


-def get_language_weighted_sampler(items: list):
+def get_language_balancer_weights(items: list):
    language_names = np.array([item["language"] for item in items])
    unique_language_names = np.unique(language_names).tolist()
    language_ids = [unique_language_names.index(l) for l in language_names]
    language_count = np.array([len(np.where(language_names == l)[0]) for l in unique_language_names])
    weight_language = 1.0 / language_count
-    dataset_samples_weight = torch.from_numpy(np.array([weight_language[l] for l in language_ids])).double()
-    return WeightedRandomSampler(dataset_samples_weight, len(dataset_samples_weight))
+    # get weight for each sample
+    dataset_samples_weight = np.array([weight_language[l] for l in language_ids])
+    # normalize
+    dataset_samples_weight = dataset_samples_weight / np.linalg.norm(dataset_samples_weight)
+    return torch.from_numpy(dataset_samples_weight).float()
--- a/TTS/tts/utils/managers.py
+++ b/TTS/tts/utils/managers.py
@ -0,0 +1,285 @@
+import json
+import random
+from typing import Any, Dict, List, Tuple, Union
+
+import fsspec
+import numpy as np
+import torch
+
+from TTS.config import load_config
+from TTS.encoder.utils.generic_utils import setup_encoder_model
+from TTS.utils.audio import AudioProcessor
+
+
+class BaseIDManager:
+    """Base `ID` Manager class. Every new `ID` manager must inherit this.
+    It defines common `ID` manager specific functions.
+    """
+
+    def __init__(self, id_file_path: str = ""):
+        self.ids = {}
+
+        if id_file_path:
+            self.load_ids_from_file(id_file_path)
+
+    @staticmethod
+    def _load_json(json_file_path: str) -> Dict:
+        with fsspec.open(json_file_path, "r") as f:
+            return json.load(f)
+
+    @staticmethod
+    def _save_json(json_file_path: str, data: dict) -> None:
+        with fsspec.open(json_file_path, "w") as f:
+            json.dump(data, f, indent=4)
+
+    def set_ids_from_data(self, items: List, parse_key: str) -> None:
+        """Set IDs from data samples.
+
+        Args:
+            items (List): Data sampled returned by `load_tts_samples()`.
+        """
+        self.ids = self.parse_ids_from_data(items, parse_key=parse_key)
+
+    def load_ids_from_file(self, file_path: str) -> None:
+        """Set IDs from a file.
+
+        Args:
+            file_path (str): Path to the file.
+        """
+        self.ids = self._load_json(file_path)
+
+    def save_ids_to_file(self, file_path: str) -> None:
+        """Save IDs to a json file.
+
+        Args:
+            file_path (str): Path to the output file.
+        """
+        self._save_json(file_path, self.ids)
+
+    def get_random_id(self) -> Any:
+        """Get a random embedding.
+
+        Args:
+
+        Returns:
+            np.ndarray: embedding.
+        """
+        if self.ids:
+            return self.ids[random.choices(list(self.ids.keys()))[0]]
+
+        return None
+
+    @staticmethod
+    def parse_ids_from_data(items: List, parse_key: str) -> Tuple[Dict]:
+        """Parse IDs from data samples retured by `load_tts_samples()`.
+
+        Args:
+            items (list): Data sampled returned by `load_tts_samples()`.
+            parse_key (str): The key to being used to parse the data.
+        Returns:
+            Tuple[Dict]: speaker IDs.
+        """
+        classes = sorted({item[parse_key] for item in items})
+        ids = {name: i for i, name in enumerate(classes)}
+        return ids
+
+
+class EmbeddingManager(BaseIDManager):
+    """Base `Embedding` Manager class. Every new `Embedding` manager must inherit this.
+    It defines common `Embedding` manager specific functions.
+    """
+
+    def __init__(
+        self,
+        embedding_file_path: str = "",
+        id_file_path: str = "",
+        encoder_model_path: str = "",
+        encoder_config_path: str = "",
+        use_cuda: bool = False,
+    ):
+        super().__init__(id_file_path=id_file_path)
+
+        self.embeddings = {}
+        self.embeddings_by_names = {}
+        self.clip_ids = []
+        self.encoder = None
+        self.encoder_ap = None
+        self.use_cuda = use_cuda
+
+        if embedding_file_path:
+            self.load_embeddings_from_file(embedding_file_path)
+
+        if encoder_model_path and encoder_config_path:
+            self.init_encoder(encoder_model_path, encoder_config_path)
+
+    @property
+    def embedding_dim(self):
+        """Dimensionality of embeddings. If embeddings are not loaded, returns zero."""
+        if self.embeddings:
+            return len(self.embeddings[list(self.embeddings.keys())[0]]["embedding"])
+        return 0
+
+    def save_embeddings_to_file(self, file_path: str) -> None:
+        """Save embeddings to a json file.
+
+        Args:
+            file_path (str): Path to the output file.
+        """
+        self._save_json(file_path, self.embeddings)
+
+    def load_embeddings_from_file(self, file_path: str) -> None:
+        """Load embeddings from a json file.
+
+        Args:
+            file_path (str): Path to the target json file.
+        """
+        self.embeddings = self._load_json(file_path)
+
+        speakers = sorted({x["name"] for x in self.embeddings.values()})
+        self.ids = {name: i for i, name in enumerate(speakers)}
+
+        self.clip_ids = list(set(sorted(clip_name for clip_name in self.embeddings.keys())))
+        # cache embeddings_by_names for fast inference using a bigger speakers.json
+        self.embeddings_by_names = self.get_embeddings_by_names()
+
+    def get_embedding_by_clip(self, clip_idx: str) -> List:
+        """Get embedding by clip ID.
+
+        Args:
+            clip_idx (str): Target clip ID.
+
+        Returns:
+            List: embedding as a list.
+        """
+        return self.embeddings[clip_idx]["embedding"]
+
+    def get_embeddings_by_name(self, idx: str) -> List[List]:
+        """Get all embeddings of a speaker.
+
+        Args:
+            idx (str): Target name.
+
+        Returns:
+            List[List]: all the embeddings of the given speaker.
+        """
+        return self.embeddings_by_names[idx]
+
+    def get_embeddings_by_names(self) -> Dict:
+        """Get all embeddings by names.
+
+        Returns:
+            Dict: all the embeddings of each speaker.
+        """
+        embeddings_by_names = {}
+        for x in self.embeddings.values():
+            if x["name"] not in embeddings_by_names.keys():
+                embeddings_by_names[x["name"]] = [x["embedding"]]
+            else:
+                embeddings_by_names[x["name"]].append(x["embedding"])
+        return embeddings_by_names
+
+    def get_mean_embedding(self, idx: str, num_samples: int = None, randomize: bool = False) -> np.ndarray:
+        """Get mean embedding of a idx.
+
+        Args:
+            idx (str): Target name.
+            num_samples (int, optional): Number of samples to be averaged. Defaults to None.
+            randomize (bool, optional): Pick random `num_samples` of embeddings. Defaults to False.
+
+        Returns:
+            np.ndarray: Mean embedding.
+        """
+        embeddings = self.get_embeddings_by_name(idx)
+        if num_samples is None:
+            embeddings = np.stack(embeddings).mean(0)
+        else:
+            assert len(embeddings) >= num_samples, f" [!] {idx} has number of samples < {num_samples}"
+            if randomize:
+                embeddings = np.stack(random.choices(embeddings, k=num_samples)).mean(0)
+            else:
+                embeddings = np.stack(embeddings[:num_samples]).mean(0)
+        return embeddings
+
+    def get_random_embedding(self) -> Any:
+        """Get a random embedding.
+
+        Args:
+
+        Returns:
+            np.ndarray: embedding.
+        """
+        if self.embeddings:
+            return self.embeddings[random.choices(list(self.embeddings.keys()))[0]]["embedding"]
+
+        return None
+
+    def get_clips(self) -> List:
+        return sorted(self.embeddings.keys())
+
+    def init_encoder(self, model_path: str, config_path: str) -> None:
+        """Initialize a speaker encoder model.
+
+        Args:
+            model_path (str): Model file path.
+            config_path (str): Model config file path.
+        """
+        self.encoder_config = load_config(config_path)
+        self.encoder = setup_encoder_model(self.encoder_config)
+        self.encoder_criterion = self.encoder.load_checkpoint(
+            self.encoder_config, model_path, eval=True, use_cuda=self.use_cuda
+        )
+        self.encoder_ap = AudioProcessor(**self.encoder_config.audio)
+
+    def compute_embedding_from_clip(self, wav_file: Union[str, List[str]]) -> list:
+        """Compute a embedding from a given audio file.
+
+        Args:
+            wav_file (Union[str, List[str]]): Target file path.
+
+        Returns:
+            list: Computed embedding.
+        """
+
+        def _compute(wav_file: str):
+            waveform = self.encoder_ap.load_wav(wav_file, sr=self.encoder_ap.sample_rate)
+            if not self.encoder_config.model_params.get("use_torch_spec", False):
+                m_input = self.encoder_ap.melspectrogram(waveform)
+                m_input = torch.from_numpy(m_input)
+            else:
+                m_input = torch.from_numpy(waveform)
+
+            if self.use_cuda:
+                m_input = m_input.cuda()
+            m_input = m_input.unsqueeze(0)
+            embedding = self.encoder.compute_embedding(m_input)
+            return embedding
+
+        if isinstance(wav_file, list):
+            # compute the mean embedding
+            embeddings = None
+            for wf in wav_file:
+                embedding = _compute(wf)
+                if embeddings is None:
+                    embeddings = embedding
+                else:
+                    embeddings += embedding
+            return (embeddings / len(wav_file))[0].tolist()
+        embedding = _compute(wav_file)
+        return embedding[0].tolist()
+
+    def compute_embeddings(self, feats: Union[torch.Tensor, np.ndarray]) -> List:
+        """Compute embedding from features.
+
+        Args:
+            feats (Union[torch.Tensor, np.ndarray]): Input features.
+
+        Returns:
+            List: computed embedding.
+        """
+        if isinstance(feats, np.ndarray):
+            feats = torch.from_numpy(feats)
+        if feats.ndim == 2:
+            feats = feats.unsqueeze(0)
+        if self.use_cuda:
+            feats = feats.cuda()
+        return self.encoder.compute_embedding(feats)
--- a/TTS/tts/utils/speakers.py
+++ b/TTS/tts/utils/speakers.py
@ -1,20 +1,17 @@
 import json
 import os
-import random
-from typing import Any, Dict, List, Tuple, Union
+from typing import Any, Dict, List, Union

 import fsspec
 import numpy as np
 import torch
 from coqpit import Coqpit
-from torch.utils.data.sampler import WeightedRandomSampler

-from TTS.config import get_from_config_or_model_args_with_default, load_config
-from TTS.speaker_encoder.utils.generic_utils import setup_speaker_encoder_model
-from TTS.utils.audio import AudioProcessor
+from TTS.config import get_from_config_or_model_args_with_default
+from TTS.tts.utils.managers import EmbeddingManager


-class SpeakerManager:
+class SpeakerManager(EmbeddingManager):
    """Manage the speakers for multi-speaker 🐸TTS models. Load a datafile and parse the information
    in a way that can be queried by speaker or clip.

@ -51,7 +48,7 @@ class SpeakerManager:
        >>> # load a sample audio and compute embedding
        >>> waveform = ap.load_wav(sample_wav_path)
        >>> mel = ap.melspectrogram(waveform)
-        >>> d_vector = manager.compute_d_vector(mel.T)
+        >>> d_vector = manager.compute_embeddings(mel.T)
    """

    def __init__(
@ -63,260 +60,27 @@ class SpeakerManager:
        encoder_config_path: str = "",
        use_cuda: bool = False,
    ):
-
-        self.d_vectors = {}
-        self.speaker_ids = {}
-        self.clip_ids = []
-        self.speaker_encoder = None
-        self.speaker_encoder_ap = None
-        self.use_cuda = use_cuda
+        super().__init__(
+            embedding_file_path=d_vectors_file_path,
+            id_file_path=speaker_id_file_path,
+            encoder_model_path=encoder_model_path,
+            encoder_config_path=encoder_config_path,
+            use_cuda=use_cuda,
+        )

        if data_items:
-            self.speaker_ids, _ = self.parse_speakers_from_data(data_items)
-
-        if d_vectors_file_path:
-            self.set_d_vectors_from_file(d_vectors_file_path)
-
-        if speaker_id_file_path:
-            self.set_speaker_ids_from_file(speaker_id_file_path)
-
-        if encoder_model_path and encoder_config_path:
-            self.init_speaker_encoder(encoder_model_path, encoder_config_path)
-
-    @staticmethod
-    def _load_json(json_file_path: str) -> Dict:
-        with fsspec.open(json_file_path, "r") as f:
-            return json.load(f)
-
-    @staticmethod
-    def _save_json(json_file_path: str, data: dict) -> None:
-        with fsspec.open(json_file_path, "w") as f:
-            json.dump(data, f, indent=4)
+            self.set_ids_from_data(data_items, parse_key="speaker_name")

    @property
    def num_speakers(self):
-        return len(self.speaker_ids)
+        return len(self.ids)

    @property
    def speaker_names(self):
-        return list(self.speaker_ids.keys())
-
-    @property
-    def d_vector_dim(self):
-        """Dimensionality of d_vectors. If d_vectors are not loaded, returns zero."""
-        if self.d_vectors:
-            return len(self.d_vectors[list(self.d_vectors.keys())[0]]["embedding"])
-        return 0
-
-    @staticmethod
-    def parse_speakers_from_data(items: list) -> Tuple[Dict, int]:
-        """Parse speaker IDs from data samples retured by `load_tts_samples()`.
-
-        Args:
-            items (list): Data sampled returned by `load_tts_samples()`.
-
-        Returns:
-            Tuple[Dict, int]: speaker IDs and number of speakers.
-        """
-        speakers = sorted({item["speaker_name"] for item in items})
-        speaker_ids = {name: i for i, name in enumerate(speakers)}
-        num_speakers = len(speaker_ids)
-        return speaker_ids, num_speakers
-
-    def set_speaker_ids_from_data(self, items: List) -> None:
-        """Set speaker IDs from data samples.
-
-        Args:
-            items (List): Data sampled returned by `load_tts_samples()`.
-        """
-        self.speaker_ids, _ = self.parse_speakers_from_data(items)
-
-    def set_speaker_ids_from_file(self, file_path: str) -> None:
-        """Set speaker IDs from a file.
-
-        Args:
-            file_path (str): Path to the file.
-        """
-        self.speaker_ids = self._load_json(file_path)
-
-    def save_speaker_ids_to_file(self, file_path: str) -> None:
-        """Save speaker IDs to a json file.
-
-        Args:
-            file_path (str): Path to the output file.
-        """
-        self._save_json(file_path, self.speaker_ids)
-
-    def save_d_vectors_to_file(self, file_path: str) -> None:
-        """Save d_vectors to a json file.
-
-        Args:
-            file_path (str): Path to the output file.
-        """
-        self._save_json(file_path, self.d_vectors)
-
-    def set_d_vectors_from_file(self, file_path: str) -> None:
-        """Load d_vectors from a json file.
-
-        Args:
-            file_path (str): Path to the target json file.
-        """
-        self.d_vectors = self._load_json(file_path)
-
-        speakers = sorted({x["name"] for x in self.d_vectors.values()})
-        self.speaker_ids = {name: i for i, name in enumerate(speakers)}
-
-        self.clip_ids = list(set(sorted(clip_name for clip_name in self.d_vectors.keys())))
-
-    def get_d_vector_by_clip(self, clip_idx: str) -> List:
-        """Get d_vector by clip ID.
-
-        Args:
-            clip_idx (str): Target clip ID.
-
-        Returns:
-            List: d_vector as a list.
-        """
-        return self.d_vectors[clip_idx]["embedding"]
-
-    def get_d_vectors_by_speaker(self, speaker_idx: str) -> List[List]:
-        """Get all d_vectors of a speaker.
-
-        Args:
-            speaker_idx (str): Target speaker ID.
-
-        Returns:
-            List[List]: all the d_vectors of the given speaker.
-        """
-        return [x["embedding"] for x in self.d_vectors.values() if x["name"] == speaker_idx]
-
-    def get_mean_d_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False) -> np.ndarray:
-        """Get mean d_vector of a speaker ID.
-
-        Args:
-            speaker_idx (str): Target speaker ID.
-            num_samples (int, optional): Number of samples to be averaged. Defaults to None.
-            randomize (bool, optional): Pick random `num_samples` of d_vectors. Defaults to False.
-
-        Returns:
-            np.ndarray: Mean d_vector.
-        """
-        d_vectors = self.get_d_vectors_by_speaker(speaker_idx)
-        if num_samples is None:
-            d_vectors = np.stack(d_vectors).mean(0)
-        else:
-            assert len(d_vectors) >= num_samples, f" [!] speaker {speaker_idx} has number of samples < {num_samples}"
-            if randomize:
-                d_vectors = np.stack(random.choices(d_vectors, k=num_samples)).mean(0)
-            else:
-                d_vectors = np.stack(d_vectors[:num_samples]).mean(0)
-        return d_vectors
-
-    def get_random_speaker_id(self) -> Any:
-        """Get a random d_vector.
-
-        Args:
-
-        Returns:
-            np.ndarray: d_vector.
-        """
-        if self.speaker_ids:
-            return self.speaker_ids[random.choices(list(self.speaker_ids.keys()))[0]]
-
-        return None
-
-    def get_random_d_vector(self) -> Any:
-        """Get a random D  ID.
-
-        Args:
-
-        Returns:
-            np.ndarray: d_vector.
-        """
-        if self.d_vectors:
-            return self.d_vectors[random.choices(list(self.d_vectors.keys()))[0]]["embedding"]
-
-        return None
+        return list(self.ids.keys())

    def get_speakers(self) -> List:
-        return self.speaker_ids
-
-    def get_clips(self) -> List:
-        return sorted(self.d_vectors.keys())
-
-    def init_speaker_encoder(self, model_path: str, config_path: str) -> None:
-        """Initialize a speaker encoder model.
-
-        Args:
-            model_path (str): Model file path.
-            config_path (str): Model config file path.
-        """
-        self.speaker_encoder_config = load_config(config_path)
-        self.speaker_encoder = setup_speaker_encoder_model(self.speaker_encoder_config)
-        self.speaker_encoder.load_checkpoint(config_path, model_path, eval=True, use_cuda=self.use_cuda)
-        self.speaker_encoder_ap = AudioProcessor(**self.speaker_encoder_config.audio)
-
-    def compute_d_vector_from_clip(self, wav_file: Union[str, List[str]]) -> list:
-        """Compute a d_vector from a given audio file.
-
-        Args:
-            wav_file (Union[str, List[str]]): Target file path.
-
-        Returns:
-            list: Computed d_vector.
-        """
-
-        def _compute(wav_file: str):
-            waveform = self.speaker_encoder_ap.load_wav(wav_file, sr=self.speaker_encoder_ap.sample_rate)
-            if not self.speaker_encoder_config.model_params.get("use_torch_spec", False):
-                m_input = self.speaker_encoder_ap.melspectrogram(waveform)
-                m_input = torch.from_numpy(m_input)
-            else:
-                m_input = torch.from_numpy(waveform)
-
-            if self.use_cuda:
-                m_input = m_input.cuda()
-            m_input = m_input.unsqueeze(0)
-            d_vector = self.speaker_encoder.compute_embedding(m_input)
-            return d_vector
-
-        if isinstance(wav_file, list):
-            # compute the mean d_vector
-            d_vectors = None
-            for wf in wav_file:
-                d_vector = _compute(wf)
-                if d_vectors is None:
-                    d_vectors = d_vector
-                else:
-                    d_vectors += d_vector
-            return (d_vectors / len(wav_file))[0].tolist()
-        d_vector = _compute(wav_file)
-        return d_vector[0].tolist()
-
-    def compute_d_vector(self, feats: Union[torch.Tensor, np.ndarray]) -> List:
-        """Compute d_vector from features.
-
-        Args:
-            feats (Union[torch.Tensor, np.ndarray]): Input features.
-
-        Returns:
-            List: computed d_vector.
-        """
-        if isinstance(feats, np.ndarray):
-            feats = torch.from_numpy(feats)
-        if feats.ndim == 2:
-            feats = feats.unsqueeze(0)
-        if self.use_cuda:
-            feats = feats.cuda()
-        return self.speaker_encoder.compute_embedding(feats)
-
-    def run_umap(self):
-        # TODO: implement speaker encoder
-        raise NotImplementedError
-
-    def plot_embeddings(self):
-        # TODO: implement speaker encoder
-        raise NotImplementedError
+        return self.ids

    @staticmethod
    def init_from_config(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "SpeakerManager":
@ -402,7 +166,7 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None,
    speaker_manager = SpeakerManager()
    if c.use_speaker_embedding:
        if data is not None:
-            speaker_manager.set_speaker_ids_from_data(data)
+            speaker_manager.set_ids_from_data(data, parse_key="speaker_name")
        if restore_path:
            speakers_file = _set_file_path(restore_path)
            # restoring speaker manager from a previous run.
@ -414,27 +178,27 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None,
                        raise RuntimeError(
                            "You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.d_vector_file"
                        )
-                    speaker_manager.load_d_vectors_file(c.d_vector_file)
-                speaker_manager.set_d_vectors_from_file(speakers_file)
+                    speaker_manager.load_embeddings_from_file(c.d_vector_file)
+                speaker_manager.load_embeddings_from_file(speakers_file)
            elif not c.use_d_vector_file:  # restor speaker manager with speaker ID file.
-                speaker_ids_from_data = speaker_manager.speaker_ids
-                speaker_manager.set_speaker_ids_from_file(speakers_file)
+                speaker_ids_from_data = speaker_manager.ids
+                speaker_manager.load_ids_from_file(speakers_file)
                assert all(
-                    speaker in speaker_manager.speaker_ids for speaker in speaker_ids_from_data
+                    speaker in speaker_manager.ids for speaker in speaker_ids_from_data
                ), " [!] You cannot introduce new speakers to a pre-trained model."
        elif c.use_d_vector_file and c.d_vector_file:
            # new speaker manager with external speaker embeddings.
-            speaker_manager.set_d_vectors_from_file(c.d_vector_file)
+            speaker_manager.load_embeddings_from_file(c.d_vector_file)
        elif c.use_d_vector_file and not c.d_vector_file:
            raise "use_d_vector_file is True, so you need pass a external speaker embedding file."
        elif c.use_speaker_embedding and "speakers_file" in c and c.speakers_file:
            # new speaker manager with speaker IDs file.
-            speaker_manager.set_speaker_ids_from_file(c.speakers_file)
+            speaker_manager.load_ids_from_file(c.speakers_file)

        if speaker_manager.num_speakers > 0:
            print(
                " > Speaker manager is loaded with {} speakers: {}".format(
-                    speaker_manager.num_speakers, ", ".join(speaker_manager.speaker_ids)
+                    speaker_manager.num_speakers, ", ".join(speaker_manager.ids)
                )
            )

@ -443,17 +207,19 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None,
            out_file_path = os.path.join(out_path, "speakers.json")
            print(f" > Saving `speakers.json` to {out_file_path}.")
            if c.use_d_vector_file and c.d_vector_file:
-                speaker_manager.save_d_vectors_to_file(out_file_path)
+                speaker_manager.save_embeddings_to_file(out_file_path)
            else:
-                speaker_manager.save_speaker_ids_to_file(out_file_path)
+                speaker_manager.save_ids_to_file(out_file_path)
    return speaker_manager


-def get_speaker_weighted_sampler(items: list):
+def get_speaker_balancer_weights(items: list):
    speaker_names = np.array([item["speaker_name"] for item in items])
    unique_speaker_names = np.unique(speaker_names).tolist()
    speaker_ids = [unique_speaker_names.index(l) for l in speaker_names]
    speaker_count = np.array([len(np.where(speaker_names == l)[0]) for l in unique_speaker_names])
    weight_speaker = 1.0 / speaker_count
-    dataset_samples_weight = torch.from_numpy(np.array([weight_speaker[l] for l in speaker_ids])).double()
-    return WeightedRandomSampler(dataset_samples_weight, len(dataset_samples_weight))
+    dataset_samples_weight = np.array([weight_speaker[l] for l in speaker_ids])
+    # normalize
+    dataset_samples_weight = dataset_samples_weight / np.linalg.norm(dataset_samples_weight)
+    return torch.from_numpy(dataset_samples_weight).float()
--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@ -205,3 +205,84 @@ def synthesis(
        "outputs": outputs,
    }
    return return_dict
+
+
+def transfer_voice(
+    model,
+    CONFIG,
+    use_cuda,
+    reference_wav,
+    speaker_id=None,
+    d_vector=None,
+    reference_speaker_id=None,
+    reference_d_vector=None,
+    do_trim_silence=False,
+    use_griffin_lim=False,
+):
+    """Synthesize voice for the given text using Griffin-Lim vocoder or just compute output features to be passed to
+    the vocoder model.
+
+    Args:
+        model (TTS.tts.models):
+            The TTS model to synthesize audio with.
+
+        CONFIG (Coqpit):
+            Model configuration.
+
+        use_cuda (bool):
+            Enable/disable CUDA.
+
+        reference_wav (str):
+            Path of reference_wav to be used to voice conversion.
+
+        speaker_id (int):
+            Speaker ID passed to the speaker embedding layer in multi-speaker model. Defaults to None.
+
+        d_vector (torch.Tensor):
+            d-vector for multi-speaker models in share :math:`[1, D]`. Defaults to None.
+
+        reference_speaker_id (int):
+            Reference Speaker ID passed to the speaker embedding layer in multi-speaker model. Defaults to None.
+
+        reference_d_vector (torch.Tensor):
+            Reference d-vector for multi-speaker models in share :math:`[1, D]`. Defaults to None.
+
+        enable_eos_bos_chars (bool):
+            enable special chars for end of sentence and start of sentence. Defaults to False.
+
+        do_trim_silence (bool):
+            trim silence after synthesis. Defaults to False.
+    """
+    # pass tensors to backend
+    if speaker_id is not None:
+        speaker_id = id_to_torch(speaker_id, cuda=use_cuda)
+
+    if d_vector is not None:
+        d_vector = embedding_to_torch(d_vector, cuda=use_cuda)
+
+    if reference_d_vector is not None:
+        reference_d_vector = embedding_to_torch(reference_d_vector, cuda=use_cuda)
+
+    # load reference_wav audio
+    reference_wav = embedding_to_torch(model.ap.load_wav(reference_wav, sr=model.ap.sample_rate), cuda=use_cuda)
+
+    if hasattr(model, "module"):
+        _func = model.module.inference_voice_conversion
+    else:
+        _func = model.inference_voice_conversion
+    model_outputs = _func(reference_wav, speaker_id, d_vector, reference_speaker_id, reference_d_vector)
+
+    # convert outputs to numpy
+    # plot results
+    wav = None
+    model_outputs = model_outputs.squeeze()
+    if model_outputs.ndim == 2:  # [T, C_spec]
+        if use_griffin_lim:
+            wav = inv_spectrogram(model_outputs, model.ap, CONFIG)
+            # trim silence
+            if do_trim_silence:
+                wav = trim_silence(wav, model.ap)
+    else:  # [T,]
+        wav = model_outputs
+
+    return wav
--- a/TTS/tts/utils/text/phonemizers/init.py
+++ b/TTS/tts/utils/text/phonemizers/init.py
@ -12,16 +12,9 @@ GRUUT_LANGS = list(Gruut.supported_languages())


 # Dict setting default phonemizers for each language
-DEF_LANG_TO_PHONEMIZER = {
-    "ja-jp": JA_JP_Phonemizer.name(),
-    "zh-cn": ZH_CN_Phonemizer.name(),
-}
-
-
 # Add Gruut languages
 _ = [Gruut.name()] * len(GRUUT_LANGS)
-_new_dict = dict(list(zip(GRUUT_LANGS, _)))
-DEF_LANG_TO_PHONEMIZER.update(_new_dict)
+DEF_LANG_TO_PHONEMIZER = dict(list(zip(GRUUT_LANGS, _)))


 # Add ESpeak languages and override any existing ones
@ -29,7 +22,10 @@ _ = [ESpeak.name()] * len(ESPEAK_LANGS)
 _new_dict = dict(list(zip(list(ESPEAK_LANGS), _)))
 DEF_LANG_TO_PHONEMIZER.update(_new_dict)

+# Force default for some languages
 DEF_LANG_TO_PHONEMIZER["en"] = DEF_LANG_TO_PHONEMIZER["en-us"]
+DEF_LANG_TO_PHONEMIZER["ja-jp"] = JA_JP_Phonemizer.name()
+DEF_LANG_TO_PHONEMIZER["zh-cn"] = ZH_CN_Phonemizer.name()


 def get_phonemizer_by_name(name: str, **kwargs) -> BasePhonemizer:
--- a/TTS/tts/utils/text/tokenizer.py
+++ b/TTS/tts/utils/text/tokenizer.py
@ -191,6 +191,7 @@ class TTSTokenizer:
                    phonemizer = get_phonemizer_by_name(
                        DEF_LANG_TO_PHONEMIZER[config.phoneme_language], **phonemizer_kwargs
                    )
+                    new_config.phonemizer = phonemizer.name()
                except KeyError as e:
                    raise ValueError(
                        f"""No phonemizer found for language {config.phoneme_language}.
--- a/TTS/utils/audio.py
+++ b/TTS/utils/audio.py
@ -371,7 +371,9 @@ class AudioProcessor(object):
            self.hop_length = hop_length
            self.win_length = win_length
        assert min_level_db != 0.0, " [!] min_level_db is 0"
-        assert self.win_length <= self.fft_size, " [!] win_length cannot be larger than fft_size"
+        assert (
+            self.win_length <= self.fft_size
+        ), f" [!] win_length cannot be larger than fft_size - {self.win_length} vs {self.fft_size}"
        members = vars(self)
        if verbose:
            print(" > Setting up Audio Processor...")
--- a/TTS/utils/generic_utils.py
+++ b/TTS/utils/generic_utils.py
@ -67,7 +67,7 @@ def get_experiment_folder_path(root_path, model_name):
 def remove_experiment_folder(experiment_path):
    """Check folder if there is a checkpoint, otherwise remove the folder"""
    fs = fsspec.get_mapper(experiment_path).fs
-    checkpoint_files = fs.glob(experiment_path + "/*.pth.tar")
+    checkpoint_files = fs.glob(experiment_path + "/*.pth")
    if not checkpoint_files:
        if fs.exists(experiment_path):
            fs.rm(experiment_path, recursive=True)
--- a/TTS/utils/io.py
+++ b/TTS/utils/io.py
@ -140,7 +140,7 @@ def save_checkpoint(
    output_folder,
    **kwargs,
 ):
-    file_name = "checkpoint_{}.pth.tar".format(current_step)
+    file_name = "checkpoint_{}.pth".format(current_step)
    checkpoint_path = os.path.join(output_folder, file_name)
    print("\n > CHECKPOINT : {}".format(checkpoint_path))
    save_model(
@ -170,7 +170,7 @@ def save_best_model(
    **kwargs,
 ):
    if current_loss < best_loss:
-        best_model_name = f"best_model_{current_step}.pth.tar"
+        best_model_name = f"best_model_{current_step}.pth"
        checkpoint_path = os.path.join(out_path, best_model_name)
        print(" > BEST MODEL : {}".format(checkpoint_path))
        save_model(
@ -187,12 +187,12 @@ def save_best_model(
        fs = fsspec.get_mapper(out_path).fs
        # only delete previous if current is saved successfully
        if not keep_all_best or (current_step < keep_after):
-            model_names = fs.glob(os.path.join(out_path, "best_model*.pth.tar"))
+            model_names = fs.glob(os.path.join(out_path, "best_model*.pth"))
            for model_name in model_names:
                if os.path.basename(model_name) != best_model_name:
                    fs.rm(model_name)
        # create a shortcut which always points to the currently best model
-        shortcut_name = "best_model.pth.tar"
+        shortcut_name = "best_model.pth"
        shortcut_path = os.path.join(out_path, shortcut_name)
        fs.copy(checkpoint_path, shortcut_path)
        best_loss = current_loss
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@ -4,12 +4,24 @@ import os
 import zipfile
 from pathlib import Path
 from shutil import copyfile, rmtree
+from typing import Dict, Tuple

 import requests

 from TTS.config import load_config
 from TTS.utils.generic_utils import get_user_data_dir

+LICENSE_URLS = {
+    "cc by-nc-nd 4.0": "https://creativecommons.org/licenses/by-nc-nd/4.0/",
+    "mpl": "https://www.mozilla.org/en-US/MPL/2.0/",
+    "mpl2": "https://www.mozilla.org/en-US/MPL/2.0/",
+    "mpl 2.0": "https://www.mozilla.org/en-US/MPL/2.0/",
+    "mit": "https://choosealicense.com/licenses/mit/",
+    "apache 2.0": "https://choosealicense.com/licenses/apache-2.0/",
+    "apache2": "https://choosealicense.com/licenses/apache-2.0/",
+    "cc-by-sa 4.0": "https://creativecommons.org/licenses/by-sa/4.0/",
+}
+

 class ModelManager(object):
    """Manage TTS models defined in .models.json.
@ -107,6 +119,22 @@ class ModelManager(object):
                for dataset in self.models_dict[model_type][lang]:
                    print(f" >: {model_type}/{lang}/{dataset}")

+    @staticmethod
+    def print_model_license(model_item: Dict):
+        """Print the license of a model
+
+        Args:
+            model_item (dict): model item in the models.json
+        """
+        if "license" in model_item and model_item["license"].strip() != "":
+            print(f" > Model's license - {model_item['license']}")
+            if model_item["license"].lower() in LICENSE_URLS:
+                print(f" > Check {LICENSE_URLS[model_item['license'].lower()]} for more info.")
+            else:
+                print(" > Check https://opensource.org/licenses for more info.")
+        else:
+            print(" > Model's license - No license information available")
+
    def download_model(self, model_name):
        """Download model files given the full model name.
        Model name is in the format
@ -114,7 +142,7 @@ class ModelManager(object):
            e.g. 'tts_model/en/ljspeech/tacotron'

        Every model must have the following files:
-            - *.pth.tar : pytorch model checkpoint file.
+            - *.pth : pytorch model checkpoint file.
            - config.json : model config file.
            - scale_stats.npy (if exist): scale values for preprocessing.

@ -127,9 +155,6 @@ class ModelManager(object):
        model_item = self.models_dict[model_type][lang][dataset][model]
        # set the model specific output path
        output_path = os.path.join(self.output_prefix, model_full_name)
-        output_model_path = os.path.join(output_path, "model_file.pth.tar")
-        output_config_path = os.path.join(output_path, "config.json")
-
        if os.path.exists(output_path):
            print(f" > {model_name} is already downloaded.")
        else:
@ -137,10 +162,52 @@ class ModelManager(object):
            print(f" > Downloading model to {output_path}")
            # download from github release
            self._download_zip_file(model_item["github_rls_url"], output_path)
+            self.print_model_license(model_item=model_item)
+        # find downloaded files
+        output_model_path, output_config_path = self._find_files(output_path)
        # update paths in the config.json
        self._update_paths(output_path, output_config_path)
        return output_model_path, output_config_path, model_item

+    @staticmethod
+    def _find_files(output_path: str) -> Tuple[str, str]:
+        """Find the model and config files in the output path
+
+        Args:
+            output_path (str): path to the model files
+
+        Returns:
+            Tuple[str, str]: path to the model file and config file
+        """
+        model_file = None
+        config_file = None
+        for file_name in os.listdir(output_path):
+            if file_name in ["model_file.pth", "model_file.pth.tar", "model.pth"]:
+                model_file = os.path.join(output_path, file_name)
+            elif file_name == "config.json":
+                config_file = os.path.join(output_path, file_name)
+        if model_file is None:
+            raise ValueError(" [!] Model file not found in the output path")
+        if config_file is None:
+            raise ValueError(" [!] Config file not found in the output path")
+        return model_file, config_file
+
+    @staticmethod
+    def _find_speaker_encoder(output_path: str) -> str:
+        """Find the speaker encoder file in the output path
+
+        Args:
+            output_path (str): path to the model files
+
+        Returns:
+            str: path to the speaker encoder file
+        """
+        speaker_encoder_file = None
+        for file_name in os.listdir(output_path):
+            if file_name in ["model_se.pth", "model_se.pth.tar"]:
+                speaker_encoder_file = os.path.join(output_path, file_name)
+        return speaker_encoder_file
+
    def _update_paths(self, output_path: str, config_path: str) -> None:
        """Update paths for certain files in config.json after download.

@ -152,7 +219,7 @@ class ModelManager(object):
        output_d_vector_file_path = os.path.join(output_path, "speakers.json")
        output_speaker_ids_file_path = os.path.join(output_path, "speaker_ids.json")
        speaker_encoder_config_path = os.path.join(output_path, "config_se.json")
-        speaker_encoder_model_path = os.path.join(output_path, "model_se.pth.tar")
+        speaker_encoder_model_path = self._find_speaker_encoder(output_path)

        # update the scale_path.npy file path in the model config.json
        self._update_path("audio.stats_path", output_stats_path, config_path)
@ -174,7 +241,7 @@ class ModelManager(object):
    @staticmethod
    def _update_path(field_name, new_path, config_path):
        """Update the path in the model config.json for the current environment after download"""
-        if os.path.exists(new_path):
+        if new_path and os.path.exists(new_path):
            config = load_config(config_path)
            field_names = field_name.split(".")
            if len(field_names) > 1:
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@ -10,7 +10,7 @@ from TTS.tts.models import setup_model as setup_tts_model

 # pylint: disable=unused-wildcard-import
 # pylint: disable=wildcard-import
-from TTS.tts.utils.synthesis import synthesis, trim_silence
+from TTS.tts.utils.synthesis import synthesis, transfer_voice, trim_silence
 from TTS.utils.audio import AudioProcessor
 from TTS.vocoder.models import setup_model as setup_vocoder_model
 from TTS.vocoder.utils.generic_utils import interpolate_vocoder_input
@ -109,15 +109,21 @@ class Synthesizer(object):
        """
        # pylint: disable=global-statement
        self.tts_config = load_config(tts_config_path)
-        self.use_phonemes = self.tts_config.use_phonemes
+        if self.tts_config["use_phonemes"] and self.tts_config["phonemizer"] is None:
+            raise ValueError("Phonemizer is not defined in the TTS config.")
+
        self.tts_model = setup_tts_model(config=self.tts_config)

        if not self.encoder_checkpoint:
            self._set_speaker_encoder_paths_from_tts_config()
+
        self.tts_model.load_checkpoint(self.tts_config, tts_checkpoint, eval=True)
        if use_cuda:
            self.tts_model.cuda()

+        if self.encoder_checkpoint and hasattr(self.tts_model, "speaker_manager"):
+            self.tts_model.speaker_manager.init_encoder(self.encoder_checkpoint, self.encoder_config)
+
    def _set_speaker_encoder_paths_from_tts_config(self):
        """Set the encoder paths from the tts model config for models with speaker encoders."""
        if hasattr(self.tts_config, "model_args") and hasattr(
@ -183,11 +189,13 @@ class Synthesizer(object):

    def tts(
        self,
-        text: str,
+        text: str = "",
        speaker_name: str = "",
        language_name: str = "",
        speaker_wav: Union[str, List[str]] = None,
        style_wav=None,
+        reference_wav=None,
+        reference_speaker_name=None,
    ) -> List[int]:
        """🐸 TTS magic. Run all the models and generate speech.

@ -197,33 +205,43 @@ class Synthesizer(object):
            language_name (str, optional): language id for multi-language models. Defaults to "".
            speaker_wav (Union[str, List[str]], optional): path to the speaker wav. Defaults to None.
            style_wav ([type], optional): style waveform for GST. Defaults to None.
-
+            reference_wav ([type], optional): reference waveform for voice conversion. Defaults to None.
+            reference_speaker_name ([type], optional): spekaer id of reference waveform. Defaults to None.
        Returns:
            List[int]: [description]
        """
        start_time = time.time()
        wavs = []
-        sens = self.split_into_sentences(text)
-        print(" > Text splitted to sentences.")
-        print(sens)
+
+        if not text and not reference_wav:
+            raise ValueError(
+                "You need to define either `text` (for sythesis) or a `reference_wav` (for voice conversion) to use the Coqui TTS API."
+            )
+
+        if text:
+            sens = self.split_into_sentences(text)
+            print(" > Text splitted to sentences.")
+            print(sens)

        # handle multi-speaker
        speaker_embedding = None
        speaker_id = None
-        if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "speaker_ids"):
+        if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "ids"):
            if speaker_name and isinstance(speaker_name, str):
                if self.tts_config.use_d_vector_file:
-                    # get the speaker embedding from the saved d_vectors.
-                    speaker_embedding = self.tts_model.speaker_manager.get_d_vectors_by_speaker(speaker_name)[0]
+                    # get the average speaker embedding from the saved d_vectors.
+                    speaker_embedding = self.tts_model.speaker_manager.get_mean_embedding(
+                        speaker_name, num_samples=None, randomize=False
+                    )
                    speaker_embedding = np.array(speaker_embedding)[None, :]  # [1 x embedding_dim]
                else:
                    # get speaker idx from the speaker name
-                    speaker_id = self.tts_model.speaker_manager.speaker_ids[speaker_name]
+                    speaker_id = self.tts_model.speaker_manager.ids[speaker_name]

            elif not speaker_name and not speaker_wav:
                raise ValueError(
                    " [!] Look like you use a multi-speaker model. "
-                    "You need to define either a `speaker_name` or a `style_wav` to use a multi-speaker model."
+                    "You need to define either a `speaker_name` or a `speaker_wav` to use a multi-speaker model."
                )
            else:
                speaker_embedding = None
@ -240,7 +258,7 @@ class Synthesizer(object):
            hasattr(self.tts_model, "language_manager") and self.tts_model.language_manager is not None
        ):
            if language_name and isinstance(language_name, str):
-                language_id = self.tts_model.language_manager.language_id_mapping[language_name]
+                language_id = self.tts_model.language_manager.ids[language_name]

            elif not language_name:
                raise ValueError(
@ -256,26 +274,93 @@ class Synthesizer(object):

        # compute a new d_vector from the given clip.
        if speaker_wav is not None:
-            speaker_embedding = self.tts_model.speaker_manager.compute_d_vector_from_clip(speaker_wav)
+            speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(speaker_wav)

        use_gl = self.vocoder_model is None

-        for sen in sens:
-            # synthesize voice
-            outputs = synthesis(
+        if not reference_wav:
+            for sen in sens:
+                # synthesize voice
+                outputs = synthesis(
+                    model=self.tts_model,
+                    text=sen,
+                    CONFIG=self.tts_config,
+                    use_cuda=self.use_cuda,
+                    speaker_id=speaker_id,
+                    language_id=language_id,
+                    style_wav=style_wav,
+                    use_griffin_lim=use_gl,
+                    d_vector=speaker_embedding,
+                )
+                waveform = outputs["wav"]
+                mel_postnet_spec = outputs["outputs"]["model_outputs"][0].detach().cpu().numpy()
+                if not use_gl:
+                    # denormalize tts output based on tts audio config
+                    mel_postnet_spec = self.tts_model.ap.denormalize(mel_postnet_spec.T).T
+                    device_type = "cuda" if self.use_cuda else "cpu"
+                    # renormalize spectrogram based on vocoder config
+                    vocoder_input = self.vocoder_ap.normalize(mel_postnet_spec.T)
+                    # compute scale factor for possible sample rate mismatch
+                    scale_factor = [
+                        1,
+                        self.vocoder_config["audio"]["sample_rate"] / self.tts_model.ap.sample_rate,
+                    ]
+                    if scale_factor[1] != 1:
+                        print(" > interpolating tts model output.")
+                        vocoder_input = interpolate_vocoder_input(scale_factor, vocoder_input)
+                    else:
+                        vocoder_input = torch.tensor(vocoder_input).unsqueeze(0)  # pylint: disable=not-callable
+                    # run vocoder model
+                    # [1, T, C]
+                    waveform = self.vocoder_model.inference(vocoder_input.to(device_type))
+                if self.use_cuda and not use_gl:
+                    waveform = waveform.cpu()
+                if not use_gl:
+                    waveform = waveform.numpy()
+                waveform = waveform.squeeze()
+
+                # trim silence
+                if self.tts_config.audio["do_trim_silence"] is True:
+                    waveform = trim_silence(waveform, self.tts_model.ap)
+
+                wavs += list(waveform)
+                wavs += [0] * 10000
+        else:
+            # get the speaker embedding or speaker id for the reference wav file
+            reference_speaker_embedding = None
+            reference_speaker_id = None
+            if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "speaker_ids"):
+                if reference_speaker_name and isinstance(reference_speaker_name, str):
+                    if self.tts_config.use_d_vector_file:
+                        # get the speaker embedding from the saved d_vectors.
+                        reference_speaker_embedding = self.tts_model.speaker_manager.get_embeddings_by_name(
+                            reference_speaker_name
+                        )[0]
+                        reference_speaker_embedding = np.array(reference_speaker_embedding)[
+                            None, :
+                        ]  # [1 x embedding_dim]
+                    else:
+                        # get speaker idx from the speaker name
+                        reference_speaker_id = self.tts_model.speaker_manager.ids[reference_speaker_name]
+                else:
+                    reference_speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(
+                        reference_wav
+                    )
+
+            outputs = transfer_voice(
                model=self.tts_model,
-                text=sen,
                CONFIG=self.tts_config,
                use_cuda=self.use_cuda,
+                reference_wav=reference_wav,
                speaker_id=speaker_id,
-                language_id=language_id,
-                style_wav=style_wav,
-                use_griffin_lim=use_gl,
                d_vector=speaker_embedding,
+                use_griffin_lim=use_gl,
+                reference_speaker_id=reference_speaker_id,
+                reference_d_vector=reference_speaker_embedding,
            )
-            waveform = outputs["wav"]
-            mel_postnet_spec = outputs["outputs"]["model_outputs"][0].detach().cpu().numpy()
+            waveform = outputs
            if not use_gl:
+                mel_postnet_spec = outputs[0].detach().cpu().numpy()
                # denormalize tts output based on tts audio config
                mel_postnet_spec = self.tts_model.ap.denormalize(mel_postnet_spec.T).T
                device_type = "cuda" if self.use_cuda else "cpu"
@ -294,18 +379,11 @@ class Synthesizer(object):
                # run vocoder model
                # [1, T, C]
                waveform = self.vocoder_model.inference(vocoder_input.to(device_type))
-            if self.use_cuda and not use_gl:
+            if self.use_cuda:
                waveform = waveform.cpu()
            if not use_gl:
                waveform = waveform.numpy()
-            waveform = waveform.squeeze()
-
-            # trim silence
-            if self.tts_config.audio["do_trim_silence"] is True:
-                waveform = trim_silence(waveform, self.tts_model.ap)
-
-            wavs += list(waveform)
-            wavs += [0] * 10000
+            wavs = waveform.squeeze()

        # compute stats
        process_time = time.time() - start_time
--- a/TTS/utils/vad.py
+++ b/TTS/utils/vad.py
@ -1,144 +1,81 @@
-# This code is adpated from: https://github.com/wiseman/py-webrtcvad/blob/master/example.py
-import collections
-import contextlib
-import wave
-
-import webrtcvad
+import torch
+import torchaudio


-def read_wave(path):
-    """Reads a .wav file.
+def read_audio(path):
+    wav, sr = torchaudio.load(path)

-    Takes the path, and returns (PCM audio data, sample rate).
-    """
-    with contextlib.closing(wave.open(path, "rb")) as wf:
-        num_channels = wf.getnchannels()
-        assert num_channels == 1
-        sample_width = wf.getsampwidth()
-        assert sample_width == 2
-        sample_rate = wf.getframerate()
-        assert sample_rate in (8000, 16000, 32000, 48000)
-        pcm_data = wf.readframes(wf.getnframes())
-        return pcm_data, sample_rate
+    if wav.size(0) > 1:
+        wav = wav.mean(dim=0, keepdim=True)
+
+    return wav.squeeze(0), sr


-def write_wave(path, audio, sample_rate):
-    """Writes a .wav file.
-
-    Takes path, PCM audio data, and sample rate.
-    """
-    with contextlib.closing(wave.open(path, "wb")) as wf:
-        wf.setnchannels(1)
-        wf.setsampwidth(2)
-        wf.setframerate(sample_rate)
-        wf.writeframes(audio)
+def resample_wav(wav, sr, new_sr):
+    wav = wav.unsqueeze(0)
+    transform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=new_sr)
+    wav = transform(wav)
+    return wav.squeeze(0)


-class Frame(object):
-    """Represents a "frame" of audio data."""
+def map_timestamps_to_new_sr(vad_sr, new_sr, timestamps, just_begging_end=False):
+    factor = new_sr / vad_sr
+    new_timestamps = []
+    if just_begging_end and timestamps:
+        # get just the start and end timestamps
+        new_dict = {"start": int(timestamps[0]["start"] * factor), "end": int(timestamps[-1]["end"] * factor)}
+        new_timestamps.append(new_dict)
+    else:
+        for ts in timestamps:
+            # map to the new SR
+            new_dict = {"start": int(ts["start"] * factor), "end": int(ts["end"] * factor)}
+            new_timestamps.append(new_dict)

-    def __init__(self, _bytes, timestamp, duration):
-        self.bytes = _bytes
-        self.timestamp = timestamp
-        self.duration = duration
+    return new_timestamps


-def frame_generator(frame_duration_ms, audio, sample_rate):
-    """Generates audio frames from PCM audio data.
+def get_vad_model_and_utils(use_cuda=False):
+    model, utils = torch.hub.load(repo_or_dir="snakers4/silero-vad", model="silero_vad", force_reload=True, onnx=False)
+    if use_cuda:
+        model = model.cuda()

-    Takes the desired frame duration in milliseconds, the PCM data, and
-    the sample rate.
-
-    Yields Frames of the requested duration.
-    """
-    n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
-    offset = 0
-    timestamp = 0.0
-    duration = (float(n) / sample_rate) / 2.0
-    while offset + n < len(audio):
-        yield Frame(audio[offset : offset + n], timestamp, duration)
-        timestamp += duration
-        offset += n
+    get_speech_timestamps, save_audio, _, _, collect_chunks = utils
+    return model, get_speech_timestamps, save_audio, collect_chunks


-def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, frames):
-    """Filters out non-voiced audio frames.
+def remove_silence(
+    model_and_utils, audio_path, out_path, vad_sample_rate=8000, trim_just_beginning_and_end=True, use_cuda=False
+):

-    Given a webrtcvad.Vad and a source of audio frames, yields only
-    the voiced audio.
+    # get the VAD model and utils functions
+    model, get_speech_timestamps, save_audio, collect_chunks = model_and_utils

-    Uses a padded, sliding window algorithm over the audio frames.
-    When more than 90% of the frames in the window are voiced (as
-    reported by the VAD), the collector triggers and begins yielding
-    audio frames. Then the collector waits until 90% of the frames in
-    the window are unvoiced to detrigger.
+    # read ground truth wav and resample the audio for the VAD
+    wav, gt_sample_rate = read_audio(audio_path)

-    The window is padded at the front and back to provide a small
-    amount of silence or the beginnings/endings of speech around the
-    voiced frames.
+    # if needed, resample the audio for the VAD model
+    if gt_sample_rate != vad_sample_rate:
+        wav_vad = resample_wav(wav, gt_sample_rate, vad_sample_rate)
+    else:
+        wav_vad = wav

-    Arguments:
+    if use_cuda:
+        wav_vad = wav_vad.cuda()

-    sample_rate - The audio sample rate, in Hz.
-    frame_duration_ms - The frame duration in milliseconds.
-    padding_duration_ms - The amount to pad the window, in milliseconds.
-    vad - An instance of webrtcvad.Vad.
-    frames - a source of audio frames (sequence or generator).
+    # get speech timestamps from full audio file
+    speech_timestamps = get_speech_timestamps(wav_vad, model, sampling_rate=vad_sample_rate, window_size_samples=768)

-    Returns: A generator that yields PCM audio data.
-    """
-    num_padding_frames = int(padding_duration_ms / frame_duration_ms)
-    # We use a deque for our sliding window/ring buffer.
-    ring_buffer = collections.deque(maxlen=num_padding_frames)
-    # We have two states: TRIGGERED and NOTTRIGGERED. We start in the
-    # NOTTRIGGERED state.
-    triggered = False
+    # map the current speech_timestamps to the sample rate of the ground truth audio
+    new_speech_timestamps = map_timestamps_to_new_sr(
+        vad_sample_rate, gt_sample_rate, speech_timestamps, trim_just_beginning_and_end
+    )

-    voiced_frames = []
-    for frame in frames:
-        is_speech = vad.is_speech(frame.bytes, sample_rate)
+    # if have speech timestamps else save the wav
+    if new_speech_timestamps:
+        wav = collect_chunks(new_speech_timestamps, wav)
+    else:
+        print(f"> The file {audio_path} probably does not have speech please check it !!")

-        # sys.stdout.write('1' if is_speech else '0')
-        if not triggered:
-            ring_buffer.append((frame, is_speech))
-            num_voiced = len([f for f, speech in ring_buffer if speech])
-            # If we're NOTTRIGGERED and more than 90% of the frames in
-            # the ring buffer are voiced frames, then enter the
-            # TRIGGERED state.
-            if num_voiced > 0.9 * ring_buffer.maxlen:
-                triggered = True
-                # sys.stdout.write('+(%s)' % (ring_buffer[0][0].timestamp,))
-                # We want to yield all the audio we see from now until
-                # we are NOTTRIGGERED, but we have to start with the
-                # audio that's already in the ring buffer.
-                for f, _ in ring_buffer:
-                    voiced_frames.append(f)
-                ring_buffer.clear()
-        else:
-            # We're in the TRIGGERED state, so collect the audio data
-            # and add it to the ring buffer.
-            voiced_frames.append(frame)
-            ring_buffer.append((frame, is_speech))
-            num_unvoiced = len([f for f, speech in ring_buffer if not speech])
-            # If more than 90% of the frames in the ring buffer are
-            # unvoiced, then enter NOTTRIGGERED and yield whatever
-            # audio we've collected.
-            if num_unvoiced > 0.9 * ring_buffer.maxlen:
-                # sys.stdout.write('-(%s)' % (frame.timestamp + frame.duration))
-                triggered = False
-                yield b"".join([f.bytes for f in voiced_frames])
-                ring_buffer.clear()
-                voiced_frames = []
-    # If we have any leftover voiced audio when we run out of input,
-    # yield it.
-    if voiced_frames:
-        yield b"".join([f.bytes for f in voiced_frames])
-
-
-def get_vad_speech_segments(audio, sample_rate, aggressiveness=2, padding_duration_ms=300):
-
-    vad = webrtcvad.Vad(int(aggressiveness))
-    frames = list(frame_generator(30, audio, sample_rate))
-    segments = vad_collector(sample_rate, 30, padding_duration_ms, vad, frames)
-
-    return segments
+    # save audio
+    save_audio(out_path, wav, sampling_rate=gt_sample_rate)
+    return out_path
--- a/TTS/vocoder/README.md
+++ b/TTS/vocoder/README.md
@ -29,7 +29,7 @@ You can continue a previous training run by the following command.

 You can fine-tune a pre-trained model by the following command.

-```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --restore_path path/to/your/model.pth.tar```
+```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --restore_path path/to/your/model.pth```

 Restoring a model starts a new training in a different folder. It only restores model weights with the given checkpoint file. However, continuing a training starts from the same directory where the previous training run left off.

--- a/docs/source/_templates/page.html
+++ b/docs/source/_templates/page.html
@ -0,0 +1,23 @@
+{% extends "!page.html" %}
+{% block scripts %}
+    {{ super() }}
+    <!-- DocsQA integration start -->
+    <script src="https://cdn.jsdelivr.net/npm/qabot@0.4"></script>
+
+    <qa-bot
+        token="qAFjWNovwHUXKKkVhy4AN6tawSwCMfdb3HJNPLVM23ACdrBGxmBNObM="
+        title="🐸💬TTS Bot"
+        description="A library for advanced Text-to-Speech generation"
+        style="bottom: calc(1.25em + 80px);"
+    >
+    <template>
+        <dl>
+            <dt>You can ask questions about TTS. Try</dt>
+            <dd>What is VITS?</dd>
+            <dd>How to train a TTS model?</dd>
+            <dd>What is the format of training data?</dd>
+        </dl>
+    </template>
+    </qa-bot>
+    <!-- DocsQA integration end -->
+{% endblock %}
--- a/docs/source/finetuning.md
+++ b/docs/source/finetuning.md
@ -93,13 +93,13 @@ them and fine-tune it for your own dataset. This will help you in two main ways:

    ```bash
    CUDA_VISIBLE_DEVICES="0" python recipes/ljspeech/glow_tts/train_glowtts.py \
-        --restore_path  /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth.tar
+        --restore_path  /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth
    ```

    ```bash
    CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tts.py \
        --config_path  /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/config.json \
-        --restore_path  /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth.tar
+        --restore_path  /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth
    ```

    As stated above, you can also use command-line arguments to change the model configuration.
@ -107,7 +107,7 @@ them and fine-tune it for your own dataset. This will help you in two main ways:

    ```bash
    CUDA_VISIBLE_DEVICES="0" python recipes/ljspeech/glow_tts/train_glowtts.py \
-        --restore_path  /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth.tar
+        --restore_path  /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth
        --coqpit.run_name "glow-tts-finetune" \
        --coqpit.lr 0.00001
    ```
--- a/docs/source/inference.md
+++ b/docs/source/inference.md
@ -44,7 +44,7 @@ Run your own TTS model (Using Griffin-Lim Vocoder)

 ```bash
 tts --text "Text for TTS" \
-    --model_path path/to/model.pth.tar \
+    --model_path path/to/model.pth \
    --config_path path/to/config.json \
    --out_path folder/to/save/output.wav
 ```
@ -54,9 +54,9 @@ Run your own TTS and Vocoder models
 ```bash
 tts --text "Text for TTS" \
    --config_path path/to/config.json \
-    --model_path path/to/model.pth.tar \
+    --model_path path/to/model.pth \
    --out_path folder/to/save/output.wav \
-    --vocoder_path path/to/vocoder.pth.tar \
+    --vocoder_path path/to/vocoder.pth \
    --vocoder_config_path path/to/vocoder_config.json
 ```

--- a/docs/source/main_classes/trainer_api.md
+++ b/docs/source/main_classes/trainer_api.md
@ -1,17 +1,3 @@
 # Trainer API

-The {class}`TTS.trainer.Trainer` provides a lightweight, extensible, and feature-complete training run-time. We optimized it for 🐸 but
-can also be used for any DL training in different domains. It supports distributed multi-gpu, mixed-precision (apex or torch.amp) training.
-
-
-## Trainer
-```{eval-rst}
-.. autoclass:: TTS.trainer.Trainer
-    :members:
-```
-
-## TrainingArgs
-```{eval-rst}
-.. autoclass:: TTS.trainer.TrainingArgs
-    :members:
-```
+We made the trainer a seprate project on https://github.com/coqui-ai/Trainer
--- a/docs/source/training_a_model.md
+++ b/docs/source/training_a_model.md
@ -33,7 +33,7 @@
    If you like to run a multi-gpu training using DDP back-end,

    ```bash
-    $ CUDA_VISIBLE_DEVICES="0, 1, 2" python TTS/bin/distribute.py --script <path_to_your_script>/train_glowtts.py
+    $ CUDA_VISIBLE_DEVICES="0, 1, 2" python -m trainer.distribute --script <path_to_your_script>/train_glowtts.py
    ```

    The example above runs a multi-gpu training using GPUs `0, 1, 2`.
@ -122,7 +122,7 @@

    ```bash
    $ tts --text "Text for TTS" \
-          --model_path path/to/checkpoint_x.pth.tar \
+          --model_path path/to/checkpoint_x.pth \
          --config_path path/to/config.json \
          --out_path folder/to/save/output.wav
    ```
--- a/docs/source/tutorial_for_nervous_beginners.md
+++ b/docs/source/tutorial_for_nervous_beginners.md
@ -50,13 +50,13 @@ A breakdown of a simple script that trains a GlowTTS model on the LJspeech datas
    - Fine-tune a model.

        ```bash
-        CUDA_VISIBLE_DEVICES=0 python train.py --restore_path path/to/model/checkpoint.pth.tar
+        CUDA_VISIBLE_DEVICES=0 python train.py --restore_path path/to/model/checkpoint.pth
        ```

    - Run multi-gpu training.

        ```bash
-        CUDA_VISIBLE_DEVICES=0,1,2 python TTS/bin/distribute.py --script train.py
+        CUDA_VISIBLE_DEVICES=0,1,2 python -m trainer.distribute --script train.py
        ```

 ### CLI Way
--- a/notebooks/ExtractTTSpectrogram.ipynb
+++ b/notebooks/ExtractTTSpectrogram.ipynb
@ -66,7 +66,7 @@
    "DATASET = \"ljspeech\"\n",
    "METADATA_FILE = \"metadata.csv\"\n",
    "CONFIG_PATH = \"/home/ubuntu/.local/share/tts/tts_models--en--ljspeech--tacotron2-DDC_ph/config.json\"\n",
-    "MODEL_FILE = \"/home/ubuntu/.local/share/tts/tts_models--en--ljspeech--tacotron2-DDC_ph/model_file.pth.tar\"\n",
+    "MODEL_FILE = \"/home/ubuntu/.local/share/tts/tts_models--en--ljspeech--tacotron2-DDC_ph/model_file.pth\"\n",
    "BATCH_SIZE = 32\n",
    "\n",
    "QUANTIZED_WAV = False\n",
--- a/notebooks/PlotUmapLibriTTS.ipynb
+++ b/notebooks/PlotUmapLibriTTS.ipynb
@ -66,7 +66,7 @@
   "outputs": [],
   "source": [
    "MODEL_RUN_PATH = \"/media/erogol/data_ssd/Models/libri_tts/speaker_encoder/libritts_360-half-October-31-2019_04+54PM-19d2f5f/\"\n",
-    "MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth.tar\"\n",
+    "MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth\"\n",
    "CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n",
    "\n",
    "# My single speaker locations\n",
--- a/notebooks/TestAttention.ipynb
+++ b/notebooks/TestAttention.ipynb
@ -73,7 +73,7 @@
    "\n",
    "# Set constants\n",
    "ROOT_PATH = '/home/erogol/Models/LJSpeech/ljspeech-May-20-2020_12+29PM-1835628/'\n",
-    "MODEL_PATH = ROOT_PATH + '/best_model.pth.tar'\n",
+    "MODEL_PATH = ROOT_PATH + '/best_model.pth'\n",
    "CONFIG_PATH = ROOT_PATH + '/config.json'\n",
    "OUT_FOLDER = './hard_sentences/'\n",
    "CONFIG = load_config(CONFIG_PATH)\n",
--- a/notebooks/dataset_analysis/AnalyzeDataset.ipynb
+++ b/notebooks/dataset_analysis/AnalyzeDataset.ipynb
@ -416,7 +416,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.9.1"
+   "version": "3.9.5"
  }
 },
 "nbformat": 4,
--- a/notebooks/dataset_analysis/CheckSpectrograms.ipynb
+++ b/notebooks/dataset_analysis/CheckSpectrograms.ipynb
@ -3,6 +3,10 @@
  {
   "cell_type": "code",
   "execution_count": null,
+   "metadata": {
+    "Collapsed": "false"
+   },
+   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "\n",
@ -12,21 +16,51 @@
    "\n",
    "import IPython.display as ipd\n",
    "import glob"
-   ],
-   "outputs": [],
-   "metadata": {
-    "Collapsed": "false"
-   }
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
+   "metadata": {
+    "Collapsed": "false"
+   },
+   "outputs": [],
   "source": [
-    "config_path = \"/home/erogol/gdrive/Projects/TTS/recipes/ljspeech/align_tts/config_transformer2.json\"\n",
-    "data_path = \"/home/erogol/gdrive/Datasets/LJSpeech-1.1/\"\n",
-    "\n",
-    "file_paths = glob.glob(data_path + \"/**/*.wav\", recursive=True)\n",
-    "CONFIG = load_config(config_path)\n",
+    "from TTS.config.shared_configs import BaseAudioConfig\n",
+    "CONFIG = BaseAudioConfig()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## ✍️ Set these values "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_path = \"/root/wav48_silence_trimmed/\"\n",
+    "file_ext = \".flac\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Read audio files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "file_paths = glob.glob(data_path + f\"/**/*{file_ext}\", recursive=True)\n",
    "\n",
    "# Change this to the index of the desired file listed below\n",
    "sample_file_index = 10\n",
@ -35,44 +69,45 @@
    "\n",
    "print(\"File list, by index:\")\n",
    "dict(enumerate(file_paths))"
-   ],
-   "outputs": [],
-   "metadata": {
-    "Collapsed": "false"
-   }
+   ]
  },
  {
   "cell_type": "markdown",
+   "metadata": {
+    "Collapsed": "false"
+   },
   "source": [
-    "### Setup Audio Processor\n",
+    "## ✍️ Set Audio Processor\n",
    "Play with the AP parameters until you find a good fit with the synthesis speech below.\n",
    "\n",
    "The default values are loaded from your config.json file, so you only need to\n",
    "uncomment and modify values below that you'd like to tune."
-   ],
-   "metadata": {
-    "Collapsed": "false"
-   }
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
+   "metadata": {
+    "Collapsed": "false"
+   },
+   "outputs": [],
   "source": [
    "tune_params={\n",
-    "#  'audio_processor': 'audio',\n",
-    "#  'num_mels': 80,          # In general, you don't need to change this. \n",
-    "#  'fft_size': 1024,        # In general, you don't need to change this.\n",
-    "#  'sample_rate': 22050,    # This must match the sample rate of the dataset.\n",
-    "#  'hop_length': 256,       # In general, you don't need to change this.\n",
-    "#  'win_length': 1024,      # In general, you don't need to change this.\n",
-    "#  'preemphasis': 0.98,     # In general, 0 gives better voice recovery but makes training harder. If your model does not train, try 0.97 - 0.99.\n",
-    "#  'min_level_db': -100,\n",
-    "#  'ref_level_db': 0,       # The base DB; increase until all background noise is removed in the spectrogram, then lower until you hear better speech below.\n",
-    "#  'power': 1.5,            # Change this value and listen to the synthesized voice. 1.2 - 1.5 are resonable values.\n",
-    "#  'griffin_lim_iters': 60, # Quality does not improve for values > 60\n",
-    "#  'mel_fmin': 0.0,         # Adjust this and check mel-spectrogram-based voice synthesis below.\n",
-    "#  'mel_fmax': 8000.0,      # Adjust this and check mel-spectrogram-based voice synthesis below.\n",
-    "#  'do_trim_silence': True  # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.\n",
+    " 'num_mels': 80,          # In general, you don't need to change this. \n",
+    " 'fft_size': 2400,        # In general, you don't need to change this.\n",
+    " 'frame_length_ms': 50, \n",
+    " 'frame_shift_ms': 12.5,\n",
+    " 'sample_rate': 48000,    # This must match the sample rate of the dataset.\n",
+    " 'hop_length': None,       # In general, you don't need to change this.\n",
+    " 'win_length': 1024,      # In general, you don't need to change this.\n",
+    " 'preemphasis': 0.98,     # In general, 0 gives better voice recovery but makes training harder. If your model does not train, try 0.97 - 0.99.\n",
+    " 'min_level_db': -100,\n",
+    " 'ref_level_db': 0,       # The base DB; increase until all background noise is removed in the spectrogram, then lower until you hear better speech below.\n",
+    " 'power': 1.5,            # Change this value and listen to the synthesized voice. 1.2 - 1.5 are resonable values.\n",
+    " 'griffin_lim_iters': 60, # Quality does not improve for values > 60\n",
+    " 'mel_fmin': 0.0,         # Adjust this and check mel-spectrogram-based voice synthesis below.\n",
+    " 'mel_fmax': 8000.0,      # Adjust this and check mel-spectrogram-based voice synthesis below.\n",
+    " 'do_trim_silence': True  # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.\n",
    "}\n",
    "\n",
    "# These options have to be forced off in order to avoid errors about the \n",
@ -86,59 +121,57 @@
    "}\n",
    "\n",
    "# Override select parts of loaded config with parameters above\n",
-    "tuned_config = CONFIG.audio.copy()\n",
+    "tuned_config = CONFIG.copy()\n",
    "tuned_config.update(reset)\n",
    "tuned_config.update(tune_params)\n",
    "\n",
    "AP = AudioProcessor(**tuned_config);"
-   ],
-   "outputs": [],
-   "metadata": {
-    "Collapsed": "false"
-   }
+   ]
  },
  {
   "cell_type": "markdown",
-   "source": [
-    "### Check audio loading "
-   ],
   "metadata": {
    "Collapsed": "false"
-   }
+   },
+   "source": [
+    "### Check audio loading "
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
+   "metadata": {
+    "Collapsed": "false"
+   },
+   "outputs": [],
   "source": [
    "wav = AP.load_wav(SAMPLE_FILE_PATH)\n",
    "ipd.Audio(data=wav, rate=AP.sample_rate) "
-   ],
-   "outputs": [],
-   "metadata": {
-    "Collapsed": "false"
-   }
+   ]
  },
  {
   "cell_type": "markdown",
-   "source": [
-    "### Generate Mel-Spectrogram and Re-synthesis with GL"
-   ],
   "metadata": {
    "Collapsed": "false"
-   }
+   },
+   "source": [
+    "### Generate Mel-Spectrogram and Re-synthesis with GL"
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
   "source": [
    "AP.power = 1.5"
-   ],
-   "outputs": [],
-   "metadata": {}
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
   "source": [
    "mel = AP.melspectrogram(wav)\n",
    "print(\"Max:\", mel.max())\n",
@ -148,24 +181,24 @@
    "\n",
    "wav_gen = AP.inv_melspectrogram(mel)\n",
    "ipd.Audio(wav_gen, rate=AP.sample_rate)"
-   ],
-   "outputs": [],
-   "metadata": {
-    "Collapsed": "false"
-   }
+   ]
  },
  {
   "cell_type": "markdown",
-   "source": [
-    "### Generate Linear-Spectrogram and Re-synthesis with GL"
-   ],
   "metadata": {
    "Collapsed": "false"
-   }
+   },
+   "source": [
+    "### Generate Linear-Spectrogram and Re-synthesis with GL"
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
+   "metadata": {
+    "Collapsed": "false"
+   },
+   "outputs": [],
   "source": [
    "spec = AP.spectrogram(wav)\n",
    "print(\"Max:\", spec.max())\n",
@ -175,26 +208,26 @@
    "\n",
    "wav_gen = AP.inv_spectrogram(spec)\n",
    "ipd.Audio(wav_gen, rate=AP.sample_rate)"
-   ],
-   "outputs": [],
-   "metadata": {
-    "Collapsed": "false"
-   }
+   ]
  },
  {
   "cell_type": "markdown",
+   "metadata": {
+    "Collapsed": "false"
+   },
   "source": [
    "### Compare values for a certain parameter\n",
    "\n",
    "Optimize your parameters by comparing different values per parameter at a time."
-   ],
-   "metadata": {
-    "Collapsed": "false"
-   }
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
+   "metadata": {
+    "Collapsed": "false"
+   },
+   "outputs": [],
   "source": [
    "from librosa import display\n",
    "from matplotlib import pylab as plt\n",
@ -234,39 +267,39 @@
    "        val = values[idx]\n",
    "        print(\" > {} = {}\".format(attribute, val))\n",
    "        IPython.display.display(IPython.display.Audio(wav_gen, rate=AP.sample_rate))"
-   ],
-   "outputs": [],
-   "metadata": {
-    "Collapsed": "false"
-   }
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
+   "metadata": {
+    "Collapsed": "false"
+   },
+   "outputs": [],
   "source": [
    "compare_values(\"preemphasis\", [0, 0.5, 0.97, 0.98, 0.99])"
-   ],
-   "outputs": [],
-   "metadata": {
-    "Collapsed": "false"
-   }
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "source": [
-    "compare_values(\"ref_level_db\", [2, 5, 10, 15, 20, 25, 30, 35, 40, 1000])"
-   ],
-   "outputs": [],
   "metadata": {
    "Collapsed": "false"
-   }
+   },
+   "outputs": [],
+   "source": [
+    "compare_values(\"ref_level_db\", [2, 5, 10, 15, 20, 25, 30, 35, 40, 1000])"
+   ]
  }
 ],
 "metadata": {
+  "interpreter": {
+   "hash": "27648abe09795c3a768a281b31f7524fcf66a207e733f8ecda3a4e1fd1059fb0"
+  },
  "kernelspec": {
-   "name": "python3",
-   "display_name": "Python 3.8.5 64-bit ('torch': conda)"
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
@ -278,12 +311,9 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.5"
-  },
-  "interpreter": {
-   "hash": "27648abe09795c3a768a281b31f7524fcf66a207e733f8ecda3a4e1fd1059fb0"
+   "version": "3.9.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
-}
+}
--- a/pyproject.toml
+++ b/pyproject.toml
@ -6,7 +6,7 @@ max-line-length=120

 [tool.black]
 line-length = 120
-target-version = ['py38']
+target-version = ['py39']
 exclude = '''

 (
--- a/recipes/ljspeech/align_tts/train_aligntts.py
+++ b/recipes/ljspeech/align_tts/train_aligntts.py
@ -49,7 +49,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)

 # init model
 model = AlignTTS(config, ap, tokenizer)
--- a/recipes/ljspeech/fast_pitch/train_fast_pitch.py
+++ b/recipes/ljspeech/fast_pitch/train_fast_pitch.py
@ -84,7 +84,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)

 # init the model
 model = ForwardTTS(config, ap, tokenizer, speaker_manager=None)
--- a/recipes/ljspeech/fast_speech/train_fast_speech.py
+++ b/recipes/ljspeech/fast_speech/train_fast_speech.py
@ -83,7 +83,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)

 # init the model
 model = ForwardTTS(config, ap, tokenizer)
--- a/recipes/ljspeech/glow_tts/train_glowtts.py
+++ b/recipes/ljspeech/glow_tts/train_glowtts.py
@ -60,7 +60,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)

 # INITIALIZE THE MODEL
 # Models take a config object and a speaker manager as input
--- a/recipes/ljspeech/hifigan/train_hifigan.py
+++ b/recipes/ljspeech/hifigan/train_hifigan.py
@ -37,16 +37,10 @@ ap = AudioProcessor(**config.audio.to_dict())
 eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)

 # init model
-model = GAN(config)
+model = GAN(config, ap)

 # init the trainer and 🚀
 trainer = Trainer(
-    TrainerArgs(),
-    config,
-    output_path,
-    model=model,
-    train_samples=train_samples,
-    eval_samples=eval_samples,
-    training_assets={"audio_processor": ap},
+    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
 )
 trainer.fit()
--- a/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py
+++ b/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py
@ -37,16 +37,10 @@ ap = AudioProcessor(**config.audio.to_dict())
 eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)

 # init model
-model = GAN(config)
+model = GAN(config, ap)

 # init the trainer and 🚀
 trainer = Trainer(
-    TrainerArgs(),
-    config,
-    output_path,
-    model=model,
-    train_samples=train_samples,
-    eval_samples=eval_samples,
-    training_assets={"audio_processor": ap},
+    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
 )
 trainer.fit()
--- a/recipes/ljspeech/speedy_speech/train_speedy_speech.py
+++ b/recipes/ljspeech/speedy_speech/train_speedy_speech.py
@ -67,7 +67,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)

 # init model
 model = ForwardTTS(config, ap, tokenizer)
--- a/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py
+++ b/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py
@ -77,7 +77,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)

 # INITIALIZE THE MODEL
 # Models take a config object and a speaker manager as input
--- a/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py
+++ b/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py
@ -74,7 +74,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)

 # INITIALIZE THE MODEL
 # Models take a config object and a speaker manager as input
@ -84,12 +89,6 @@ model = Tacotron2(config, ap, tokenizer, speaker_manager=None)

 # init the trainer and 🚀
 trainer = Trainer(
-    TrainerArgs(),
-    config,
-    output_path,
-    model=model,
-    train_samples=train_samples,
-    eval_samples=eval_samples,
-    training_assets={"audio_processor": ap},
+    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
 )
 trainer.fit()
--- a/recipes/ljspeech/univnet/train.py
+++ b/recipes/ljspeech/univnet/train.py
@ -36,16 +36,10 @@ ap = AudioProcessor(**config.audio.to_dict())
 eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)

 # init model
-model = GAN(config)
+model = GAN(config, ap)

 # init the trainer and 🚀
 trainer = Trainer(
-    TrainerArgs(),
-    config,
-    output_path,
-    model=model,
-    train_samples=train_samples,
-    eval_samples=eval_samples,
-    training_assets={"audio_processor": ap},
+    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
 )
 trainer.fit()
--- a/recipes/ljspeech/vits_tts/train_vits.py
+++ b/recipes/ljspeech/vits_tts/train_vits.py
@ -69,7 +69,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)

 # init model
 model = Vits(config, ap, tokenizer, speaker_manager=None)
--- a/recipes/multilingual/vits_tts/train_vits_tts.py
+++ b/recipes/multilingual/vits_tts/train_vits_tts.py
@ -7,9 +7,10 @@ from TTS.config.shared_configs import BaseAudioConfig
 from TTS.tts.configs.shared_configs import BaseDatasetConfig
 from TTS.tts.configs.vits_config import VitsConfig
 from TTS.tts.datasets import load_tts_samples
-from TTS.tts.models.vits import Vits, VitsArgs
+from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs
 from TTS.tts.utils.languages import LanguageManager
 from TTS.tts.utils.speakers import SpeakerManager
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor

 output_path = os.path.dirname(os.path.abspath(__file__))
@ -73,15 +74,16 @@ config = VitsConfig(
    max_audio_len=160000,
    output_path=output_path,
    datasets=dataset_config,
-    characters={
-        "pad": "_",
-        "eos": "&",
-        "bos": "*",
-        "characters": "!¡'(),-.:;¿?abcdefghijklmnopqrstuvwxyzµßàáâäåæçèéêëìíîïñòóôöùúûüąćęłńœśşźżƒабвгдежзийклмнопрстуфхцчшщъыьэюяёєіїґӧ «°±µ»$%&‘’‚“`”„",
-        "punctuations": "!¡'(),-.:;¿? ",
-        "phonemes": None,
-        "unique": True,
-    },
+    characters=CharactersConfig(
+        characters_class="TTS.tts.models.vits.VitsCharacters",
+        pad="<PAD>",
+        eos="<EOS>",
+        bos="<BOS>",
+        blank="<BLNK>",
+        characters="!¡'(),-.:;¿?abcdefghijklmnopqrstuvwxyzµßàáâäåæçèéêëìíîïñòóôöùúûüąćęłńœśşźżƒабвгдежзийклмнопрстуфхцчшщъыьэюяёєіїґӧ «°±µ»$%&‘’‚“`”„",
+        punctuations="!¡'(),-.:;¿? ",
+        phonemes=None,
+    ),
    test_sentences=[
        [
            "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
@ -100,32 +102,39 @@ config = VitsConfig(
    ],
 )

+# force the convertion of the custom characters to a config attribute
+config.from_dict(config.to_dict())
+
 # init audio processor
 ap = AudioProcessor(**config.audio.to_dict())

 # load training samples
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)

 # init speaker manager for multi-speaker training
 # it maps speaker-id to speaker-name in the model and data-loader
 speaker_manager = SpeakerManager()
-speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples)
+speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name")
 config.model_args.num_speakers = speaker_manager.num_speakers

 language_manager = LanguageManager(config=config)
 config.model_args.num_languages = language_manager.num_languages

+# INITIALIZE THE TOKENIZER
+# Tokenizer is used to convert text to sequences of token IDs.
+# config is updated with the default characters if not defined in the config.
+tokenizer, config = TTSTokenizer.init_from_config(config)
+
 # init model
-model = Vits(config, speaker_manager, language_manager)
+model = Vits(config, ap, tokenizer, speaker_manager, language_manager)

 # init the trainer and 🚀
 trainer = Trainer(
-    TrainerArgs(),
-    config,
-    output_path,
-    model=model,
-    train_samples=train_samples,
-    eval_samples=eval_samples,
-    training_assets={"audio_processor": ap},
+    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
 )
 trainer.fit()
--- a/recipes/vctk/fast_pitch/train_fast_pitch.py
+++ b/recipes/vctk/fast_pitch/train_fast_pitch.py
@ -71,12 +71,17 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)

 # init speaker manager for multi-speaker training
 # it maps speaker-id to speaker-name in the model and data-loader
 speaker_manager = SpeakerManager()
-speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples)
+speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name")
 config.model_args.num_speakers = speaker_manager.num_speakers

 # init model
--- a/recipes/vctk/fast_speech/train_fast_speech.py
+++ b/recipes/vctk/fast_speech/train_fast_speech.py
@ -69,12 +69,17 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)

 # init speaker manager for multi-speaker training
 # it maps speaker-id to speaker-name in the model and data-loader
 speaker_manager = SpeakerManager()
-speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples)
+speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name")
 config.model_args.num_speakers = speaker_manager.num_speakers

 # init model
--- a/recipes/vctk/glow_tts/train_glow_tts.py
+++ b/recipes/vctk/glow_tts/train_glow_tts.py
@ -69,12 +69,17 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)

 # init speaker manager for multi-speaker training
 # it maps speaker-id to speaker-name in the model and data-loader
 speaker_manager = SpeakerManager()
-speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples)
+speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name")
 config.num_speakers = speaker_manager.num_speakers

 # init model
--- a/recipes/vctk/speedy_speech/train_speedy_speech.py
+++ b/recipes/vctk/speedy_speech/train_speedy_speech.py
@ -69,12 +69,17 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)

 # init speaker manager for multi-speaker training
 # it maps speaker-id to speaker-name in the model and data-loader
 speaker_manager = SpeakerManager()
-speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples)
+speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name")
 config.model_args.num_speakers = speaker_manager.num_speakers

 # init model
--- a/Show More
+++ b/Show More
 @ -1 +1 @@
 .6.1
 .6.2