Merge pull request #1942 from coqui-ai/dev

v0.9.0
2022-11-16 16:50:57 +01:00 · 2022-11-16 16:50:57 +01:00 · 56ba616a03
parent e5430a6519 bc6120c330
commit 56ba616a03
165 changed files with 101831 additions and 443 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -1,2 +1,9 @@
 .git/
 Dockerfile
+build/
+dist/
+TTS.egg-info/
+tests/outputs/*
+tests/train_outputs/*
+__pycache__/
+*.pyc
--- a/.github/workflows/aux_tests.yml
+++ b/.github/workflows/aux_tests.yml
@ -21,9 +21,9 @@ jobs:
        python-version: [3.7, 3.8, 3.9, "3.10"]
        experimental: [false]
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
      - name: Set up Python ${{ matrix.python-version }}
-        uses: coqui-ai/setup-python@pip-cache-key-py-ver
+        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}
          architecture: x64
--- a/.github/workflows/data_tests.yml
+++ b/.github/workflows/data_tests.yml
@ -21,9 +21,9 @@ jobs:
        python-version: [3.7, 3.8, 3.9, "3.10"]
        experimental: [false]
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
      - name: Set up Python ${{ matrix.python-version }}
-        uses: coqui-ai/setup-python@pip-cache-key-py-ver
+        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}
          architecture: x64
--- a/.github/workflows/docker.yaml
+++ b/.github/workflows/docker.yaml
@ -15,8 +15,8 @@ jobs:
      matrix:
        arch: ["amd64"]
        base:
-        - "nvcr.io/nvidia/pytorch:22.03-py3" # GPU enabled
-        - "ubuntu:20.04" # CPU only
+        - "nvidia/cuda:11.8.0-base-ubuntu22.04" # GPU enabled
+        - "python:3.10.8-slim" # CPU only
    steps:
      - uses: actions/checkout@v2
      - name: Log in to the Container registry
@ -32,7 +32,7 @@ jobs:
          base="ghcr.io/coqui-ai/tts"
          tags="" # PR build

-          if [[ ${{ matrix.base }} = "ubuntu:20.04" ]]; then
+          if [[ ${{ matrix.base }} = "python:3.10.8-slim" ]]; then
            base="ghcr.io/coqui-ai/tts-cpu"
          fi

--- a/.github/workflows/inference_tests.yml
+++ b/.github/workflows/inference_tests.yml
@ -21,9 +21,9 @@ jobs:
        python-version: [3.7, 3.8, 3.9, "3.10"]
        experimental: [false]
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
      - name: Set up Python ${{ matrix.python-version }}
-        uses: coqui-ai/setup-python@pip-cache-key-py-ver
+        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}
          architecture: x64
--- a/.github/workflows/style_check.yml
+++ b/.github/workflows/style_check.yml
@ -21,9 +21,9 @@ jobs:
        python-version: [3.9]
        experimental: [false]
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
      - name: Set up Python ${{ matrix.python-version }}
-        uses: coqui-ai/setup-python@pip-cache-key-py-ver
+        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}
          architecture: x64
--- a/.github/workflows/text_tests.yml
+++ b/.github/workflows/text_tests.yml
@ -21,9 +21,9 @@ jobs:
        python-version: [3.7, 3.8, 3.9, "3.10"]
        experimental: [false]
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
      - name: Set up Python ${{ matrix.python-version }}
-        uses: coqui-ai/setup-python@pip-cache-key-py-ver
+        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}
          architecture: x64
--- a/.github/workflows/tts_tests.yml
+++ b/.github/workflows/tts_tests.yml
@ -21,9 +21,9 @@ jobs:
        python-version: [3.7, 3.8, 3.9, "3.10"]
        experimental: [false]
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
      - name: Set up Python ${{ matrix.python-version }}
-        uses: coqui-ai/setup-python@pip-cache-key-py-ver
+        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}
          architecture: x64
--- a/.github/workflows/vocoder_tests.yml
+++ b/.github/workflows/vocoder_tests.yml
@ -21,9 +21,9 @@ jobs:
        python-version: [3.7, 3.8, 3.9, "3.10"]
        experimental: [false]
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
      - name: Set up Python ${{ matrix.python-version }}
-        uses: coqui-ai/setup-python@pip-cache-key-py-ver
+        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}
          architecture: x64
--- a/.github/workflows/zoo_tests0.yml
+++ b/.github/workflows/zoo_tests0.yml
@ -0,0 +1,52 @@
+name: zoo-tests-0
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    types: [opened, synchronize, reopened]
+jobs:
+  check_skip:
+    runs-on: ubuntu-latest
+    if: "! contains(github.event.head_commit.message, '[ci skip]')"
+    steps:
+      - run: echo "${{ github.event.head_commit.message }}"
+
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [3.7, 3.8, 3.9, "3.10"]
+        experimental: [false]
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+          architecture: x64
+          cache: 'pip'
+          cache-dependency-path: 'requirements*'
+      - name: check OS
+        run: cat /etc/os-release
+      - name: Install dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y git make gcc
+          sudo apt-get install espeak espeak-ng
+          make system-deps
+      - name: Install/upgrade Python setup deps
+        run: python3 -m pip install --upgrade pip setuptools wheel
+      - name: Replace scarf urls
+        run: |
+          sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
+      - name: Install TTS
+        run: |
+          python3 -m pip install .[all]
+          python3 setup.py egg_info
+      - name: Unit tests
+        run: |
+          nose2 -F -v -B TTS tests.zoo_tests.test_models.test_models_offset_0_step_3
+          nose2 -F -v -B TTS tests.zoo_tests.test_models.test_voice_conversion
--- a/.github/workflows/zoo_tests1.yml
+++ b/.github/workflows/zoo_tests1.yml
@ -1,4 +1,4 @@
-name: zoo-tests
+name: zoo-tests-1

 on:
  push:
@ -21,9 +21,9 @@ jobs:
        python-version: [3.7, 3.8, 3.9, "3.10"]
        experimental: [false]
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
      - name: Set up Python ${{ matrix.python-version }}
-        uses: coqui-ai/setup-python@pip-cache-key-py-ver
+        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}
          architecture: x64
@ -47,4 +47,4 @@ jobs:
          python3 -m pip install .[all]
          python3 setup.py egg_info
      - name: Unit tests
-        run: make test_zoo
+        run: nose2 -F -v -B --with-coverage --coverage TTS tests.zoo_tests.test_models.test_models_offset_1_step_3
--- a/.github/workflows/zoo_tests2.yml
+++ b/.github/workflows/zoo_tests2.yml
@ -0,0 +1,50 @@
+name: zoo-tests-2
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    types: [opened, synchronize, reopened]
+jobs:
+  check_skip:
+    runs-on: ubuntu-latest
+    if: "! contains(github.event.head_commit.message, '[ci skip]')"
+    steps:
+      - run: echo "${{ github.event.head_commit.message }}"
+
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [3.7, 3.8, 3.9, "3.10"]
+        experimental: [false]
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+          architecture: x64
+          cache: 'pip'
+          cache-dependency-path: 'requirements*'
+      - name: check OS
+        run: cat /etc/os-release
+      - name: Install dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y git make gcc
+          sudo apt-get install espeak espeak-ng
+          make system-deps
+      - name: Install/upgrade Python setup deps
+        run: python3 -m pip install --upgrade pip setuptools wheel
+      - name: Replace scarf urls
+        run: |
+          sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
+      - name: Install TTS
+        run: |
+          python3 -m pip install .[all]
+          python3 setup.py egg_info
+      - name: Unit tests
+        run: nose2 -F -v -B --with-coverage --coverage TTS tests.zoo_tests.test_models.test_models_offset_2_step_3
--- a/18
+++ b/18
@ -1,20 +1,12 @@
-ARG BASE=nvcr.io/nvidia/pytorch:22.03-py3
+ARG BASE=nvidia/cuda:11.8.0-base-ubuntu22.04
 FROM ${BASE}
-RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ make  python3 python3-dev python3-pip python3-venv python3-wheel espeak espeak-ng libsndfile1-dev && rm -rf /var/lib/apt/lists/*
-RUN pip install llvmlite --ignore-installed
-
-# Create and activate virtual env
-ENV VIRTUAL_ENV=/venv
-RUN python3 -m venv $VIRTUAL_ENV
-ENV PATH="$VIRTUAL_ENV/bin:$PATH"
-RUN pip install -U pip setuptools wheel
+RUN apt-get update && apt-get upgrade -y
+RUN apt-get install -y --no-install-recommends gcc g++ make python3 python3-dev python3-pip python3-venv python3-wheel espeak-ng libsndfile1-dev && rm -rf /var/lib/apt/lists/*
+RUN pip3 install llvmlite --ignore-installed

 WORKDIR /root
-COPY requirements.txt /root
-COPY requirements.dev.txt /root
-COPY requirements.notebooks.txt /root
-RUN ["/bin/bash", "-c", "pip install -r <(cat requirements.txt requirements.dev.txt requirements.notebooks.txt)"]
 COPY . /root
+RUN pip3 install torch torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
 RUN make install
 ENTRYPOINT ["tts"]
 CMD ["--help"]
--- a/README.md
+++ b/README.md
@ -1,9 +1,16 @@
-# <img src="https://raw.githubusercontent.com/coqui-ai/TTS/main/images/coqui-log-green-TTS.png" height="56"/>
+<img src="https://raw.githubusercontent.com/coqui-ai/TTS/main/images/coqui-log-green-TTS.png" height="56"/>
+
+----
+
+### 📣 Clone your voice with a single click on [🐸Coqui.ai](https://app.coqui.ai/auth/signin)
+### 📣 🐸Coqui Studio is launching soon!! Join our [waiting list](https://coqui.ai/)!!
+
+----

 🐸TTS is a library for advanced Text-to-Speech generation. It's built on the latest research, was designed to achieve the best trade-off among ease-of-training, speed and quality.
 🐸TTS comes with pretrained models, tools for measuring dataset quality and already used in **20+ languages** for products and research projects.

-[![Gitter](https://badges.gitter.im/coqui-ai/TTS.svg)](https://gitter.im/coqui-ai/TTS?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge)
+[![Dicord](https://img.shields.io/discord/1037326658807533628?color=%239B59B6&label=chat%20on%20discord)](https://discord.gg/5eXr5seRrv)
 [![License](<https://img.shields.io/badge/License-MPL%202.0-brightgreen.svg>)](https://opensource.org/licenses/MPL-2.0)
 [![PyPI version](https://badge.fury.io/py/TTS.svg)](https://badge.fury.io/py/TTS)
 [![Covenant](https://camo.githubusercontent.com/7d620efaa3eac1c5b060ece5d6aacfcc8b81a74a04d05cd0398689c01c4463bb/68747470733a2f2f696d672e736869656c64732e696f2f62616467652f436f6e7472696275746f72253230436f76656e616e742d76322e3025323061646f707465642d6666363962342e737667)](https://github.com/coqui-ai/TTS/blob/master/CODE_OF_CONDUCT.md)
@ -36,12 +43,12 @@ Please use our dedicated channels for questions and discussion. Help is much mor
 | ------------------------------- | --------------------------------------- |
 | 🚨 **Bug Reports**              | [GitHub Issue Tracker]                  |
 | 🎁 **Feature Requests & Ideas** | [GitHub Issue Tracker]                  |
-| 👩‍💻 **Usage Questions**          | [Github Discussions]                    |
-| 🗯 **General Discussion**       | [Github Discussions] or [Gitter Room]   |
+| 👩‍💻 **Usage Questions**          | [GitHub Discussions]                    |
+| 🗯 **General Discussion**       | [GitHub Discussions] or [Discord]   |

 [github issue tracker]: https://github.com/coqui-ai/tts/issues
 [github discussions]: https://github.com/coqui-ai/TTS/discussions
-[gitter room]: https://gitter.im/coqui-ai/TTS?utm_source=share-link&utm_medium=link&utm_campaign=share-link
+[discord]: https://discord.gg/5eXr5seRrv
 [Tutorials and Examples]: https://github.com/coqui-ai/TTS/wiki/TTS-Notebooks-and-Tutorials


@ -75,7 +82,7 @@ Underlined "TTS*" and "Judy*" are 🐸TTS models
 - Modular (but not too much) code base enabling easy implementation of new ideas.

 ## Implemented Models
-### Text-to-Spectrogram
+### Spectrogram models
 - Tacotron: [paper](https://arxiv.org/abs/1703.10135)
 - Tacotron2: [paper](https://arxiv.org/abs/1712.05884)
 - Glow-TTS: [paper](https://arxiv.org/abs/2005.11129)
@ -83,9 +90,12 @@ Underlined "TTS*" and "Judy*" are 🐸TTS models
 - Align-TTS: [paper](https://arxiv.org/abs/2003.01950)
 - FastPitch: [paper](https://arxiv.org/pdf/2006.06873.pdf)
 - FastSpeech: [paper](https://arxiv.org/abs/1905.09263)
+- SC-GlowTTS: [paper](https://arxiv.org/abs/2104.05557)
+- Capacitron: [paper](https://arxiv.org/abs/1906.03402)

 ### End-to-End Models
 - VITS: [paper](https://arxiv.org/pdf/2106.06103)
+- YourTTS: [paper](https://arxiv.org/abs/2112.02418)

 ### Attention Methods
 - Guided Attention: [paper](https://arxiv.org/abs/1710.08969)
@ -136,6 +146,21 @@ $ make install

 If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](https://stackoverflow.com/questions/66726331/how-can-i-run-mozilla-tts-coqui-tts-training-with-cuda-on-a-windows-system).

+
+## Docker Image
+You can also try TTS without install with the docker image.
+Simply run the following command and you will be able to run TTS without installing it.
+
+```bash
+docker run --rm -it -p 5002:5002 --entrypoint /bin/bash ghcr.io/coqui-ai/tts-cpu
+python3 TTS/server/server.py --list_models #To get the list of available models
+python3 TTS/server/server.py --model_name tts_models/en/vctk/vits # To start a server
+```
+
+You can then enjoy the TTS server [here](http://[::1]:5002/)
+More details about the docker images (like GPU support) can be found [here](https://tts.readthedocs.io/en/latest/docker_images.html)
+
+
 ## Use TTS

 ### Single Speaker Models
@ -147,12 +172,12 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht
    ```
 - Get model info (for both tts_models and vocoder_models):
    - Query by type/name:
-        The model_info_by_name uses the name as it from the --list_models. 
+        The model_info_by_name uses the name as it from the --list_models.
        ```
        $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
        ```
        For example:
-        
+
        ```
        $ tts --model_info_by_name tts_models/tr/common-voice/glow-tts
        ```
@ -160,16 +185,16 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht
        $ tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2
        ```
    - Query by type/idx:
-        The model_query_idx uses the corresponding idx from --list_models. 
+        The model_query_idx uses the corresponding idx from --list_models.
        ```
        $ tts --model_info_by_idx "<model_type>/<model_query_idx>"
        ```
        For example:
-        
+
        ```
-        $ tts --model_info_by_idx tts_models/3 
+        $ tts --model_info_by_idx tts_models/3
        ```
-        
+
 - Run TTS with default models:

    ```
@ -208,7 +233,7 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht

 - Run your own TTS and Vocoder models:
    ```
-    $ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth --out_path output/path/speech.wav
+    $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
        --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
    ```

@ -229,7 +254,7 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht
 - Run your own multi-speaker TTS model:

    ```
-    $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
+    $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/model.pth --config_path path/to/config.json --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
    ```

 ## Directory Structure
@ -239,8 +264,6 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht
 |- TTS
    |- bin/             (folder for all the executables.)
      |- train*.py                  (train your target model.)
-      |- distribute.py              (train your TTS model using Multiple GPUs.)
-      |- compute_statistics.py      (compute dataset statistics for normalization.)
      |- ...
    |- tts/             (text to speech models)
        |- layers/          (model layer definitions)
--- a/TTS/.models.json
+++ b/TTS/.models.json
@ -12,6 +12,61 @@
                }
            }
        },
+        "bg": {
+            "cv": {
+                "vits":{
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--bg--cv--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
+            }
+        },
+        "cs": {
+            "cv": {
+                "vits":{
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--cs--cv--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
+            }
+        },
+        "da": {
+            "cv": {
+                "vits":{
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--da--cv--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
+            }
+        },
+        "et": {
+            "cv": {
+                "vits":{
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--et--cv--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
+            }
+        },
+        "ga": {
+            "cv": {
+                "vits":{
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ga--cv--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
+            }
+        },
        "en": {
            "ek1": {
                "tacotron2": {
@ -79,6 +134,14 @@
                    "license": "apache 2.0",
                    "contact": "egolge@coqui.com"
                },
+                "vits--neon": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--en--ljspeech--vits.zip",
+                    "default_vocoder": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause",
+                    "contact": null,
+                    "commit": null
+                },   
                "fast_pitch": {
                    "description": "FastPitch model trained on LJSpeech using the Aligner Network",
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--fast_pitch.zip",
@ -130,10 +193,10 @@
                    "license": "apache 2.0",
                    "contact": "adamfroghyar@gmail.com"
                },
-                "capacitron-t2-c150": {
+                "capacitron-t2-c150_v2": {
                    "description": "Capacitron additions to Tacotron 2 with Capacity at 150 as in https://arxiv.org/pdf/1906.03402.pdf",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--en--blizzard2013--capacitron-t2-c150.zip",
-                    "commit": "d6284e7",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.1_models/tts_models--en--blizzard2013--capacitron-t2-c150_v2.zip",
+                    "commit": "a67039d",
                    "default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
                    "author": "Adam Froghyar @a-froghyar",
                    "license": "apache 2.0",
@ -151,18 +214,36 @@
                    "license": "MPL",
                    "contact": "egolge@coqui.com"
                }
-            }
+            },
+            "css10":{
+                "vits":{
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--es--css10--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
+            } 
        },
        "fr": {
            "mai": {
                "tacotron2-DDC": {
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--fr--mai--tacotron2-DDC.zip",
                    "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
-                    "commit": "",
+                    "commit": null,
                    "author": "Eren Gölge @erogol",
                    "license": "MPL",
                    "contact": "egolge@coqui.com"
                }
+            },
+            "css10":{
+                "vits":{
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fr--css10--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
            }
        },
        "uk":{
@ -174,6 +255,13 @@
                    "license": "MIT",
                    "contact": "",
                    "default_vocoder": "vocoder_models/uk/mai/multiband-melgan"
+                },
+                "vits":{
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--uk--mai--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
                }
            }
        },
@ -198,6 +286,15 @@
                    "stats_file": null,
                    "commit": "540d811"
                }
+            },
+            "css10":{
+                "vits":{
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--nl--css10--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
            }
        },
        "de": {
@ -224,6 +321,15 @@
                    "license": "apache 2.0",
                    "commit": "unknown"
                }
+            },
+            "css10": {
+                "vits-neon":{
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--de--css10--vits.zip",
+                    "default_vocoder": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause",
+                    "commit": null
+                }
            }
        },
        "ja": {
@ -359,6 +465,149 @@
                    "commit": "1b22f03"
                }
            }
+        },
+        "hu": {
+            "css10": {
+                "vits": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hu--css10--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
+            }
+        },
+        "el": {
+            "cv": {
+                "vits": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--el--cv--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
+            }
+        },
+        "fi": {
+            "css10": {
+                "vits":{
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fi--css10--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
+            }
+        },
+        "hr": {
+            "cv": {
+                "vits":{
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hr--cv--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
+            }
+        },
+        "lt": {
+            "cv": {
+                "vits":{
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lt--cv--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
+            }
+        },
+        "lv": {
+            "cv": {
+                "vits":{
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lv--cv--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
+            }
+        },
+        "mt": {
+            "cv": {
+                "vits":{
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--mt--cv--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
+            }
+        },
+        "pl": {
+            "mai_female": {
+                "vits":{
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pl--mai_female--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
+            }
+        },
+        "pt": {
+            "cv": {
+                "vits":{
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pt--cv--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
+            }
+        },
+        "ro": {
+            "cv": {
+                "vits":{
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ro--cv--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
+            }
+        },
+        "sk": {
+            "cv": {
+                "vits":{
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sk--cv--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
+            }
+        },
+        "sl": {
+            "cv": {
+                "vits":{
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sl--cv--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
+            }
+        },
+        "sv": {
+            "cv": {
+                "vits":{
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sv--cv--vits.zip",
+                    "default_vocoder": null,
+                    "commit": null,
+                    "author": "@NeonGeckoCom",
+                    "license": "bsd-3-clause"
+                }
+            }
        }
    },
    "vocoder_models": {
@ -512,4 +761,4 @@
            }
        }
    }
-}
+}
--- a/TTS/VERSION
+++ b/TTS/VERSION
@ -1 +1 @@
-0.8.0
+0.9.0
--- a/TTS/bin/compute_embeddings.py
+++ b/TTS/bin/compute_embeddings.py
@ -6,38 +6,87 @@ import torch
 from tqdm import tqdm

 from TTS.config import load_config
+from TTS.config.shared_configs import BaseDatasetConfig
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.utils.managers import save_file
 from TTS.tts.utils.speakers import SpeakerManager

 parser = argparse.ArgumentParser(
-    description="""Compute embedding vectors for each wav file in a dataset.\n\n"""
+    description="""Compute embedding vectors for each audio file in a dataset and store them keyed by `{dataset_name}#{file_path}` in a .pth file\n\n"""
    """
    Example runs:
-    python TTS/bin/compute_embeddings.py speaker_encoder_model.pth speaker_encoder_config.json  dataset_config.json
+    python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json  --config_dataset_path dataset_config.json
+
+    python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json  --fomatter vctk --dataset_path /path/to/vctk/dataset --dataset_name my_vctk --metafile /path/to/vctk/metafile.csv
    """,
    formatter_class=RawTextHelpFormatter,
 )
-parser.add_argument("model_path", type=str, help="Path to model checkpoint file.")
-parser.add_argument("config_path", type=str, help="Path to model config file.")
-parser.add_argument("config_dataset_path", type=str, help="Path to dataset config file.")
+parser.add_argument(
+    "--model_path",
+    type=str,
+    help="Path to model checkpoint file. It defaults to the released speaker encoder.",
+    default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
+)
+parser.add_argument(
+    "--config_path",
+    type=str,
+    help="Path to model config file. It defaults to the released speaker encoder config.",
+    default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
+)
+parser.add_argument(
+    "--config_dataset_path",
+    type=str,
+    help="Path to dataset config file. You either need to provide this or `formatter_name`, `dataset_name` and `dataset_path` arguments.",
+    default=None,
+)
 parser.add_argument("--output_path", type=str, help="Path for output `pth` or `json` file.", default="speakers.pth")
 parser.add_argument("--old_file", type=str, help="Previous embedding file to only compute new audios.", default=None)
 parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False)
 parser.add_argument("--no_eval", type=bool, help="Do not compute eval?. Default False", default=False)
-
+parser.add_argument(
+    "--formatter_name",
+    type=str,
+    help="Name of the formatter to use. You either need to provide this or `config_dataset_path`",
+    default=None,
+)
+parser.add_argument(
+    "--dataset_name",
+    type=str,
+    help="Name of the dataset to use. You either need to provide this or `config_dataset_path`",
+    default=None,
+)
+parser.add_argument(
+    "--dataset_path",
+    type=str,
+    help="Path to the dataset. You either need to provide this or `config_dataset_path`",
+    default=None,
+)
+parser.add_argument(
+    "--metafile",
+    type=str,
+    help="Path to the meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
+    default=None,
+)
 args = parser.parse_args()

 use_cuda = torch.cuda.is_available() and not args.disable_cuda

-c_dataset = load_config(args.config_dataset_path)
+if args.config_dataset_path is not None:
+    c_dataset = load_config(args.config_dataset_path)
+    meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=not args.no_eval)
+else:
+    c_dataset = BaseDatasetConfig()
+    c_dataset.formatter = args.formatter_name
+    c_dataset.dataset_name = args.dataset_name
+    c_dataset.path = args.dataset_path
+    c_dataset.meta_file_train = args.metafile if args.metafile else None
+    meta_data_train, meta_data_eval = load_tts_samples(c_dataset, eval_split=not args.no_eval)

-meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=not args.no_eval)

 if meta_data_eval is None:
-    wav_files = meta_data_train
+    samples = meta_data_train
 else:
-    wav_files = meta_data_train + meta_data_eval
+    samples = meta_data_train + meta_data_eval

 encoder_manager = SpeakerManager(
    encoder_model_path=args.model_path,
@ -50,25 +99,23 @@ class_name_key = encoder_manager.encoder_config.class_name_key

 # compute speaker embeddings
 speaker_mapping = {}
-for idx, wav_file in enumerate(tqdm(wav_files)):
-    if isinstance(wav_file, dict):
-        class_name = wav_file[class_name_key]
-        wav_file = wav_file["audio_file"]
-    else:
-        class_name = None
+for idx, fields in enumerate(tqdm(samples)):
+    class_name = fields[class_name_key]
+    audio_file = fields["audio_file"]
+    embedding_key = fields["audio_unique_name"]
+    root_path = fields["root_path"]

-    wav_file_name = os.path.basename(wav_file)
-    if args.old_file is not None and wav_file_name in encoder_manager.clip_ids:
+    if args.old_file is not None and embedding_key in encoder_manager.clip_ids:
        # get the embedding from the old file
-        embedd = encoder_manager.get_embedding_by_clip(wav_file_name)
+        embedd = encoder_manager.get_embedding_by_clip(embedding_key)
    else:
        # extract the embedding
-        embedd = encoder_manager.compute_embedding_from_clip(wav_file)
+        embedd = encoder_manager.compute_embedding_from_clip(audio_file)

    # create speaker_mapping if target dataset is defined
-    speaker_mapping[wav_file_name] = {}
-    speaker_mapping[wav_file_name]["name"] = class_name
-    speaker_mapping[wav_file_name]["embedding"] = embedd
+    speaker_mapping[embedding_key] = {}
+    speaker_mapping[embedding_key]["name"] = class_name
+    speaker_mapping[embedding_key]["embedding"] = embedd

 if speaker_mapping:
    # save speaker_mapping if target dataset is defined
--- a/TTS/bin/extract_tts_spectrograms.py
+++ b/TTS/bin/extract_tts_spectrograms.py
@ -37,7 +37,7 @@ def setup_loader(ap, r, verbose=False):
        precompute_num_workers=0,
        use_noise_augment=False,
        verbose=verbose,
-        speaker_id_mapping=speaker_manager.ids if c.use_speaker_embedding else None,
+        speaker_id_mapping=speaker_manager.name_to_id if c.use_speaker_embedding else None,
        d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None,
    )

--- a/TTS/bin/find_unique_phonemes.py
+++ b/TTS/bin/find_unique_phonemes.py
@ -7,30 +7,25 @@ from tqdm.contrib.concurrent import process_map

 from TTS.config import load_config
 from TTS.tts.datasets import load_tts_samples
-from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut
-
-phonemizer = Gruut(language="en-us")
+from TTS.tts.utils.text.phonemizers import Gruut


 def compute_phonemes(item):
-    try:
-        text = item[0]
-        ph = phonemizer.phonemize(text).split("|")
-    except:
-        return []
-    return list(set(ph))
+    text = item["text"]
+    ph = phonemizer.phonemize(text).replace("|", "")
+    return set(list(ph))


 def main():
    # pylint: disable=W0601
-    global c
+    global c, phonemizer
    # pylint: disable=bad-option-value
    parser = argparse.ArgumentParser(
        description="""Find all the unique characters or phonemes in a dataset.\n\n"""
        """
    Example runs:

-    python TTS/bin/find_unique_chars.py --config_path config.json
+    python TTS/bin/find_unique_phonemes.py --config_path config.json
    """,
        formatter_class=RawTextHelpFormatter,
    )
@ -46,15 +41,24 @@ def main():
    items = train_items + eval_items
    print("Num items:", len(items))

-    is_lang_def = all(item["language"] for item in items)
+    language_list = [item["language"] for item in items]
+    is_lang_def = all(language_list)

    if not c.phoneme_language or not is_lang_def:
        raise ValueError("Phoneme language must be defined in config.")

+    if not language_list.count(language_list[0]) == len(language_list):
+        raise ValueError(
+            "Currently, just one phoneme language per config file is supported !! Please split the dataset config into different configs and run it individually for each language !!"
+        )
+
+    phonemizer = Gruut(language=language_list[0], keep_puncs=True)
+
    phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15)
    phones = []
    for ph in phonemes:
        phones.extend(ph)
+
    phones = set(phones)
    lower_phones = filter(lambda c: c.islower(), phones)
    phones_force_lower = [c.lower() for c in phones]
--- a/TTS/bin/remove_silence_using_vad.py
+++ b/TTS/bin/remove_silence_using_vad.py
@ -17,7 +17,7 @@ def adjust_path_and_remove_silence(audio_path):
    # create all directory structure
    pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
    # remove the silence and save the audio
-    output_path = remove_silence(
+    output_path, is_speech = remove_silence(
        model_and_utils,
        audio_path,
        output_path,
@ -25,26 +25,34 @@ def adjust_path_and_remove_silence(audio_path):
        use_cuda=args.use_cuda,
    )

-    return output_path
+    return output_path, is_speech


 def preprocess_audios():
    files = sorted(glob.glob(os.path.join(args.input_dir, args.glob), recursive=True))
    print("> Number of files: ", len(files))
    if not args.force:
-        print("> Ignoring files that already exist in the output directory.")
+        print("> Ignoring files that already exist in the output idrectory.")

    if args.trim_just_beginning_and_end:
        print("> Trimming just the beginning and the end with nonspeech parts.")
    else:
        print("> Trimming all nonspeech parts.")

+    filtered_files = []
    if files:
        # create threads
        # num_threads = multiprocessing.cpu_count()
        # process_map(adjust_path_and_remove_silence, files, max_workers=num_threads, chunksize=15)
        for f in tqdm(files):
-            adjust_path_and_remove_silence(f)
+            output_path, is_speech = adjust_path_and_remove_silence(f)
+            if not is_speech:
+                filtered_files.append(output_path)
+
+        # write files that do not have speech
+        with open(os.path.join(args.output_dir, "filtered_files.txt"), "w", encoding="utf-8") as f:
+            for file in filtered_files:
+                f.write(file + "\n")
    else:
        print("> No files Found !")

--- a/TTS/bin/synthesize.py
+++ b/TTS/bin/synthesize.py
@ -238,6 +238,13 @@ If you don't specify any models, then it uses LJSpeech based English model.
        help="speaker ID of the reference_wav speaker (If not provided the embedding will be computed using the Speaker Encoder).",
        default=None,
    )
+    parser.add_argument(
+        "--progress_bar",
+        type=str2bool,
+        help="If true shows a progress bar for the model download. Defaults to True",
+        default=True,
+    )
+
    args = parser.parse_args()

    # print the description if either text or list_models is not set
@ -255,7 +262,7 @@ If you don't specify any models, then it uses LJSpeech based English model.

    # load model manager
    path = Path(__file__).parent / "../.models.json"
-    manager = ModelManager(path)
+    manager = ModelManager(path, progress_bar=args.progress_bar)

    model_path = None
    config_path = None
@ -323,7 +330,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
        print(
            " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
        )
-        print(synthesizer.tts_model.speaker_manager.ids)
+        print(synthesizer.tts_model.speaker_manager.name_to_id)
        return

    # query langauge ids of a multi-lingual model.
@ -331,7 +338,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
        print(
            " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
        )
-        print(synthesizer.tts_model.language_manager.ids)
+        print(synthesizer.tts_model.language_manager.name_to_id)
        return

    # check the arguments against a multi-speaker model.
--- a/TTS/config/shared_configs.py
+++ b/TTS/config/shared_configs.py
@ -62,7 +62,7 @@ class BaseAudioConfig(Coqpit):
            Maximum frequency of the F0 frames. Defaults to ```640```.

        pitch_fmin (float, optional):
-            Minimum frequency of the F0 frames. Defaults to ```0```.
+            Minimum frequency of the F0 frames. Defaults to ```1```.

        trim_db (int):
            Silence threshold used for silence trimming. Defaults to 45.
@ -144,7 +144,7 @@ class BaseAudioConfig(Coqpit):
    do_amp_to_db_mel: bool = True
    # f0 params
    pitch_fmax: float = 640.0
-    pitch_fmin: float = 0.0
+    pitch_fmin: float = 1.0
    # normalization params
    signal_norm: bool = True
    min_level_db: int = -100
@ -193,21 +193,24 @@ class BaseDatasetConfig(Coqpit):
    """Base config for TTS datasets.

    Args:
-        name (str):
-            Dataset name that defines the preprocessor in use. Defaults to None.
+        formatter (str):
+            Formatter name that defines used formatter in ```TTS.tts.datasets.formatter```. Defaults to `""`.
+
+        dataset_name (str):
+            Unique name for the dataset. Defaults to `""`.

        path (str):
-            Root path to the dataset files. Defaults to None.
+            Root path to the dataset files. Defaults to `""`.

        meta_file_train (str):
            Name of the dataset meta file. Or a list of speakers to be ignored at training for multi-speaker datasets.
-            Defaults to None.
+            Defaults to `""`.

        ignored_speakers (List):
            List of speakers IDs that are not used at the training. Default None.

        language (str):
-            Language code of the dataset. If defined, it overrides `phoneme_language`. Defaults to None.
+            Language code of the dataset. If defined, it overrides `phoneme_language`. Defaults to `""`.

        meta_file_val (str):
            Name of the dataset meta file that defines the instances used at validation.
@ -217,7 +220,8 @@ class BaseDatasetConfig(Coqpit):
            train the duration predictor.
    """

-    name: str = ""
+    formatter: str = ""
+    dataset_name: str = ""
    path: str = ""
    meta_file_train: str = ""
    ignored_speakers: List[str] = None
@ -230,7 +234,7 @@ class BaseDatasetConfig(Coqpit):
    ):
        """Check config fields"""
        c = asdict(self)
-        check_argument("name", c, restricted=True)
+        check_argument("formatter", c, restricted=True)
        check_argument("path", c, restricted=True)
        check_argument("meta_file_train", c, restricted=True)
        check_argument("meta_file_val", c, restricted=False)
--- a/TTS/encoder/models/base_encoder.py
+++ b/TTS/encoder/models/base_encoder.py
@ -107,11 +107,18 @@ class BaseEncoder(nn.Module):
        return criterion

    def load_checkpoint(
-        self, config: Coqpit, checkpoint_path: str, eval: bool = False, use_cuda: bool = False, criterion=None
+        self,
+        config: Coqpit,
+        checkpoint_path: str,
+        eval: bool = False,
+        use_cuda: bool = False,
+        criterion=None,
+        cache=False,
    ):
-        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
        try:
            self.load_state_dict(state["model"])
+            print(" > Model fully restored. ")
        except (KeyError, RuntimeError) as error:
            # If eval raise the error
            if eval:
--- a/TTS/model.py
+++ b/TTS/model.py
@ -44,13 +44,16 @@ class BaseTrainerModel(TrainerModel):
        return outputs_dict

    @abstractmethod
-    def load_checkpoint(self, config: Coqpit, checkpoint_path: str, eval: bool = False, strict: bool = True) -> None:
+    def load_checkpoint(
+        self, config: Coqpit, checkpoint_path: str, eval: bool = False, strict: bool = True, cache=False
+    ) -> None:
        """Load a model checkpoint gile and get ready for training or inference.

        Args:
            config (Coqpit): Model configuration.
            checkpoint_path (str): Path to the model checkpoint file.
            eval (bool, optional): If true, init model for inference else for training. Defaults to False.
-            strcit (bool, optional): Match all checkpoint keys to model's keys. Defaults to True.
+            strict (bool, optional): Match all checkpoint keys to model's keys. Defaults to True.
+            cache (bool, optional): If True, cache the file locally for subsequent calls. It is cached under `get_user_data_dir()/tts_cache`. Defaults to False.
        """
        ...
--- a/TTS/server/server.py
+++ b/TTS/server/server.py
@ -5,6 +5,7 @@ import json
 import os
 import sys
 from pathlib import Path
+from threading import Lock
 from typing import Union

 from flask import Flask, render_template, request, send_file
@ -146,7 +147,7 @@ def index():
        "index.html",
        show_details=args.show_details,
        use_multi_speaker=use_multi_speaker,
-        speaker_ids=speaker_manager.ids if speaker_manager is not None else None,
+        speaker_ids=speaker_manager.name_to_id if speaker_manager is not None else None,
        use_gst=use_gst,
    )

@ -168,17 +169,21 @@ def details():
    )


+lock = Lock()
+
+
@app.route("/api/tts", methods=["GET"])
 def tts():
-    text = request.args.get("text")
-    speaker_idx = request.args.get("speaker_id", "")
-    style_wav = request.args.get("style_wav", "")
-    style_wav = style_wav_uri_to_dict(style_wav)
-    print(" > Model input: {}".format(text))
-    print(" > Speaker Idx: {}".format(speaker_idx))
-    wavs = synthesizer.tts(text, speaker_name=speaker_idx, style_wav=style_wav)
-    out = io.BytesIO()
-    synthesizer.save_wav(wavs, out)
+    with lock:
+        text = request.args.get("text")
+        speaker_idx = request.args.get("speaker_id", "")
+        style_wav = request.args.get("style_wav", "")
+        style_wav = style_wav_uri_to_dict(style_wav)
+        print(" > Model input: {}".format(text))
+        print(" > Speaker Idx: {}".format(speaker_idx))
+        wavs = synthesizer.tts(text, speaker_name=speaker_idx, style_wav=style_wav)
+        out = io.BytesIO()
+        synthesizer.save_wav(wavs, out)
    return send_file(out, mimetype="audio/wav")


--- a/TTS/tts/datasets/init.py
+++ b/TTS/tts/datasets/init.py
@ -1,3 +1,4 @@
+import os
 import sys
 from collections import Counter
 from pathlib import Path
@ -12,20 +13,16 @@ from TTS.tts.datasets.formatters import *
 def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01):
    """Split a dataset into train and eval. Consider speaker distribution in multi-speaker training.

-        Args:
-    <<<<<<< HEAD
-            items (List[List]):
-                A list of samples. Each sample is a list of `[audio_path, text, speaker_id]`.
+    Args:
+        items (List[List]):
+            A list of samples. Each sample is a list of `[audio_path, text, speaker_id]`.

-            eval_split_max_size (int):
-                Number maximum of samples to be used for evaluation in proportion split. Defaults to None (Disabled).
+        eval_split_max_size (int):
+            Number maximum of samples to be used for evaluation in proportion split. Defaults to None (Disabled).

-            eval_split_size (float):
-                If between 0.0 and 1.0 represents the proportion of the dataset to include in the evaluation set.
-                If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%).
-    =======
-            items (List[List]): A list of samples. Each sample is a list of `[text, audio_path, speaker_id]`.
-    >>>>>>> Fix docstring
+        eval_split_size (float):
+            If between 0.0 and 1.0 represents the proportion of the dataset to include in the evaluation set.
+            If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%).
    """
    speakers = [item["speaker_name"] for item in items]
    is_multi_speaker = len(set(speakers)) > 1
@ -59,6 +56,17 @@ def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01):
    return items[:eval_split_size], items[eval_split_size:]


+def add_extra_keys(metadata, language, dataset_name):
+    for item in metadata:
+        # add language name
+        item["language"] = language
+        # add unique audio name
+        relfilepath = os.path.splitext(os.path.relpath(item["audio_file"], item["root_path"]))[0]
+        audio_unique_name = f"{dataset_name}#{relfilepath}"
+        item["audio_unique_name"] = audio_unique_name
+    return metadata
+
+
 def load_tts_samples(
    datasets: Union[List[Dict], Dict],
    eval_split=True,
@ -97,7 +105,8 @@ def load_tts_samples(
    if not isinstance(datasets, list):
        datasets = [datasets]
    for dataset in datasets:
-        name = dataset["name"]
+        formatter_name = dataset["formatter"]
+        dataset_name = dataset["dataset_name"]
        root_path = dataset["path"]
        meta_file_train = dataset["meta_file_train"]
        meta_file_val = dataset["meta_file_val"]
@ -106,17 +115,19 @@ def load_tts_samples(

        # setup the right data processor
        if formatter is None:
-            formatter = _get_formatter_by_name(name)
+            formatter = _get_formatter_by_name(formatter_name)
        # load train set
        meta_data_train = formatter(root_path, meta_file_train, ignored_speakers=ignored_speakers)
-        meta_data_train = [{**item, **{"language": language}} for item in meta_data_train]
+        assert len(meta_data_train) > 0, f" [!] No training samples found in {root_path}/{meta_file_train}"
+
+        meta_data_train = add_extra_keys(meta_data_train, language, dataset_name)

        print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}")
        # load evaluation split if set
        if eval_split:
            if meta_file_val:
                meta_data_eval = formatter(root_path, meta_file_val, ignored_speakers=ignored_speakers)
-                meta_data_eval = [{**item, **{"language": language}} for item in meta_data_eval]
+                meta_data_eval = add_extra_keys(meta_data_eval, language, dataset_name)
            else:
                meta_data_eval, meta_data_train = split_dataset(meta_data_train, eval_split_max_size, eval_split_size)
            meta_data_eval_all += meta_data_eval
--- a/TTS/tts/datasets/dataset.py
+++ b/TTS/tts/datasets/dataset.py
@ -1,3 +1,4 @@
+import base64
 import collections
 import os
 import random
@ -34,6 +35,12 @@ def noise_augment_audio(wav):
    return wav + (1.0 / 32768.0) * np.random.rand(*wav.shape)


+def string2filename(string):
+    # generate a safe and reversible filename based on a string
+    filename = base64.urlsafe_b64encode(string.encode("utf-8")).decode("utf-8", "ignore")
+    return filename
+
+
 class TTSDataset(Dataset):
    def __init__(
        self,
@ -201,7 +208,7 @@ class TTSDataset(Dataset):
    def get_f0(self, idx):
        out_dict = self.f0_dataset[idx]
        item = self.samples[idx]
-        assert item["audio_file"] == out_dict["audio_file"]
+        assert item["audio_unique_name"] == out_dict["audio_unique_name"]
        return out_dict

    @staticmethod
@ -256,6 +263,7 @@ class TTSDataset(Dataset):
            "speaker_name": item["speaker_name"],
            "language_name": item["language"],
            "wav_file_name": os.path.basename(item["audio_file"]),
+            "audio_unique_name": item["audio_unique_name"],
        }
        return sample

@ -397,8 +405,8 @@ class TTSDataset(Dataset):
                language_ids = None
            # get pre-computed d-vectors
            if self.d_vector_mapping is not None:
-                wav_files_names = list(batch["wav_file_name"])
-                d_vectors = [self.d_vector_mapping[w]["embedding"] for w in wav_files_names]
+                embedding_keys = list(batch["audio_unique_name"])
+                d_vectors = [self.d_vector_mapping[w]["embedding"] for w in embedding_keys]
            else:
                d_vectors = None

@ -560,19 +568,18 @@ class PhonemeDataset(Dataset):

    def __getitem__(self, index):
        item = self.samples[index]
-        ids = self.compute_or_load(item["audio_file"], item["text"])
+        ids = self.compute_or_load(string2filename(item["audio_unique_name"]), item["text"])
        ph_hat = self.tokenizer.ids_to_text(ids)
        return {"text": item["text"], "ph_hat": ph_hat, "token_ids": ids, "token_ids_len": len(ids)}

    def __len__(self):
        return len(self.samples)

-    def compute_or_load(self, wav_file, text):
+    def compute_or_load(self, file_name, text):
        """Compute phonemes for the given text.

        If the phonemes are already cached, load them from cache.
        """
-        file_name = os.path.splitext(os.path.basename(wav_file))[0]
        file_ext = "_phoneme.npy"
        cache_path = os.path.join(self.cache_path, file_name + file_ext)
        try:
@ -669,11 +676,11 @@ class F0Dataset:

    def __getitem__(self, idx):
        item = self.samples[idx]
-        f0 = self.compute_or_load(item["audio_file"])
+        f0 = self.compute_or_load(item["audio_file"], string2filename(item["audio_unique_name"]))
        if self.normalize_f0:
            assert self.mean is not None and self.std is not None, " [!] Mean and STD is not available"
            f0 = self.normalize(f0)
-        return {"audio_file": item["audio_file"], "f0": f0}
+        return {"audio_unique_name": item["audio_unique_name"], "f0": f0}

    def __len__(self):
        return len(self.samples)
@ -705,8 +712,7 @@ class F0Dataset:
        return self.pad_id

    @staticmethod
-    def create_pitch_file_path(wav_file, cache_path):
-        file_name = os.path.splitext(os.path.basename(wav_file))[0]
+    def create_pitch_file_path(file_name, cache_path):
        pitch_file = os.path.join(cache_path, file_name + "_pitch.npy")
        return pitch_file

@ -744,11 +750,11 @@ class F0Dataset:
        pitch[zero_idxs] = 0.0
        return pitch

-    def compute_or_load(self, wav_file):
+    def compute_or_load(self, wav_file, audio_unique_name):
        """
        compute pitch and return a numpy array of pitch values
        """
-        pitch_file = self.create_pitch_file_path(wav_file, self.cache_path)
+        pitch_file = self.create_pitch_file_path(audio_unique_name, self.cache_path)
        if not os.path.exists(pitch_file):
            pitch = self._compute_and_save_pitch(self.ap, wav_file, pitch_file)
        else:
@ -756,14 +762,14 @@ class F0Dataset:
        return pitch.astype(np.float32)

    def collate_fn(self, batch):
-        audio_file = [item["audio_file"] for item in batch]
+        audio_unique_name = [item["audio_unique_name"] for item in batch]
        f0s = [item["f0"] for item in batch]
        f0_lens = [len(item["f0"]) for item in batch]
        f0_lens_max = max(f0_lens)
        f0s_torch = torch.LongTensor(len(f0s), f0_lens_max).fill_(self.get_pad_id())
        for i, f0_len in enumerate(f0_lens):
            f0s_torch[i, :f0_len] = torch.LongTensor(f0s[i])
-        return {"audio_file": audio_file, "f0": f0s_torch, "f0_lens": f0_lens}
+        return {"audio_unique_name": audio_unique_name, "f0": f0s_torch, "f0_lens": f0_lens}

    def print_logs(self, level: int = 0) -> None:
        indent = "\t" * level
--- a/TTS/tts/datasets/formatters.py
+++ b/TTS/tts/datasets/formatters.py
@ -15,6 +15,15 @@ from tqdm import tqdm

 def coqui(root_path, meta_file, ignored_speakers=None):
    """Interal dataset formatter."""
+    filepath = os.path.join(root_path, meta_file)
+    # ensure there are 4 columns for every line
+    with open(filepath, "r", encoding="utf8") as f:
+        lines = f.readlines()
+    num_cols = len(lines[0].split("|"))  # take the first row as reference
+    for idx, line in enumerate(lines[1:]):
+        if len(line.split("|")) != num_cols:
+            print(f" > Missing column in line {idx + 1} -> {line.strip()}")
+    # load metadata
    metadata = pd.read_csv(os.path.join(root_path, meta_file), sep="|")
    assert all(x in metadata.columns for x in ["audio_file", "text"])
    speaker_name = None if "speaker_name" in metadata.columns else "coqui"
@ -97,9 +106,9 @@ def mailabs(root_path, meta_files=None, ignored_speakers=None):
        meta_files (str):  list of meta files to be used in the training. If None, finds all the csv files
            recursively. Defaults to None
    """
-    speaker_regex = re.compile("by_book/(male|female)/(?P<speaker_name>[^/]+)/")
+    speaker_regex = re.compile(f"by_book{os.sep}(male|female){os.sep}(?P<speaker_name>[^{os.sep}]+){os.sep}")
    if not meta_files:
-        csv_files = glob(root_path + "/**/metadata.csv", recursive=True)
+        csv_files = glob(root_path + f"{os.sep}**{os.sep}metadata.csv", recursive=True)
    else:
        csv_files = meta_files

@ -578,3 +587,17 @@ def kokoro(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
            text = cols[2].replace(" ", "")
            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
    return items
+
+
+def kss(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
+    """Korean single-speaker dataset from https://www.kaggle.com/datasets/bryanpark/korean-single-speaker-speech-dataset"""
+    txt_file = os.path.join(root_path, meta_file)
+    items = []
+    speaker_name = "kss"
+    with open(txt_file, "r", encoding="utf-8") as ttf:
+        for line in ttf:
+            cols = line.split("|")
+            wav_file = os.path.join(root_path, cols[0])
+            text = cols[2]  # cols[1] => 6월, cols[2] => 유월
+            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name})
+    return items
--- a/TTS/tts/models/align_tts.py
+++ b/TTS/tts/models/align_tts.py
@ -398,9 +398,9 @@ class AlignTTS(BaseTTS):
        logger.eval_audios(steps, audios, self.ap.sample_rate)

    def load_checkpoint(
-        self, config, checkpoint_path, eval=False
+        self, config, checkpoint_path, eval=False, cache=False
    ):  # pylint: disable=unused-argument, redefined-builtin
-        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
        self.load_state_dict(state["model"])
        if eval:
            self.eval()
--- a/TTS/tts/models/base_tacotron.py
+++ b/TTS/tts/models/base_tacotron.py
@ -92,16 +92,17 @@ class BaseTacotron(BaseTTS):
        pass

    def load_checkpoint(
-        self, config, checkpoint_path, eval=False
+        self, config, checkpoint_path, eval=False, cache=False
    ):  # pylint: disable=unused-argument, redefined-builtin
        """Load model checkpoint and set up internals.

        Args:
            config (Coqpi): model configuration.
            checkpoint_path (str): path to checkpoint file.
-            eval (bool): whether to load model for evaluation.
+            eval (bool, optional): whether to load model for evaluation.
+            cache (bool, optional): If True, cache the file locally for subsequent calls. It is cached under `get_user_data_dir()/tts_cache`. Defaults to False.
        """
-        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
        self.load_state_dict(state["model"])
        # TODO: set r in run-time by taking it from the new config
        if "r" in state:
--- a/TTS/tts/models/base_tts.py
+++ b/TTS/tts/models/base_tts.py
@ -144,11 +144,11 @@ class BaseTTS(BaseTrainerModel):
                if speaker_name is None:
                    speaker_id = self.speaker_manager.get_random_id()
                else:
-                    speaker_id = self.speaker_manager.ids[speaker_name]
+                    speaker_id = self.speaker_manager.name_to_id[speaker_name]

        # get language id
        if hasattr(self, "language_manager") and config.use_language_embedding and language_name is not None:
-            language_id = self.language_manager.ids[language_name]
+            language_id = self.language_manager.name_to_id[language_name]

        return {
            "text": text,
@ -288,11 +288,13 @@ class BaseTTS(BaseTrainerModel):
            # setup multi-speaker attributes
            if hasattr(self, "speaker_manager") and self.speaker_manager is not None:
                if hasattr(config, "model_args"):
-                    speaker_id_mapping = self.speaker_manager.ids if config.model_args.use_speaker_embedding else None
+                    speaker_id_mapping = (
+                        self.speaker_manager.name_to_id if config.model_args.use_speaker_embedding else None
+                    )
                    d_vector_mapping = self.speaker_manager.embeddings if config.model_args.use_d_vector_file else None
                    config.use_d_vector_file = config.model_args.use_d_vector_file
                else:
-                    speaker_id_mapping = self.speaker_manager.ids if config.use_speaker_embedding else None
+                    speaker_id_mapping = self.speaker_manager.name_to_id if config.use_speaker_embedding else None
                    d_vector_mapping = self.speaker_manager.embeddings if config.use_d_vector_file else None
            else:
                speaker_id_mapping = None
@ -300,7 +302,7 @@ class BaseTTS(BaseTrainerModel):

            # setup multi-lingual attributes
            if hasattr(self, "language_manager") and self.language_manager is not None:
-                language_id_mapping = self.language_manager.ids if self.args.use_language_embedding else None
+                language_id_mapping = self.language_manager.name_to_id if self.args.use_language_embedding else None
            else:
                language_id_mapping = None

@ -342,7 +344,7 @@ class BaseTTS(BaseTrainerModel):
            loader = DataLoader(
                dataset,
                batch_size=config.eval_batch_size if is_eval else config.batch_size,
-                shuffle=False,  # shuffle is done in the dataset.
+                shuffle=True,  # if there is no other sampler
                collate_fn=dataset.collate_fn,
                drop_last=False,  # setting this False might cause issues in AMP training.
                sampler=sampler,
@ -363,7 +365,7 @@ class BaseTTS(BaseTrainerModel):
        aux_inputs = {
            "speaker_id": None
            if not self.config.use_speaker_embedding
-            else random.sample(sorted(self.speaker_manager.ids.values()), 1),
+            else random.sample(sorted(self.speaker_manager.name_to_id.values()), 1),
            "d_vector": d_vector,
            "style_wav": None,  # TODO: handle GST style input
        }
--- a/TTS/tts/models/forward_tts.py
+++ b/TTS/tts/models/forward_tts.py
@ -16,6 +16,7 @@ from TTS.tts.utils.helpers import average_over_durations, generate_path, maximum
 from TTS.tts.utils.speakers import SpeakerManager
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.tts.utils.visual import plot_alignment, plot_avg_pitch, plot_spectrogram
+from TTS.utils.io import load_fsspec


@dataclass
@ -707,9 +708,9 @@ class ForwardTTS(BaseTTS):
        logger.eval_audios(steps, audios, self.ap.sample_rate)

    def load_checkpoint(
-        self, config, checkpoint_path, eval=False
+        self, config, checkpoint_path, eval=False, cache=False
    ):  # pylint: disable=unused-argument, redefined-builtin
-        state = torch.load(checkpoint_path, map_location=torch.device("cpu"))
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
        self.load_state_dict(state["model"])
        if eval:
            self.eval()
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@ -284,6 +284,7 @@ class VitsDataset(TTSDataset):
            "wav_file": wav_filename,
            "speaker_name": item["speaker_name"],
            "language_name": item["language"],
+            "audio_unique_name": item["audio_unique_name"],
        }

    @property
@ -308,6 +309,7 @@ class VitsDataset(TTSDataset):
            - language_names: :math:`[B]`
            - audiofile_paths: :math:`[B]`
            - raw_texts: :math:`[B]`
+            - audio_unique_names: :math:`[B]`
        """
        # convert list of dicts to dict of lists
        B = len(batch)
@ -348,6 +350,7 @@ class VitsDataset(TTSDataset):
            "language_names": batch["language_name"],
            "audio_files": batch["wav_file"],
            "raw_text": batch["raw_text"],
+            "audio_unique_names": batch["audio_unique_name"],
        }


@ -718,6 +721,10 @@ class Vits(BaseTTS):
                use_spectral_norm=self.args.use_spectral_norm_disriminator,
            )

+    @property
+    def device(self):
+        return next(self.parameters()).device
+
    def init_multispeaker(self, config: Coqpit):
        """Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer
        or with external `d_vectors` computed from a speaker encoder model.
@ -755,17 +762,12 @@ class Vits(BaseTTS):

            if (
                hasattr(self.speaker_manager.encoder, "audio_config")
-                and self.config.audio["sample_rate"] != self.speaker_manager.encoder.audio_config["sample_rate"]
+                and self.config.audio.sample_rate != self.speaker_manager.encoder.audio_config["sample_rate"]
            ):
                self.audio_transform = torchaudio.transforms.Resample(
-                    orig_freq=self.audio_config["sample_rate"],
+                    orig_freq=self.config.audio.sample_rate,
                    new_freq=self.speaker_manager.encoder.audio_config["sample_rate"],
                )
-            # pylint: disable=W0101,W0105
-            self.audio_transform = torchaudio.transforms.Resample(
-                orig_freq=self.config.audio.sample_rate,
-                new_freq=self.speaker_manager.encoder.audio_config["sample_rate"],
-            )

    def _init_speaker_embedding(self):
        # pylint: disable=attribute-defined-outside-init
@ -808,6 +810,13 @@ class Vits(BaseTTS):
                orig_freq=self.config.audio["sample_rate"], new_freq=self.args.encoder_sample_rate
            )  # pylint: disable=W0201

+    def on_epoch_start(self, trainer):  # pylint: disable=W0613
+        """Freeze layers at the beginning of an epoch"""
+        self._freeze_layers()
+        # set the device of speaker encoder
+        if self.args.use_speaker_encoder_as_loss:
+            self.speaker_manager.encoder = self.speaker_manager.encoder.to(self.device)
+
    def on_init_end(self, trainer):  # pylint: disable=W0613
        """Reinit layes if needed"""
        if self.args.reinit_DP:
@ -1185,7 +1194,6 @@ class Vits(BaseTTS):
        y_lengths = torch.tensor([y.size(-1)]).to(y.device)
        speaker_cond_src = reference_speaker_id if reference_speaker_id is not None else reference_d_vector
        speaker_cond_tgt = speaker_id if speaker_id is not None else d_vector
-        # print(y.shape, y_lengths.shape)
        wav, _, _ = self.voice_conversion(y, y_lengths, speaker_cond_src, speaker_cond_tgt)
        return wav

@ -1229,8 +1237,6 @@ class Vits(BaseTTS):
            Tuple[Dict, Dict]: Model ouputs and computed losses.
        """

-        self._freeze_layers()
-
        spec_lens = batch["spec_lens"]

        if optimizer_idx == 0:
@ -1402,11 +1408,11 @@ class Vits(BaseTTS):
                if speaker_name is None:
                    speaker_id = self.speaker_manager.get_random_id()
                else:
-                    speaker_id = self.speaker_manager.ids[speaker_name]
+                    speaker_id = self.speaker_manager.name_to_id[speaker_name]

        # get language id
        if hasattr(self, "language_manager") and config.use_language_embedding and language_name is not None:
-            language_id = self.language_manager.ids[language_name]
+            language_id = self.language_manager.name_to_id[language_name]

        return {
            "text": text,
@ -1461,8 +1467,8 @@ class Vits(BaseTTS):
        d_vectors = None

        # get numerical speaker ids from speaker names
-        if self.speaker_manager is not None and self.speaker_manager.ids and self.args.use_speaker_embedding:
-            speaker_ids = [self.speaker_manager.ids[sn] for sn in batch["speaker_names"]]
+        if self.speaker_manager is not None and self.speaker_manager.name_to_id and self.args.use_speaker_embedding:
+            speaker_ids = [self.speaker_manager.name_to_id[sn] for sn in batch["speaker_names"]]

        if speaker_ids is not None:
            speaker_ids = torch.LongTensor(speaker_ids)
@ -1471,12 +1477,12 @@ class Vits(BaseTTS):
        # get d_vectors from audio file names
        if self.speaker_manager is not None and self.speaker_manager.embeddings and self.args.use_d_vector_file:
            d_vector_mapping = self.speaker_manager.embeddings
-            d_vectors = [d_vector_mapping[w]["embedding"] for w in batch["audio_files"]]
+            d_vectors = [d_vector_mapping[w]["embedding"] for w in batch["audio_unique_names"]]
            d_vectors = torch.FloatTensor(d_vectors)

        # get language ids from language names
-        if self.language_manager is not None and self.language_manager.ids and self.args.use_language_embedding:
-            language_ids = [self.language_manager.ids[ln] for ln in batch["language_names"]]
+        if self.language_manager is not None and self.language_manager.name_to_id and self.args.use_language_embedding:
+            language_ids = [self.language_manager.name_to_id[ln] for ln in batch["language_names"]]

        if language_ids is not None:
            language_ids = torch.LongTensor(language_ids)
@ -1680,14 +1686,10 @@ class Vits(BaseTTS):
        return [VitsDiscriminatorLoss(self.config), VitsGeneratorLoss(self.config)]

    def load_checkpoint(
-        self,
-        config,
-        checkpoint_path,
-        eval=False,
-        strict=True,
+        self, config, checkpoint_path, eval=False, strict=True, cache=False
    ):  # pylint: disable=unused-argument, redefined-builtin
        """Load the model checkpoint and setup for training or inference"""
-        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
        # compat band-aid for the pre-trained models to not use the encoder baked into the model
        # TODO: consider baking the speaker encoder into the model and call it from there.
        # as it is probably easier for model distribution.
--- a/TTS/tts/utils/languages.py
+++ b/TTS/tts/utils/languages.py
@ -37,11 +37,11 @@ class LanguageManager(BaseIDManager):

    @property
    def num_languages(self) -> int:
-        return len(list(self.ids.keys()))
+        return len(list(self.name_to_id.keys()))

    @property
    def language_names(self) -> List:
-        return list(self.ids.keys())
+        return list(self.name_to_id.keys())

    @staticmethod
    def parse_language_ids_from_config(c: Coqpit) -> Dict:
@ -67,7 +67,7 @@ class LanguageManager(BaseIDManager):
        Args:
            c (Coqpit): Config.
        """
-        self.ids = self.parse_language_ids_from_config(c)
+        self.name_to_id = self.parse_language_ids_from_config(c)

    @staticmethod
    def parse_ids_from_data(items: List, parse_key: str) -> Any:
@ -82,7 +82,7 @@ class LanguageManager(BaseIDManager):
        Args:
            file_path (str): Path to the output file.
        """
-        self._save_json(file_path, self.ids)
+        self._save_json(file_path, self.name_to_id)

    @staticmethod
    def init_from_config(config: Coqpit) -> "LanguageManager":
--- a/TTS/tts/utils/managers.py
+++ b/TTS/tts/utils/managers.py
@ -39,7 +39,7 @@ class BaseIDManager:
    """

    def __init__(self, id_file_path: str = ""):
-        self.ids = {}
+        self.name_to_id = {}

        if id_file_path:
            self.load_ids_from_file(id_file_path)
@ -60,7 +60,7 @@ class BaseIDManager:
        Args:
            items (List): Data sampled returned by `load_tts_samples()`.
        """
-        self.ids = self.parse_ids_from_data(items, parse_key=parse_key)
+        self.name_to_id = self.parse_ids_from_data(items, parse_key=parse_key)

    def load_ids_from_file(self, file_path: str) -> None:
        """Set IDs from a file.
@ -68,7 +68,7 @@ class BaseIDManager:
        Args:
            file_path (str): Path to the file.
        """
-        self.ids = load_file(file_path)
+        self.name_to_id = load_file(file_path)

    def save_ids_to_file(self, file_path: str) -> None:
        """Save IDs to a json file.
@ -76,7 +76,7 @@ class BaseIDManager:
        Args:
            file_path (str): Path to the output file.
        """
-        save_file(self.ids, file_path)
+        save_file(self.name_to_id, file_path)

    def get_random_id(self) -> Any:
        """Get a random embedding.
@ -86,8 +86,8 @@ class BaseIDManager:
        Returns:
            np.ndarray: embedding.
        """
-        if self.ids:
-            return self.ids[random.choices(list(self.ids.keys()))[0]]
+        if self.name_to_id:
+            return self.name_to_id[random.choices(list(self.name_to_id.keys()))[0]]

        return None

@ -109,11 +109,27 @@ class BaseIDManager:
 class EmbeddingManager(BaseIDManager):
    """Base `Embedding` Manager class. Every new `Embedding` manager must inherit this.
    It defines common `Embedding` manager specific functions.
+
+    It expects embeddings files in the following format:
+
+    ::
+
+        {
+            'audio_file_key':{
+                'name': 'category_name',
+                'embedding'[<embedding_values>]
+            },
+            ...
+        }
+
+    `audio_file_key` is a unique key to the audio file in the dataset. It can be the path to the file or any other unique key.
+    `embedding` is the embedding vector of the audio file.
+    `name` can be name of the speaker of the audio file.
    """

    def __init__(
        self,
-        embedding_file_path: str = "",
+        embedding_file_path: Union[str, List[str]] = "",
        id_file_path: str = "",
        encoder_model_path: str = "",
        encoder_config_path: str = "",
@ -129,11 +145,24 @@ class EmbeddingManager(BaseIDManager):
        self.use_cuda = use_cuda

        if embedding_file_path:
-            self.load_embeddings_from_file(embedding_file_path)
+            if isinstance(embedding_file_path, list):
+                self.load_embeddings_from_list_of_files(embedding_file_path)
+            else:
+                self.load_embeddings_from_file(embedding_file_path)

        if encoder_model_path and encoder_config_path:
            self.init_encoder(encoder_model_path, encoder_config_path, use_cuda)

+    @property
+    def num_embeddings(self):
+        """Get number of embeddings."""
+        return len(self.embeddings)
+
+    @property
+    def num_names(self):
+        """Get number of embeddings."""
+        return len(self.embeddings_by_names)
+
    @property
    def embedding_dim(self):
        """Dimensionality of embeddings. If embeddings are not loaded, returns zero."""
@ -141,6 +170,11 @@ class EmbeddingManager(BaseIDManager):
            return len(self.embeddings[list(self.embeddings.keys())[0]]["embedding"])
        return 0

+    @property
+    def embedding_names(self):
+        """Get embedding names."""
+        return list(self.embeddings_by_names.keys())
+
    def save_embeddings_to_file(self, file_path: str) -> None:
        """Save embeddings to a json file.

@ -149,20 +183,57 @@ class EmbeddingManager(BaseIDManager):
        """
        save_file(self.embeddings, file_path)

+    @staticmethod
+    def read_embeddings_from_file(file_path: str):
+        """Load embeddings from a json file.
+
+        Args:
+            file_path (str): Path to the file.
+        """
+        embeddings = load_file(file_path)
+        speakers = sorted({x["name"] for x in embeddings.values()})
+        name_to_id = {name: i for i, name in enumerate(speakers)}
+        clip_ids = list(set(sorted(clip_name for clip_name in embeddings.keys())))
+        # cache embeddings_by_names for fast inference using a bigger speakers.json
+        embeddings_by_names = {}
+        for x in embeddings.values():
+            if x["name"] not in embeddings_by_names.keys():
+                embeddings_by_names[x["name"]] = [x["embedding"]]
+            else:
+                embeddings_by_names[x["name"]].append(x["embedding"])
+        return name_to_id, clip_ids, embeddings, embeddings_by_names
+
    def load_embeddings_from_file(self, file_path: str) -> None:
        """Load embeddings from a json file.

        Args:
            file_path (str): Path to the target json file.
        """
-        self.embeddings = load_file(file_path)
+        self.name_to_id, self.clip_ids, self.embeddings, self.embeddings_by_names = self.read_embeddings_from_file(
+            file_path
+        )

-        speakers = sorted({x["name"] for x in self.embeddings.values()})
-        self.ids = {name: i for i, name in enumerate(speakers)}
+    def load_embeddings_from_list_of_files(self, file_paths: List[str]) -> None:
+        """Load embeddings from a list of json files and don't allow duplicate keys.

-        self.clip_ids = list(set(sorted(clip_name for clip_name in self.embeddings.keys())))
-        # cache embeddings_by_names for fast inference using a bigger speakers.json
-        self.embeddings_by_names = self.get_embeddings_by_names()
+        Args:
+            file_paths (List[str]): List of paths to the target json files.
+        """
+        self.name_to_id = {}
+        self.clip_ids = []
+        self.embeddings_by_names = {}
+        self.embeddings = {}
+        for file_path in file_paths:
+            ids, clip_ids, embeddings, embeddings_by_names = self.read_embeddings_from_file(file_path)
+            # check colliding keys
+            duplicates = set(self.embeddings.keys()) & set(embeddings.keys())
+            if duplicates:
+                raise ValueError(f" [!] Duplicate embedding names <{duplicates}> in {file_path}")
+            # store values
+            self.name_to_id.update(ids)
+            self.clip_ids.extend(clip_ids)
+            self.embeddings_by_names.update(embeddings_by_names)
+            self.embeddings.update(embeddings)

    def get_embedding_by_clip(self, clip_idx: str) -> List:
        """Get embedding by clip ID.
--- a/TTS/tts/utils/speakers.py
+++ b/TTS/tts/utils/speakers.py
@ -73,14 +73,14 @@ class SpeakerManager(EmbeddingManager):

    @property
    def num_speakers(self):
-        return len(self.ids)
+        return len(self.name_to_id)

    @property
    def speaker_names(self):
-        return list(self.ids.keys())
+        return list(self.name_to_id.keys())

    def get_speakers(self) -> List:
-        return self.ids
+        return self.name_to_id

    @staticmethod
    def init_from_config(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "SpeakerManager":
@ -182,10 +182,10 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None,
                    speaker_manager.load_embeddings_from_file(c.d_vector_file)
                speaker_manager.load_embeddings_from_file(speakers_file)
            elif not c.use_d_vector_file:  # restor speaker manager with speaker ID file.
-                speaker_ids_from_data = speaker_manager.ids
+                speaker_ids_from_data = speaker_manager.name_to_id
                speaker_manager.load_ids_from_file(speakers_file)
                assert all(
-                    speaker in speaker_manager.ids for speaker in speaker_ids_from_data
+                    speaker in speaker_manager.name_to_id for speaker in speaker_ids_from_data
                ), " [!] You cannot introduce new speakers to a pre-trained model."
        elif c.use_d_vector_file and c.d_vector_file:
            # new speaker manager with external speaker embeddings.
@ -199,7 +199,7 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None,
        if speaker_manager.num_speakers > 0:
            print(
                " > Speaker manager is loaded with {} speakers: {}".format(
-                    speaker_manager.num_speakers, ", ".join(speaker_manager.ids)
+                    speaker_manager.num_speakers, ", ".join(speaker_manager.name_to_id)
                )
            )

--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@ -295,7 +295,12 @@ def transfer_voice(
        reference_d_vector = embedding_to_torch(reference_d_vector, cuda=use_cuda)

    # load reference_wav audio
-    reference_wav = embedding_to_torch(model.ap.load_wav(reference_wav, sr=model.ap.sample_rate), cuda=use_cuda)
+    reference_wav = embedding_to_torch(
+        model.ap.load_wav(
+            reference_wav, sr=model.args.encoder_sample_rate if model.args.encoder_sample_rate else model.ap.sample_rate
+        ),
+        cuda=use_cuda,
+    )

    if hasattr(model, "module"):
        _func = model.module.inference_voice_conversion
--- a/TTS/tts/utils/text/korean/init.py
+++ b/TTS/tts/utils/text/korean/init.py
--- a/TTS/tts/utils/text/korean/ko_dictionary.py
+++ b/TTS/tts/utils/text/korean/ko_dictionary.py
@ -0,0 +1,44 @@
+# coding: utf-8
+# Add the word you want to the dictionary.
+etc_dictionary = {"1+1": "원플러스원", "2+1": "투플러스원"}
+
+
+english_dictionary = {
+    "KOREA": "코리아",
+    "IDOL": "아이돌",
+    "IT": "아이티",
+    "IQ": "아이큐",
+    "UP": "업",
+    "DOWN": "다운",
+    "PC": "피씨",
+    "CCTV": "씨씨티비",
+    "SNS": "에스엔에스",
+    "AI": "에이아이",
+    "CEO": "씨이오",
+    "A": "에이",
+    "B": "비",
+    "C": "씨",
+    "D": "디",
+    "E": "이",
+    "F": "에프",
+    "G": "지",
+    "H": "에이치",
+    "I": "아이",
+    "J": "제이",
+    "K": "케이",
+    "L": "엘",
+    "M": "엠",
+    "N": "엔",
+    "O": "오",
+    "P": "피",
+    "Q": "큐",
+    "R": "알",
+    "S": "에스",
+    "T": "티",
+    "U": "유",
+    "V": "브이",
+    "W": "더블유",
+    "X": "엑스",
+    "Y": "와이",
+    "Z": "제트",
+}
--- a/TTS/tts/utils/text/korean/korean.py
+++ b/TTS/tts/utils/text/korean/korean.py
@ -0,0 +1,32 @@
+# coding: utf-8
+# Code based on https://github.com/carpedm20/multi-speaker-tacotron-tensorflow/blob/master/text/korean.py
+import re
+
+from TTS.tts.utils.text.korean.ko_dictionary import english_dictionary, etc_dictionary
+
+
+def normalize(text):
+    text = text.strip()
+    text = re.sub("[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]", "", text)
+    text = normalize_with_dictionary(text, etc_dictionary)
+    text = normalize_english(text)
+    text = text.lower()
+    return text
+
+
+def normalize_with_dictionary(text, dic):
+    if any(key in text for key in dic.keys()):
+        pattern = re.compile("|".join(re.escape(key) for key in dic.keys()))
+        return pattern.sub(lambda x: dic[x.group()], text)
+    return text
+
+
+def normalize_english(text):
+    def fn(m):
+        word = m.group()
+        if word in english_dictionary:
+            return english_dictionary.get(word)
+        return word
+
+    text = re.sub("([A-Za-z]+)", fn, text)
+    return text
--- a/TTS/tts/utils/text/korean/phonemizer.py
+++ b/TTS/tts/utils/text/korean/phonemizer.py
@ -0,0 +1,36 @@
+from jamo import hangul_to_jamo
+
+from TTS.tts.utils.text.korean.korean import normalize
+
+g2p = None
+
+
+def korean_text_to_phonemes(text, character: str = "hangeul") -> str:
+    """
+
+    The input and output values look the same, but they are different in Unicode.
+
+    example :
+
+        input = '하늘' (Unicode : \ud558\ub298), (하 + 늘)
+        output = '하늘' (Unicode :\u1112\u1161\u1102\u1173\u11af), (ᄒ + ᅡ + ᄂ + ᅳ + ᆯ)
+
+    """
+    global g2p  # pylint: disable=global-statement
+    if g2p is None:
+        from g2pkk import G2p
+
+        g2p = G2p()
+
+    if character == "english":
+        from anyascii import anyascii
+
+        text = normalize(text)
+        text = g2p(text)
+        text = anyascii(text)
+        return text
+
+    text = normalize(text)
+    text = g2p(text)
+    text = list(hangul_to_jamo(text))  # '하늘' --> ['ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆯ']
+    return "".join(text)
--- a/TTS/tts/utils/text/phonemizers/init.py
+++ b/TTS/tts/utils/text/phonemizers/init.py
@ -1,53 +1,57 @@
-from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
-from TTS.tts.utils.text.phonemizers.espeak_wrapper import ESpeak
-from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut
-from TTS.tts.utils.text.phonemizers.ja_jp_phonemizer import JA_JP_Phonemizer
-from TTS.tts.utils.text.phonemizers.zh_cn_phonemizer import ZH_CN_Phonemizer
-
-PHONEMIZERS = {b.name(): b for b in (ESpeak, Gruut, JA_JP_Phonemizer)}
-
-
-ESPEAK_LANGS = list(ESpeak.supported_languages().keys())
-GRUUT_LANGS = list(Gruut.supported_languages())
-
-
-# Dict setting default phonemizers for each language
-# Add Gruut languages
-_ = [Gruut.name()] * len(GRUUT_LANGS)
-DEF_LANG_TO_PHONEMIZER = dict(list(zip(GRUUT_LANGS, _)))
-
-
-# Add ESpeak languages and override any existing ones
-_ = [ESpeak.name()] * len(ESPEAK_LANGS)
-_new_dict = dict(list(zip(list(ESPEAK_LANGS), _)))
-DEF_LANG_TO_PHONEMIZER.update(_new_dict)
-
-# Force default for some languages
-DEF_LANG_TO_PHONEMIZER["en"] = DEF_LANG_TO_PHONEMIZER["en-us"]
-DEF_LANG_TO_PHONEMIZER["ja-jp"] = JA_JP_Phonemizer.name()
-DEF_LANG_TO_PHONEMIZER["zh-cn"] = ZH_CN_Phonemizer.name()
-
-
-def get_phonemizer_by_name(name: str, **kwargs) -> BasePhonemizer:
-    """Initiate a phonemizer by name
-
-    Args:
-        name (str):
-            Name of the phonemizer that should match `phonemizer.name()`.
-
-        kwargs (dict):
-            Extra keyword arguments that should be passed to the phonemizer.
-    """
-    if name == "espeak":
-        return ESpeak(**kwargs)
-    if name == "gruut":
-        return Gruut(**kwargs)
-    if name == "zh_cn_phonemizer":
-        return ZH_CN_Phonemizer(**kwargs)
-    if name == "ja_jp_phonemizer":
-        return JA_JP_Phonemizer(**kwargs)
-    raise ValueError(f"Phonemizer {name} not found")
-
-
-if __name__ == "__main__":
-    print(DEF_LANG_TO_PHONEMIZER)
+from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
+from TTS.tts.utils.text.phonemizers.espeak_wrapper import ESpeak
+from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut
+from TTS.tts.utils.text.phonemizers.ja_jp_phonemizer import JA_JP_Phonemizer
+from TTS.tts.utils.text.phonemizers.ko_kr_phonemizer import KO_KR_Phonemizer
+from TTS.tts.utils.text.phonemizers.zh_cn_phonemizer import ZH_CN_Phonemizer
+
+PHONEMIZERS = {b.name(): b for b in (ESpeak, Gruut, JA_JP_Phonemizer)}
+
+
+ESPEAK_LANGS = list(ESpeak.supported_languages().keys())
+GRUUT_LANGS = list(Gruut.supported_languages())
+
+
+# Dict setting default phonemizers for each language
+# Add Gruut languages
+_ = [Gruut.name()] * len(GRUUT_LANGS)
+DEF_LANG_TO_PHONEMIZER = dict(list(zip(GRUUT_LANGS, _)))
+
+
+# Add ESpeak languages and override any existing ones
+_ = [ESpeak.name()] * len(ESPEAK_LANGS)
+_new_dict = dict(list(zip(list(ESPEAK_LANGS), _)))
+DEF_LANG_TO_PHONEMIZER.update(_new_dict)
+
+# Force default for some languages
+DEF_LANG_TO_PHONEMIZER["en"] = DEF_LANG_TO_PHONEMIZER["en-us"]
+DEF_LANG_TO_PHONEMIZER["ja-jp"] = JA_JP_Phonemizer.name()
+DEF_LANG_TO_PHONEMIZER["zh-cn"] = ZH_CN_Phonemizer.name()
+DEF_LANG_TO_PHONEMIZER["ko-kr"] = KO_KR_Phonemizer.name()
+
+
+def get_phonemizer_by_name(name: str, **kwargs) -> BasePhonemizer:
+    """Initiate a phonemizer by name
+
+    Args:
+        name (str):
+            Name of the phonemizer that should match `phonemizer.name()`.
+
+        kwargs (dict):
+            Extra keyword arguments that should be passed to the phonemizer.
+    """
+    if name == "espeak":
+        return ESpeak(**kwargs)
+    if name == "gruut":
+        return Gruut(**kwargs)
+    if name == "zh_cn_phonemizer":
+        return ZH_CN_Phonemizer(**kwargs)
+    if name == "ja_jp_phonemizer":
+        return JA_JP_Phonemizer(**kwargs)
+    if name == "ko_kr_phonemizer":
+        return KO_KR_Phonemizer(**kwargs)
+    raise ValueError(f"Phonemizer {name} not found")
+
+
+if __name__ == "__main__":
+    print(DEF_LANG_TO_PHONEMIZER)
--- a/TTS/tts/utils/text/phonemizers/espeak_wrapper.py
+++ b/TTS/tts/utils/text/phonemizers/espeak_wrapper.py
@ -94,6 +94,8 @@ class ESpeak(BasePhonemizer):
        # band-aid for backwards compatibility
        if language == "en":
            language = "en-us"
+        if language == "zh-cn":
+            language = "cmn"

        super().__init__(language, punctuations=punctuations, keep_puncs=keep_puncs)
        if backend is not None:
--- a/TTS/tts/utils/text/phonemizers/ko_kr_phonemizer.py
+++ b/TTS/tts/utils/text/phonemizers/ko_kr_phonemizer.py
@ -0,0 +1,65 @@
+from typing import Dict
+
+from TTS.tts.utils.text.korean.phonemizer import korean_text_to_phonemes
+from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
+
+_DEF_KO_PUNCS = "、.,[]()?!〽~『』「」【】"
+
+
+class KO_KR_Phonemizer(BasePhonemizer):
+    """🐸TTS ko_kr_phonemizer using functions in `TTS.tts.utils.text.korean.phonemizer`
+
+    TODO: Add Korean to character (ᄀᄁᄂᄃᄄᄅᄆᄇᄈᄉᄊᄋᄌᄍᄎᄏᄐᄑ하ᅢᅣᅤᅥᅦᅧᅨᅩᅪᅫᅬᅭᅮᅯᅰᅱᅲᅳᅴᅵᆨᆩᆪᆫᆬᆭᆮᆯᆰᆱᆲᆳᆴᆵᆶᆷᆸᆹᆺᆻᆼᆽᆾᆿᇀᇁᇂ)
+
+    Example:
+
+        >>> from TTS.tts.utils.text.phonemizers import KO_KR_Phonemizer
+        >>> phonemizer = KO_KR_Phonemizer()
+        >>> phonemizer.phonemize("이 문장은 음성합성 테스트를 위한 문장입니다.", separator="|")
+        'ᄋ|ᅵ| |ᄆ|ᅮ|ᆫ|ᄌ|ᅡ|ᆼ|ᄋ|ᅳ| |ᄂ|ᅳ|ᆷ|ᄉ|ᅥ|ᆼ|ᄒ|ᅡ|ᆸ|ᄊ|ᅥ|ᆼ| |ᄐ|ᅦ|ᄉ|ᅳ|ᄐ|ᅳ|ᄅ|ᅳ| |ᄅ|ᅱ|ᄒ|ᅡ|ᆫ| |ᄆ|ᅮ|ᆫ|ᄌ|ᅡ|ᆼ|ᄋ|ᅵ|ᆷ|ᄂ|ᅵ|ᄃ|ᅡ|.'
+
+        >>> from TTS.tts.utils.text.phonemizers import KO_KR_Phonemizer
+        >>> phonemizer = KO_KR_Phonemizer()
+        >>> phonemizer.phonemize("이 문장은 음성합성 테스트를 위한 문장입니다.", separator="|", character='english')
+        'I| |M|u|n|J|a|n|g|E|u| |N|e|u|m|S|e|o|n|g|H|a|b|S|s|e|o|n|g| |T|e|S|e|u|T|e|u|L|e|u| |L|w|i|H|a|n| |M|u|n|J|a|n|g|I|m|N|i|D|a|.'
+
+    """
+
+    language = "ko-kr"
+
+    def __init__(self, punctuations=_DEF_KO_PUNCS, keep_puncs=True, **kwargs):  # pylint: disable=unused-argument
+        super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs)
+
+    @staticmethod
+    def name():
+        return "ko_kr_phonemizer"
+
+    def _phonemize(self, text: str, separator: str = "", character: str = "hangeul") -> str:
+        ph = korean_text_to_phonemes(text, character=character)
+        if separator is not None or separator != "":
+            return separator.join(ph)
+        return ph
+
+    def phonemize(self, text: str, separator: str = "", character: str = "hangeul") -> str:
+        return self._phonemize(text, separator, character)
+
+    @staticmethod
+    def supported_languages() -> Dict:
+        return {"ko-kr": "hangeul(korean)"}
+
+    def version(self) -> str:
+        return "0.0.2"
+
+    def is_available(self) -> bool:
+        return True
+
+
+if __name__ == "__main__":
+    texts = "이 문장은 음성합성 테스트를 위한 문장입니다."
+    e = KO_KR_Phonemizer()
+    print(e.supported_languages())
+    print(e.version())
+    print(e.language)
+    print(e.name())
+    print(e.is_available())
+    print(e.phonemize(texts))
--- a/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py
+++ b/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py
@ -42,7 +42,7 @@ class ZH_CN_Phonemizer(BasePhonemizer):

    @staticmethod
    def supported_languages() -> Dict:
-        return {"zh-cn": "Japanese (Japan)"}
+        return {"zh-cn": "Chinese (China)"}

    def version(self) -> str:
        return "0.0.1"
--- a/TTS/utils/audio/numpy_transforms.py
+++ b/TTS/utils/audio/numpy_transforms.py
@ -2,9 +2,9 @@ from typing import Tuple

 import librosa
 import numpy as np
-import pyworld as pw
 import scipy
 import soundfile as sf
+from librosa import pyin

 # For using kwargs
 # pylint: disable=unused-argument
@ -242,12 +242,28 @@ def compute_stft_paddings(


 def compute_f0(
-    *, x: np.ndarray = None, pitch_fmax: float = None, hop_length: int = None, sample_rate: int = None, **kwargs
+    *,
+    x: np.ndarray = None,
+    pitch_fmax: float = None,
+    pitch_fmin: float = None,
+    hop_length: int = None,
+    win_length: int = None,
+    sample_rate: int = None,
+    stft_pad_mode: str = "reflect",
+    center: bool = True,
+    **kwargs,
 ) -> np.ndarray:
    """Compute pitch (f0) of a waveform using the same parameters used for computing melspectrogram.

    Args:
        x (np.ndarray): Waveform. Shape :math:`[T_wav,]`
+        pitch_fmax (float): Pitch max value.
+        pitch_fmin (float): Pitch min value.
+        hop_length (int): Number of frames between STFT columns.
+        win_length (int): STFT window length.
+        sample_rate (int): Audio sampling rate.
+        stft_pad_mode (str): Padding mode for STFT.
+        center (bool): Centered padding.

    Returns:
        np.ndarray: Pitch. Shape :math:`[T_pitch,]`. :math:`T_pitch == T_wav / hop_length`
@ -255,20 +271,35 @@ def compute_f0(
    Examples:
        >>> WAV_FILE = filename = librosa.util.example_audio_file()
        >>> from TTS.config import BaseAudioConfig
-        >>> from TTS.utils.audio.processor import AudioProcessor        >>> conf = BaseAudioConfig(pitch_fmax=8000)
+        >>> from TTS.utils.audio import AudioProcessor
+        >>> conf = BaseAudioConfig(pitch_fmax=640, pitch_fmin=1)
        >>> ap = AudioProcessor(**conf)
-        >>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050]
+        >>> wav = ap.load_wav(WAV_FILE, sr=ap.sample_rate)[:5 * ap.sample_rate]
        >>> pitch = ap.compute_f0(wav)
    """
    assert pitch_fmax is not None, " [!] Set `pitch_fmax` before caling `compute_f0`."
+    assert pitch_fmin is not None, " [!] Set `pitch_fmin` before caling `compute_f0`."

-    f0, t = pw.dio(
-        x.astype(np.double),
-        fs=sample_rate,
-        f0_ceil=pitch_fmax,
-        frame_period=1000 * hop_length / sample_rate,
+    f0, voiced_mask, _ = pyin(
+        y=x.astype(np.double),
+        fmin=pitch_fmin,
+        fmax=pitch_fmax,
+        sr=sample_rate,
+        frame_length=win_length,
+        win_length=win_length // 2,
+        hop_length=hop_length,
+        pad_mode=stft_pad_mode,
+        center=center,
+        n_thresholds=100,
+        beta_parameters=(2, 18),
+        boltzmann_parameter=2,
+        resolution=0.1,
+        max_transition_rate=35.92,
+        switch_prob=0.01,
+        no_trough_prob=0.01,
    )
-    f0 = pw.stonemask(x.astype(np.double), f0, t, sample_rate)
+    f0[~voiced_mask] = 0.0
+
    return f0


--- a/TTS/utils/audio/processor.py
+++ b/TTS/utils/audio/processor.py
@ -2,12 +2,12 @@ from typing import Dict, Tuple

 import librosa
 import numpy as np
-import pyworld as pw
 import scipy.io.wavfile
 import scipy.signal
 import soundfile as sf

 from TTS.tts.utils.helpers import StandardScaler
+from TTS.utils.audio.numpy_transforms import compute_f0

 # pylint: disable=too-many-public-methods

@ -573,23 +573,28 @@ class AudioProcessor(object):
            >>> WAV_FILE = filename = librosa.util.example_audio_file()
            >>> from TTS.config import BaseAudioConfig
            >>> from TTS.utils.audio import AudioProcessor
-            >>> conf = BaseAudioConfig(pitch_fmax=8000)
+            >>> conf = BaseAudioConfig(pitch_fmax=640, pitch_fmin=1)
            >>> ap = AudioProcessor(**conf)
-            >>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050]
+            >>> wav = ap.load_wav(WAV_FILE, sr=ap.sample_rate)[:5 * ap.sample_rate]
            >>> pitch = ap.compute_f0(wav)
        """
        assert self.pitch_fmax is not None, " [!] Set `pitch_fmax` before caling `compute_f0`."
+        assert self.pitch_fmin is not None, " [!] Set `pitch_fmin` before caling `compute_f0`."
        # align F0 length to the spectrogram length
        if len(x) % self.hop_length == 0:
-            x = np.pad(x, (0, self.hop_length // 2), mode="reflect")
+            x = np.pad(x, (0, self.hop_length // 2), mode=self.stft_pad_mode)

-        f0, t = pw.dio(
-            x.astype(np.double),
-            fs=self.sample_rate,
-            f0_ceil=self.pitch_fmax,
-            frame_period=1000 * self.hop_length / self.sample_rate,
+        f0 = compute_f0(
+            x=x,
+            pitch_fmax=self.pitch_fmax,
+            pitch_fmin=self.pitch_fmin,
+            hop_length=self.hop_length,
+            win_length=self.win_length,
+            sample_rate=self.sample_rate,
+            stft_pad_mode=self.stft_pad_mode,
+            center=True,
        )
-        f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate)
+
        return f0

    ### Audio Processing ###
--- a/TTS/utils/capacitron_optimizer.py
+++ b/TTS/utils/capacitron_optimizer.py
@ -38,9 +38,9 @@ class CapacitronOptimizer:
        self.param_groups = self.primary_optimizer.param_groups
        self.primary_optimizer.step()

-    def zero_grad(self):
-        self.primary_optimizer.zero_grad()
-        self.secondary_optimizer.zero_grad()
+    def zero_grad(self, set_to_none=False):
+        self.primary_optimizer.zero_grad(set_to_none)
+        self.secondary_optimizer.zero_grad(set_to_none)

    def load_state_dict(self, state_dict):
        self.primary_optimizer.load_state_dict(state_dict[0])
--- a/TTS/utils/io.py
+++ b/TTS/utils/io.py
@ -9,6 +9,8 @@ import fsspec
 import torch
 from coqpit import Coqpit

+from TTS.utils.generic_utils import get_user_data_dir
+

 class RenamingUnpickler(pickle_tts.Unpickler):
    """Overload default pickler to solve module renaming problem"""
@ -57,6 +59,7 @@ def copy_model_files(config: Coqpit, out_path, new_fields=None):
 def load_fsspec(
    path: str,
    map_location: Union[str, Callable, torch.device, Dict[Union[str, torch.device], Union[str, torch.device]]] = None,
+    cache: bool = True,
    **kwargs,
 ) -> Any:
    """Like torch.load but can load from other locations (e.g. s3:// , gs://).
@ -64,21 +67,33 @@ def load_fsspec(
    Args:
        path: Any path or url supported by fsspec.
        map_location: torch.device or str.
+        cache: If True, cache a remote file locally for subsequent calls. It is cached under `get_user_data_dir()/tts_cache`. Defaults to True.
        **kwargs: Keyword arguments forwarded to torch.load.

    Returns:
        Object stored in path.
    """
-    with fsspec.open(path, "rb") as f:
-        return torch.load(f, map_location=map_location, **kwargs)
+    is_local = os.path.isdir(path) or os.path.isfile(path)
+    if cache and not is_local:
+        with fsspec.open(
+            f"filecache::{path}",
+            filecache={"cache_storage": str(get_user_data_dir("tts_cache"))},
+            mode="rb",
+        ) as f:
+            return torch.load(f, map_location=map_location, **kwargs)
+    else:
+        with fsspec.open(path, "rb") as f:
+            return torch.load(f, map_location=map_location, **kwargs)


-def load_checkpoint(model, checkpoint_path, use_cuda=False, eval=False):  # pylint: disable=redefined-builtin
+def load_checkpoint(
+    model, checkpoint_path, use_cuda=False, eval=False, cache=False
+):  # pylint: disable=redefined-builtin
    try:
-        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
    except ModuleNotFoundError:
        pickle_tts.Unpickler = RenamingUnpickler
-        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), pickle_module=pickle_tts)
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), pickle_module=pickle_tts, cache=cache)
    model.load_state_dict(state["model"])
    if use_cuda:
        model.cuda()
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@ -32,11 +32,14 @@ class ModelManager(object):
    home path.

    Args:
-        models_file (str): path to .model.json
+        models_file (str): path to .model.json file. Defaults to None.
+        output_prefix (str): prefix to `tts` to download models. Defaults to None
+        progress_bar (bool): print a progress bar when donwloading a file. Defaults to False.
    """

-    def __init__(self, models_file=None, output_prefix=None):
+    def __init__(self, models_file=None, output_prefix=None, progress_bar=False):
        super().__init__()
+        self.progress_bar = progress_bar
        if output_prefix is None:
            self.output_prefix = get_user_data_dir("tts")
        else:
@ -236,7 +239,7 @@ class ModelManager(object):
            os.makedirs(output_path, exist_ok=True)
            print(f" > Downloading model to {output_path}")
            # download from github release
-            self._download_zip_file(model_item["github_rls_url"], output_path)
+            self._download_zip_file(model_item["github_rls_url"], output_path, self.progress_bar)
            self.print_model_license(model_item=model_item)
        # find downloaded files
        output_model_path, output_config_path = self._find_files(output_path)
@ -334,7 +337,7 @@ class ModelManager(object):
            config.save_json(config_path)

    @staticmethod
-    def _download_zip_file(file_url, output_folder):
+    def _download_zip_file(file_url, output_folder, progress_bar):
        """Download the github releases"""
        # download the file
        r = requests.get(file_url, stream=True)
@ -342,11 +345,13 @@ class ModelManager(object):
        try:
            total_size_in_bytes = int(r.headers.get("content-length", 0))
            block_size = 1024  # 1 Kibibyte
-            progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
+            if progress_bar:
+                progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
            temp_zip_name = os.path.join(output_folder, file_url.split("/")[-1])
            with open(temp_zip_name, "wb") as file:
                for data in r.iter_content(block_size):
-                    progress_bar.update(len(data))
+                    if progress_bar:
+                        progress_bar.update(len(data))
                    file.write(data)
            with zipfile.ZipFile(temp_zip_name) as z:
                z.extractall(output_folder)
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@ -212,8 +212,13 @@ class Synthesizer(object):
        # handle multi-speaker
        speaker_embedding = None
        speaker_id = None
-        if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "ids"):
-            if speaker_name and isinstance(speaker_name, str):
+        if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "name_to_id"):
+
+            # handle Neon models with single speaker.
+            if len(self.tts_model.speaker_manager.name_to_id) == 1:
+                speaker_id = list(self.tts_model.speaker_manager.name_to_id.values())[0]
+
+            elif speaker_name and isinstance(speaker_name, str):
                if self.tts_config.use_d_vector_file:
                    # get the average speaker embedding from the saved d_vectors.
                    speaker_embedding = self.tts_model.speaker_manager.get_mean_embedding(
@ -222,7 +227,7 @@ class Synthesizer(object):
                    speaker_embedding = np.array(speaker_embedding)[None, :]  # [1 x embedding_dim]
                else:
                    # get speaker idx from the speaker name
-                    speaker_id = self.tts_model.speaker_manager.ids[speaker_name]
+                    speaker_id = self.tts_model.speaker_manager.name_to_id[speaker_name]

            elif not speaker_name and not speaker_wav:
                raise ValueError(
@ -243,8 +248,12 @@ class Synthesizer(object):
        if self.tts_languages_file or (
            hasattr(self.tts_model, "language_manager") and self.tts_model.language_manager is not None
        ):
-            if language_name and isinstance(language_name, str):
-                language_id = self.tts_model.language_manager.ids[language_name]
+
+            if len(self.tts_model.language_manager.name_to_id) == 1:
+                language_id = list(self.tts_model.language_manager.name_to_id.values())[0]
+
+            elif language_name and isinstance(language_name, str):
+                language_id = self.tts_model.language_manager.name_to_id[language_name]

            elif not language_name:
                raise ValueError(
@ -316,7 +325,7 @@ class Synthesizer(object):
            # get the speaker embedding or speaker id for the reference wav file
            reference_speaker_embedding = None
            reference_speaker_id = None
-            if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "ids"):
+            if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "name_to_id"):
                if reference_speaker_name and isinstance(reference_speaker_name, str):
                    if self.tts_config.use_d_vector_file:
                        # get the speaker embedding from the saved d_vectors.
@ -328,12 +337,11 @@ class Synthesizer(object):
                        ]  # [1 x embedding_dim]
                    else:
                        # get speaker idx from the speaker name
-                        reference_speaker_id = self.tts_model.speaker_manager.ids[reference_speaker_name]
+                        reference_speaker_id = self.tts_model.speaker_manager.name_to_id[reference_speaker_name]
                else:
                    reference_speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(
                        reference_wav
                    )
-
            outputs = transfer_voice(
                model=self.tts_model,
                CONFIG=self.tts_config,
--- a/TTS/utils/vad.py
+++ b/TTS/utils/vad.py
@ -1,3 +1,4 @@
+import soundfile as sf
 import torch
 import torchaudio

@ -48,7 +49,7 @@ def remove_silence(
 ):

    # get the VAD model and utils functions
-    model, get_speech_timestamps, save_audio, collect_chunks = model_and_utils
+    model, get_speech_timestamps, _, collect_chunks = model_and_utils

    # read ground truth wav and resample the audio for the VAD
    wav, gt_sample_rate = read_audio(audio_path)
@ -73,9 +74,11 @@ def remove_silence(
    # if have speech timestamps else save the wav
    if new_speech_timestamps:
        wav = collect_chunks(new_speech_timestamps, wav)
+        is_speech = True
    else:
        print(f"> The file {audio_path} probably does not have speech please check it !!")
+        is_speech = False

    # save audio
-    save_audio(out_path, wav, sampling_rate=gt_sample_rate)
-    return out_path
+    sf.write(out_path, wav, gt_sample_rate, subtype="PCM_16")
+    return out_path, is_speech
--- a/TTS/vocoder/configs/hifigan_config.py
+++ b/TTS/vocoder/configs/hifigan_config.py
@ -22,14 +22,12 @@ class HifiganConfig(BaseGANVocoderConfig):
        generator_model_params (dict): Parameters of the generator model. Defaults to
            `
            {
-                "use_mel": True,
-                "sample_rate": 22050,
-                "n_fft": 1024,
-                "hop_length": 256,
-                "win_length": 1024,
-                "n_mels": 80,
-                "mel_fmin": 0.0,
-                "mel_fmax": None,
+                "upsample_factors": [8, 8, 2, 2],
+                "upsample_kernel_sizes": [16, 16, 4, 4],
+                "upsample_initial_channel": 512,
+                "resblock_kernel_sizes": [3, 7, 11],
+                "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+                "resblock_type": "1",
            }
            `
        batch_size (int):
--- a/TTS/vocoder/models/gan.py
+++ b/TTS/vocoder/models/gan.py
@ -231,6 +231,7 @@ class GAN(BaseVocoder):
        config: Coqpit,
        checkpoint_path: str,
        eval: bool = False,  # pylint: disable=unused-argument, redefined-builtin
+        cache: bool = False,
    ) -> None:
        """Load a GAN checkpoint and initialize model parameters.

@ -239,7 +240,7 @@ class GAN(BaseVocoder):
            checkpoint_path (str): Checkpoint file path.
            eval (bool, optional): If true, load the model for inference. If falseDefaults to False.
        """
-        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
        # band-aid for older than v0.0.15 GAN models
        if "model_disc" in state:
            self.model_g.load_checkpoint(config, checkpoint_path, eval)
--- a/TTS/vocoder/models/hifigan_generator.py
+++ b/TTS/vocoder/models/hifigan_generator.py
@ -290,9 +290,9 @@ class HifiganGenerator(torch.nn.Module):
        remove_weight_norm(self.conv_post)

    def load_checkpoint(
-        self, config, checkpoint_path, eval=False
+        self, config, checkpoint_path, eval=False, cache=False
    ):  # pylint: disable=unused-argument, redefined-builtin
-        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
        self.load_state_dict(state["model"])
        if eval:
            self.eval()
--- a/TTS/vocoder/models/melgan_generator.py
+++ b/TTS/vocoder/models/melgan_generator.py
@ -85,9 +85,9 @@ class MelganGenerator(nn.Module):
                    layer.remove_weight_norm()

    def load_checkpoint(
-        self, config, checkpoint_path, eval=False
+        self, config, checkpoint_path, eval=False, cache=False
    ):  # pylint: disable=unused-argument, redefined-builtin
-        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
        self.load_state_dict(state["model"])
        if eval:
            self.eval()
--- a/TTS/vocoder/models/parallel_wavegan_generator.py
+++ b/TTS/vocoder/models/parallel_wavegan_generator.py
@ -153,9 +153,9 @@ class ParallelWaveganGenerator(torch.nn.Module):
        return self._get_receptive_field_size(self.layers, self.stacks, self.kernel_size)

    def load_checkpoint(
-        self, config, checkpoint_path, eval=False
+        self, config, checkpoint_path, eval=False, cache=False
    ):  # pylint: disable=unused-argument, redefined-builtin
-        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
        self.load_state_dict(state["model"])
        if eval:
            self.eval()
--- a/TTS/vocoder/models/wavegrad.py
+++ b/TTS/vocoder/models/wavegrad.py
@ -218,9 +218,9 @@ class Wavegrad(BaseVocoder):
        self.y_conv = weight_norm(self.y_conv)

    def load_checkpoint(
-        self, config, checkpoint_path, eval=False
+        self, config, checkpoint_path, eval=False, cache=False
    ):  # pylint: disable=unused-argument, redefined-builtin
-        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
        self.load_state_dict(state["model"])
        if eval:
            self.eval()
--- a/TTS/vocoder/models/wavernn.py
+++ b/TTS/vocoder/models/wavernn.py
@ -542,9 +542,9 @@ class Wavernn(BaseVocoder):
        return unfolded

    def load_checkpoint(
-        self, config, checkpoint_path, eval=False
+        self, config, checkpoint_path, eval=False, cache=False
    ):  # pylint: disable=unused-argument, redefined-builtin
-        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
        self.load_state_dict(state["model"])
        if eval:
            self.eval()
--- a/docs/source/docker_images.md
+++ b/docs/source/docker_images.md
@ -0,0 +1,56 @@
+(docker_images)=
+## Docker images
+We provide docker images to be able to test TTS without having to setup your own environment.
+
+### Using premade images
+You can use premade images built automatically from the latest TTS version.
+
+#### CPU version
+```bash
+docker pull ghcr.io/coqui-ai/tts-cpu
+```
+#### GPU version
+```bash
+docker pull ghcr.io/coqui-ai/tts
+```
+
+### Building your own image
+```bash
+docker build -t tts .
+```
+
+## Basic inference
+Basic usage: generating an audio file from a text passed as argument.
+You can pass any tts argument after the image name.
+
+### CPU version
+```bash
+docker run --rm -v ~/tts-output:/root/tts-output ghcr.io/coqui-ai/tts-cpu --text "Hello." --out_path /root/tts-output/hello.wav
+```
+### GPU version
+For the GPU version, you need to have the latest NVIDIA drivers installed.
+With `nvidia-smi` you can check the CUDA version supported, it must be >= 11.8
+
+```bash
+docker run --rm --gpus all -v ~/tts-output:/root/tts-output ghcr.io/coqui-ai/tts --text "Hello." --out_path /root/tts-output/hello.wav --use_cuda true
+```
+
+## Start a server
+Starting a TTS server:
+Start the container and get a shell inside it.
+
+### CPU version
+```bash
+docker run --rm -it -p 5002:5002 --entrypoint /bin/bash ghcr.io/coqui-ai/tts-cpu
+python3 TTS/server/server.py --list_models #To get the list of available models
+python3 TTS/server/server.py --model_name tts_models/en/vctk/vits 
+```
+
+### GPU version
+```bash
+docker run --rm -it -p 5002:5002 --gpus all --entrypoint /bin/bash ghcr.io/coqui-ai/tts
+python3 TTS/server/server.py --list_models #To get the list of available models
+python3 TTS/server/server.py --model_name tts_models/en/vctk/vits --use_cuda true
+```
+
+Click [there](http://[::1]:5002/) and have fun with the server!
--- a/docs/source/faq.md
+++ b/docs/source/faq.md
@ -53,7 +53,7 @@ We tried to collect common issues and questions we receive about 🐸TTS. It is
        "mixed_precision": false,
        "output_path": "recipes/ljspeech/glow_tts/",
        "test_sentences": ["Test this sentence.", "This test sentence.", "Sentence this test."],
-        "datasets":[{"name": "ljspeech", "meta_file_train":"metadata.csv", "path": "recipes/ljspeech/LJSpeech-1.1/"}]
+        "datasets":[{"formatter": "ljspeech", "meta_file_train":"metadata.csv", "path": "recipes/ljspeech/LJSpeech-1.1/"}]
    }
    ```

--- a/docs/source/formatting_your_dataset.md
+++ b/docs/source/formatting_your_dataset.md
@ -88,7 +88,7 @@ from TTS.tts.datasets import load_tts_samples

 # dataset config for one of the pre-defined datasets
 dataset_config = BaseDatasetConfig(
-    name="vctk", meta_file_train="", language="en-us", path="dataset-path")
+    formatter="vctk", meta_file_train="", language="en-us", path="dataset-path")
 )

 # load training samples
--- a/docs/source/index.md
+++ b/docs/source/index.md
@ -20,6 +20,7 @@
    :caption: Using 🐸TTS

    inference
+    docker_images
    implementing_a_new_model
    training_a_model
    finetuning
--- a/docs/source/models/forward_tts.md
+++ b/docs/source/models/forward_tts.md
@ -12,7 +12,7 @@ Currently we provide the following pre-configured architectures:

 - **FastPitch:**

-    It uses the same FastSpeech architecture that us conditioned on fundemental frequency (f0) contours with the
+    It uses the same FastSpeech architecture that is conditioned on fundemental frequency (f0) contours with the
    promise of more expressive speech.

 - **SpeedySpeech:**
--- a/docs/source/tutorial_for_nervous_beginners.md
+++ b/docs/source/tutorial_for_nervous_beginners.md
@ -84,7 +84,7 @@ We still support running training from CLI like in the old days. The same traini
        "print_eval": true,
        "mixed_precision": false,
        "output_path": "recipes/ljspeech/glow_tts/",
-        "datasets":[{"name": "ljspeech", "meta_file_train":"metadata.csv", "path": "recipes/ljspeech/LJSpeech-1.1/"}]
+        "datasets":[{"formatter": "ljspeech", "meta_file_train":"metadata.csv", "path": "recipes/ljspeech/LJSpeech-1.1/"}]
    }
    ```

@ -120,6 +120,3 @@ $ tts-server -h # see the help
 $ tts-server --list_models  # list the available models.
 ```
 ![server.gif](https://github.com/coqui-ai/TTS/raw/main/images/demo_server.gif)
-
-
-
--- a/notebooks/Tutorial_2_train_your_first_TTS_model.ipynb
+++ b/notebooks/Tutorial_2_train_your_first_TTS_model.ipynb
@ -74,7 +74,7 @@
    "<span style=\"color:purple;font-size:15px\">\n",
    "/MyTTSDataset <br /> \n",
    "&emsp;| <br /> \n",
-    "&emsp;| -> metadata.txt<br /> \n",
+    "&emsp;| -> metadata.csv<br /> \n",
    "&emsp;| -> /wavs<br /> \n",
    "&emsp;&emsp;| -> audio1.wav<br /> \n",
    "&emsp;&emsp;| -> audio2.wav<br /> \n",
--- a/recipes/blizzard2013/tacotron1-Capacitron/train_capacitron_t1.py
+++ b/recipes/blizzard2013/tacotron1-Capacitron/train_capacitron_t1.py
@ -15,7 +15,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
 data_path = "/srv/data/"

 # Using LJSpeech like dataset processing for the blizzard dataset
-dataset_config = BaseDatasetConfig(name="ljspeech", meta_file_train="metadata.csv", path=data_path)
+dataset_config = BaseDatasetConfig(formatter="ljspeech", meta_file_train="metadata.csv", path=data_path)

 audio_config = BaseAudioConfig(
    sample_rate=24000,
--- a/recipes/blizzard2013/tacotron2-Capacitron/train_capacitron_t2.py
+++ b/recipes/blizzard2013/tacotron2-Capacitron/train_capacitron_t2.py
@ -16,7 +16,7 @@ data_path = "/srv/data/blizzard2013/segmented"

 # Using LJSpeech like dataset processing for the blizzard dataset
 dataset_config = BaseDatasetConfig(
-    name="ljspeech",
+    formatter="ljspeech",
    meta_file_train="metadata.csv",
    path=data_path,
 )
--- a/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json
+++ b/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json
@ -1,7 +1,7 @@
 {
    "datasets": [
        {
-            "name": "kokoro",
+            "formatter": "kokoro",
            "path": "DEFINE THIS",
            "meta_file_train": "metadata.csv",
            "meta_file_val": null
@ -119,7 +119,7 @@
        "phonemes": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
    },
    "use_speaker_embedding": false,
-    "use_gst": false,       			
+    "use_gst": false,
    "use_external_speaker_embedding_file": false,
    "external_speaker_embedding_file": "../../speakers-vctk-en.json"
 }
--- a/recipes/ljspeech/align_tts/train_aligntts.py
+++ b/recipes/ljspeech/align_tts/train_aligntts.py
@ -13,7 +13,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))

 # init configs
 dataset_config = BaseDatasetConfig(
-    name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
+    formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
 )
 config = AlignTTSConfig(
    batch_size=32,
--- a/recipes/ljspeech/fast_pitch/train_fast_pitch.py
+++ b/recipes/ljspeech/fast_pitch/train_fast_pitch.py
@ -14,7 +14,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))

 # init configs
 dataset_config = BaseDatasetConfig(
-    name="ljspeech",
+    formatter="ljspeech",
    meta_file_train="metadata.csv",
    # meta_file_attn_mask=os.path.join(output_path, "../LJSpeech-1.1/metadata_attn_mask.txt"),
    path=os.path.join(output_path, "../LJSpeech-1.1/"),
--- a/recipes/ljspeech/fast_speech/train_fast_speech.py
+++ b/recipes/ljspeech/fast_speech/train_fast_speech.py
@ -14,7 +14,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))

 # init configs
 dataset_config = BaseDatasetConfig(
-    name="ljspeech",
+    formatter="ljspeech",
    meta_file_train="metadata.csv",
    # meta_file_attn_mask=os.path.join(output_path, "../LJSpeech-1.1/metadata_attn_mask.txt"),
    path=os.path.join(output_path, "../LJSpeech-1.1/"),
--- a/recipes/ljspeech/glow_tts/train_glowtts.py
+++ b/recipes/ljspeech/glow_tts/train_glowtts.py
@ -21,7 +21,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
 # Set LJSpeech as our target dataset and define its path.
 # You can also use a simple Dict to define the dataset and pass it to your custom formatter.
 dataset_config = BaseDatasetConfig(
-    name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
+    formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
 )

 # INITIALIZE THE TRAINING CONFIGURATION
--- a/recipes/ljspeech/speedy_speech/train_speedy_speech.py
+++ b/recipes/ljspeech/speedy_speech/train_speedy_speech.py
@ -11,7 +11,7 @@ from TTS.utils.audio import AudioProcessor

 output_path = os.path.dirname(os.path.abspath(__file__))
 dataset_config = BaseDatasetConfig(
-    name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
+    formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
 )

 audio_config = BaseAudioConfig(
--- a/recipes/ljspeech/tacotron2-Capacitron/train_capacitron_t2.py
+++ b/recipes/ljspeech/tacotron2-Capacitron/train_capacitron_t2.py
@ -16,7 +16,7 @@ data_path = "/srv/data/"

 # Using LJSpeech like dataset processing for the blizzard dataset
 dataset_config = BaseDatasetConfig(
-    name="ljspeech",
+    formatter="ljspeech",
    meta_file_train="metadata.csv",
    path=data_path,
 )
--- a/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py
+++ b/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py
@ -16,7 +16,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))

 # init configs
 dataset_config = BaseDatasetConfig(
-    name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
+    formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
 )

 audio_config = BaseAudioConfig(
--- a/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py
+++ b/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py
@ -16,7 +16,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))

 # init configs
 dataset_config = BaseDatasetConfig(
-    name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
+    formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
 )

 audio_config = BaseAudioConfig(
--- a/recipes/ljspeech/vits_tts/train_vits.py
+++ b/recipes/ljspeech/vits_tts/train_vits.py
@ -11,7 +11,7 @@ from TTS.utils.audio import AudioProcessor

 output_path = os.path.dirname(os.path.abspath(__file__))
 dataset_config = BaseDatasetConfig(
-    name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
+    formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
 )
 audio_config = VitsAudioConfig(
    sample_rate=22050, win_length=1024, hop_length=256, num_mels=80, mel_fmin=0, mel_fmax=None
--- a/recipes/multilingual/vits_tts/train_vits_tts.py
+++ b/recipes/multilingual/vits_tts/train_vits_tts.py
@ -17,7 +17,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
 mailabs_path = "/home/julian/workspace/mailabs/**"
 dataset_paths = glob(mailabs_path)
 dataset_config = [
-    BaseDatasetConfig(name="mailabs", meta_file_train=None, path=path, language=path.split("/")[-1])
+    BaseDatasetConfig(formatter="mailabs", meta_file_train=None, path=path, language=path.split("/")[-1])
    for path in dataset_paths
 ]

--- a/recipes/thorsten_DE/align_tts/train_aligntts.py
+++ b/recipes/thorsten_DE/align_tts/train_aligntts.py
@ -14,7 +14,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))

 # init configs
 dataset_config = BaseDatasetConfig(
-    name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
+    formatter="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
 )

 # download dataset if not already present
--- a/recipes/thorsten_DE/glow_tts/train_glowtts.py
+++ b/recipes/thorsten_DE/glow_tts/train_glowtts.py
@ -22,7 +22,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
 # Set LJSpeech as our target dataset and define its path.
 # You can also use a simple Dict to define the dataset and pass it to your custom formatter.
 dataset_config = BaseDatasetConfig(
-    name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
+    formatter="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
 )

 # download dataset if not already present
--- a/recipes/thorsten_DE/speedy_speech/train_speedy_speech.py
+++ b/recipes/thorsten_DE/speedy_speech/train_speedy_speech.py
@ -12,7 +12,7 @@ from TTS.utils.downloaders import download_thorsten_de

 output_path = os.path.dirname(os.path.abspath(__file__))
 dataset_config = BaseDatasetConfig(
-    name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
+    formatter="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
 )

 # download dataset if not already present
--- a/recipes/thorsten_DE/tacotron2-DDC/train_tacotron_ddc.py
+++ b/recipes/thorsten_DE/tacotron2-DDC/train_tacotron_ddc.py
@ -16,7 +16,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))

 # init configs
 dataset_config = BaseDatasetConfig(
-    name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
+    formatter="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
 )

 # download dataset if not already present
--- a/recipes/thorsten_DE/vits_tts/train_vits.py
+++ b/recipes/thorsten_DE/vits_tts/train_vits.py
@ -12,7 +12,7 @@ from TTS.utils.downloaders import download_thorsten_de

 output_path = os.path.dirname(os.path.abspath(__file__))
 dataset_config = BaseDatasetConfig(
-    name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
+    formatter="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
 )

 # download dataset if not already present
--- a/recipes/vctk/fast_pitch/train_fast_pitch.py
+++ b/recipes/vctk/fast_pitch/train_fast_pitch.py
@ -11,7 +11,7 @@ from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor

 output_path = os.path.dirname(os.path.abspath(__file__))
-dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
+dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))

 audio_config = BaseAudioConfig(
    sample_rate=22050,
--- a/recipes/vctk/fast_speech/train_fast_speech.py
+++ b/recipes/vctk/fast_speech/train_fast_speech.py
@ -11,7 +11,7 @@ from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor

 output_path = os.path.dirname(os.path.abspath(__file__))
-dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
+dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))

 audio_config = BaseAudioConfig(
    sample_rate=22050,
--- a/recipes/vctk/glow_tts/train_glow_tts.py
+++ b/recipes/vctk/glow_tts/train_glow_tts.py
@ -22,7 +22,7 @@ if not os.path.exists(dataset_path):
    download_vctk(dataset_path)

 # define dataset config
-dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=dataset_path)
+dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", path=dataset_path)

 # define audio config
 # ❗ resample the dataset externally using `TTS/bin/resample.py` and set `resample=False` for faster training
--- a/recipes/vctk/resnet_speaker_encoder/train_encoder.py
+++ b/recipes/vctk/resnet_speaker_encoder/train_encoder.py
@ -0,0 +1,139 @@
+import os
+
+from TTS.encoder.configs.speaker_encoder_config import SpeakerEncoderConfig
+
+# from TTS.encoder.configs.emotion_encoder_config import EmotionEncoderConfig
+from TTS.tts.configs.shared_configs import BaseDatasetConfig
+
+CURRENT_PATH = os.getcwd()
+# change the root path to the TTS root path
+os.chdir("../../../")
+
+### Definitions ###
+# dataset
+VCTK_PATH = "/raid/datasets/VCTK_NEW_16khz_removed_silence_silero_vad/"  # download:  https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zipdddddddddd
+RIR_SIMULATED_PATH = "/raid/datasets/DA/RIRS_NOISES/simulated_rirs/"  # download: https://www.openslr.org/17/
+MUSAN_PATH = "/raid/datasets/DA/musan/"  # download: https://www.openslr.org/17/
+
+# training
+OUTPUT_PATH = os.path.join(
+    CURRENT_PATH, "resnet_speaker_encoder_training_output/"
+)  # path to save the train logs and checkpoint
+CONFIG_OUT_PATH = os.path.join(OUTPUT_PATH, "config_se.json")
+RESTORE_PATH = None  # Checkpoint to use for transfer learning if None ignore
+
+# instance the config
+# to speaker encoder
+config = SpeakerEncoderConfig()
+# to emotion encoder
+# config = EmotionEncoderConfig()
+
+
+#### DATASET CONFIG ####
+# The formatter need to return the key "speaker_name"  for the speaker encoder and the "emotion_name" for the emotion encoder
+dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", language="en-us", path=VCTK_PATH)
+
+# add the dataset to the config
+config.datasets = [dataset_config]
+
+
+#### TRAINING CONFIG ####
+# The encoder data loader balancer the dataset item equally to guarantee better training and to attend the losses requirements
+# It have two parameters to control the final batch size the number total of speaker used in each batch and the number of samples for each speaker
+
+# number total of speaker in batch in training
+config.num_classes_in_batch = 100
+# number of utterance per class/speaker in the batch in training
+config.num_utter_per_class = 4
+# final batch size = config.num_classes_in_batch * config.num_utter_per_class
+
+# number total of speaker in batch in evaluation
+config.eval_num_classes_in_batch = 100
+# number of utterance per class/speaker in the batch in evaluation
+config.eval_num_utter_per_class = 4
+
+# number of data loader workers
+config.num_loader_workers = 8
+config.num_val_loader_workers = 8
+
+# number of epochs
+config.epochs = 10000
+# loss to be used in training
+config.loss = "softmaxproto"
+
+# run eval
+config.run_eval = False
+
+# output path for the checkpoints
+config.output_path = OUTPUT_PATH
+
+# Save local checkpoint every save_step steps
+config.save_step = 2000
+
+### Model Config ###
+config.model_params = {
+    "model_name": "resnet",  # supported "lstm" and "resnet"
+    "input_dim": 64,
+    "use_torch_spec": True,
+    "log_input": True,
+    "proj_dim": 512,  # embedding dim
+}
+
+### Audio Config ###
+# To fast train the model divides the audio in small parts. it parameter defines the length in seconds of these "parts"
+config.voice_len = 2.0
+# all others configs
+config.audio = {
+    "fft_size": 512,
+    "win_length": 400,
+    "hop_length": 160,
+    "frame_shift_ms": None,
+    "frame_length_ms": None,
+    "stft_pad_mode": "reflect",
+    "sample_rate": 16000,
+    "resample": False,
+    "preemphasis": 0.97,
+    "ref_level_db": 20,
+    "do_sound_norm": False,
+    "do_trim_silence": False,
+    "trim_db": 60,
+    "power": 1.5,
+    "griffin_lim_iters": 60,
+    "num_mels": 64,
+    "mel_fmin": 0.0,
+    "mel_fmax": 8000.0,
+    "spec_gain": 20,
+    "signal_norm": False,
+    "min_level_db": -100,
+    "symmetric_norm": False,
+    "max_norm": 4.0,
+    "clip_norm": False,
+    "stats_path": None,
+    "do_rms_norm": True,
+    "db_level": -27.0,
+}
+
+
+### Augmentation Config ###
+config.audio_augmentation = {
+    # additive noise and room impulse response (RIR) simulation similar to: https://arxiv.org/pdf/2009.14153.pdf
+    "p": 0.5,  # probability to the use of one of the augmentation - 0 means disabled
+    "rir": {"rir_path": RIR_SIMULATED_PATH, "conv_mode": "full"},  # download: https://www.openslr.org/17/
+    "additive": {
+        "sounds_path": MUSAN_PATH,
+        "speech": {"min_snr_in_db": 13, "max_snr_in_db": 20, "min_num_noises": 1, "max_num_noises": 1},
+        "noise": {"min_snr_in_db": 0, "max_snr_in_db": 15, "min_num_noises": 1, "max_num_noises": 1},
+        "music": {"min_snr_in_db": 5, "max_snr_in_db": 15, "min_num_noises": 1, "max_num_noises": 1},
+    },
+    "gaussian": {"p": 0.7, "min_amplitude": 0.0, "max_amplitude": 1e-05},
+}
+
+config.save_json(CONFIG_OUT_PATH)
+
+print(CONFIG_OUT_PATH)
+if RESTORE_PATH is not None:
+    command = f"python TTS/bin/train_encoder.py --config_path {CONFIG_OUT_PATH} --restore_path {RESTORE_PATH}"
+else:
+    command = f"python TTS/bin/train_encoder.py --config_path {CONFIG_OUT_PATH}"
+
+os.system(command)
--- a/recipes/vctk/speedy_speech/train_speedy_speech.py
+++ b/recipes/vctk/speedy_speech/train_speedy_speech.py
@ -11,7 +11,7 @@ from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor

 output_path = os.path.dirname(os.path.abspath(__file__))
-dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
+dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))

 audio_config = BaseAudioConfig(
    sample_rate=22050,
--- a/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py
+++ b/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py
@ -12,7 +12,7 @@ from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor

 output_path = os.path.dirname(os.path.abspath(__file__))
-dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
+dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))

 audio_config = BaseAudioConfig(
    sample_rate=22050,
--- a/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py
+++ b/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py
@ -12,7 +12,7 @@ from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor

 output_path = os.path.dirname(os.path.abspath(__file__))
-dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
+dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))

 audio_config = BaseAudioConfig(
    sample_rate=22050,
--- a/recipes/vctk/tacotron2/train_tacotron2.py
+++ b/recipes/vctk/tacotron2/train_tacotron2.py
@ -12,7 +12,7 @@ from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor

 output_path = os.path.dirname(os.path.abspath(__file__))
-dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
+dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))

 audio_config = BaseAudioConfig(
    sample_rate=22050,
--- a/recipes/vctk/vits/train_vits.py
+++ b/recipes/vctk/vits/train_vits.py
@ -12,7 +12,7 @@ from TTS.utils.audio import AudioProcessor

 output_path = os.path.dirname(os.path.abspath(__file__))
 dataset_config = BaseDatasetConfig(
-    name="vctk", meta_file_train="", language="en-us", path=os.path.join(output_path, "../VCTK/")
+    formatter="vctk", meta_file_train="", language="en-us", path=os.path.join(output_path, "../VCTK/")
 )


--- a/requirements.txt
+++ b/requirements.txt
@ -23,7 +23,6 @@ umap-learn==0.5.1
 pandas
 # deps for training
 matplotlib
-pyworld==0.2.10 # > 0.2.10 is not p3.10.x compatible
 # coqui stack
 trainer
 # config management
@ -35,4 +34,8 @@ pypinyin
 mecab-python3==1.0.5
 unidic-lite==1.0.8
 # gruut+supported langs
-gruut[cs,de,es,fr,it,nl,pt,ru,sv]==2.2.3
+gruut[de]==2.2.3
+# deps for korean
+jamo
+nltk
+g2pkk>=0.1.1
--- a/tests/init.py
+++ b/tests/init.py
@ -33,7 +33,9 @@ def get_tests_data_path():

 def get_tests_output_path():
    """Returns the path to the directory for test outputs."""
-    return os.path.join(get_tests_path(), "outputs")
+    path = os.path.join(get_tests_path(), "outputs")
+    os.makedirs(path, exist_ok=True)
+    return path


 def run_cli(command):
@ -42,7 +44,7 @@ def run_cli(command):


 def get_test_data_config():
-    return BaseDatasetConfig(name="ljspeech", path="tests/data/ljspeech/", meta_file_train="metadata.csv")
+    return BaseDatasetConfig(formatter="ljspeech", path="tests/data/ljspeech/", meta_file_train="metadata.csv")


 def assertHasAttr(test_obj, obj, intendedAttr):
--- a/tests/aux_tests/test_audio_processor.py
+++ b/tests/aux_tests/test_audio_processor.py
@ -10,7 +10,7 @@ OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests")
 WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")

 os.makedirs(OUT_PATH, exist_ok=True)
-conf = BaseAudioConfig(mel_fmax=8000)
+conf = BaseAudioConfig(mel_fmax=8000, pitch_fmax=640, pitch_fmin=1)


 # pylint: disable=protected-access
--- a/tests/aux_tests/test_embedding_manager.py
+++ b/tests/aux_tests/test_embedding_manager.py
@ -0,0 +1,92 @@
+import os
+import unittest
+
+import numpy as np
+import torch
+
+from tests import get_tests_input_path
+from TTS.config import load_config
+from TTS.encoder.utils.generic_utils import setup_encoder_model
+from TTS.encoder.utils.io import save_checkpoint
+from TTS.tts.utils.managers import EmbeddingManager
+from TTS.utils.audio import AudioProcessor
+
+encoder_config_path = os.path.join(get_tests_input_path(), "test_speaker_encoder_config.json")
+encoder_model_path = os.path.join(get_tests_input_path(), "checkpoint_0.pth")
+sample_wav_path = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0001.wav")
+sample_wav_path2 = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0002.wav")
+embedding_file_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.json")
+embeddings_file_path2 = os.path.join(get_tests_input_path(), "../data/dummy_speakers2.json")
+embeddings_file_pth_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.pth")
+
+
+class EmbeddingManagerTest(unittest.TestCase):
+    """Test emEeddingManager for loading embedding files and computing embeddings from waveforms"""
+
+    @staticmethod
+    def test_speaker_embedding():
+        # load config
+        config = load_config(encoder_config_path)
+        config.audio.resample = True
+
+        # create a dummy speaker encoder
+        model = setup_encoder_model(config)
+        save_checkpoint(model, None, None, get_tests_input_path(), 0)
+
+        # load audio processor and speaker encoder
+        manager = EmbeddingManager(encoder_model_path=encoder_model_path, encoder_config_path=encoder_config_path)
+
+        # load a sample audio and compute embedding
+        ap = AudioProcessor(**config.audio)
+        waveform = ap.load_wav(sample_wav_path)
+        mel = ap.melspectrogram(waveform)
+        embedding = manager.compute_embeddings(mel)
+        assert embedding.shape[1] == 256
+
+        # compute embedding directly from an input file
+        embedding = manager.compute_embedding_from_clip(sample_wav_path)
+        embedding2 = manager.compute_embedding_from_clip(sample_wav_path)
+        embedding = torch.FloatTensor(embedding)
+        embedding2 = torch.FloatTensor(embedding2)
+        assert embedding.shape[0] == 256
+        assert (embedding - embedding2).sum() == 0.0
+
+        # compute embedding from a list of wav files.
+        embedding3 = manager.compute_embedding_from_clip([sample_wav_path, sample_wav_path2])
+        embedding3 = torch.FloatTensor(embedding3)
+        assert embedding3.shape[0] == 256
+        assert (embedding - embedding3).sum() != 0.0
+
+        # remove dummy model
+        os.remove(encoder_model_path)
+
+    def test_embedding_file_processing(self):  # pylint: disable=no-self-use
+        manager = EmbeddingManager(embedding_file_path=embeddings_file_pth_path)
+        # test embedding querying
+        embedding = manager.get_embedding_by_clip(manager.clip_ids[0])
+        assert len(embedding) == 256
+        embeddings = manager.get_embeddings_by_name(manager.embedding_names[0])
+        assert len(embeddings[0]) == 256
+        embedding1 = manager.get_mean_embedding(manager.embedding_names[0], num_samples=2, randomize=True)
+        assert len(embedding1) == 256
+        embedding2 = manager.get_mean_embedding(manager.embedding_names[0], num_samples=2, randomize=False)
+        assert len(embedding2) == 256
+        assert np.sum(np.array(embedding1) - np.array(embedding2)) != 0
+
+    def test_embedding_file_loading(self):
+        # test loading a json file
+        manager = EmbeddingManager(embedding_file_path=embedding_file_path)
+        self.assertEqual(manager.num_embeddings, 384)
+        self.assertEqual(manager.embedding_dim, 256)
+        # test loading a pth file
+        manager = EmbeddingManager(embedding_file_path=embeddings_file_pth_path)
+        self.assertEqual(manager.num_embeddings, 384)
+        self.assertEqual(manager.embedding_dim, 256)
+        # test loading a pth files with duplicate embedding keys
+        with self.assertRaises(Exception) as context:
+            manager = EmbeddingManager(embedding_file_path=[embeddings_file_pth_path, embeddings_file_pth_path])
+        self.assertTrue("Duplicate embedding names" in str(context.exception))
+        # test loading embedding files with different embedding keys
+        manager = EmbeddingManager(embedding_file_path=[embeddings_file_pth_path, embeddings_file_path2])
+        self.assertEqual(manager.embedding_dim, 256)
+        self.assertEqual(manager.num_embeddings, 384 * 2)
--- a/tests/aux_tests/test_extract_tts_spectrograms.py
+++ b/tests/aux_tests/test_extract_tts_spectrograms.py
@ -15,7 +15,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase):
    def test_GlowTTS():
        # set paths
        config_path = os.path.join(get_tests_input_path(), "test_glow_tts.json")
-        checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth")
+        checkpoint_path = os.path.join(get_tests_output_path(), "glowtts.pth")
        output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/")
        # load config
        c = load_config(config_path)
@ -33,7 +33,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase):
    def test_Tacotron2():
        # set paths
        config_path = os.path.join(get_tests_input_path(), "test_tacotron2_config.json")
-        checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth")
+        checkpoint_path = os.path.join(get_tests_output_path(), "tacotron2.pth")
        output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/")
        # load config
        c = load_config(config_path)
@ -51,7 +51,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase):
    def test_Tacotron():
        # set paths
        config_path = os.path.join(get_tests_input_path(), "test_tacotron_config.json")
-        checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth")
+        checkpoint_path = os.path.join(get_tests_output_path(), "tacotron.pth")
        output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/")
        # load config
        c = load_config(config_path)
--- a/tests/aux_tests/test_find_unique_phonemes.py
+++ b/tests/aux_tests/test_find_unique_phonemes.py
@ -12,20 +12,22 @@ torch.manual_seed(1)
 config_path = os.path.join(get_tests_output_path(), "test_model_config.json")

 dataset_config_en = BaseDatasetConfig(
-    name="ljspeech",
+    formatter="ljspeech",
    meta_file_train="metadata.csv",
    meta_file_val="metadata.csv",
    path="tests/data/ljspeech",
    language="en",
 )

+"""
 dataset_config_pt = BaseDatasetConfig(
-    name="ljspeech",
+    formatter="ljspeech",
    meta_file_train="metadata.csv",
    meta_file_val="metadata.csv",
    path="tests/data/ljspeech",
    language="pt-br",
 )
+"""

 # pylint: disable=protected-access
 class TestFindUniquePhonemes(unittest.TestCase):
@ -46,7 +48,7 @@ class TestFindUniquePhonemes(unittest.TestCase):
            epochs=1,
            print_step=1,
            print_eval=True,
-            datasets=[dataset_config_en, dataset_config_pt],
+            datasets=[dataset_config_en],
        )
        config.save_json(config_path)

@ -70,7 +72,7 @@ class TestFindUniquePhonemes(unittest.TestCase):
            epochs=1,
            print_step=1,
            print_eval=True,
-            datasets=[dataset_config_en, dataset_config_pt],
+            datasets=[dataset_config_en],
        )
        config.save_json(config_path)

--- a/tests/aux_tests/test_numpy_transforms.py
+++ b/tests/aux_tests/test_numpy_transforms.py
@ -31,7 +31,8 @@ class TestNumpyTransforms(unittest.TestCase):
            mel_fmin: int = 0
            hop_length: int = 256
            win_length: int = 1024
-            pitch_fmax: int = 450
+            pitch_fmax: int = 640
+            pitch_fmin: int = 1
            trim_db: int = -1
            min_silence_sec: float = 0.01
            gain: float = 1.0
--- a/Show More
+++ b/Show More
 @ -1 +1 @@
 .8.0
 .9.0