Merge pull request #1942 from coqui-ai/dev

v0.9.0
2022-11-16 16:50:57 +01:00 · 2022-11-16 16:50:57 +01:00 · 56ba616a03
parent e5430a6519 bc6120c330
commit 56ba616a03
165 changed files with 101831 additions and 443 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -1,2 +1,9 @@
 .git/
 Dockerfile
 build/
 dist/
 TTS.egg-info/
 tests/outputs/*
 tests/train_outputs/*
 __pycache__/
 *.pyc
--- a/.github/workflows/aux_tests.yml
+++ b/.github/workflows/aux_tests.yml
@ -21,9 +21,9 @@ jobs:
        python-version: [3.7, 3.8, 3.9, "3.10"]
        experimental: [false]
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
      - name: Set up Python ${{ matrix.python-version }}
-        uses: coqui-ai/setup-python@pip-cache-key-py-ver
+        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}
          architecture: x64
--- a/.github/workflows/data_tests.yml
+++ b/.github/workflows/data_tests.yml
@ -21,9 +21,9 @@ jobs:
        python-version: [3.7, 3.8, 3.9, "3.10"]
        experimental: [false]
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
      - name: Set up Python ${{ matrix.python-version }}
-        uses: coqui-ai/setup-python@pip-cache-key-py-ver
+        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}
          architecture: x64
--- a/.github/workflows/docker.yaml
+++ b/.github/workflows/docker.yaml
@ -15,8 +15,8 @@ jobs:
      matrix:
        arch: ["amd64"]
        base:
-        - "nvcr.io/nvidia/pytorch:22.03-py3" # GPU enabled
+        - "nvidia/cuda:11.8.0-base-ubuntu22.04" # GPU enabled
-        - "ubuntu:20.04" # CPU only
+        - "python:3.10.8-slim" # CPU only
    steps:
      - uses: actions/checkout@v2
      - name: Log in to the Container registry
@ -32,7 +32,7 @@ jobs:
          base="ghcr.io/coqui-ai/tts"
          tags="" # PR build
-          if [[ ${{ matrix.base }} = "ubuntu:20.04" ]]; then
+          if [[ ${{ matrix.base }} = "python:3.10.8-slim" ]]; then
            base="ghcr.io/coqui-ai/tts-cpu"
          fi
--- a/.github/workflows/inference_tests.yml
+++ b/.github/workflows/inference_tests.yml
@ -21,9 +21,9 @@ jobs:
        python-version: [3.7, 3.8, 3.9, "3.10"]
        experimental: [false]
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
      - name: Set up Python ${{ matrix.python-version }}
-        uses: coqui-ai/setup-python@pip-cache-key-py-ver
+        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}
          architecture: x64
--- a/.github/workflows/style_check.yml
+++ b/.github/workflows/style_check.yml
@ -21,9 +21,9 @@ jobs:
        python-version: [3.9]
        experimental: [false]
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
      - name: Set up Python ${{ matrix.python-version }}
-        uses: coqui-ai/setup-python@pip-cache-key-py-ver
+        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}
          architecture: x64
--- a/.github/workflows/text_tests.yml
+++ b/.github/workflows/text_tests.yml
@ -21,9 +21,9 @@ jobs:
        python-version: [3.7, 3.8, 3.9, "3.10"]
        experimental: [false]
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
      - name: Set up Python ${{ matrix.python-version }}
-        uses: coqui-ai/setup-python@pip-cache-key-py-ver
+        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}
          architecture: x64
--- a/.github/workflows/tts_tests.yml
+++ b/.github/workflows/tts_tests.yml
@ -21,9 +21,9 @@ jobs:
        python-version: [3.7, 3.8, 3.9, "3.10"]
        experimental: [false]
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
      - name: Set up Python ${{ matrix.python-version }}
-        uses: coqui-ai/setup-python@pip-cache-key-py-ver
+        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}
          architecture: x64
--- a/.github/workflows/vocoder_tests.yml
+++ b/.github/workflows/vocoder_tests.yml
@ -21,9 +21,9 @@ jobs:
        python-version: [3.7, 3.8, 3.9, "3.10"]
        experimental: [false]
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
      - name: Set up Python ${{ matrix.python-version }}
-        uses: coqui-ai/setup-python@pip-cache-key-py-ver
+        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}
          architecture: x64
--- a/.github/workflows/zoo_tests0.yml
+++ b/.github/workflows/zoo_tests0.yml
@ -0,0 +1,52 @@
 name: zoo-tests-0
 on:
  push:
    branches:
      - main
  pull_request:
    types: [opened, synchronize, reopened]
 jobs:
  check_skip:
    runs-on: ubuntu-latest
    if: "! contains(github.event.head_commit.message, '[ci skip]')"
    steps:
      - run: echo "${{ github.event.head_commit.message }}"
  test:
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        python-version: [3.7, 3.8, 3.9, "3.10"]
        experimental: [false]
    steps:
      - uses: actions/checkout@v3
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}
          architecture: x64
          cache: 'pip'
          cache-dependency-path: 'requirements*'
      - name: check OS
        run: cat /etc/os-release
      - name: Install dependencies
        run: |
          sudo apt-get update
          sudo apt-get install -y git make gcc
          sudo apt-get install espeak espeak-ng
          make system-deps
      - name: Install/upgrade Python setup deps
        run: python3 -m pip install --upgrade pip setuptools wheel
      - name: Replace scarf urls
        run: |
          sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
      - name: Install TTS
        run: |
          python3 -m pip install .[all]
          python3 setup.py egg_info
      - name: Unit tests
        run: |
          nose2 -F -v -B TTS tests.zoo_tests.test_models.test_models_offset_0_step_3
          nose2 -F -v -B TTS tests.zoo_tests.test_models.test_voice_conversion
--- a/.github/workflows/zoo_tests1.yml
+++ b/.github/workflows/zoo_tests1.yml
@ -1,4 +1,4 @@
-name: zoo-tests
+name: zoo-tests-1
 on:
  push:
@ -21,9 +21,9 @@ jobs:
        python-version: [3.7, 3.8, 3.9, "3.10"]
        experimental: [false]
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
      - name: Set up Python ${{ matrix.python-version }}
-        uses: coqui-ai/setup-python@pip-cache-key-py-ver
+        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}
          architecture: x64
@ -47,4 +47,4 @@ jobs:
          python3 -m pip install .[all]
          python3 setup.py egg_info
      - name: Unit tests
-        run: make test_zoo
+        run: nose2 -F -v -B --with-coverage --coverage TTS tests.zoo_tests.test_models.test_models_offset_1_step_3
--- a/.github/workflows/zoo_tests2.yml
+++ b/.github/workflows/zoo_tests2.yml
@ -0,0 +1,50 @@
 name: zoo-tests-2
 on:
  push:
    branches:
      - main
  pull_request:
    types: [opened, synchronize, reopened]
 jobs:
  check_skip:
    runs-on: ubuntu-latest
    if: "! contains(github.event.head_commit.message, '[ci skip]')"
    steps:
      - run: echo "${{ github.event.head_commit.message }}"
  test:
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        python-version: [3.7, 3.8, 3.9, "3.10"]
        experimental: [false]
    steps:
      - uses: actions/checkout@v3
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}
          architecture: x64
          cache: 'pip'
          cache-dependency-path: 'requirements*'
      - name: check OS
        run: cat /etc/os-release
      - name: Install dependencies
        run: |
          sudo apt-get update
          sudo apt-get install -y git make gcc
          sudo apt-get install espeak espeak-ng
          make system-deps
      - name: Install/upgrade Python setup deps
        run: python3 -m pip install --upgrade pip setuptools wheel
      - name: Replace scarf urls
        run: |
          sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
      - name: Install TTS
        run: |
          python3 -m pip install .[all]
          python3 setup.py egg_info
      - name: Unit tests
        run: nose2 -F -v -B --with-coverage --coverage TTS tests.zoo_tests.test_models.test_models_offset_2_step_3
--- a/18
+++ b/18
@ -1,20 +1,12 @@
-ARG BASE=nvcr.io/nvidia/pytorch:22.03-py3
+ARG BASE=nvidia/cuda:11.8.0-base-ubuntu22.04
 FROM ${BASE}
-RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ make  python3 python3-dev python3-pip python3-venv python3-wheel espeak espeak-ng libsndfile1-dev && rm -rf /var/lib/apt/lists/*
+RUN apt-get update && apt-get upgrade -y
-RUN pip install llvmlite --ignore-installed
+RUN apt-get install -y --no-install-recommends gcc g++ make python3 python3-dev python3-pip python3-venv python3-wheel espeak-ng libsndfile1-dev && rm -rf /var/lib/apt/lists/*
-
+RUN pip3 install llvmlite --ignore-installed
 # Create and activate virtual env
 ENV VIRTUAL_ENV=/venv
 RUN python3 -m venv $VIRTUAL_ENV
 ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 RUN pip install -U pip setuptools wheel
 WORKDIR /root
 COPY requirements.txt /root
 COPY requirements.dev.txt /root
 COPY requirements.notebooks.txt /root
 RUN ["/bin/bash", "-c", "pip install -r <(cat requirements.txt requirements.dev.txt requirements.notebooks.txt)"]
 COPY . /root
 RUN pip3 install torch torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
 RUN make install
 ENTRYPOINT ["tts"]
 CMD ["--help"]
--- a/README.md
+++ b/README.md
@ -1,9 +1,16 @@
-# <img src="https://raw.githubusercontent.com/coqui-ai/TTS/main/images/coqui-log-green-TTS.png" height="56"/>
+<img src="https://raw.githubusercontent.com/coqui-ai/TTS/main/images/coqui-log-green-TTS.png" height="56"/>
 ----
 ### 📣 Clone your voice with a single click on [🐸Coqui.ai](https://app.coqui.ai/auth/signin)
 ### 📣 🐸Coqui Studio is launching soon!! Join our [waiting list](https://coqui.ai/)!!
 ----
 🐸TTS is a library for advanced Text-to-Speech generation. It's built on the latest research, was designed to achieve the best trade-off among ease-of-training, speed and quality.
 🐸TTS comes with pretrained models, tools for measuring dataset quality and already used in **20+ languages** for products and research projects.
-[![Gitter](https://badges.gitter.im/coqui-ai/TTS.svg)](https://gitter.im/coqui-ai/TTS?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge)
+[![Dicord](https://img.shields.io/discord/1037326658807533628?color=%239B59B6&label=chat%20on%20discord)](https://discord.gg/5eXr5seRrv)
 [![License](<https://img.shields.io/badge/License-MPL%202.0-brightgreen.svg>)](https://opensource.org/licenses/MPL-2.0)
 [![PyPI version](https://badge.fury.io/py/TTS.svg)](https://badge.fury.io/py/TTS)
 [![Covenant](https://camo.githubusercontent.com/7d620efaa3eac1c5b060ece5d6aacfcc8b81a74a04d05cd0398689c01c4463bb/68747470733a2f2f696d672e736869656c64732e696f2f62616467652f436f6e7472696275746f72253230436f76656e616e742d76322e3025323061646f707465642d6666363962342e737667)](https://github.com/coqui-ai/TTS/blob/master/CODE_OF_CONDUCT.md)
@ -36,12 +43,12 @@ Please use our dedicated channels for questions and discussion. Help is much mor
 | ------------------------------- | --------------------------------------- |
 | 🚨 **Bug Reports**              | [GitHub Issue Tracker]                  |
 | 🎁 **Feature Requests & Ideas** | [GitHub Issue Tracker]                  |
-| 👩‍💻 **Usage Questions**          | [Github Discussions]                    |
+| 👩‍💻 **Usage Questions**          | [GitHub Discussions]                    |
-| 🗯 **General Discussion**       | [Github Discussions] or [Gitter Room]   |
+| 🗯 **General Discussion**       | [GitHub Discussions] or [Discord]   |
 [github issue tracker]: https://github.com/coqui-ai/tts/issues
 [github discussions]: https://github.com/coqui-ai/TTS/discussions
-[gitter room]: https://gitter.im/coqui-ai/TTS?utm_source=share-link&utm_medium=link&utm_campaign=share-link
+[discord]: https://discord.gg/5eXr5seRrv
 [Tutorials and Examples]: https://github.com/coqui-ai/TTS/wiki/TTS-Notebooks-and-Tutorials
@ -75,7 +82,7 @@ Underlined "TTS*" and "Judy*" are 🐸TTS models
 - Modular (but not too much) code base enabling easy implementation of new ideas.
 ## Implemented Models
-### Text-to-Spectrogram
+### Spectrogram models
 - Tacotron: [paper](https://arxiv.org/abs/1703.10135)
 - Tacotron2: [paper](https://arxiv.org/abs/1712.05884)
 - Glow-TTS: [paper](https://arxiv.org/abs/2005.11129)
@ -83,9 +90,12 @@ Underlined "TTS*" and "Judy*" are 🐸TTS models
 - Align-TTS: [paper](https://arxiv.org/abs/2003.01950)
 - FastPitch: [paper](https://arxiv.org/pdf/2006.06873.pdf)
 - FastSpeech: [paper](https://arxiv.org/abs/1905.09263)
 - SC-GlowTTS: [paper](https://arxiv.org/abs/2104.05557)
 - Capacitron: [paper](https://arxiv.org/abs/1906.03402)
 ### End-to-End Models
 - VITS: [paper](https://arxiv.org/pdf/2106.06103)
 - YourTTS: [paper](https://arxiv.org/abs/2112.02418)
 ### Attention Methods
 - Guided Attention: [paper](https://arxiv.org/abs/1710.08969)
@ -136,6 +146,21 @@ $ make install
 If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](https://stackoverflow.com/questions/66726331/how-can-i-run-mozilla-tts-coqui-tts-training-with-cuda-on-a-windows-system).
 ## Docker Image
 You can also try TTS without install with the docker image.
 Simply run the following command and you will be able to run TTS without installing it.
 ```bash
 docker run --rm -it -p 5002:5002 --entrypoint /bin/bash ghcr.io/coqui-ai/tts-cpu
 python3 TTS/server/server.py --list_models #To get the list of available models
 python3 TTS/server/server.py --model_name tts_models/en/vctk/vits # To start a server
 ```
 You can then enjoy the TTS server [here](http://[::1]:5002/)
 More details about the docker images (like GPU support) can be found [here](https://tts.readthedocs.io/en/latest/docker_images.html)
 ## Use TTS
 ### Single Speaker Models
@ -208,7 +233,7 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht
 - Run your own TTS and Vocoder models:
    ```
-    $ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth --out_path output/path/speech.wav
+    $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
        --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
    ```
@ -229,7 +254,7 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht
 - Run your own multi-speaker TTS model:
    ```
-    $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
+    $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/model.pth --config_path path/to/config.json --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
    ```
 ## Directory Structure
@ -239,8 +264,6 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht
 |- TTS
    |- bin/             (folder for all the executables.)
      |- train*.py                  (train your target model.)
      |- distribute.py              (train your TTS model using Multiple GPUs.)
      |- compute_statistics.py      (compute dataset statistics for normalization.)
      |- ...
    |- tts/             (text to speech models)
        |- layers/          (model layer definitions)
--- a/TTS/.models.json
+++ b/TTS/.models.json
@ -12,6 +12,61 @@
                }
            }
        },
        "bg": {
            "cv": {
                "vits":{
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--bg--cv--vits.zip",
                    "default_vocoder": null,
                    "commit": null,
                    "author": "@NeonGeckoCom",
                    "license": "bsd-3-clause"
                }
            }
        },
        "cs": {
            "cv": {
                "vits":{
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--cs--cv--vits.zip",
                    "default_vocoder": null,
                    "commit": null,
                    "author": "@NeonGeckoCom",
                    "license": "bsd-3-clause"
                }
            }
        },
        "da": {
            "cv": {
                "vits":{
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--da--cv--vits.zip",
                    "default_vocoder": null,
                    "commit": null,
                    "author": "@NeonGeckoCom",
                    "license": "bsd-3-clause"
                }
            }
        },
        "et": {
            "cv": {
                "vits":{
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--et--cv--vits.zip",
                    "default_vocoder": null,
                    "commit": null,
                    "author": "@NeonGeckoCom",
                    "license": "bsd-3-clause"
                }
            }
        },
        "ga": {
            "cv": {
                "vits":{
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ga--cv--vits.zip",
                    "default_vocoder": null,
                    "commit": null,
                    "author": "@NeonGeckoCom",
                    "license": "bsd-3-clause"
                }
            }
        },
        "en": {
            "ek1": {
                "tacotron2": {
@ -79,6 +134,14 @@
                    "license": "apache 2.0",
                    "contact": "egolge@coqui.com"
                },
                "vits--neon": {
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--en--ljspeech--vits.zip",
                    "default_vocoder": null,
                    "author": "@NeonGeckoCom",
                    "license": "bsd-3-clause",
                    "contact": null,
                    "commit": null
                },   
                "fast_pitch": {
                    "description": "FastPitch model trained on LJSpeech using the Aligner Network",
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--fast_pitch.zip",
@ -130,10 +193,10 @@
                    "license": "apache 2.0",
                    "contact": "adamfroghyar@gmail.com"
                },
-                "capacitron-t2-c150": {
+                "capacitron-t2-c150_v2": {
                    "description": "Capacitron additions to Tacotron 2 with Capacity at 150 as in https://arxiv.org/pdf/1906.03402.pdf",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--en--blizzard2013--capacitron-t2-c150.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.1_models/tts_models--en--blizzard2013--capacitron-t2-c150_v2.zip",
-                    "commit": "d6284e7",
+                    "commit": "a67039d",
                    "default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
                    "author": "Adam Froghyar @a-froghyar",
                    "license": "apache 2.0",
@ -151,6 +214,15 @@
                    "license": "MPL",
                    "contact": "egolge@coqui.com"
                }
            },
            "css10":{
                "vits":{
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--es--css10--vits.zip",
                    "default_vocoder": null,
                    "commit": null,
                    "author": "@NeonGeckoCom",
                    "license": "bsd-3-clause"
                }
            } 
        },
        "fr": {
@ -158,11 +230,20 @@
                "tacotron2-DDC": {
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--fr--mai--tacotron2-DDC.zip",
                    "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
-                    "commit": "",
+                    "commit": null,
                    "author": "Eren Gölge @erogol",
                    "license": "MPL",
                    "contact": "egolge@coqui.com"
                }
            },
            "css10":{
                "vits":{
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fr--css10--vits.zip",
                    "default_vocoder": null,
                    "commit": null,
                    "author": "@NeonGeckoCom",
                    "license": "bsd-3-clause"
                }
            }
        },
        "uk":{
@ -174,6 +255,13 @@
                    "license": "MIT",
                    "contact": "",
                    "default_vocoder": "vocoder_models/uk/mai/multiband-melgan"
                },
                "vits":{
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--uk--mai--vits.zip",
                    "default_vocoder": null,
                    "commit": null,
                    "author": "@NeonGeckoCom",
                    "license": "bsd-3-clause"
                }
            }
        },
@ -198,6 +286,15 @@
                    "stats_file": null,
                    "commit": "540d811"
                }
            },
            "css10":{
                "vits":{
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--nl--css10--vits.zip",
                    "default_vocoder": null,
                    "commit": null,
                    "author": "@NeonGeckoCom",
                    "license": "bsd-3-clause"
                }
            }
        },
        "de": {
@ -224,6 +321,15 @@
                    "license": "apache 2.0",
                    "commit": "unknown"
                }
            },
            "css10": {
                "vits-neon":{
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--de--css10--vits.zip",
                    "default_vocoder": null,
                    "author": "@NeonGeckoCom",
                    "license": "bsd-3-clause",
                    "commit": null
                }
            }
        },
        "ja": {
@ -359,6 +465,149 @@
                    "commit": "1b22f03"
                }
            }
        },
        "hu": {
            "css10": {
                "vits": {
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hu--css10--vits.zip",
                    "default_vocoder": null,
                    "commit": null,
                    "author": "@NeonGeckoCom",
                    "license": "bsd-3-clause"
                }
            }
        },
        "el": {
            "cv": {
                "vits": {
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--el--cv--vits.zip",
                    "default_vocoder": null,
                    "commit": null,
                    "author": "@NeonGeckoCom",
                    "license": "bsd-3-clause"
                }
            }
        },
        "fi": {
            "css10": {
                "vits":{
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fi--css10--vits.zip",
                    "default_vocoder": null,
                    "commit": null,
                    "author": "@NeonGeckoCom",
                    "license": "bsd-3-clause"
                }
            }
        },
        "hr": {
            "cv": {
                "vits":{
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hr--cv--vits.zip",
                    "default_vocoder": null,
                    "commit": null,
                    "author": "@NeonGeckoCom",
                    "license": "bsd-3-clause"
                }
            }
        },
        "lt": {
            "cv": {
                "vits":{
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lt--cv--vits.zip",
                    "default_vocoder": null,
                    "commit": null,
                    "author": "@NeonGeckoCom",
                    "license": "bsd-3-clause"
                }
            }
        },
        "lv": {
            "cv": {
                "vits":{
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lv--cv--vits.zip",
                    "default_vocoder": null,
                    "commit": null,
                    "author": "@NeonGeckoCom",
                    "license": "bsd-3-clause"
                }
            }
        },
        "mt": {
            "cv": {
                "vits":{
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--mt--cv--vits.zip",
                    "default_vocoder": null,
                    "commit": null,
                    "author": "@NeonGeckoCom",
                    "license": "bsd-3-clause"
                }
            }
        },
        "pl": {
            "mai_female": {
                "vits":{
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pl--mai_female--vits.zip",
                    "default_vocoder": null,
                    "commit": null,
                    "author": "@NeonGeckoCom",
                    "license": "bsd-3-clause"
                }
            }
        },
        "pt": {
            "cv": {
                "vits":{
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pt--cv--vits.zip",
                    "default_vocoder": null,
                    "commit": null,
                    "author": "@NeonGeckoCom",
                    "license": "bsd-3-clause"
                }
            }
        },
        "ro": {
            "cv": {
                "vits":{
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ro--cv--vits.zip",
                    "default_vocoder": null,
                    "commit": null,
                    "author": "@NeonGeckoCom",
                    "license": "bsd-3-clause"
                }
            }
        },
        "sk": {
            "cv": {
                "vits":{
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sk--cv--vits.zip",
                    "default_vocoder": null,
                    "commit": null,
                    "author": "@NeonGeckoCom",
                    "license": "bsd-3-clause"
                }
            }
        },
        "sl": {
            "cv": {
                "vits":{
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sl--cv--vits.zip",
                    "default_vocoder": null,
                    "commit": null,
                    "author": "@NeonGeckoCom",
                    "license": "bsd-3-clause"
                }
            }
        },
        "sv": {
            "cv": {
                "vits":{
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sv--cv--vits.zip",
                    "default_vocoder": null,
                    "commit": null,
                    "author": "@NeonGeckoCom",
                    "license": "bsd-3-clause"
                }
            }
        }
    },
    "vocoder_models": {
--- a/TTS/VERSION
+++ b/TTS/VERSION
@ -1 +1 @@
-0.8.0
+0.9.0
--- a/TTS/bin/compute_embeddings.py
+++ b/TTS/bin/compute_embeddings.py
@ -6,38 +6,87 @@ import torch
 from tqdm import tqdm
 from TTS.config import load_config
 from TTS.config.shared_configs import BaseDatasetConfig
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.utils.managers import save_file
 from TTS.tts.utils.speakers import SpeakerManager
 parser = argparse.ArgumentParser(
-    description="""Compute embedding vectors for each wav file in a dataset.\n\n"""
+    description="""Compute embedding vectors for each audio file in a dataset and store them keyed by `{dataset_name}#{file_path}` in a .pth file\n\n"""
    """
    Example runs:
-    python TTS/bin/compute_embeddings.py speaker_encoder_model.pth speaker_encoder_config.json  dataset_config.json
+    python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json  --config_dataset_path dataset_config.json
    python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json  --fomatter vctk --dataset_path /path/to/vctk/dataset --dataset_name my_vctk --metafile /path/to/vctk/metafile.csv
    """,
    formatter_class=RawTextHelpFormatter,
 )
-parser.add_argument("model_path", type=str, help="Path to model checkpoint file.")
+parser.add_argument(
-parser.add_argument("config_path", type=str, help="Path to model config file.")
+    "--model_path",
-parser.add_argument("config_dataset_path", type=str, help="Path to dataset config file.")
+    type=str,
    help="Path to model checkpoint file. It defaults to the released speaker encoder.",
    default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
 )
 parser.add_argument(
    "--config_path",
    type=str,
    help="Path to model config file. It defaults to the released speaker encoder config.",
    default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
 )
 parser.add_argument(
    "--config_dataset_path",
    type=str,
    help="Path to dataset config file. You either need to provide this or `formatter_name`, `dataset_name` and `dataset_path` arguments.",
    default=None,
 )
 parser.add_argument("--output_path", type=str, help="Path for output `pth` or `json` file.", default="speakers.pth")
 parser.add_argument("--old_file", type=str, help="Previous embedding file to only compute new audios.", default=None)
 parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False)
 parser.add_argument("--no_eval", type=bool, help="Do not compute eval?. Default False", default=False)
-
+parser.add_argument(
    "--formatter_name",
    type=str,
    help="Name of the formatter to use. You either need to provide this or `config_dataset_path`",
    default=None,
 )
 parser.add_argument(
    "--dataset_name",
    type=str,
    help="Name of the dataset to use. You either need to provide this or `config_dataset_path`",
    default=None,
 )
 parser.add_argument(
    "--dataset_path",
    type=str,
    help="Path to the dataset. You either need to provide this or `config_dataset_path`",
    default=None,
 )
 parser.add_argument(
    "--metafile",
    type=str,
    help="Path to the meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
    default=None,
 )
 args = parser.parse_args()
 use_cuda = torch.cuda.is_available() and not args.disable_cuda
-c_dataset = load_config(args.config_dataset_path)
+if args.config_dataset_path is not None:
    c_dataset = load_config(args.config_dataset_path)
    meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=not args.no_eval)
 else:
    c_dataset = BaseDatasetConfig()
    c_dataset.formatter = args.formatter_name
    c_dataset.dataset_name = args.dataset_name
    c_dataset.path = args.dataset_path
    c_dataset.meta_file_train = args.metafile if args.metafile else None
    meta_data_train, meta_data_eval = load_tts_samples(c_dataset, eval_split=not args.no_eval)
 meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=not args.no_eval)
 if meta_data_eval is None:
-    wav_files = meta_data_train
+    samples = meta_data_train
 else:
-    wav_files = meta_data_train + meta_data_eval
+    samples = meta_data_train + meta_data_eval
 encoder_manager = SpeakerManager(
    encoder_model_path=args.model_path,
@ -50,25 +99,23 @@ class_name_key = encoder_manager.encoder_config.class_name_key
 # compute speaker embeddings
 speaker_mapping = {}
-for idx, wav_file in enumerate(tqdm(wav_files)):
+for idx, fields in enumerate(tqdm(samples)):
-    if isinstance(wav_file, dict):
+    class_name = fields[class_name_key]
-        class_name = wav_file[class_name_key]
+    audio_file = fields["audio_file"]
-        wav_file = wav_file["audio_file"]
+    embedding_key = fields["audio_unique_name"]
-    else:
+    root_path = fields["root_path"]
        class_name = None
-    wav_file_name = os.path.basename(wav_file)
+    if args.old_file is not None and embedding_key in encoder_manager.clip_ids:
    if args.old_file is not None and wav_file_name in encoder_manager.clip_ids:
        # get the embedding from the old file
-        embedd = encoder_manager.get_embedding_by_clip(wav_file_name)
+        embedd = encoder_manager.get_embedding_by_clip(embedding_key)
    else:
        # extract the embedding
-        embedd = encoder_manager.compute_embedding_from_clip(wav_file)
+        embedd = encoder_manager.compute_embedding_from_clip(audio_file)
    # create speaker_mapping if target dataset is defined
-    speaker_mapping[wav_file_name] = {}
+    speaker_mapping[embedding_key] = {}
-    speaker_mapping[wav_file_name]["name"] = class_name
+    speaker_mapping[embedding_key]["name"] = class_name
-    speaker_mapping[wav_file_name]["embedding"] = embedd
+    speaker_mapping[embedding_key]["embedding"] = embedd
 if speaker_mapping:
    # save speaker_mapping if target dataset is defined
--- a/TTS/bin/extract_tts_spectrograms.py
+++ b/TTS/bin/extract_tts_spectrograms.py
@ -37,7 +37,7 @@ def setup_loader(ap, r, verbose=False):
        precompute_num_workers=0,
        use_noise_augment=False,
        verbose=verbose,
-        speaker_id_mapping=speaker_manager.ids if c.use_speaker_embedding else None,
+        speaker_id_mapping=speaker_manager.name_to_id if c.use_speaker_embedding else None,
        d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None,
    )
--- a/TTS/bin/find_unique_phonemes.py
+++ b/TTS/bin/find_unique_phonemes.py
@ -7,30 +7,25 @@ from tqdm.contrib.concurrent import process_map
 from TTS.config import load_config
 from TTS.tts.datasets import load_tts_samples
-from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut
+from TTS.tts.utils.text.phonemizers import Gruut
 phonemizer = Gruut(language="en-us")
 def compute_phonemes(item):
-    try:
+    text = item["text"]
-        text = item[0]
+    ph = phonemizer.phonemize(text).replace("|", "")
-        ph = phonemizer.phonemize(text).split("|")
+    return set(list(ph))
    except:
        return []
    return list(set(ph))
 def main():
    # pylint: disable=W0601
-    global c
+    global c, phonemizer
    # pylint: disable=bad-option-value
    parser = argparse.ArgumentParser(
        description="""Find all the unique characters or phonemes in a dataset.\n\n"""
        """
    Example runs:
-    python TTS/bin/find_unique_chars.py --config_path config.json
+    python TTS/bin/find_unique_phonemes.py --config_path config.json
    """,
        formatter_class=RawTextHelpFormatter,
    )
@ -46,15 +41,24 @@ def main():
    items = train_items + eval_items
    print("Num items:", len(items))
-    is_lang_def = all(item["language"] for item in items)
+    language_list = [item["language"] for item in items]
    is_lang_def = all(language_list)
    if not c.phoneme_language or not is_lang_def:
        raise ValueError("Phoneme language must be defined in config.")
    if not language_list.count(language_list[0]) == len(language_list):
        raise ValueError(
            "Currently, just one phoneme language per config file is supported !! Please split the dataset config into different configs and run it individually for each language !!"
        )
    phonemizer = Gruut(language=language_list[0], keep_puncs=True)
    phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15)
    phones = []
    for ph in phonemes:
        phones.extend(ph)
    phones = set(phones)
    lower_phones = filter(lambda c: c.islower(), phones)
    phones_force_lower = [c.lower() for c in phones]
--- a/TTS/bin/remove_silence_using_vad.py
+++ b/TTS/bin/remove_silence_using_vad.py
@ -17,7 +17,7 @@ def adjust_path_and_remove_silence(audio_path):
    # create all directory structure
    pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
    # remove the silence and save the audio
-    output_path = remove_silence(
+    output_path, is_speech = remove_silence(
        model_and_utils,
        audio_path,
        output_path,
@ -25,26 +25,34 @@ def adjust_path_and_remove_silence(audio_path):
        use_cuda=args.use_cuda,
    )
-    return output_path
+    return output_path, is_speech
 def preprocess_audios():
    files = sorted(glob.glob(os.path.join(args.input_dir, args.glob), recursive=True))
    print("> Number of files: ", len(files))
    if not args.force:
-        print("> Ignoring files that already exist in the output directory.")
+        print("> Ignoring files that already exist in the output idrectory.")
    if args.trim_just_beginning_and_end:
        print("> Trimming just the beginning and the end with nonspeech parts.")
    else:
        print("> Trimming all nonspeech parts.")
    filtered_files = []
    if files:
        # create threads
        # num_threads = multiprocessing.cpu_count()
        # process_map(adjust_path_and_remove_silence, files, max_workers=num_threads, chunksize=15)
        for f in tqdm(files):
-            adjust_path_and_remove_silence(f)
+            output_path, is_speech = adjust_path_and_remove_silence(f)
            if not is_speech:
                filtered_files.append(output_path)
        # write files that do not have speech
        with open(os.path.join(args.output_dir, "filtered_files.txt"), "w", encoding="utf-8") as f:
            for file in filtered_files:
                f.write(file + "\n")
    else:
        print("> No files Found !")
--- a/TTS/bin/synthesize.py
+++ b/TTS/bin/synthesize.py
@ -238,6 +238,13 @@ If you don't specify any models, then it uses LJSpeech based English model.
        help="speaker ID of the reference_wav speaker (If not provided the embedding will be computed using the Speaker Encoder).",
        default=None,
    )
    parser.add_argument(
        "--progress_bar",
        type=str2bool,
        help="If true shows a progress bar for the model download. Defaults to True",
        default=True,
    )
    args = parser.parse_args()
    # print the description if either text or list_models is not set
@ -255,7 +262,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
    # load model manager
    path = Path(__file__).parent / "../.models.json"
-    manager = ModelManager(path)
+    manager = ModelManager(path, progress_bar=args.progress_bar)
    model_path = None
    config_path = None
@ -323,7 +330,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
        print(
            " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
        )
-        print(synthesizer.tts_model.speaker_manager.ids)
+        print(synthesizer.tts_model.speaker_manager.name_to_id)
        return
    # query langauge ids of a multi-lingual model.
@ -331,7 +338,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
        print(
            " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
        )
-        print(synthesizer.tts_model.language_manager.ids)
+        print(synthesizer.tts_model.language_manager.name_to_id)
        return
    # check the arguments against a multi-speaker model.
--- a/TTS/config/shared_configs.py
+++ b/TTS/config/shared_configs.py
@ -62,7 +62,7 @@ class BaseAudioConfig(Coqpit):
            Maximum frequency of the F0 frames. Defaults to ```640```.
        pitch_fmin (float, optional):
-            Minimum frequency of the F0 frames. Defaults to ```0```.
+            Minimum frequency of the F0 frames. Defaults to ```1```.
        trim_db (int):
            Silence threshold used for silence trimming. Defaults to 45.
@ -144,7 +144,7 @@ class BaseAudioConfig(Coqpit):
    do_amp_to_db_mel: bool = True
    # f0 params
    pitch_fmax: float = 640.0
-    pitch_fmin: float = 0.0
+    pitch_fmin: float = 1.0
    # normalization params
    signal_norm: bool = True
    min_level_db: int = -100
@ -193,21 +193,24 @@ class BaseDatasetConfig(Coqpit):
    """Base config for TTS datasets.
    Args:
-        name (str):
+        formatter (str):
-            Dataset name that defines the preprocessor in use. Defaults to None.
+            Formatter name that defines used formatter in ```TTS.tts.datasets.formatter```. Defaults to `""`.
        dataset_name (str):
            Unique name for the dataset. Defaults to `""`.
        path (str):
-            Root path to the dataset files. Defaults to None.
+            Root path to the dataset files. Defaults to `""`.
        meta_file_train (str):
            Name of the dataset meta file. Or a list of speakers to be ignored at training for multi-speaker datasets.
-            Defaults to None.
+            Defaults to `""`.
        ignored_speakers (List):
            List of speakers IDs that are not used at the training. Default None.
        language (str):
-            Language code of the dataset. If defined, it overrides `phoneme_language`. Defaults to None.
+            Language code of the dataset. If defined, it overrides `phoneme_language`. Defaults to `""`.
        meta_file_val (str):
            Name of the dataset meta file that defines the instances used at validation.
@ -217,7 +220,8 @@ class BaseDatasetConfig(Coqpit):
            train the duration predictor.
    """
-    name: str = ""
+    formatter: str = ""
    dataset_name: str = ""
    path: str = ""
    meta_file_train: str = ""
    ignored_speakers: List[str] = None
@ -230,7 +234,7 @@ class BaseDatasetConfig(Coqpit):
    ):
        """Check config fields"""
        c = asdict(self)
-        check_argument("name", c, restricted=True)
+        check_argument("formatter", c, restricted=True)
        check_argument("path", c, restricted=True)
        check_argument("meta_file_train", c, restricted=True)
        check_argument("meta_file_val", c, restricted=False)
--- a/TTS/encoder/models/base_encoder.py
+++ b/TTS/encoder/models/base_encoder.py
@ -107,11 +107,18 @@ class BaseEncoder(nn.Module):
        return criterion
    def load_checkpoint(
-        self, config: Coqpit, checkpoint_path: str, eval: bool = False, use_cuda: bool = False, criterion=None
+        self,
        config: Coqpit,
        checkpoint_path: str,
        eval: bool = False,
        use_cuda: bool = False,
        criterion=None,
        cache=False,
    ):
-        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
        try:
            self.load_state_dict(state["model"])
            print(" > Model fully restored. ")
        except (KeyError, RuntimeError) as error:
            # If eval raise the error
            if eval:
--- a/TTS/model.py
+++ b/TTS/model.py
@ -44,13 +44,16 @@ class BaseTrainerModel(TrainerModel):
        return outputs_dict
    @abstractmethod
-    def load_checkpoint(self, config: Coqpit, checkpoint_path: str, eval: bool = False, strict: bool = True) -> None:
+    def load_checkpoint(
        self, config: Coqpit, checkpoint_path: str, eval: bool = False, strict: bool = True, cache=False
    ) -> None:
        """Load a model checkpoint gile and get ready for training or inference.
        Args:
            config (Coqpit): Model configuration.
            checkpoint_path (str): Path to the model checkpoint file.
            eval (bool, optional): If true, init model for inference else for training. Defaults to False.
-            strcit (bool, optional): Match all checkpoint keys to model's keys. Defaults to True.
+            strict (bool, optional): Match all checkpoint keys to model's keys. Defaults to True.
            cache (bool, optional): If True, cache the file locally for subsequent calls. It is cached under `get_user_data_dir()/tts_cache`. Defaults to False.
        """
        ...
--- a/TTS/server/server.py
+++ b/TTS/server/server.py
@ -5,6 +5,7 @@ import json
 import os
 import sys
 from pathlib import Path
 from threading import Lock
 from typing import Union
 from flask import Flask, render_template, request, send_file
@ -146,7 +147,7 @@ def index():
        "index.html",
        show_details=args.show_details,
        use_multi_speaker=use_multi_speaker,
-        speaker_ids=speaker_manager.ids if speaker_manager is not None else None,
+        speaker_ids=speaker_manager.name_to_id if speaker_manager is not None else None,
        use_gst=use_gst,
    )
@ -168,17 +169,21 @@ def details():
    )
 lock = Lock()
@app.route("/api/tts", methods=["GET"])
 def tts():
-    text = request.args.get("text")
+    with lock:
-    speaker_idx = request.args.get("speaker_id", "")
+        text = request.args.get("text")
-    style_wav = request.args.get("style_wav", "")
+        speaker_idx = request.args.get("speaker_id", "")
-    style_wav = style_wav_uri_to_dict(style_wav)
+        style_wav = request.args.get("style_wav", "")
-    print(" > Model input: {}".format(text))
+        style_wav = style_wav_uri_to_dict(style_wav)
-    print(" > Speaker Idx: {}".format(speaker_idx))
+        print(" > Model input: {}".format(text))
-    wavs = synthesizer.tts(text, speaker_name=speaker_idx, style_wav=style_wav)
+        print(" > Speaker Idx: {}".format(speaker_idx))
-    out = io.BytesIO()
+        wavs = synthesizer.tts(text, speaker_name=speaker_idx, style_wav=style_wav)
-    synthesizer.save_wav(wavs, out)
+        out = io.BytesIO()
        synthesizer.save_wav(wavs, out)
    return send_file(out, mimetype="audio/wav")
--- a/TTS/tts/datasets/init.py
+++ b/TTS/tts/datasets/init.py
@ -1,3 +1,4 @@
 import os
 import sys
 from collections import Counter
 from pathlib import Path
@ -12,20 +13,16 @@ from TTS.tts.datasets.formatters import *
 def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01):
    """Split a dataset into train and eval. Consider speaker distribution in multi-speaker training.
-        Args:
+    Args:
-    <<<<<<< HEAD
+        items (List[List]):
-            items (List[List]):
+            A list of samples. Each sample is a list of `[audio_path, text, speaker_id]`.
                A list of samples. Each sample is a list of `[audio_path, text, speaker_id]`.
-            eval_split_max_size (int):
+        eval_split_max_size (int):
-                Number maximum of samples to be used for evaluation in proportion split. Defaults to None (Disabled).
+            Number maximum of samples to be used for evaluation in proportion split. Defaults to None (Disabled).
-            eval_split_size (float):
+        eval_split_size (float):
-                If between 0.0 and 1.0 represents the proportion of the dataset to include in the evaluation set.
+            If between 0.0 and 1.0 represents the proportion of the dataset to include in the evaluation set.
-                If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%).
+            If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%).
    =======
            items (List[List]): A list of samples. Each sample is a list of `[text, audio_path, speaker_id]`.
    >>>>>>> Fix docstring
    """
    speakers = [item["speaker_name"] for item in items]
    is_multi_speaker = len(set(speakers)) > 1
@ -59,6 +56,17 @@ def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01):
    return items[:eval_split_size], items[eval_split_size:]
 def add_extra_keys(metadata, language, dataset_name):
    for item in metadata:
        # add language name
        item["language"] = language
        # add unique audio name
        relfilepath = os.path.splitext(os.path.relpath(item["audio_file"], item["root_path"]))[0]
        audio_unique_name = f"{dataset_name}#{relfilepath}"
        item["audio_unique_name"] = audio_unique_name
    return metadata
 def load_tts_samples(
    datasets: Union[List[Dict], Dict],
    eval_split=True,
@ -97,7 +105,8 @@ def load_tts_samples(
    if not isinstance(datasets, list):
        datasets = [datasets]
    for dataset in datasets:
-        name = dataset["name"]
+        formatter_name = dataset["formatter"]
        dataset_name = dataset["dataset_name"]
        root_path = dataset["path"]
        meta_file_train = dataset["meta_file_train"]
        meta_file_val = dataset["meta_file_val"]
@ -106,17 +115,19 @@ def load_tts_samples(
        # setup the right data processor
        if formatter is None:
-            formatter = _get_formatter_by_name(name)
+            formatter = _get_formatter_by_name(formatter_name)
        # load train set
        meta_data_train = formatter(root_path, meta_file_train, ignored_speakers=ignored_speakers)
-        meta_data_train = [{**item, **{"language": language}} for item in meta_data_train]
+        assert len(meta_data_train) > 0, f" [!] No training samples found in {root_path}/{meta_file_train}"
        meta_data_train = add_extra_keys(meta_data_train, language, dataset_name)
        print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}")
        # load evaluation split if set
        if eval_split:
            if meta_file_val:
                meta_data_eval = formatter(root_path, meta_file_val, ignored_speakers=ignored_speakers)
-                meta_data_eval = [{**item, **{"language": language}} for item in meta_data_eval]
+                meta_data_eval = add_extra_keys(meta_data_eval, language, dataset_name)
            else:
                meta_data_eval, meta_data_train = split_dataset(meta_data_train, eval_split_max_size, eval_split_size)
            meta_data_eval_all += meta_data_eval
--- a/TTS/tts/datasets/dataset.py
+++ b/TTS/tts/datasets/dataset.py
@ -1,3 +1,4 @@
 import base64
 import collections
 import os
 import random
@ -34,6 +35,12 @@ def noise_augment_audio(wav):
    return wav + (1.0 / 32768.0) * np.random.rand(*wav.shape)
 def string2filename(string):
    # generate a safe and reversible filename based on a string
    filename = base64.urlsafe_b64encode(string.encode("utf-8")).decode("utf-8", "ignore")
    return filename
 class TTSDataset(Dataset):
    def __init__(
        self,
@ -201,7 +208,7 @@ class TTSDataset(Dataset):
    def get_f0(self, idx):
        out_dict = self.f0_dataset[idx]
        item = self.samples[idx]
-        assert item["audio_file"] == out_dict["audio_file"]
+        assert item["audio_unique_name"] == out_dict["audio_unique_name"]
        return out_dict
    @staticmethod
@ -256,6 +263,7 @@ class TTSDataset(Dataset):
            "speaker_name": item["speaker_name"],
            "language_name": item["language"],
            "wav_file_name": os.path.basename(item["audio_file"]),
            "audio_unique_name": item["audio_unique_name"],
        }
        return sample
@ -397,8 +405,8 @@ class TTSDataset(Dataset):
                language_ids = None
            # get pre-computed d-vectors
            if self.d_vector_mapping is not None:
-                wav_files_names = list(batch["wav_file_name"])
+                embedding_keys = list(batch["audio_unique_name"])
-                d_vectors = [self.d_vector_mapping[w]["embedding"] for w in wav_files_names]
+                d_vectors = [self.d_vector_mapping[w]["embedding"] for w in embedding_keys]
            else:
                d_vectors = None
@ -560,19 +568,18 @@ class PhonemeDataset(Dataset):
    def __getitem__(self, index):
        item = self.samples[index]
-        ids = self.compute_or_load(item["audio_file"], item["text"])
+        ids = self.compute_or_load(string2filename(item["audio_unique_name"]), item["text"])
        ph_hat = self.tokenizer.ids_to_text(ids)
        return {"text": item["text"], "ph_hat": ph_hat, "token_ids": ids, "token_ids_len": len(ids)}
    def __len__(self):
        return len(self.samples)
-    def compute_or_load(self, wav_file, text):
+    def compute_or_load(self, file_name, text):
        """Compute phonemes for the given text.
        If the phonemes are already cached, load them from cache.
        """
        file_name = os.path.splitext(os.path.basename(wav_file))[0]
        file_ext = "_phoneme.npy"
        cache_path = os.path.join(self.cache_path, file_name + file_ext)
        try:
@ -669,11 +676,11 @@ class F0Dataset:
    def __getitem__(self, idx):
        item = self.samples[idx]
-        f0 = self.compute_or_load(item["audio_file"])
+        f0 = self.compute_or_load(item["audio_file"], string2filename(item["audio_unique_name"]))
        if self.normalize_f0:
            assert self.mean is not None and self.std is not None, " [!] Mean and STD is not available"
            f0 = self.normalize(f0)
-        return {"audio_file": item["audio_file"], "f0": f0}
+        return {"audio_unique_name": item["audio_unique_name"], "f0": f0}
    def __len__(self):
        return len(self.samples)
@ -705,8 +712,7 @@ class F0Dataset:
        return self.pad_id
    @staticmethod
-    def create_pitch_file_path(wav_file, cache_path):
+    def create_pitch_file_path(file_name, cache_path):
        file_name = os.path.splitext(os.path.basename(wav_file))[0]
        pitch_file = os.path.join(cache_path, file_name + "_pitch.npy")
        return pitch_file
@ -744,11 +750,11 @@ class F0Dataset:
        pitch[zero_idxs] = 0.0
        return pitch
-    def compute_or_load(self, wav_file):
+    def compute_or_load(self, wav_file, audio_unique_name):
        """
        compute pitch and return a numpy array of pitch values
        """
-        pitch_file = self.create_pitch_file_path(wav_file, self.cache_path)
+        pitch_file = self.create_pitch_file_path(audio_unique_name, self.cache_path)
        if not os.path.exists(pitch_file):
            pitch = self._compute_and_save_pitch(self.ap, wav_file, pitch_file)
        else:
@ -756,14 +762,14 @@ class F0Dataset:
        return pitch.astype(np.float32)
    def collate_fn(self, batch):
-        audio_file = [item["audio_file"] for item in batch]
+        audio_unique_name = [item["audio_unique_name"] for item in batch]
        f0s = [item["f0"] for item in batch]
        f0_lens = [len(item["f0"]) for item in batch]
        f0_lens_max = max(f0_lens)
        f0s_torch = torch.LongTensor(len(f0s), f0_lens_max).fill_(self.get_pad_id())
        for i, f0_len in enumerate(f0_lens):
            f0s_torch[i, :f0_len] = torch.LongTensor(f0s[i])
-        return {"audio_file": audio_file, "f0": f0s_torch, "f0_lens": f0_lens}
+        return {"audio_unique_name": audio_unique_name, "f0": f0s_torch, "f0_lens": f0_lens}
    def print_logs(self, level: int = 0) -> None:
        indent = "\t" * level
--- a/TTS/tts/datasets/formatters.py
+++ b/TTS/tts/datasets/formatters.py
@ -15,6 +15,15 @@ from tqdm import tqdm
 def coqui(root_path, meta_file, ignored_speakers=None):
    """Interal dataset formatter."""
    filepath = os.path.join(root_path, meta_file)
    # ensure there are 4 columns for every line
    with open(filepath, "r", encoding="utf8") as f:
        lines = f.readlines()
    num_cols = len(lines[0].split("|"))  # take the first row as reference
    for idx, line in enumerate(lines[1:]):
        if len(line.split("|")) != num_cols:
            print(f" > Missing column in line {idx + 1} -> {line.strip()}")
    # load metadata
    metadata = pd.read_csv(os.path.join(root_path, meta_file), sep="|")
    assert all(x in metadata.columns for x in ["audio_file", "text"])
    speaker_name = None if "speaker_name" in metadata.columns else "coqui"
@ -97,9 +106,9 @@ def mailabs(root_path, meta_files=None, ignored_speakers=None):
        meta_files (str):  list of meta files to be used in the training. If None, finds all the csv files
            recursively. Defaults to None
    """
-    speaker_regex = re.compile("by_book/(male|female)/(?P<speaker_name>[^/]+)/")
+    speaker_regex = re.compile(f"by_book{os.sep}(male|female){os.sep}(?P<speaker_name>[^{os.sep}]+){os.sep}")
    if not meta_files:
-        csv_files = glob(root_path + "/**/metadata.csv", recursive=True)
+        csv_files = glob(root_path + f"{os.sep}**{os.sep}metadata.csv", recursive=True)
    else:
        csv_files = meta_files
@ -578,3 +587,17 @@ def kokoro(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
            text = cols[2].replace(" ", "")
            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
    return items
 def kss(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
    """Korean single-speaker dataset from https://www.kaggle.com/datasets/bryanpark/korean-single-speaker-speech-dataset"""
    txt_file = os.path.join(root_path, meta_file)
    items = []
    speaker_name = "kss"
    with open(txt_file, "r", encoding="utf-8") as ttf:
        for line in ttf:
            cols = line.split("|")
            wav_file = os.path.join(root_path, cols[0])
            text = cols[2]  # cols[1] => 6월, cols[2] => 유월
            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name})
    return items
--- a/TTS/tts/models/align_tts.py
+++ b/TTS/tts/models/align_tts.py
@ -398,9 +398,9 @@ class AlignTTS(BaseTTS):
        logger.eval_audios(steps, audios, self.ap.sample_rate)
    def load_checkpoint(
-        self, config, checkpoint_path, eval=False
+        self, config, checkpoint_path, eval=False, cache=False
    ):  # pylint: disable=unused-argument, redefined-builtin
-        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
        self.load_state_dict(state["model"])
        if eval:
            self.eval()
--- a/TTS/tts/models/base_tacotron.py
+++ b/TTS/tts/models/base_tacotron.py
@ -92,16 +92,17 @@ class BaseTacotron(BaseTTS):
        pass
    def load_checkpoint(
-        self, config, checkpoint_path, eval=False
+        self, config, checkpoint_path, eval=False, cache=False
    ):  # pylint: disable=unused-argument, redefined-builtin
        """Load model checkpoint and set up internals.
        Args:
            config (Coqpi): model configuration.
            checkpoint_path (str): path to checkpoint file.
-            eval (bool): whether to load model for evaluation.
+            eval (bool, optional): whether to load model for evaluation.
            cache (bool, optional): If True, cache the file locally for subsequent calls. It is cached under `get_user_data_dir()/tts_cache`. Defaults to False.
        """
-        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
        self.load_state_dict(state["model"])
        # TODO: set r in run-time by taking it from the new config
        if "r" in state:
--- a/TTS/tts/models/base_tts.py
+++ b/TTS/tts/models/base_tts.py
@ -144,11 +144,11 @@ class BaseTTS(BaseTrainerModel):
                if speaker_name is None:
                    speaker_id = self.speaker_manager.get_random_id()
                else:
-                    speaker_id = self.speaker_manager.ids[speaker_name]
+                    speaker_id = self.speaker_manager.name_to_id[speaker_name]
        # get language id
        if hasattr(self, "language_manager") and config.use_language_embedding and language_name is not None:
-            language_id = self.language_manager.ids[language_name]
+            language_id = self.language_manager.name_to_id[language_name]
        return {
            "text": text,
@ -288,11 +288,13 @@ class BaseTTS(BaseTrainerModel):
            # setup multi-speaker attributes
            if hasattr(self, "speaker_manager") and self.speaker_manager is not None:
                if hasattr(config, "model_args"):
-                    speaker_id_mapping = self.speaker_manager.ids if config.model_args.use_speaker_embedding else None
+                    speaker_id_mapping = (
                        self.speaker_manager.name_to_id if config.model_args.use_speaker_embedding else None
                    )
                    d_vector_mapping = self.speaker_manager.embeddings if config.model_args.use_d_vector_file else None
                    config.use_d_vector_file = config.model_args.use_d_vector_file
                else:
-                    speaker_id_mapping = self.speaker_manager.ids if config.use_speaker_embedding else None
+                    speaker_id_mapping = self.speaker_manager.name_to_id if config.use_speaker_embedding else None
                    d_vector_mapping = self.speaker_manager.embeddings if config.use_d_vector_file else None
            else:
                speaker_id_mapping = None
@ -300,7 +302,7 @@ class BaseTTS(BaseTrainerModel):
            # setup multi-lingual attributes
            if hasattr(self, "language_manager") and self.language_manager is not None:
-                language_id_mapping = self.language_manager.ids if self.args.use_language_embedding else None
+                language_id_mapping = self.language_manager.name_to_id if self.args.use_language_embedding else None
            else:
                language_id_mapping = None
@ -342,7 +344,7 @@ class BaseTTS(BaseTrainerModel):
            loader = DataLoader(
                dataset,
                batch_size=config.eval_batch_size if is_eval else config.batch_size,
-                shuffle=False,  # shuffle is done in the dataset.
+                shuffle=True,  # if there is no other sampler
                collate_fn=dataset.collate_fn,
                drop_last=False,  # setting this False might cause issues in AMP training.
                sampler=sampler,
@ -363,7 +365,7 @@ class BaseTTS(BaseTrainerModel):
        aux_inputs = {
            "speaker_id": None
            if not self.config.use_speaker_embedding
-            else random.sample(sorted(self.speaker_manager.ids.values()), 1),
+            else random.sample(sorted(self.speaker_manager.name_to_id.values()), 1),
            "d_vector": d_vector,
            "style_wav": None,  # TODO: handle GST style input
        }
--- a/TTS/tts/models/forward_tts.py
+++ b/TTS/tts/models/forward_tts.py
@ -16,6 +16,7 @@ from TTS.tts.utils.helpers import average_over_durations, generate_path, maximum
 from TTS.tts.utils.speakers import SpeakerManager
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.tts.utils.visual import plot_alignment, plot_avg_pitch, plot_spectrogram
 from TTS.utils.io import load_fsspec
@dataclass
@ -707,9 +708,9 @@ class ForwardTTS(BaseTTS):
        logger.eval_audios(steps, audios, self.ap.sample_rate)
    def load_checkpoint(
-        self, config, checkpoint_path, eval=False
+        self, config, checkpoint_path, eval=False, cache=False
    ):  # pylint: disable=unused-argument, redefined-builtin
-        state = torch.load(checkpoint_path, map_location=torch.device("cpu"))
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
        self.load_state_dict(state["model"])
        if eval:
            self.eval()
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@ -284,6 +284,7 @@ class VitsDataset(TTSDataset):
            "wav_file": wav_filename,
            "speaker_name": item["speaker_name"],
            "language_name": item["language"],
            "audio_unique_name": item["audio_unique_name"],
        }
    @property
@ -308,6 +309,7 @@ class VitsDataset(TTSDataset):
            - language_names: :math:`[B]`
            - audiofile_paths: :math:`[B]`
            - raw_texts: :math:`[B]`
            - audio_unique_names: :math:`[B]`
        """
        # convert list of dicts to dict of lists
        B = len(batch)
@ -348,6 +350,7 @@ class VitsDataset(TTSDataset):
            "language_names": batch["language_name"],
            "audio_files": batch["wav_file"],
            "raw_text": batch["raw_text"],
            "audio_unique_names": batch["audio_unique_name"],
        }
@ -718,6 +721,10 @@ class Vits(BaseTTS):
                use_spectral_norm=self.args.use_spectral_norm_disriminator,
            )
    @property
    def device(self):
        return next(self.parameters()).device
    def init_multispeaker(self, config: Coqpit):
        """Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer
        or with external `d_vectors` computed from a speaker encoder model.
@ -755,17 +762,12 @@ class Vits(BaseTTS):
            if (
                hasattr(self.speaker_manager.encoder, "audio_config")
-                and self.config.audio["sample_rate"] != self.speaker_manager.encoder.audio_config["sample_rate"]
+                and self.config.audio.sample_rate != self.speaker_manager.encoder.audio_config["sample_rate"]
            ):
                self.audio_transform = torchaudio.transforms.Resample(
-                    orig_freq=self.audio_config["sample_rate"],
+                    orig_freq=self.config.audio.sample_rate,
                    new_freq=self.speaker_manager.encoder.audio_config["sample_rate"],
                )
            # pylint: disable=W0101,W0105
            self.audio_transform = torchaudio.transforms.Resample(
                orig_freq=self.config.audio.sample_rate,
                new_freq=self.speaker_manager.encoder.audio_config["sample_rate"],
            )
    def _init_speaker_embedding(self):
        # pylint: disable=attribute-defined-outside-init
@ -808,6 +810,13 @@ class Vits(BaseTTS):
                orig_freq=self.config.audio["sample_rate"], new_freq=self.args.encoder_sample_rate
            )  # pylint: disable=W0201
    def on_epoch_start(self, trainer):  # pylint: disable=W0613
        """Freeze layers at the beginning of an epoch"""
        self._freeze_layers()
        # set the device of speaker encoder
        if self.args.use_speaker_encoder_as_loss:
            self.speaker_manager.encoder = self.speaker_manager.encoder.to(self.device)
    def on_init_end(self, trainer):  # pylint: disable=W0613
        """Reinit layes if needed"""
        if self.args.reinit_DP:
@ -1185,7 +1194,6 @@ class Vits(BaseTTS):
        y_lengths = torch.tensor([y.size(-1)]).to(y.device)
        speaker_cond_src = reference_speaker_id if reference_speaker_id is not None else reference_d_vector
        speaker_cond_tgt = speaker_id if speaker_id is not None else d_vector
        # print(y.shape, y_lengths.shape)
        wav, _, _ = self.voice_conversion(y, y_lengths, speaker_cond_src, speaker_cond_tgt)
        return wav
@ -1229,8 +1237,6 @@ class Vits(BaseTTS):
            Tuple[Dict, Dict]: Model ouputs and computed losses.
        """
        self._freeze_layers()
        spec_lens = batch["spec_lens"]
        if optimizer_idx == 0:
@ -1402,11 +1408,11 @@ class Vits(BaseTTS):
                if speaker_name is None:
                    speaker_id = self.speaker_manager.get_random_id()
                else:
-                    speaker_id = self.speaker_manager.ids[speaker_name]
+                    speaker_id = self.speaker_manager.name_to_id[speaker_name]
        # get language id
        if hasattr(self, "language_manager") and config.use_language_embedding and language_name is not None:
-            language_id = self.language_manager.ids[language_name]
+            language_id = self.language_manager.name_to_id[language_name]
        return {
            "text": text,
@ -1461,8 +1467,8 @@ class Vits(BaseTTS):
        d_vectors = None
        # get numerical speaker ids from speaker names
-        if self.speaker_manager is not None and self.speaker_manager.ids and self.args.use_speaker_embedding:
+        if self.speaker_manager is not None and self.speaker_manager.name_to_id and self.args.use_speaker_embedding:
-            speaker_ids = [self.speaker_manager.ids[sn] for sn in batch["speaker_names"]]
+            speaker_ids = [self.speaker_manager.name_to_id[sn] for sn in batch["speaker_names"]]
        if speaker_ids is not None:
            speaker_ids = torch.LongTensor(speaker_ids)
@ -1471,12 +1477,12 @@ class Vits(BaseTTS):
        # get d_vectors from audio file names
        if self.speaker_manager is not None and self.speaker_manager.embeddings and self.args.use_d_vector_file:
            d_vector_mapping = self.speaker_manager.embeddings
-            d_vectors = [d_vector_mapping[w]["embedding"] for w in batch["audio_files"]]
+            d_vectors = [d_vector_mapping[w]["embedding"] for w in batch["audio_unique_names"]]
            d_vectors = torch.FloatTensor(d_vectors)
        # get language ids from language names
-        if self.language_manager is not None and self.language_manager.ids and self.args.use_language_embedding:
+        if self.language_manager is not None and self.language_manager.name_to_id and self.args.use_language_embedding:
-            language_ids = [self.language_manager.ids[ln] for ln in batch["language_names"]]
+            language_ids = [self.language_manager.name_to_id[ln] for ln in batch["language_names"]]
        if language_ids is not None:
            language_ids = torch.LongTensor(language_ids)
@ -1680,14 +1686,10 @@ class Vits(BaseTTS):
        return [VitsDiscriminatorLoss(self.config), VitsGeneratorLoss(self.config)]
    def load_checkpoint(
-        self,
+        self, config, checkpoint_path, eval=False, strict=True, cache=False
        config,
        checkpoint_path,
        eval=False,
        strict=True,
    ):  # pylint: disable=unused-argument, redefined-builtin
        """Load the model checkpoint and setup for training or inference"""
-        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
        # compat band-aid for the pre-trained models to not use the encoder baked into the model
        # TODO: consider baking the speaker encoder into the model and call it from there.
        # as it is probably easier for model distribution.
--- a/TTS/tts/utils/languages.py
+++ b/TTS/tts/utils/languages.py
@ -37,11 +37,11 @@ class LanguageManager(BaseIDManager):
    @property
    def num_languages(self) -> int:
-        return len(list(self.ids.keys()))
+        return len(list(self.name_to_id.keys()))
    @property
    def language_names(self) -> List:
-        return list(self.ids.keys())
+        return list(self.name_to_id.keys())
    @staticmethod
    def parse_language_ids_from_config(c: Coqpit) -> Dict:
@ -67,7 +67,7 @@ class LanguageManager(BaseIDManager):
        Args:
            c (Coqpit): Config.
        """
-        self.ids = self.parse_language_ids_from_config(c)
+        self.name_to_id = self.parse_language_ids_from_config(c)
    @staticmethod
    def parse_ids_from_data(items: List, parse_key: str) -> Any:
@ -82,7 +82,7 @@ class LanguageManager(BaseIDManager):
        Args:
            file_path (str): Path to the output file.
        """
-        self._save_json(file_path, self.ids)
+        self._save_json(file_path, self.name_to_id)
    @staticmethod
    def init_from_config(config: Coqpit) -> "LanguageManager":
--- a/TTS/tts/utils/managers.py
+++ b/TTS/tts/utils/managers.py
@ -39,7 +39,7 @@ class BaseIDManager:
    """
    def __init__(self, id_file_path: str = ""):
-        self.ids = {}
+        self.name_to_id = {}
        if id_file_path:
            self.load_ids_from_file(id_file_path)
@ -60,7 +60,7 @@ class BaseIDManager:
        Args:
            items (List): Data sampled returned by `load_tts_samples()`.
        """
-        self.ids = self.parse_ids_from_data(items, parse_key=parse_key)
+        self.name_to_id = self.parse_ids_from_data(items, parse_key=parse_key)
    def load_ids_from_file(self, file_path: str) -> None:
        """Set IDs from a file.
@ -68,7 +68,7 @@ class BaseIDManager:
        Args:
            file_path (str): Path to the file.
        """
-        self.ids = load_file(file_path)
+        self.name_to_id = load_file(file_path)
    def save_ids_to_file(self, file_path: str) -> None:
        """Save IDs to a json file.
@ -76,7 +76,7 @@ class BaseIDManager:
        Args:
            file_path (str): Path to the output file.
        """
-        save_file(self.ids, file_path)
+        save_file(self.name_to_id, file_path)
    def get_random_id(self) -> Any:
        """Get a random embedding.
@ -86,8 +86,8 @@ class BaseIDManager:
        Returns:
            np.ndarray: embedding.
        """
-        if self.ids:
+        if self.name_to_id:
-            return self.ids[random.choices(list(self.ids.keys()))[0]]
+            return self.name_to_id[random.choices(list(self.name_to_id.keys()))[0]]
        return None
@ -109,11 +109,27 @@ class BaseIDManager:
 class EmbeddingManager(BaseIDManager):
    """Base `Embedding` Manager class. Every new `Embedding` manager must inherit this.
    It defines common `Embedding` manager specific functions.
    It expects embeddings files in the following format:
    ::
        {
            'audio_file_key':{
                'name': 'category_name',
                'embedding'[<embedding_values>]
            },
            ...
        }
    `audio_file_key` is a unique key to the audio file in the dataset. It can be the path to the file or any other unique key.
    `embedding` is the embedding vector of the audio file.
    `name` can be name of the speaker of the audio file.
    """
    def __init__(
        self,
-        embedding_file_path: str = "",
+        embedding_file_path: Union[str, List[str]] = "",
        id_file_path: str = "",
        encoder_model_path: str = "",
        encoder_config_path: str = "",
@ -129,11 +145,24 @@ class EmbeddingManager(BaseIDManager):
        self.use_cuda = use_cuda
        if embedding_file_path:
-            self.load_embeddings_from_file(embedding_file_path)
+            if isinstance(embedding_file_path, list):
                self.load_embeddings_from_list_of_files(embedding_file_path)
            else:
                self.load_embeddings_from_file(embedding_file_path)
        if encoder_model_path and encoder_config_path:
            self.init_encoder(encoder_model_path, encoder_config_path, use_cuda)
    @property
    def num_embeddings(self):
        """Get number of embeddings."""
        return len(self.embeddings)
    @property
    def num_names(self):
        """Get number of embeddings."""
        return len(self.embeddings_by_names)
    @property
    def embedding_dim(self):
        """Dimensionality of embeddings. If embeddings are not loaded, returns zero."""
@ -141,6 +170,11 @@ class EmbeddingManager(BaseIDManager):
            return len(self.embeddings[list(self.embeddings.keys())[0]]["embedding"])
        return 0
    @property
    def embedding_names(self):
        """Get embedding names."""
        return list(self.embeddings_by_names.keys())
    def save_embeddings_to_file(self, file_path: str) -> None:
        """Save embeddings to a json file.
@ -149,20 +183,57 @@ class EmbeddingManager(BaseIDManager):
        """
        save_file(self.embeddings, file_path)
    @staticmethod
    def read_embeddings_from_file(file_path: str):
        """Load embeddings from a json file.
        Args:
            file_path (str): Path to the file.
        """
        embeddings = load_file(file_path)
        speakers = sorted({x["name"] for x in embeddings.values()})
        name_to_id = {name: i for i, name in enumerate(speakers)}
        clip_ids = list(set(sorted(clip_name for clip_name in embeddings.keys())))
        # cache embeddings_by_names for fast inference using a bigger speakers.json
        embeddings_by_names = {}
        for x in embeddings.values():
            if x["name"] not in embeddings_by_names.keys():
                embeddings_by_names[x["name"]] = [x["embedding"]]
            else:
                embeddings_by_names[x["name"]].append(x["embedding"])
        return name_to_id, clip_ids, embeddings, embeddings_by_names
    def load_embeddings_from_file(self, file_path: str) -> None:
        """Load embeddings from a json file.
        Args:
            file_path (str): Path to the target json file.
        """
-        self.embeddings = load_file(file_path)
+        self.name_to_id, self.clip_ids, self.embeddings, self.embeddings_by_names = self.read_embeddings_from_file(
            file_path
        )
-        speakers = sorted({x["name"] for x in self.embeddings.values()})
+    def load_embeddings_from_list_of_files(self, file_paths: List[str]) -> None:
-        self.ids = {name: i for i, name in enumerate(speakers)}
+        """Load embeddings from a list of json files and don't allow duplicate keys.
-        self.clip_ids = list(set(sorted(clip_name for clip_name in self.embeddings.keys())))
+        Args:
-        # cache embeddings_by_names for fast inference using a bigger speakers.json
+            file_paths (List[str]): List of paths to the target json files.
-        self.embeddings_by_names = self.get_embeddings_by_names()
+        """
        self.name_to_id = {}
        self.clip_ids = []
        self.embeddings_by_names = {}
        self.embeddings = {}
        for file_path in file_paths:
            ids, clip_ids, embeddings, embeddings_by_names = self.read_embeddings_from_file(file_path)
            # check colliding keys
            duplicates = set(self.embeddings.keys()) & set(embeddings.keys())
            if duplicates:
                raise ValueError(f" [!] Duplicate embedding names <{duplicates}> in {file_path}")
            # store values
            self.name_to_id.update(ids)
            self.clip_ids.extend(clip_ids)
            self.embeddings_by_names.update(embeddings_by_names)
            self.embeddings.update(embeddings)
    def get_embedding_by_clip(self, clip_idx: str) -> List:
        """Get embedding by clip ID.
--- a/TTS/tts/utils/speakers.py
+++ b/TTS/tts/utils/speakers.py
@ -73,14 +73,14 @@ class SpeakerManager(EmbeddingManager):
    @property
    def num_speakers(self):
-        return len(self.ids)
+        return len(self.name_to_id)
    @property
    def speaker_names(self):
-        return list(self.ids.keys())
+        return list(self.name_to_id.keys())
    def get_speakers(self) -> List:
-        return self.ids
+        return self.name_to_id
    @staticmethod
    def init_from_config(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "SpeakerManager":
@ -182,10 +182,10 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None,
                    speaker_manager.load_embeddings_from_file(c.d_vector_file)
                speaker_manager.load_embeddings_from_file(speakers_file)
            elif not c.use_d_vector_file:  # restor speaker manager with speaker ID file.
-                speaker_ids_from_data = speaker_manager.ids
+                speaker_ids_from_data = speaker_manager.name_to_id
                speaker_manager.load_ids_from_file(speakers_file)
                assert all(
-                    speaker in speaker_manager.ids for speaker in speaker_ids_from_data
+                    speaker in speaker_manager.name_to_id for speaker in speaker_ids_from_data
                ), " [!] You cannot introduce new speakers to a pre-trained model."
        elif c.use_d_vector_file and c.d_vector_file:
            # new speaker manager with external speaker embeddings.
@ -199,7 +199,7 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None,
        if speaker_manager.num_speakers > 0:
            print(
                " > Speaker manager is loaded with {} speakers: {}".format(
-                    speaker_manager.num_speakers, ", ".join(speaker_manager.ids)
+                    speaker_manager.num_speakers, ", ".join(speaker_manager.name_to_id)
                )
            )
--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@ -295,7 +295,12 @@ def transfer_voice(
        reference_d_vector = embedding_to_torch(reference_d_vector, cuda=use_cuda)
    # load reference_wav audio
-    reference_wav = embedding_to_torch(model.ap.load_wav(reference_wav, sr=model.ap.sample_rate), cuda=use_cuda)
+    reference_wav = embedding_to_torch(
        model.ap.load_wav(
            reference_wav, sr=model.args.encoder_sample_rate if model.args.encoder_sample_rate else model.ap.sample_rate
        ),
        cuda=use_cuda,
    )
    if hasattr(model, "module"):
        _func = model.module.inference_voice_conversion
--- a/TTS/tts/utils/text/korean/init.py
+++ b/TTS/tts/utils/text/korean/init.py
--- a/TTS/tts/utils/text/korean/ko_dictionary.py
+++ b/TTS/tts/utils/text/korean/ko_dictionary.py
@ -0,0 +1,44 @@
 # coding: utf-8
 # Add the word you want to the dictionary.
 etc_dictionary = {"1+1": "원플러스원", "2+1": "투플러스원"}
 english_dictionary = {
    "KOREA": "코리아",
    "IDOL": "아이돌",
    "IT": "아이티",
    "IQ": "아이큐",
    "UP": "업",
    "DOWN": "다운",
    "PC": "피씨",
    "CCTV": "씨씨티비",
    "SNS": "에스엔에스",
    "AI": "에이아이",
    "CEO": "씨이오",
    "A": "에이",
    "B": "비",
    "C": "씨",
    "D": "디",
    "E": "이",
    "F": "에프",
    "G": "지",
    "H": "에이치",
    "I": "아이",
    "J": "제이",
    "K": "케이",
    "L": "엘",
    "M": "엠",
    "N": "엔",
    "O": "오",
    "P": "피",
    "Q": "큐",
    "R": "알",
    "S": "에스",
    "T": "티",
    "U": "유",
    "V": "브이",
    "W": "더블유",
    "X": "엑스",
    "Y": "와이",
    "Z": "제트",
 }
--- a/TTS/tts/utils/text/korean/korean.py
+++ b/TTS/tts/utils/text/korean/korean.py
@ -0,0 +1,32 @@
 # coding: utf-8
 # Code based on https://github.com/carpedm20/multi-speaker-tacotron-tensorflow/blob/master/text/korean.py
 import re
 from TTS.tts.utils.text.korean.ko_dictionary import english_dictionary, etc_dictionary
 def normalize(text):
    text = text.strip()
    text = re.sub("[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]", "", text)
    text = normalize_with_dictionary(text, etc_dictionary)
    text = normalize_english(text)
    text = text.lower()
    return text
 def normalize_with_dictionary(text, dic):
    if any(key in text for key in dic.keys()):
        pattern = re.compile("|".join(re.escape(key) for key in dic.keys()))
        return pattern.sub(lambda x: dic[x.group()], text)
    return text
 def normalize_english(text):
    def fn(m):
        word = m.group()
        if word in english_dictionary:
            return english_dictionary.get(word)
        return word
    text = re.sub("([A-Za-z]+)", fn, text)
    return text
--- a/TTS/tts/utils/text/korean/phonemizer.py
+++ b/TTS/tts/utils/text/korean/phonemizer.py
@ -0,0 +1,36 @@
 from jamo import hangul_to_jamo
 from TTS.tts.utils.text.korean.korean import normalize
 g2p = None
 def korean_text_to_phonemes(text, character: str = "hangeul") -> str:
    """
    The input and output values look the same, but they are different in Unicode.
    example :
        input = '하늘' (Unicode : \ud558\ub298), (하 + 늘)
        output = '하늘' (Unicode :\u1112\u1161\u1102\u1173\u11af), (ᄒ + ᅡ + ᄂ + ᅳ + ᆯ)
    """
    global g2p  # pylint: disable=global-statement
    if g2p is None:
        from g2pkk import G2p
        g2p = G2p()
    if character == "english":
        from anyascii import anyascii
        text = normalize(text)
        text = g2p(text)
        text = anyascii(text)
        return text
    text = normalize(text)
    text = g2p(text)
    text = list(hangul_to_jamo(text))  # '하늘' --> ['ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆯ']
    return "".join(text)
--- a/TTS/tts/utils/text/phonemizers/init.py
+++ b/TTS/tts/utils/text/phonemizers/init.py
@ -2,6 +2,7 @@ from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
 from TTS.tts.utils.text.phonemizers.espeak_wrapper import ESpeak
 from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut
 from TTS.tts.utils.text.phonemizers.ja_jp_phonemizer import JA_JP_Phonemizer
 from TTS.tts.utils.text.phonemizers.ko_kr_phonemizer import KO_KR_Phonemizer
 from TTS.tts.utils.text.phonemizers.zh_cn_phonemizer import ZH_CN_Phonemizer
 PHONEMIZERS = {b.name(): b for b in (ESpeak, Gruut, JA_JP_Phonemizer)}
@ -26,6 +27,7 @@ DEF_LANG_TO_PHONEMIZER.update(_new_dict)
 DEF_LANG_TO_PHONEMIZER["en"] = DEF_LANG_TO_PHONEMIZER["en-us"]
 DEF_LANG_TO_PHONEMIZER["ja-jp"] = JA_JP_Phonemizer.name()
 DEF_LANG_TO_PHONEMIZER["zh-cn"] = ZH_CN_Phonemizer.name()
 DEF_LANG_TO_PHONEMIZER["ko-kr"] = KO_KR_Phonemizer.name()
 def get_phonemizer_by_name(name: str, **kwargs) -> BasePhonemizer:
@ -46,6 +48,8 @@ def get_phonemizer_by_name(name: str, **kwargs) -> BasePhonemizer:
        return ZH_CN_Phonemizer(**kwargs)
    if name == "ja_jp_phonemizer":
        return JA_JP_Phonemizer(**kwargs)
    if name == "ko_kr_phonemizer":
        return KO_KR_Phonemizer(**kwargs)
    raise ValueError(f"Phonemizer {name} not found")
--- a/TTS/tts/utils/text/phonemizers/espeak_wrapper.py
+++ b/TTS/tts/utils/text/phonemizers/espeak_wrapper.py
@ -94,6 +94,8 @@ class ESpeak(BasePhonemizer):
        # band-aid for backwards compatibility
        if language == "en":
            language = "en-us"
        if language == "zh-cn":
            language = "cmn"
        super().__init__(language, punctuations=punctuations, keep_puncs=keep_puncs)
        if backend is not None:
--- a/TTS/tts/utils/text/phonemizers/ko_kr_phonemizer.py
+++ b/TTS/tts/utils/text/phonemizers/ko_kr_phonemizer.py
@ -0,0 +1,65 @@
 from typing import Dict
 from TTS.tts.utils.text.korean.phonemizer import korean_text_to_phonemes
 from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
 _DEF_KO_PUNCS = "、.,[]()?!〽~『』「」【】"
 class KO_KR_Phonemizer(BasePhonemizer):
    """🐸TTS ko_kr_phonemizer using functions in `TTS.tts.utils.text.korean.phonemizer`
    TODO: Add Korean to character (ᄀᄁᄂᄃᄄᄅᄆᄇᄈᄉᄊᄋᄌᄍᄎᄏᄐᄑ하ᅢᅣᅤᅥᅦᅧᅨᅩᅪᅫᅬᅭᅮᅯᅰᅱᅲᅳᅴᅵᆨᆩᆪᆫᆬᆭᆮᆯᆰᆱᆲᆳᆴᆵᆶᆷᆸᆹᆺᆻᆼᆽᆾᆿᇀᇁᇂ)
    Example:
        >>> from TTS.tts.utils.text.phonemizers import KO_KR_Phonemizer
        >>> phonemizer = KO_KR_Phonemizer()
        >>> phonemizer.phonemize("이 문장은 음성합성 테스트를 위한 문장입니다.", separator="|")
        'ᄋ|ᅵ| |ᄆ|ᅮ|ᆫ|ᄌ|ᅡ|ᆼ|ᄋ|ᅳ| |ᄂ|ᅳ|ᆷ|ᄉ|ᅥ|ᆼ|ᄒ|ᅡ|ᆸ|ᄊ|ᅥ|ᆼ| |ᄐ|ᅦ|ᄉ|ᅳ|ᄐ|ᅳ|ᄅ|ᅳ| |ᄅ|ᅱ|ᄒ|ᅡ|ᆫ| |ᄆ|ᅮ|ᆫ|ᄌ|ᅡ|ᆼ|ᄋ|ᅵ|ᆷ|ᄂ|ᅵ|ᄃ|ᅡ|.'
        >>> from TTS.tts.utils.text.phonemizers import KO_KR_Phonemizer
        >>> phonemizer = KO_KR_Phonemizer()
        >>> phonemizer.phonemize("이 문장은 음성합성 테스트를 위한 문장입니다.", separator="|", character='english')
        'I| |M|u|n|J|a|n|g|E|u| |N|e|u|m|S|e|o|n|g|H|a|b|S|s|e|o|n|g| |T|e|S|e|u|T|e|u|L|e|u| |L|w|i|H|a|n| |M|u|n|J|a|n|g|I|m|N|i|D|a|.'
    """
    language = "ko-kr"
    def __init__(self, punctuations=_DEF_KO_PUNCS, keep_puncs=True, **kwargs):  # pylint: disable=unused-argument
        super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs)
    @staticmethod
    def name():
        return "ko_kr_phonemizer"
    def _phonemize(self, text: str, separator: str = "", character: str = "hangeul") -> str:
        ph = korean_text_to_phonemes(text, character=character)
        if separator is not None or separator != "":
            return separator.join(ph)
        return ph
    def phonemize(self, text: str, separator: str = "", character: str = "hangeul") -> str:
        return self._phonemize(text, separator, character)
    @staticmethod
    def supported_languages() -> Dict:
        return {"ko-kr": "hangeul(korean)"}
    def version(self) -> str:
        return "0.0.2"
    def is_available(self) -> bool:
        return True
 if __name__ == "__main__":
    texts = "이 문장은 음성합성 테스트를 위한 문장입니다."
    e = KO_KR_Phonemizer()
    print(e.supported_languages())
    print(e.version())
    print(e.language)
    print(e.name())
    print(e.is_available())
    print(e.phonemize(texts))
--- a/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py
+++ b/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py
@ -42,7 +42,7 @@ class ZH_CN_Phonemizer(BasePhonemizer):
    @staticmethod
    def supported_languages() -> Dict:
-        return {"zh-cn": "Japanese (Japan)"}
+        return {"zh-cn": "Chinese (China)"}
    def version(self) -> str:
        return "0.0.1"
--- a/TTS/utils/audio/numpy_transforms.py
+++ b/TTS/utils/audio/numpy_transforms.py
@ -2,9 +2,9 @@ from typing import Tuple
 import librosa
 import numpy as np
 import pyworld as pw
 import scipy
 import soundfile as sf
 from librosa import pyin
 # For using kwargs
 # pylint: disable=unused-argument
@ -242,12 +242,28 @@ def compute_stft_paddings(
 def compute_f0(
-    *, x: np.ndarray = None, pitch_fmax: float = None, hop_length: int = None, sample_rate: int = None, **kwargs
+    *,
    x: np.ndarray = None,
    pitch_fmax: float = None,
    pitch_fmin: float = None,
    hop_length: int = None,
    win_length: int = None,
    sample_rate: int = None,
    stft_pad_mode: str = "reflect",
    center: bool = True,
    **kwargs,
 ) -> np.ndarray:
    """Compute pitch (f0) of a waveform using the same parameters used for computing melspectrogram.
    Args:
        x (np.ndarray): Waveform. Shape :math:`[T_wav,]`
        pitch_fmax (float): Pitch max value.
        pitch_fmin (float): Pitch min value.
        hop_length (int): Number of frames between STFT columns.
        win_length (int): STFT window length.
        sample_rate (int): Audio sampling rate.
        stft_pad_mode (str): Padding mode for STFT.
        center (bool): Centered padding.
    Returns:
        np.ndarray: Pitch. Shape :math:`[T_pitch,]`. :math:`T_pitch == T_wav / hop_length`
@ -255,20 +271,35 @@ def compute_f0(
    Examples:
        >>> WAV_FILE = filename = librosa.util.example_audio_file()
        >>> from TTS.config import BaseAudioConfig
-        >>> from TTS.utils.audio.processor import AudioProcessor        >>> conf = BaseAudioConfig(pitch_fmax=8000)
+        >>> from TTS.utils.audio import AudioProcessor
        >>> conf = BaseAudioConfig(pitch_fmax=640, pitch_fmin=1)
        >>> ap = AudioProcessor(**conf)
-        >>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050]
+        >>> wav = ap.load_wav(WAV_FILE, sr=ap.sample_rate)[:5 * ap.sample_rate]
        >>> pitch = ap.compute_f0(wav)
    """
    assert pitch_fmax is not None, " [!] Set `pitch_fmax` before caling `compute_f0`."
    assert pitch_fmin is not None, " [!] Set `pitch_fmin` before caling `compute_f0`."
-    f0, t = pw.dio(
+    f0, voiced_mask, _ = pyin(
-        x.astype(np.double),
+        y=x.astype(np.double),
-        fs=sample_rate,
+        fmin=pitch_fmin,
-        f0_ceil=pitch_fmax,
+        fmax=pitch_fmax,
-        frame_period=1000 * hop_length / sample_rate,
+        sr=sample_rate,
        frame_length=win_length,
        win_length=win_length // 2,
        hop_length=hop_length,
        pad_mode=stft_pad_mode,
        center=center,
        n_thresholds=100,
        beta_parameters=(2, 18),
        boltzmann_parameter=2,
        resolution=0.1,
        max_transition_rate=35.92,
        switch_prob=0.01,
        no_trough_prob=0.01,
    )
-    f0 = pw.stonemask(x.astype(np.double), f0, t, sample_rate)
+    f0[~voiced_mask] = 0.0
    return f0
--- a/TTS/utils/audio/processor.py
+++ b/TTS/utils/audio/processor.py
@ -2,12 +2,12 @@ from typing import Dict, Tuple
 import librosa
 import numpy as np
 import pyworld as pw
 import scipy.io.wavfile
 import scipy.signal
 import soundfile as sf
 from TTS.tts.utils.helpers import StandardScaler
 from TTS.utils.audio.numpy_transforms import compute_f0
 # pylint: disable=too-many-public-methods
@ -573,23 +573,28 @@ class AudioProcessor(object):
            >>> WAV_FILE = filename = librosa.util.example_audio_file()
            >>> from TTS.config import BaseAudioConfig
            >>> from TTS.utils.audio import AudioProcessor
-            >>> conf = BaseAudioConfig(pitch_fmax=8000)
+            >>> conf = BaseAudioConfig(pitch_fmax=640, pitch_fmin=1)
            >>> ap = AudioProcessor(**conf)
-            >>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050]
+            >>> wav = ap.load_wav(WAV_FILE, sr=ap.sample_rate)[:5 * ap.sample_rate]
            >>> pitch = ap.compute_f0(wav)
        """
        assert self.pitch_fmax is not None, " [!] Set `pitch_fmax` before caling `compute_f0`."
        assert self.pitch_fmin is not None, " [!] Set `pitch_fmin` before caling `compute_f0`."
        # align F0 length to the spectrogram length
        if len(x) % self.hop_length == 0:
-            x = np.pad(x, (0, self.hop_length // 2), mode="reflect")
+            x = np.pad(x, (0, self.hop_length // 2), mode=self.stft_pad_mode)
-        f0, t = pw.dio(
+        f0 = compute_f0(
-            x.astype(np.double),
+            x=x,
-            fs=self.sample_rate,
+            pitch_fmax=self.pitch_fmax,
-            f0_ceil=self.pitch_fmax,
+            pitch_fmin=self.pitch_fmin,
-            frame_period=1000 * self.hop_length / self.sample_rate,
+            hop_length=self.hop_length,
            win_length=self.win_length,
            sample_rate=self.sample_rate,
            stft_pad_mode=self.stft_pad_mode,
            center=True,
        )
-        f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate)
+
        return f0
    ### Audio Processing ###
--- a/TTS/utils/capacitron_optimizer.py
+++ b/TTS/utils/capacitron_optimizer.py
@ -38,9 +38,9 @@ class CapacitronOptimizer:
        self.param_groups = self.primary_optimizer.param_groups
        self.primary_optimizer.step()
-    def zero_grad(self):
+    def zero_grad(self, set_to_none=False):
-        self.primary_optimizer.zero_grad()
+        self.primary_optimizer.zero_grad(set_to_none)
-        self.secondary_optimizer.zero_grad()
+        self.secondary_optimizer.zero_grad(set_to_none)
    def load_state_dict(self, state_dict):
        self.primary_optimizer.load_state_dict(state_dict[0])
--- a/TTS/utils/io.py
+++ b/TTS/utils/io.py
@ -9,6 +9,8 @@ import fsspec
 import torch
 from coqpit import Coqpit
 from TTS.utils.generic_utils import get_user_data_dir
 class RenamingUnpickler(pickle_tts.Unpickler):
    """Overload default pickler to solve module renaming problem"""
@ -57,6 +59,7 @@ def copy_model_files(config: Coqpit, out_path, new_fields=None):
 def load_fsspec(
    path: str,
    map_location: Union[str, Callable, torch.device, Dict[Union[str, torch.device], Union[str, torch.device]]] = None,
    cache: bool = True,
    **kwargs,
 ) -> Any:
    """Like torch.load but can load from other locations (e.g. s3:// , gs://).
@ -64,21 +67,33 @@ def load_fsspec(
    Args:
        path: Any path or url supported by fsspec.
        map_location: torch.device or str.
        cache: If True, cache a remote file locally for subsequent calls. It is cached under `get_user_data_dir()/tts_cache`. Defaults to True.
        **kwargs: Keyword arguments forwarded to torch.load.
    Returns:
        Object stored in path.
    """
-    with fsspec.open(path, "rb") as f:
+    is_local = os.path.isdir(path) or os.path.isfile(path)
-        return torch.load(f, map_location=map_location, **kwargs)
+    if cache and not is_local:
        with fsspec.open(
            f"filecache::{path}",
            filecache={"cache_storage": str(get_user_data_dir("tts_cache"))},
            mode="rb",
        ) as f:
            return torch.load(f, map_location=map_location, **kwargs)
    else:
        with fsspec.open(path, "rb") as f:
            return torch.load(f, map_location=map_location, **kwargs)
-def load_checkpoint(model, checkpoint_path, use_cuda=False, eval=False):  # pylint: disable=redefined-builtin
+def load_checkpoint(
    model, checkpoint_path, use_cuda=False, eval=False, cache=False
 ):  # pylint: disable=redefined-builtin
    try:
-        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
    except ModuleNotFoundError:
        pickle_tts.Unpickler = RenamingUnpickler
-        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), pickle_module=pickle_tts)
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), pickle_module=pickle_tts, cache=cache)
    model.load_state_dict(state["model"])
    if use_cuda:
        model.cuda()
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@ -32,11 +32,14 @@ class ModelManager(object):
    home path.
    Args:
-        models_file (str): path to .model.json
+        models_file (str): path to .model.json file. Defaults to None.
        output_prefix (str): prefix to `tts` to download models. Defaults to None
        progress_bar (bool): print a progress bar when donwloading a file. Defaults to False.
    """
-    def __init__(self, models_file=None, output_prefix=None):
+    def __init__(self, models_file=None, output_prefix=None, progress_bar=False):
        super().__init__()
        self.progress_bar = progress_bar
        if output_prefix is None:
            self.output_prefix = get_user_data_dir("tts")
        else:
@ -236,7 +239,7 @@ class ModelManager(object):
            os.makedirs(output_path, exist_ok=True)
            print(f" > Downloading model to {output_path}")
            # download from github release
-            self._download_zip_file(model_item["github_rls_url"], output_path)
+            self._download_zip_file(model_item["github_rls_url"], output_path, self.progress_bar)
            self.print_model_license(model_item=model_item)
        # find downloaded files
        output_model_path, output_config_path = self._find_files(output_path)
@ -334,7 +337,7 @@ class ModelManager(object):
            config.save_json(config_path)
    @staticmethod
-    def _download_zip_file(file_url, output_folder):
+    def _download_zip_file(file_url, output_folder, progress_bar):
        """Download the github releases"""
        # download the file
        r = requests.get(file_url, stream=True)
@ -342,11 +345,13 @@ class ModelManager(object):
        try:
            total_size_in_bytes = int(r.headers.get("content-length", 0))
            block_size = 1024  # 1 Kibibyte
-            progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
+            if progress_bar:
                progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
            temp_zip_name = os.path.join(output_folder, file_url.split("/")[-1])
            with open(temp_zip_name, "wb") as file:
                for data in r.iter_content(block_size):
-                    progress_bar.update(len(data))
+                    if progress_bar:
                        progress_bar.update(len(data))
                    file.write(data)
            with zipfile.ZipFile(temp_zip_name) as z:
                z.extractall(output_folder)
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@ -212,8 +212,13 @@ class Synthesizer(object):
        # handle multi-speaker
        speaker_embedding = None
        speaker_id = None
-        if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "ids"):
+        if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "name_to_id"):
-            if speaker_name and isinstance(speaker_name, str):
+
            # handle Neon models with single speaker.
            if len(self.tts_model.speaker_manager.name_to_id) == 1:
                speaker_id = list(self.tts_model.speaker_manager.name_to_id.values())[0]
            elif speaker_name and isinstance(speaker_name, str):
                if self.tts_config.use_d_vector_file:
                    # get the average speaker embedding from the saved d_vectors.
                    speaker_embedding = self.tts_model.speaker_manager.get_mean_embedding(
@ -222,7 +227,7 @@ class Synthesizer(object):
                    speaker_embedding = np.array(speaker_embedding)[None, :]  # [1 x embedding_dim]
                else:
                    # get speaker idx from the speaker name
-                    speaker_id = self.tts_model.speaker_manager.ids[speaker_name]
+                    speaker_id = self.tts_model.speaker_manager.name_to_id[speaker_name]
            elif not speaker_name and not speaker_wav:
                raise ValueError(
@ -243,8 +248,12 @@ class Synthesizer(object):
        if self.tts_languages_file or (
            hasattr(self.tts_model, "language_manager") and self.tts_model.language_manager is not None
        ):
-            if language_name and isinstance(language_name, str):
+
-                language_id = self.tts_model.language_manager.ids[language_name]
+            if len(self.tts_model.language_manager.name_to_id) == 1:
                language_id = list(self.tts_model.language_manager.name_to_id.values())[0]
            elif language_name and isinstance(language_name, str):
                language_id = self.tts_model.language_manager.name_to_id[language_name]
            elif not language_name:
                raise ValueError(
@ -316,7 +325,7 @@ class Synthesizer(object):
            # get the speaker embedding or speaker id for the reference wav file
            reference_speaker_embedding = None
            reference_speaker_id = None
-            if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "ids"):
+            if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "name_to_id"):
                if reference_speaker_name and isinstance(reference_speaker_name, str):
                    if self.tts_config.use_d_vector_file:
                        # get the speaker embedding from the saved d_vectors.
@ -328,12 +337,11 @@ class Synthesizer(object):
                        ]  # [1 x embedding_dim]
                    else:
                        # get speaker idx from the speaker name
-                        reference_speaker_id = self.tts_model.speaker_manager.ids[reference_speaker_name]
+                        reference_speaker_id = self.tts_model.speaker_manager.name_to_id[reference_speaker_name]
                else:
                    reference_speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(
                        reference_wav
                    )
            outputs = transfer_voice(
                model=self.tts_model,
                CONFIG=self.tts_config,
--- a/TTS/utils/vad.py
+++ b/TTS/utils/vad.py
@ -1,3 +1,4 @@
 import soundfile as sf
 import torch
 import torchaudio
@ -48,7 +49,7 @@ def remove_silence(
 ):
    # get the VAD model and utils functions
-    model, get_speech_timestamps, save_audio, collect_chunks = model_and_utils
+    model, get_speech_timestamps, _, collect_chunks = model_and_utils
    # read ground truth wav and resample the audio for the VAD
    wav, gt_sample_rate = read_audio(audio_path)
@ -73,9 +74,11 @@ def remove_silence(
    # if have speech timestamps else save the wav
    if new_speech_timestamps:
        wav = collect_chunks(new_speech_timestamps, wav)
        is_speech = True
    else:
        print(f"> The file {audio_path} probably does not have speech please check it !!")
        is_speech = False
    # save audio
-    save_audio(out_path, wav, sampling_rate=gt_sample_rate)
+    sf.write(out_path, wav, gt_sample_rate, subtype="PCM_16")
-    return out_path
+    return out_path, is_speech
--- a/TTS/vocoder/configs/hifigan_config.py
+++ b/TTS/vocoder/configs/hifigan_config.py
@ -22,14 +22,12 @@ class HifiganConfig(BaseGANVocoderConfig):
        generator_model_params (dict): Parameters of the generator model. Defaults to
            `
            {
-                "use_mel": True,
+                "upsample_factors": [8, 8, 2, 2],
-                "sample_rate": 22050,
+                "upsample_kernel_sizes": [16, 16, 4, 4],
-                "n_fft": 1024,
+                "upsample_initial_channel": 512,
-                "hop_length": 256,
+                "resblock_kernel_sizes": [3, 7, 11],
-                "win_length": 1024,
+                "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
-                "n_mels": 80,
+                "resblock_type": "1",
                "mel_fmin": 0.0,
                "mel_fmax": None,
            }
            `
        batch_size (int):
--- a/TTS/vocoder/models/gan.py
+++ b/TTS/vocoder/models/gan.py
@ -231,6 +231,7 @@ class GAN(BaseVocoder):
        config: Coqpit,
        checkpoint_path: str,
        eval: bool = False,  # pylint: disable=unused-argument, redefined-builtin
        cache: bool = False,
    ) -> None:
        """Load a GAN checkpoint and initialize model parameters.
@ -239,7 +240,7 @@ class GAN(BaseVocoder):
            checkpoint_path (str): Checkpoint file path.
            eval (bool, optional): If true, load the model for inference. If falseDefaults to False.
        """
-        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
        # band-aid for older than v0.0.15 GAN models
        if "model_disc" in state:
            self.model_g.load_checkpoint(config, checkpoint_path, eval)
--- a/TTS/vocoder/models/hifigan_generator.py
+++ b/TTS/vocoder/models/hifigan_generator.py
@ -290,9 +290,9 @@ class HifiganGenerator(torch.nn.Module):
        remove_weight_norm(self.conv_post)
    def load_checkpoint(
-        self, config, checkpoint_path, eval=False
+        self, config, checkpoint_path, eval=False, cache=False
    ):  # pylint: disable=unused-argument, redefined-builtin
-        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
        self.load_state_dict(state["model"])
        if eval:
            self.eval()
--- a/TTS/vocoder/models/melgan_generator.py
+++ b/TTS/vocoder/models/melgan_generator.py
@ -85,9 +85,9 @@ class MelganGenerator(nn.Module):
                    layer.remove_weight_norm()
    def load_checkpoint(
-        self, config, checkpoint_path, eval=False
+        self, config, checkpoint_path, eval=False, cache=False
    ):  # pylint: disable=unused-argument, redefined-builtin
-        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
        self.load_state_dict(state["model"])
        if eval:
            self.eval()
--- a/TTS/vocoder/models/parallel_wavegan_generator.py
+++ b/TTS/vocoder/models/parallel_wavegan_generator.py
@ -153,9 +153,9 @@ class ParallelWaveganGenerator(torch.nn.Module):
        return self._get_receptive_field_size(self.layers, self.stacks, self.kernel_size)
    def load_checkpoint(
-        self, config, checkpoint_path, eval=False
+        self, config, checkpoint_path, eval=False, cache=False
    ):  # pylint: disable=unused-argument, redefined-builtin
-        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
        self.load_state_dict(state["model"])
        if eval:
            self.eval()
--- a/TTS/vocoder/models/wavegrad.py
+++ b/TTS/vocoder/models/wavegrad.py
@ -218,9 +218,9 @@ class Wavegrad(BaseVocoder):
        self.y_conv = weight_norm(self.y_conv)
    def load_checkpoint(
-        self, config, checkpoint_path, eval=False
+        self, config, checkpoint_path, eval=False, cache=False
    ):  # pylint: disable=unused-argument, redefined-builtin
-        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
        self.load_state_dict(state["model"])
        if eval:
            self.eval()
--- a/TTS/vocoder/models/wavernn.py
+++ b/TTS/vocoder/models/wavernn.py
@ -542,9 +542,9 @@ class Wavernn(BaseVocoder):
        return unfolded
    def load_checkpoint(
-        self, config, checkpoint_path, eval=False
+        self, config, checkpoint_path, eval=False, cache=False
    ):  # pylint: disable=unused-argument, redefined-builtin
-        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
        self.load_state_dict(state["model"])
        if eval:
            self.eval()
--- a/docs/source/docker_images.md
+++ b/docs/source/docker_images.md
@ -0,0 +1,56 @@
 (docker_images)=
 ## Docker images
 We provide docker images to be able to test TTS without having to setup your own environment.
 ### Using premade images
 You can use premade images built automatically from the latest TTS version.
 #### CPU version
 ```bash
 docker pull ghcr.io/coqui-ai/tts-cpu
 ```
 #### GPU version
 ```bash
 docker pull ghcr.io/coqui-ai/tts
 ```
 ### Building your own image
 ```bash
 docker build -t tts .
 ```
 ## Basic inference
 Basic usage: generating an audio file from a text passed as argument.
 You can pass any tts argument after the image name.
 ### CPU version
 ```bash
 docker run --rm -v ~/tts-output:/root/tts-output ghcr.io/coqui-ai/tts-cpu --text "Hello." --out_path /root/tts-output/hello.wav
 ```
 ### GPU version
 For the GPU version, you need to have the latest NVIDIA drivers installed.
 With `nvidia-smi` you can check the CUDA version supported, it must be >= 11.8
 ```bash
 docker run --rm --gpus all -v ~/tts-output:/root/tts-output ghcr.io/coqui-ai/tts --text "Hello." --out_path /root/tts-output/hello.wav --use_cuda true
 ```
 ## Start a server
 Starting a TTS server:
 Start the container and get a shell inside it.
 ### CPU version
 ```bash
 docker run --rm -it -p 5002:5002 --entrypoint /bin/bash ghcr.io/coqui-ai/tts-cpu
 python3 TTS/server/server.py --list_models #To get the list of available models
 python3 TTS/server/server.py --model_name tts_models/en/vctk/vits 
 ```
 ### GPU version
 ```bash
 docker run --rm -it -p 5002:5002 --gpus all --entrypoint /bin/bash ghcr.io/coqui-ai/tts
 python3 TTS/server/server.py --list_models #To get the list of available models
 python3 TTS/server/server.py --model_name tts_models/en/vctk/vits --use_cuda true
 ```
 Click [there](http://[::1]:5002/) and have fun with the server!
--- a/docs/source/faq.md
+++ b/docs/source/faq.md
@ -53,7 +53,7 @@ We tried to collect common issues and questions we receive about 🐸TTS. It is
        "mixed_precision": false,
        "output_path": "recipes/ljspeech/glow_tts/",
        "test_sentences": ["Test this sentence.", "This test sentence.", "Sentence this test."],
-        "datasets":[{"name": "ljspeech", "meta_file_train":"metadata.csv", "path": "recipes/ljspeech/LJSpeech-1.1/"}]
+        "datasets":[{"formatter": "ljspeech", "meta_file_train":"metadata.csv", "path": "recipes/ljspeech/LJSpeech-1.1/"}]
    }
    ```
--- a/docs/source/formatting_your_dataset.md
+++ b/docs/source/formatting_your_dataset.md
@ -88,7 +88,7 @@ from TTS.tts.datasets import load_tts_samples
 # dataset config for one of the pre-defined datasets
 dataset_config = BaseDatasetConfig(
-    name="vctk", meta_file_train="", language="en-us", path="dataset-path")
+    formatter="vctk", meta_file_train="", language="en-us", path="dataset-path")
 )
 # load training samples
--- a/docs/source/index.md
+++ b/docs/source/index.md
@ -20,6 +20,7 @@
    :caption: Using 🐸TTS
    inference
    docker_images
    implementing_a_new_model
    training_a_model
    finetuning
--- a/docs/source/models/forward_tts.md
+++ b/docs/source/models/forward_tts.md
@ -12,7 +12,7 @@ Currently we provide the following pre-configured architectures:
 - **FastPitch:**
-    It uses the same FastSpeech architecture that us conditioned on fundemental frequency (f0) contours with the
+    It uses the same FastSpeech architecture that is conditioned on fundemental frequency (f0) contours with the
    promise of more expressive speech.
 - **SpeedySpeech:**
--- a/docs/source/tutorial_for_nervous_beginners.md
+++ b/docs/source/tutorial_for_nervous_beginners.md
@ -84,7 +84,7 @@ We still support running training from CLI like in the old days. The same traini
        "print_eval": true,
        "mixed_precision": false,
        "output_path": "recipes/ljspeech/glow_tts/",
-        "datasets":[{"name": "ljspeech", "meta_file_train":"metadata.csv", "path": "recipes/ljspeech/LJSpeech-1.1/"}]
+        "datasets":[{"formatter": "ljspeech", "meta_file_train":"metadata.csv", "path": "recipes/ljspeech/LJSpeech-1.1/"}]
    }
    ```
@ -120,6 +120,3 @@ $ tts-server -h # see the help
 $ tts-server --list_models  # list the available models.
 ```
 ![server.gif](https://github.com/coqui-ai/TTS/raw/main/images/demo_server.gif)
--- a/notebooks/Tutorial_2_train_your_first_TTS_model.ipynb
+++ b/notebooks/Tutorial_2_train_your_first_TTS_model.ipynb
@ -74,7 +74,7 @@
    "<span style=\"color:purple;font-size:15px\">\n",
    "/MyTTSDataset <br /> \n",
    "&emsp;| <br /> \n",
-    "&emsp;| -> metadata.txt<br /> \n",
+    "&emsp;| -> metadata.csv<br /> \n",
    "&emsp;| -> /wavs<br /> \n",
    "&emsp;&emsp;| -> audio1.wav<br /> \n",
    "&emsp;&emsp;| -> audio2.wav<br /> \n",
--- a/recipes/blizzard2013/tacotron1-Capacitron/train_capacitron_t1.py
+++ b/recipes/blizzard2013/tacotron1-Capacitron/train_capacitron_t1.py
@ -15,7 +15,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
 data_path = "/srv/data/"
 # Using LJSpeech like dataset processing for the blizzard dataset
-dataset_config = BaseDatasetConfig(name="ljspeech", meta_file_train="metadata.csv", path=data_path)
+dataset_config = BaseDatasetConfig(formatter="ljspeech", meta_file_train="metadata.csv", path=data_path)
 audio_config = BaseAudioConfig(
    sample_rate=24000,
--- a/recipes/blizzard2013/tacotron2-Capacitron/train_capacitron_t2.py
+++ b/recipes/blizzard2013/tacotron2-Capacitron/train_capacitron_t2.py
@ -16,7 +16,7 @@ data_path = "/srv/data/blizzard2013/segmented"
 # Using LJSpeech like dataset processing for the blizzard dataset
 dataset_config = BaseDatasetConfig(
-    name="ljspeech",
+    formatter="ljspeech",
    meta_file_train="metadata.csv",
    path=data_path,
 )
--- a/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json
+++ b/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json
@ -1,7 +1,7 @@
 {
    "datasets": [
        {
-            "name": "kokoro",
+            "formatter": "kokoro",
            "path": "DEFINE THIS",
            "meta_file_train": "metadata.csv",
            "meta_file_val": null
--- a/recipes/ljspeech/align_tts/train_aligntts.py
+++ b/recipes/ljspeech/align_tts/train_aligntts.py
@ -13,7 +13,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
 # init configs
 dataset_config = BaseDatasetConfig(
-    name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
+    formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
 )
 config = AlignTTSConfig(
    batch_size=32,
--- a/recipes/ljspeech/fast_pitch/train_fast_pitch.py
+++ b/recipes/ljspeech/fast_pitch/train_fast_pitch.py
@ -14,7 +14,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
 # init configs
 dataset_config = BaseDatasetConfig(
-    name="ljspeech",
+    formatter="ljspeech",
    meta_file_train="metadata.csv",
    # meta_file_attn_mask=os.path.join(output_path, "../LJSpeech-1.1/metadata_attn_mask.txt"),
    path=os.path.join(output_path, "../LJSpeech-1.1/"),
--- a/recipes/ljspeech/fast_speech/train_fast_speech.py
+++ b/recipes/ljspeech/fast_speech/train_fast_speech.py
@ -14,7 +14,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
 # init configs
 dataset_config = BaseDatasetConfig(
-    name="ljspeech",
+    formatter="ljspeech",
    meta_file_train="metadata.csv",
    # meta_file_attn_mask=os.path.join(output_path, "../LJSpeech-1.1/metadata_attn_mask.txt"),
    path=os.path.join(output_path, "../LJSpeech-1.1/"),
--- a/recipes/ljspeech/glow_tts/train_glowtts.py
+++ b/recipes/ljspeech/glow_tts/train_glowtts.py
@ -21,7 +21,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
 # Set LJSpeech as our target dataset and define its path.
 # You can also use a simple Dict to define the dataset and pass it to your custom formatter.
 dataset_config = BaseDatasetConfig(
-    name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
+    formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
 )
 # INITIALIZE THE TRAINING CONFIGURATION
--- a/recipes/ljspeech/speedy_speech/train_speedy_speech.py
+++ b/recipes/ljspeech/speedy_speech/train_speedy_speech.py
@ -11,7 +11,7 @@ from TTS.utils.audio import AudioProcessor
 output_path = os.path.dirname(os.path.abspath(__file__))
 dataset_config = BaseDatasetConfig(
-    name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
+    formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
 )
 audio_config = BaseAudioConfig(
--- a/recipes/ljspeech/tacotron2-Capacitron/train_capacitron_t2.py
+++ b/recipes/ljspeech/tacotron2-Capacitron/train_capacitron_t2.py
@ -16,7 +16,7 @@ data_path = "/srv/data/"
 # Using LJSpeech like dataset processing for the blizzard dataset
 dataset_config = BaseDatasetConfig(
-    name="ljspeech",
+    formatter="ljspeech",
    meta_file_train="metadata.csv",
    path=data_path,
 )
--- a/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py
+++ b/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py
@ -16,7 +16,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
 # init configs
 dataset_config = BaseDatasetConfig(
-    name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
+    formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
 )
 audio_config = BaseAudioConfig(
--- a/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py
+++ b/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py
@ -16,7 +16,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
 # init configs
 dataset_config = BaseDatasetConfig(
-    name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
+    formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
 )
 audio_config = BaseAudioConfig(
--- a/recipes/ljspeech/vits_tts/train_vits.py
+++ b/recipes/ljspeech/vits_tts/train_vits.py
@ -11,7 +11,7 @@ from TTS.utils.audio import AudioProcessor
 output_path = os.path.dirname(os.path.abspath(__file__))
 dataset_config = BaseDatasetConfig(
-    name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
+    formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
 )
 audio_config = VitsAudioConfig(
    sample_rate=22050, win_length=1024, hop_length=256, num_mels=80, mel_fmin=0, mel_fmax=None
--- a/recipes/multilingual/vits_tts/train_vits_tts.py
+++ b/recipes/multilingual/vits_tts/train_vits_tts.py
@ -17,7 +17,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
 mailabs_path = "/home/julian/workspace/mailabs/**"
 dataset_paths = glob(mailabs_path)
 dataset_config = [
-    BaseDatasetConfig(name="mailabs", meta_file_train=None, path=path, language=path.split("/")[-1])
+    BaseDatasetConfig(formatter="mailabs", meta_file_train=None, path=path, language=path.split("/")[-1])
    for path in dataset_paths
 ]
--- a/recipes/thorsten_DE/align_tts/train_aligntts.py
+++ b/recipes/thorsten_DE/align_tts/train_aligntts.py
@ -14,7 +14,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
 # init configs
 dataset_config = BaseDatasetConfig(
-    name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
+    formatter="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
 )
 # download dataset if not already present
--- a/recipes/thorsten_DE/glow_tts/train_glowtts.py
+++ b/recipes/thorsten_DE/glow_tts/train_glowtts.py
@ -22,7 +22,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
 # Set LJSpeech as our target dataset and define its path.
 # You can also use a simple Dict to define the dataset and pass it to your custom formatter.
 dataset_config = BaseDatasetConfig(
-    name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
+    formatter="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
 )
 # download dataset if not already present
--- a/recipes/thorsten_DE/speedy_speech/train_speedy_speech.py
+++ b/recipes/thorsten_DE/speedy_speech/train_speedy_speech.py
@ -12,7 +12,7 @@ from TTS.utils.downloaders import download_thorsten_de
 output_path = os.path.dirname(os.path.abspath(__file__))
 dataset_config = BaseDatasetConfig(
-    name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
+    formatter="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
 )
 # download dataset if not already present
--- a/recipes/thorsten_DE/tacotron2-DDC/train_tacotron_ddc.py
+++ b/recipes/thorsten_DE/tacotron2-DDC/train_tacotron_ddc.py
@ -16,7 +16,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
 # init configs
 dataset_config = BaseDatasetConfig(
-    name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
+    formatter="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
 )
 # download dataset if not already present
--- a/recipes/thorsten_DE/vits_tts/train_vits.py
+++ b/recipes/thorsten_DE/vits_tts/train_vits.py
@ -12,7 +12,7 @@ from TTS.utils.downloaders import download_thorsten_de
 output_path = os.path.dirname(os.path.abspath(__file__))
 dataset_config = BaseDatasetConfig(
-    name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
+    formatter="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
 )
 # download dataset if not already present
--- a/recipes/vctk/fast_pitch/train_fast_pitch.py
+++ b/recipes/vctk/fast_pitch/train_fast_pitch.py
@ -11,7 +11,7 @@ from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
 output_path = os.path.dirname(os.path.abspath(__file__))
-dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
+dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
 audio_config = BaseAudioConfig(
    sample_rate=22050,
--- a/recipes/vctk/fast_speech/train_fast_speech.py
+++ b/recipes/vctk/fast_speech/train_fast_speech.py
@ -11,7 +11,7 @@ from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
 output_path = os.path.dirname(os.path.abspath(__file__))
-dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
+dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
 audio_config = BaseAudioConfig(
    sample_rate=22050,
--- a/recipes/vctk/glow_tts/train_glow_tts.py
+++ b/recipes/vctk/glow_tts/train_glow_tts.py
@ -22,7 +22,7 @@ if not os.path.exists(dataset_path):
    download_vctk(dataset_path)
 # define dataset config
-dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=dataset_path)
+dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", path=dataset_path)
 # define audio config
 # ❗ resample the dataset externally using `TTS/bin/resample.py` and set `resample=False` for faster training
--- a/recipes/vctk/resnet_speaker_encoder/train_encoder.py
+++ b/recipes/vctk/resnet_speaker_encoder/train_encoder.py
@ -0,0 +1,139 @@
 import os
 from TTS.encoder.configs.speaker_encoder_config import SpeakerEncoderConfig
 # from TTS.encoder.configs.emotion_encoder_config import EmotionEncoderConfig
 from TTS.tts.configs.shared_configs import BaseDatasetConfig
 CURRENT_PATH = os.getcwd()
 # change the root path to the TTS root path
 os.chdir("../../../")
 ### Definitions ###
 # dataset
 VCTK_PATH = "/raid/datasets/VCTK_NEW_16khz_removed_silence_silero_vad/"  # download:  https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zipdddddddddd
 RIR_SIMULATED_PATH = "/raid/datasets/DA/RIRS_NOISES/simulated_rirs/"  # download: https://www.openslr.org/17/
 MUSAN_PATH = "/raid/datasets/DA/musan/"  # download: https://www.openslr.org/17/
 # training
 OUTPUT_PATH = os.path.join(
    CURRENT_PATH, "resnet_speaker_encoder_training_output/"
 )  # path to save the train logs and checkpoint
 CONFIG_OUT_PATH = os.path.join(OUTPUT_PATH, "config_se.json")
 RESTORE_PATH = None  # Checkpoint to use for transfer learning if None ignore
 # instance the config
 # to speaker encoder
 config = SpeakerEncoderConfig()
 # to emotion encoder
 # config = EmotionEncoderConfig()
 #### DATASET CONFIG ####
 # The formatter need to return the key "speaker_name"  for the speaker encoder and the "emotion_name" for the emotion encoder
 dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", language="en-us", path=VCTK_PATH)
 # add the dataset to the config
 config.datasets = [dataset_config]
 #### TRAINING CONFIG ####
 # The encoder data loader balancer the dataset item equally to guarantee better training and to attend the losses requirements
 # It have two parameters to control the final batch size the number total of speaker used in each batch and the number of samples for each speaker
 # number total of speaker in batch in training
 config.num_classes_in_batch = 100
 # number of utterance per class/speaker in the batch in training
 config.num_utter_per_class = 4
 # final batch size = config.num_classes_in_batch * config.num_utter_per_class
 # number total of speaker in batch in evaluation
 config.eval_num_classes_in_batch = 100
 # number of utterance per class/speaker in the batch in evaluation
 config.eval_num_utter_per_class = 4
 # number of data loader workers
 config.num_loader_workers = 8
 config.num_val_loader_workers = 8
 # number of epochs
 config.epochs = 10000
 # loss to be used in training
 config.loss = "softmaxproto"
 # run eval
 config.run_eval = False
 # output path for the checkpoints
 config.output_path = OUTPUT_PATH
 # Save local checkpoint every save_step steps
 config.save_step = 2000
 ### Model Config ###
 config.model_params = {
    "model_name": "resnet",  # supported "lstm" and "resnet"
    "input_dim": 64,
    "use_torch_spec": True,
    "log_input": True,
    "proj_dim": 512,  # embedding dim
 }
 ### Audio Config ###
 # To fast train the model divides the audio in small parts. it parameter defines the length in seconds of these "parts"
 config.voice_len = 2.0
 # all others configs
 config.audio = {
    "fft_size": 512,
    "win_length": 400,
    "hop_length": 160,
    "frame_shift_ms": None,
    "frame_length_ms": None,
    "stft_pad_mode": "reflect",
    "sample_rate": 16000,
    "resample": False,
    "preemphasis": 0.97,
    "ref_level_db": 20,
    "do_sound_norm": False,
    "do_trim_silence": False,
    "trim_db": 60,
    "power": 1.5,
    "griffin_lim_iters": 60,
    "num_mels": 64,
    "mel_fmin": 0.0,
    "mel_fmax": 8000.0,
    "spec_gain": 20,
    "signal_norm": False,
    "min_level_db": -100,
    "symmetric_norm": False,
    "max_norm": 4.0,
    "clip_norm": False,
    "stats_path": None,
    "do_rms_norm": True,
    "db_level": -27.0,
 }
 ### Augmentation Config ###
 config.audio_augmentation = {
    # additive noise and room impulse response (RIR) simulation similar to: https://arxiv.org/pdf/2009.14153.pdf
    "p": 0.5,  # probability to the use of one of the augmentation - 0 means disabled
    "rir": {"rir_path": RIR_SIMULATED_PATH, "conv_mode": "full"},  # download: https://www.openslr.org/17/
    "additive": {
        "sounds_path": MUSAN_PATH,
        "speech": {"min_snr_in_db": 13, "max_snr_in_db": 20, "min_num_noises": 1, "max_num_noises": 1},
        "noise": {"min_snr_in_db": 0, "max_snr_in_db": 15, "min_num_noises": 1, "max_num_noises": 1},
        "music": {"min_snr_in_db": 5, "max_snr_in_db": 15, "min_num_noises": 1, "max_num_noises": 1},
    },
    "gaussian": {"p": 0.7, "min_amplitude": 0.0, "max_amplitude": 1e-05},
 }
 config.save_json(CONFIG_OUT_PATH)
 print(CONFIG_OUT_PATH)
 if RESTORE_PATH is not None:
    command = f"python TTS/bin/train_encoder.py --config_path {CONFIG_OUT_PATH} --restore_path {RESTORE_PATH}"
 else:
    command = f"python TTS/bin/train_encoder.py --config_path {CONFIG_OUT_PATH}"
 os.system(command)
--- a/recipes/vctk/speedy_speech/train_speedy_speech.py
+++ b/recipes/vctk/speedy_speech/train_speedy_speech.py
@ -11,7 +11,7 @@ from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
 output_path = os.path.dirname(os.path.abspath(__file__))
-dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
+dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
 audio_config = BaseAudioConfig(
    sample_rate=22050,
--- a/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py
+++ b/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py
@ -12,7 +12,7 @@ from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
 output_path = os.path.dirname(os.path.abspath(__file__))
-dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
+dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
 audio_config = BaseAudioConfig(
    sample_rate=22050,
--- a/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py
+++ b/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py
@ -12,7 +12,7 @@ from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
 output_path = os.path.dirname(os.path.abspath(__file__))
-dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
+dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
 audio_config = BaseAudioConfig(
    sample_rate=22050,
--- a/recipes/vctk/tacotron2/train_tacotron2.py
+++ b/recipes/vctk/tacotron2/train_tacotron2.py
@ -12,7 +12,7 @@ from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
 output_path = os.path.dirname(os.path.abspath(__file__))
-dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
+dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
 audio_config = BaseAudioConfig(
    sample_rate=22050,
--- a/recipes/vctk/vits/train_vits.py
+++ b/recipes/vctk/vits/train_vits.py
@ -12,7 +12,7 @@ from TTS.utils.audio import AudioProcessor
 output_path = os.path.dirname(os.path.abspath(__file__))
 dataset_config = BaseDatasetConfig(
-    name="vctk", meta_file_train="", language="en-us", path=os.path.join(output_path, "../VCTK/")
+    formatter="vctk", meta_file_train="", language="en-us", path=os.path.join(output_path, "../VCTK/")
 )
--- a/requirements.txt
+++ b/requirements.txt
@ -23,7 +23,6 @@ umap-learn==0.5.1
 pandas
 # deps for training
 matplotlib
 pyworld==0.2.10 # > 0.2.10 is not p3.10.x compatible
 # coqui stack
 trainer
 # config management
@ -35,4 +34,8 @@ pypinyin
 mecab-python3==1.0.5
 unidic-lite==1.0.8
 # gruut+supported langs
-gruut[cs,de,es,fr,it,nl,pt,ru,sv]==2.2.3
+gruut[de]==2.2.3
 # deps for korean
 jamo
 nltk
 g2pkk>=0.1.1
--- a/tests/init.py
+++ b/tests/init.py
@ -33,7 +33,9 @@ def get_tests_data_path():
 def get_tests_output_path():
    """Returns the path to the directory for test outputs."""
-    return os.path.join(get_tests_path(), "outputs")
+    path = os.path.join(get_tests_path(), "outputs")
    os.makedirs(path, exist_ok=True)
    return path
 def run_cli(command):
@ -42,7 +44,7 @@ def run_cli(command):
 def get_test_data_config():
-    return BaseDatasetConfig(name="ljspeech", path="tests/data/ljspeech/", meta_file_train="metadata.csv")
+    return BaseDatasetConfig(formatter="ljspeech", path="tests/data/ljspeech/", meta_file_train="metadata.csv")
 def assertHasAttr(test_obj, obj, intendedAttr):
--- a/tests/aux_tests/test_audio_processor.py
+++ b/tests/aux_tests/test_audio_processor.py
@ -10,7 +10,7 @@ OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests")
 WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
 os.makedirs(OUT_PATH, exist_ok=True)
-conf = BaseAudioConfig(mel_fmax=8000)
+conf = BaseAudioConfig(mel_fmax=8000, pitch_fmax=640, pitch_fmin=1)
 # pylint: disable=protected-access
--- a/tests/aux_tests/test_embedding_manager.py
+++ b/tests/aux_tests/test_embedding_manager.py
@ -0,0 +1,92 @@
 import os
 import unittest
 import numpy as np
 import torch
 from tests import get_tests_input_path
 from TTS.config import load_config
 from TTS.encoder.utils.generic_utils import setup_encoder_model
 from TTS.encoder.utils.io import save_checkpoint
 from TTS.tts.utils.managers import EmbeddingManager
 from TTS.utils.audio import AudioProcessor
 encoder_config_path = os.path.join(get_tests_input_path(), "test_speaker_encoder_config.json")
 encoder_model_path = os.path.join(get_tests_input_path(), "checkpoint_0.pth")
 sample_wav_path = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0001.wav")
 sample_wav_path2 = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0002.wav")
 embedding_file_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.json")
 embeddings_file_path2 = os.path.join(get_tests_input_path(), "../data/dummy_speakers2.json")
 embeddings_file_pth_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.pth")
 class EmbeddingManagerTest(unittest.TestCase):
    """Test emEeddingManager for loading embedding files and computing embeddings from waveforms"""
    @staticmethod
    def test_speaker_embedding():
        # load config
        config = load_config(encoder_config_path)
        config.audio.resample = True
        # create a dummy speaker encoder
        model = setup_encoder_model(config)
        save_checkpoint(model, None, None, get_tests_input_path(), 0)
        # load audio processor and speaker encoder
        manager = EmbeddingManager(encoder_model_path=encoder_model_path, encoder_config_path=encoder_config_path)
        # load a sample audio and compute embedding
        ap = AudioProcessor(**config.audio)
        waveform = ap.load_wav(sample_wav_path)
        mel = ap.melspectrogram(waveform)
        embedding = manager.compute_embeddings(mel)
        assert embedding.shape[1] == 256
        # compute embedding directly from an input file
        embedding = manager.compute_embedding_from_clip(sample_wav_path)
        embedding2 = manager.compute_embedding_from_clip(sample_wav_path)
        embedding = torch.FloatTensor(embedding)
        embedding2 = torch.FloatTensor(embedding2)
        assert embedding.shape[0] == 256
        assert (embedding - embedding2).sum() == 0.0
        # compute embedding from a list of wav files.
        embedding3 = manager.compute_embedding_from_clip([sample_wav_path, sample_wav_path2])
        embedding3 = torch.FloatTensor(embedding3)
        assert embedding3.shape[0] == 256
        assert (embedding - embedding3).sum() != 0.0
        # remove dummy model
        os.remove(encoder_model_path)
    def test_embedding_file_processing(self):  # pylint: disable=no-self-use
        manager = EmbeddingManager(embedding_file_path=embeddings_file_pth_path)
        # test embedding querying
        embedding = manager.get_embedding_by_clip(manager.clip_ids[0])
        assert len(embedding) == 256
        embeddings = manager.get_embeddings_by_name(manager.embedding_names[0])
        assert len(embeddings[0]) == 256
        embedding1 = manager.get_mean_embedding(manager.embedding_names[0], num_samples=2, randomize=True)
        assert len(embedding1) == 256
        embedding2 = manager.get_mean_embedding(manager.embedding_names[0], num_samples=2, randomize=False)
        assert len(embedding2) == 256
        assert np.sum(np.array(embedding1) - np.array(embedding2)) != 0
    def test_embedding_file_loading(self):
        # test loading a json file
        manager = EmbeddingManager(embedding_file_path=embedding_file_path)
        self.assertEqual(manager.num_embeddings, 384)
        self.assertEqual(manager.embedding_dim, 256)
        # test loading a pth file
        manager = EmbeddingManager(embedding_file_path=embeddings_file_pth_path)
        self.assertEqual(manager.num_embeddings, 384)
        self.assertEqual(manager.embedding_dim, 256)
        # test loading a pth files with duplicate embedding keys
        with self.assertRaises(Exception) as context:
            manager = EmbeddingManager(embedding_file_path=[embeddings_file_pth_path, embeddings_file_pth_path])
        self.assertTrue("Duplicate embedding names" in str(context.exception))
        # test loading embedding files with different embedding keys
        manager = EmbeddingManager(embedding_file_path=[embeddings_file_pth_path, embeddings_file_path2])
        self.assertEqual(manager.embedding_dim, 256)
        self.assertEqual(manager.num_embeddings, 384 * 2)
--- a/tests/aux_tests/test_extract_tts_spectrograms.py
+++ b/tests/aux_tests/test_extract_tts_spectrograms.py
@ -15,7 +15,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase):
    def test_GlowTTS():
        # set paths
        config_path = os.path.join(get_tests_input_path(), "test_glow_tts.json")
-        checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth")
+        checkpoint_path = os.path.join(get_tests_output_path(), "glowtts.pth")
        output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/")
        # load config
        c = load_config(config_path)
@ -33,7 +33,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase):
    def test_Tacotron2():
        # set paths
        config_path = os.path.join(get_tests_input_path(), "test_tacotron2_config.json")
-        checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth")
+        checkpoint_path = os.path.join(get_tests_output_path(), "tacotron2.pth")
        output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/")
        # load config
        c = load_config(config_path)
@ -51,7 +51,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase):
    def test_Tacotron():
        # set paths
        config_path = os.path.join(get_tests_input_path(), "test_tacotron_config.json")
-        checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth")
+        checkpoint_path = os.path.join(get_tests_output_path(), "tacotron.pth")
        output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/")
        # load config
        c = load_config(config_path)
--- a/tests/aux_tests/test_find_unique_phonemes.py
+++ b/tests/aux_tests/test_find_unique_phonemes.py
@ -12,20 +12,22 @@ torch.manual_seed(1)
 config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
 dataset_config_en = BaseDatasetConfig(
-    name="ljspeech",
+    formatter="ljspeech",
    meta_file_train="metadata.csv",
    meta_file_val="metadata.csv",
    path="tests/data/ljspeech",
    language="en",
 )
 """
 dataset_config_pt = BaseDatasetConfig(
-    name="ljspeech",
+    formatter="ljspeech",
    meta_file_train="metadata.csv",
    meta_file_val="metadata.csv",
    path="tests/data/ljspeech",
    language="pt-br",
 )
 """
 # pylint: disable=protected-access
 class TestFindUniquePhonemes(unittest.TestCase):
@ -46,7 +48,7 @@ class TestFindUniquePhonemes(unittest.TestCase):
            epochs=1,
            print_step=1,
            print_eval=True,
-            datasets=[dataset_config_en, dataset_config_pt],
+            datasets=[dataset_config_en],
        )
        config.save_json(config_path)
@ -70,7 +72,7 @@ class TestFindUniquePhonemes(unittest.TestCase):
            epochs=1,
            print_step=1,
            print_eval=True,
-            datasets=[dataset_config_en, dataset_config_pt],
+            datasets=[dataset_config_en],
        )
        config.save_json(config_path)
--- a/tests/aux_tests/test_numpy_transforms.py
+++ b/tests/aux_tests/test_numpy_transforms.py
@ -31,7 +31,8 @@ class TestNumpyTransforms(unittest.TestCase):
            mel_fmin: int = 0
            hop_length: int = 256
            win_length: int = 1024
-            pitch_fmax: int = 450
+            pitch_fmax: int = 640
            pitch_fmin: int = 1
            trim_db: int = -1
            min_silence_sec: float = 0.01
            gain: float = 1.0
--- a/Show More
+++ b/Show More