diff --git a/.dockerignore b/.dockerignore index 4032ec6b..2833d344 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1 +1,2 @@ -.git/ \ No newline at end of file +.git/ +Dockerfile diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md deleted file mode 100644 index 133346f6..00000000 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ /dev/null @@ -1,58 +0,0 @@ ---- -name: 🐛 Bug report -about: Create a bug report to help 🐸 improve -title: '[Bug] ' -labels: bug -assignees: '' - ---- - -## 🐛 Description - - - -### To Reproduce - - - -### Expected behavior - - - -### Environment - - - -- 🐸TTS Version (e.g., 1.3.0): -- PyTorch Version (e.g., 1.8) -- Python version: -- OS (e.g., Linux): -- CUDA/cuDNN version: -- GPU models and configuration: -- How you installed PyTorch (`conda`, `pip`, source): -- Any other relevant information: - -### Additional context - - diff --git a/.github/ISSUE_TEMPLATE/bug_report.yaml b/.github/ISSUE_TEMPLATE/bug_report.yaml new file mode 100644 index 00000000..34cde7e8 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yaml @@ -0,0 +1,85 @@ +name: "🐛 Bug report" +description: Create a bug report to help 🐸 improve +title: '[Bug] ' +labels: [ "bug" ] +body: + - type: markdown + attributes: + value: | + Welcome to the 🐸TTS! Thanks for taking the time to fill out this bug report! + + - type: textarea + id: bug-description + attributes: + label: Describe the bug + description: A clear and concise description of what the bug is. If you intend to submit a PR for this issue, tell us in the description. Thanks! + placeholder: Bug description + validations: + required: true + + - type: textarea + id: reproduction + attributes: + label: To Reproduce + description: | + Please share your code to reproduce the error. + + Issues are fixed faster if you can provide a working example. + + The best place for sharing code is colab. https://colab.research.google.com/ + So we can directly run your code and reproduce the issue. + + In the worse case, provide steps to reproduce the behavior. + + 1. Run the following command '...' + 2. ... + 3. See error + placeholder: Reproduction + validations: + required: true + + - type: textarea + id: expected-behavior + attributes: + label: Expected behavior + description: "Write down what the expected behaviour" + + - type: textarea + id: logs + attributes: + label: Logs + description: "Please include the relevant logs if you can." + render: shell + + - type: textarea + id: system-info + attributes: + label: Environment + description: | + You can either run `TTS/bin/collect_env_info.py` + + ```bash + wget https://raw.githubusercontent.com/coqui-ai/TTS/main/TTS/bin/collect_env_info.py + python collect_env_info.py + ``` + + or fill in the fields below manually. + render: shell + placeholder: | + - 🐸TTS Version (e.g., 1.3.0): + - PyTorch Version (e.g., 1.8) + - Python version: + - OS (e.g., Linux): + - CUDA/cuDNN version: + - GPU models and configuration: + - How you installed PyTorch (`conda`, `pip`, source): + - Any other relevant information: + validations: + required: true + - type: textarea + id: context + attributes: + label: Additional context + description: Add any other context about the problem here. + validations: + required: false diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml new file mode 100644 index 00000000..457649a2 --- /dev/null +++ b/.github/workflows/docker.yaml @@ -0,0 +1,56 @@ +name: "Docker build and push" +on: + pull_request: + push: + branches: + - main + - dev + tags: + - v* +jobs: + docker-build: + name: "Build and push Docker image" + runs-on: ubuntu-20.04 + strategy: + matrix: + arch: ["amd64"] + steps: + - uses: actions/checkout@v2 + - name: Log in to the Container registry + uses: docker/login-action@v1 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Compute Docker tags, check VERSION file matches tag + id: compute-tag + run: | + set -ex + base="ghcr.io/coqui-ai/tts" + tags="" # PR build + if [[ "${{ startsWith(github.ref, 'refs/heads/') }}" = "true" ]]; then + # Push to branch + github_ref="${{ github.ref }}" + branch=${github_ref#*refs/heads/} # strip prefix to get branch name + tags="${base}:${branch},${base}:${{ github.sha }}," + elif [[ "${{ startsWith(github.ref, 'refs/tags/') }}" = "true" ]]; then + VERSION="v$(cat TTS/VERSION)" + if [[ "${{ github.ref }}" != "refs/tags/${VERSION}" ]]; then + echo "Pushed tag does not match VERSION file. Aborting push." + exit 1 + fi + tags="${base}:${VERSION},${base}:latest,${base}:${{ github.sha }}" + fi + echo "::set-output name=tags::${tags}" + - name: Set up QEMU + uses: docker/setup-qemu-action@v1 + - name: Set up Docker Buildx + id: buildx + uses: docker/setup-buildx-action@v1 + - name: Build and push + uses: docker/build-push-action@v2 + with: + context: . + platforms: linux/${{ matrix.arch }} + push: ${{ github.event_name == 'push' }} + tags: ${{ steps.compute-tag.outputs.tags }} diff --git a/.github/workflows/text_tests.yml b/.github/workflows/text_tests.yml index e06a25ad..66197e0b 100644 --- a/.github/workflows/text_tests.yml +++ b/.github/workflows/text_tests.yml @@ -1,4 +1,4 @@ -name: tts-tests +name: text-tests on: push: diff --git a/.gitignore b/.gitignore index f8d6e644..2a3cbad4 100644 --- a/.gitignore +++ b/.gitignore @@ -115,6 +115,7 @@ venv.bak/ *.swo # pytorch models +*.pth *.pth.tar result/ diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 00000000..6b0c8f19 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,20 @@ +cff-version: 1.2.0 +message: "If you want to cite 🐸💬, feel free to use this (but only if you loved it 😊)" +title: "Coqui TTS" +abstract: "A deep learning toolkit for Text-to-Speech, battle-tested in research and production" +date-released: 2021-01-01 +authors: + - family-names: "Eren" + given-names: "Gölge" + - name: "The Coqui TTS Team" +version: 1.4 +doi: 10.5281/zenodo.6334862 +license: "MPL-2.0" +url: "https://www.coqui.ai" +repository-code: "https://github.com/coqui-ai/TTS" +keywords: + - machine learning + - deep learning + - artificial intelligence + - text to speech + - TTS \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7175cf34..81a426e8 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -26,7 +26,8 @@ If you like to contribute code, squash a bug but if you don't know where to star We list all the target improvements for the next version. You can pick one of them and start contributing. - Also feel free to suggest new features, ideas and models. We're always open for new things. -#####Call for sharing language models + +## Call for sharing language models If possible, please consider sharing your pre-trained models in any language (if the licences allow for you to do so). We will include them in our model catalogue for public use and give the proper attribution, whether it be your name, company, website or any other source specified. This model can be shared in two ways: @@ -36,6 +37,7 @@ This model can be shared in two ways: Models are served under `.models.json` file and any model is available under TTS CLI or Server end points. Either way you choose, please make sure you send the models [here](https://github.com/coqui-ai/TTS/issues/380). + ## Sending a ✨**PR**✨ If you have a new feature, a model to implement, or a bug to squash, go ahead and send a ✨**PR**✨. diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..8dab3b30 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,11 @@ +FROM nvcr.io/nvidia/pytorch:22.03-py3 +RUN apt-get update && apt-get install -y --no-install-recommends espeak && rm -rf /var/lib/apt/lists/* +WORKDIR /root +COPY requirements.txt /root +COPY requirements.dev.txt /root +COPY requirements.notebooks.txt /root +RUN pip install -r <(cat requirements.txt requirements.dev.txt requirements.notebooks.txt) +COPY . /root +RUN make install +ENTRYPOINT ["tts"] +CMD ["--help"] diff --git a/MANIFEST.in b/MANIFEST.in index 0d8b4b4c..82ecadcb 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,6 +1,7 @@ include README.md include LICENSE.txt include requirements.*.txt +include *.cff include requirements.txt include TTS/VERSION recursive-include TTS *.json diff --git a/Makefile b/Makefile index d04cd976..69f34c79 100644 --- a/Makefile +++ b/Makefile @@ -44,6 +44,8 @@ style: ## update code style. lint: ## run pylint linter. pylint ${target_dirs} + black ${target_dirs} --check + isort ${target_dirs} --check-only system-deps: ## install linux system deps sudo apt-get install -y libsndfile1-dev diff --git a/README.md b/README.md index 80fa5dea..97a7cc66 100644 --- a/README.md +++ b/README.md @@ -159,13 +159,13 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht - Run your own TTS model (Using Griffin-Lim Vocoder): ``` - $ tts --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav + $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav ``` - Run your own TTS and Vocoder models: ``` - $ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav - --vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json + $ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth --out_path output/path/speech.wav + --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json ``` ### Multi-speaker Models @@ -185,7 +185,7 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht - Run your own multi-speaker TTS model: ``` - $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth.tar --speakers_file_path path/to/speaker.json --speaker_idx + $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth --speakers_file_path path/to/speaker.json --speaker_idx ``` ## Directory Structure diff --git a/TTS/.models.json b/TTS/.models.json index 801b8468..4870bc1f 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -4,7 +4,7 @@ "multi-dataset":{ "your_tts":{ "description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--multilingual--multi-dataset--your_tts.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--multilingual--multi-dataset--your_tts.zip", "default_vocoder": null, "commit": "e9a1953e", "license": "CC BY-NC-ND 4.0", @@ -16,33 +16,34 @@ "ek1": { "tacotron2": { "description": "EK1 en-rp tacotron2 by NMStoker", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.1.0/tts_models--en--ek1--tacotron2.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ek1--tacotron2.zip", "default_vocoder": "vocoder_models/en/ek1/wavegrad", - "commit": "c802255" + "commit": "c802255", + "license": "apache 2.0" } }, "ljspeech": { "tacotron2-DDC": { "description": "Tacotron2 with Double Decoder Consistency.", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.12/tts_models--en--ljspeech--tacotron2-DDC.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC.zip", "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2", "commit": "bae2ad0f", "author": "Eren Gölge @erogol", - "license": "", + "license": "apache 2.0", "contact": "egolge@coqui.com" }, "tacotron2-DDC_ph": { "description": "Tacotron2 with Double Decoder Consistency with phonemes.", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--ljspeech--tacotron2-DDC_ph.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC_ph.zip", "default_vocoder": "vocoder_models/en/ljspeech/univnet", "commit": "3900448", "author": "Eren Gölge @erogol", - "license": "", + "license": "apache 2.0", "contact": "egolge@coqui.com" }, "glow-tts": { "description": "", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/tts_models--en--ljspeech--glow-tts.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--glow-tts.zip", "stats_file": null, "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan", "commit": "", @@ -52,17 +53,17 @@ }, "speedy-speech": { "description": "Speedy Speech model trained on LJSpeech dataset using the Alignment Network for learning the durations.", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.3.0/tts_models--en--ljspeech--speedy_speech.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--speedy-speech.zip", "stats_file": null, "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2", "commit": "4581e3d", "author": "Eren Gölge @erogol", - "license": "TBD", + "license": "apache 2.0", "contact": "egolge@coqui.com" }, "tacotron2-DCA": { "description": "", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/tts_models--en--ljspeech--tacotron2-DCA.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DCA.zip", "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan", "commit": "", "author": "Eren Gölge @erogol", @@ -71,36 +72,36 @@ }, "vits": { "description": "VITS is an End2End TTS model trained on LJSpeech dataset with phonemes.", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--ljspeech--vits.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--vits.zip", "default_vocoder": null, "commit": "3900448", "author": "Eren Gölge @erogol", - "license": "TBD", + "license": "apache 2.0", "contact": "egolge@coqui.com" }, "fast_pitch": { "description": "FastPitch model trained on LJSpeech using the Aligner Network", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.2.2/tts_models--en--ljspeech--fast_pitch.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--fast_pitch.zip", "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2", "commit": "b27b3ba", "author": "Eren Gölge @erogol", - "license": "TBD", + "license": "apache 2.0", "contact": "egolge@coqui.com" } }, "vctk": { "vits": { "description": "VITS End2End TTS model trained on VCTK dataset with 109 different speakers with EN accent.", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--vctk--vits.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--vits.zip", "default_vocoder": null, "commit": "3900448", "author": "Eren @erogol", - "license": "", + "license": "apache 2.0", "contact": "egolge@coqui.ai" }, "fast_pitch":{ "description": "FastPitch model trained on VCTK dataseset.", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--vctk--fast_pitch.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--fast_pitch.zip", "default_vocoder": null, "commit": "bdab788d", "author": "Eren @erogol", @@ -111,11 +112,11 @@ "sam": { "tacotron-DDC": { "description": "Tacotron2 with Double Decoder Consistency trained with Aceenture's Sam dataset.", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.13/tts_models--en--sam--tacotron_DDC.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--sam--tacotron-DDC.zip", "default_vocoder": "vocoder_models/en/sam/hifigan_v2", "commit": "bae2ad0f", "author": "Eren Gölge @erogol", - "license": "", + "license": "apache 2.0", "contact": "egolge@coqui.com" } } @@ -123,7 +124,7 @@ "es": { "mai": { "tacotron2-DDC": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/tts_models--es--mai--tacotron2-DDC.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--es--mai--tacotron2-DDC.zip", "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan", "commit": "", "author": "Eren Gölge @erogol", @@ -135,7 +136,7 @@ "fr": { "mai": { "tacotron2-DDC": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/tts_models--fr--mai--tacotron2-DDC.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--fr--mai--tacotron2-DDC.zip", "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan", "commit": "", "author": "Eren Gölge @erogol", @@ -147,7 +148,7 @@ "uk":{ "mai": { "glow-tts": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--uk--mai--glow-tts.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--uk--mai--glow-tts.zip", "author":"@robinhad", "commit": "bdab788d", "license": "MIT", @@ -159,9 +160,10 @@ "zh-CN": { "baker": { "tacotron2-DDC-GST": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip", "commit": "unknown", "author": "@kirianguiller", + "license": "apache 2.0", "default_vocoder": null } } @@ -169,8 +171,9 @@ "nl": { "mai": { "tacotron2-DDC": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.10/tts_models--nl--mai--tacotron2-DDC.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--nl--mai--tacotron2-DDC.zip", "author": "@r-dh", + "license": "apache 2.0", "default_vocoder": "vocoder_models/nl/mai/parallel-wavegan", "stats_file": null, "commit": "540d811" @@ -180,9 +183,10 @@ "de": { "thorsten": { "tacotron2-DCA": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.11/tts_models--de--thorsten--tacotron2-DCA.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--de--thorsten--tacotron2-DCA.zip", "default_vocoder": "vocoder_models/de/thorsten/fullband-melgan", "author": "@thorstenMueller", + "license": "apache 2.0", "commit": "unknown" } } @@ -190,10 +194,11 @@ "ja": { "kokoro": { "tacotron2-DDC": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.15/tts_models--jp--kokoro--tacotron2-DDC.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--ja--kokoro--tacotron2-DDC.zip", "default_vocoder": "vocoder_models/ja/kokoro/hifigan_v1", "description": "Tacotron2 with Double Decoder Consistency trained with Kokoro Speech Dataset.", "author": "@kaiidams", + "license": "apache 2.0", "commit": "401fbd89" } } @@ -201,7 +206,7 @@ "tr":{ "common-voice": { "glow-tts":{ - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--tr--common-voice--glow-tts.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--tr--common-voice--glow-tts.zip", "default_vocoder": "vocoder_models/tr/common-voice/hifigan", "license": "MIT", "description": "Turkish GlowTTS model using an unknown speaker from the Common-Voice dataset.", @@ -213,50 +218,126 @@ "it": { "mai_female": { "glow-tts":{ - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--it--mai_female--glow-tts.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--glow-tts.zip", "default_vocoder": null, "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.", "author": "@nicolalandro", + "license": "apache 2.0", "commit": null }, "vits":{ - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--it--mai_female--vits.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--vits.zip", "default_vocoder": null, "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.", "author": "@nicolalandro", + "license": "apache 2.0", "commit": null } }, "mai_male": { "glow-tts":{ - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--it--mai_male--glow-tts.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--glow-tts.zip", "default_vocoder": null, "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.", "author": "@nicolalandro", + "license": "apache 2.0", "commit": null }, "vits":{ - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--it--mai_male--vits.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--vits.zip", "default_vocoder": null, "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.", "author": "@nicolalandro", + "license": "apache 2.0", "commit": null } } + }, + "ewe": { + "openbible": { + "vits":{ + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--ewe--openbible--vits.zip", + "default_vocoder": null, + "license": "CC-BY-SA 4.0", + "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.", + "author": "@coqui_ai", + "commit": "1b22f03" + } + } + }, + "hau": { + "openbible": { + "vits":{ + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--hau--openbible--vits.zip", + "default_vocoder": null, + "license": "CC-BY-SA 4.0", + "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.", + "author": "@coqui_ai", + "commit": "1b22f03" + } + } + }, + "lin": { + "openbible": { + "vits":{ + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--lin--openbible--vits.zip", + "default_vocoder": null, + "license": "CC-BY-SA 4.0", + "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.", + "author": "@coqui_ai", + "commit": "1b22f03" + } + } + }, + "tw_akuapem": { + "openbible": { + "vits":{ + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_akuapem--openbible--vits.zip", + "default_vocoder": null, + "license": "CC-BY-SA 4.0", + "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.", + "author": "@coqui_ai", + "commit": "1b22f03" + } + } + }, + "tw_asante": { + "openbible": { + "vits":{ + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_asante--openbible--vits.zip", + "default_vocoder": null, + "license": "CC-BY-SA 4.0", + "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.", + "author": "@coqui_ai", + "commit": "1b22f03" + } + } + }, + "yor": { + "openbible": { + "vits":{ + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--yor--openbible--vits.zip", + "default_vocoder": null, + "license": "CC-BY-SA 4.0", + "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.", + "author": "@coqui_ai", + "commit": "1b22f03" + } + } } }, "vocoder_models": { "universal": { "libri-tts": { "wavegrad": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/vocoder_models--universal--libri-tts--wavegrad.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--wavegrad.zip", "commit": "ea976b0", "author": "Eren Gölge @erogol", "license": "MPL", "contact": "egolge@coqui.com" }, "fullband-melgan": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/vocoder_models--universal--libri-tts--fullband-melgan.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--fullband-melgan.zip", "commit": "4132240", "author": "Eren Gölge @erogol", "license": "MPL", @@ -268,13 +349,14 @@ "ek1": { "wavegrad": { "description": "EK1 en-rp wavegrad by NMStoker", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.10/vocoder_models--en--ek1--wavegrad.zip", - "commit": "c802255" + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ek1--wavegrad.zip", + "commit": "c802255", + "license": "apache 2.0" } }, "ljspeech": { "multiband-melgan": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/vocoder_models--en--ljspeech--mulitband-melgan.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--multiband-melgan.zip", "commit": "ea976b0", "author": "Eren Gölge @erogol", "license": "MPL", @@ -282,38 +364,38 @@ }, "hifigan_v2": { "description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.12/vocoder_model--en--ljspeech-hifigan_v2.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--hifigan_v2.zip", "commit": "bae2ad0f", "author": "@erogol", - "license": "", + "license": "apache 2.0", "contact": "egolge@coqui.ai" }, "univnet": { "description": "UnivNet model finetuned on TacotronDDC_ph spectrograms for better compatibility.", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.3.0/vocoder_models--en--ljspeech--univnet_v2.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--univnet_v2.zip", "commit": "4581e3d", "author": "Eren @erogol", - "license": "TBD", + "license": "apache 2.0", "contact": "egolge@coqui.ai" } }, "vctk": { "hifigan_v2": { "description": "Finetuned and intended to be used with tts_models/en/vctk/sc-glow-tts", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.12/vocoder_model--en--vctk--hifigan_v2.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--vctk--hifigan_v2.zip", "commit": "2f07160", "author": "Edresson Casanova", - "license": "", + "license": "apache 2.0", "contact": "" } }, "sam": { "hifigan_v2": { "description": "Finetuned and intended to be used with tts_models/en/sam/tacotron_DDC", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.13/vocoder_models--en--sam--hifigan_v2.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--sam--hifigan_v2.zip", "commit": "2f07160", "author": "Eren Gölge @erogol", - "license": "", + "license": "apache 2.0", "contact": "egolge@coqui.ai" } } @@ -321,8 +403,9 @@ "nl": { "mai": { "parallel-wavegan": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.10/vocoder_models--nl--mai--parallel-wavegan.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--nl--mai--parallel-wavegan.zip", "author": "@r-dh", + "license": "apache 2.0", "commit": "unknown" } } @@ -330,13 +413,15 @@ "de": { "thorsten": { "wavegrad": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.11/vocoder_models--de--thorsten--wavegrad.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--wavegrad.zip", "author": "@thorstenMueller", + "license": "apache 2.0", "commit": "unknown" }, "fullband-melgan": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.1.3/vocoder_models--de--thorsten--fullband-melgan.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--fullband-melgan.zip", "author": "@thorstenMueller", + "license": "apache 2.0", "commit": "unknown" } } @@ -344,9 +429,10 @@ "ja": { "kokoro": { "hifigan_v1": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.2.0/vocoder_models--ja--kokoro--hifigan_v1.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--ja--kokoro--hifigan_v1.zip", "description": "HifiGAN model trained for kokoro dataset by @kaiidams", "author": "@kaiidams", + "license": "apache 2.0", "commit": "3900448" } } @@ -354,7 +440,7 @@ "uk": { "mai": { "multiband-melgan": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.5.0_models/vocoder_models--uk--mai--multiband-melgan.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--uk--mai--multiband-melgan.zip", "author":"@robinhad", "commit": "bdab788d", "license": "MIT", @@ -365,7 +451,7 @@ "tr":{ "common-voice": { "hifigan":{ - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/vocoder_models--tr--common-voice--hifigan.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--tr--common-voice--hifigan.zip", "description": "HifiGAN model using an unknown speaker from the Common-Voice dataset.", "author": "Fatih Akademi", "license": "MIT", diff --git a/TTS/VERSION b/TTS/VERSION index 7ceb0404..b1d7abc0 100644 --- a/TTS/VERSION +++ b/TTS/VERSION @@ -1 +1 @@ -0.6.1 \ No newline at end of file +0.6.2 \ No newline at end of file diff --git a/TTS/bin/compute_attention_masks.py b/TTS/bin/compute_attention_masks.py index e58259a6..9ab520be 100644 --- a/TTS/bin/compute_attention_masks.py +++ b/TTS/bin/compute_attention_masks.py @@ -25,7 +25,7 @@ These masks can be used for different purposes including training a TTS model wi """ Example run: CUDA_VISIBLE_DEVICE="0" python TTS/bin/compute_attention_masks.py - --model_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/checkpoint_200000.pth.tar + --model_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/checkpoint_200000.pth --config_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/config.json --dataset_metafile metadata.csv --data_path /root/LJSpeech-1.1/ diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py index 50817154..b62d603a 100644 --- a/TTS/bin/compute_embeddings.py +++ b/TTS/bin/compute_embeddings.py @@ -12,7 +12,7 @@ parser = argparse.ArgumentParser( description="""Compute embedding vectors for each wav file in a dataset.\n\n""" """ Example runs: - python TTS/bin/compute_embeddings.py speaker_encoder_model.pth.tar speaker_encoder_config.json dataset_config.json embeddings_output_path/ + python TTS/bin/compute_embeddings.py speaker_encoder_model.pth speaker_encoder_config.json dataset_config.json embeddings_output_path/ """, formatter_class=RawTextHelpFormatter, ) @@ -42,33 +42,35 @@ c_dataset = load_config(args.config_dataset_path) meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=args.eval) wav_files = meta_data_train + meta_data_eval -speaker_manager = SpeakerManager( +encoder_manager = SpeakerManager( encoder_model_path=args.model_path, encoder_config_path=args.config_path, d_vectors_file_path=args.old_file, use_cuda=args.use_cuda, ) +class_name_key = encoder_manager.encoder_config.class_name_key + # compute speaker embeddings speaker_mapping = {} for idx, wav_file in enumerate(tqdm(wav_files)): - if isinstance(wav_file, list): - speaker_name = wav_file[2] - wav_file = wav_file[1] + if isinstance(wav_file, dict): + class_name = wav_file[class_name_key] + wav_file = wav_file["audio_file"] else: - speaker_name = None + class_name = None wav_file_name = os.path.basename(wav_file) - if args.old_file is not None and wav_file_name in speaker_manager.clip_ids: + if args.old_file is not None and wav_file_name in encoder_manager.clip_ids: # get the embedding from the old file - embedd = speaker_manager.get_d_vector_by_clip(wav_file_name) + embedd = encoder_manager.get_embedding_by_clip(wav_file_name) else: # extract the embedding - embedd = speaker_manager.compute_d_vector_from_clip(wav_file) + embedd = encoder_manager.compute_embedding_from_clip(wav_file) # create speaker_mapping if target dataset is defined speaker_mapping[wav_file_name] = {} - speaker_mapping[wav_file_name]["name"] = speaker_name + speaker_mapping[wav_file_name]["name"] = class_name speaker_mapping[wav_file_name]["embedding"] = embedd if speaker_mapping: @@ -81,5 +83,5 @@ if speaker_mapping: os.makedirs(os.path.dirname(mapping_file_path), exist_ok=True) # pylint: disable=W0212 - speaker_manager._save_json(mapping_file_path, speaker_mapping) + encoder_manager._save_json(mapping_file_path, speaker_mapping) print("Speaker embeddings saved at:", mapping_file_path) diff --git a/TTS/bin/distribute.py b/TTS/bin/distribute.py deleted file mode 100644 index 97e2f0e3..00000000 --- a/TTS/bin/distribute.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -import os -import pathlib -import subprocess -import time - -import torch -from trainer import TrainerArgs - - -def main(): - """ - Call train.py as a new process and pass command arguments - """ - parser = TrainerArgs().init_argparse(arg_prefix="") - parser.add_argument("--script", type=str, help="Target training script to distibute.") - args, unargs = parser.parse_known_args() - - num_gpus = torch.cuda.device_count() - group_id = time.strftime("%Y_%m_%d-%H%M%S") - - # set arguments for train.py - folder_path = pathlib.Path(__file__).parent.absolute() - if os.path.exists(os.path.join(folder_path, args.script)): - command = [os.path.join(folder_path, args.script)] - else: - command = [args.script] - command.append("--continue_path={}".format(args.continue_path)) - command.append("--restore_path={}".format(args.restore_path)) - command.append("--config_path={}".format(args.config_path)) - command.append("--group_id=group_{}".format(group_id)) - command.append("--use_ddp=true") - command += unargs - command.append("") - - # run processes - processes = [] - for i in range(num_gpus): - my_env = os.environ.copy() - my_env["PYTHON_EGG_CACHE"] = "/tmp/tmp{}".format(i) - command[-1] = "--rank={}".format(i) - # prevent stdout for processes with rank != 0 - stdout = None - p = subprocess.Popen(["python3"] + command, stdout=stdout, env=my_env) # pylint: disable=consider-using-with - processes.append(p) - print(command) - - for p in processes: - p.wait() - - -if __name__ == "__main__": - main() diff --git a/TTS/bin/eval_encoder.py b/TTS/bin/eval_encoder.py new file mode 100644 index 00000000..7f9fdf93 --- /dev/null +++ b/TTS/bin/eval_encoder.py @@ -0,0 +1,89 @@ +import argparse +from argparse import RawTextHelpFormatter + +import torch +from tqdm import tqdm + +from TTS.config import load_config +from TTS.tts.datasets import load_tts_samples +from TTS.tts.utils.speakers import SpeakerManager + + +def compute_encoder_accuracy(dataset_items, encoder_manager): + + class_name_key = encoder_manager.encoder_config.class_name_key + map_classid_to_classname = getattr(encoder_manager.encoder_config, "map_classid_to_classname", None) + + class_acc_dict = {} + + # compute embeddings for all wav_files + for item in tqdm(dataset_items): + class_name = item[class_name_key] + wav_file = item["audio_file"] + + # extract the embedding + embedd = encoder_manager.compute_embedding_from_clip(wav_file) + if encoder_manager.encoder_criterion is not None and map_classid_to_classname is not None: + embedding = torch.FloatTensor(embedd).unsqueeze(0) + if encoder_manager.use_cuda: + embedding = embedding.cuda() + + class_id = encoder_manager.encoder_criterion.softmax.inference(embedding).item() + predicted_label = map_classid_to_classname[str(class_id)] + else: + predicted_label = None + + if class_name is not None and predicted_label is not None: + is_equal = int(class_name == predicted_label) + if class_name not in class_acc_dict: + class_acc_dict[class_name] = [is_equal] + else: + class_acc_dict[class_name].append(is_equal) + else: + raise RuntimeError("Error: class_name or/and predicted_label are None") + + acc_avg = 0 + for key, values in class_acc_dict.items(): + acc = sum(values) / len(values) + print("Class", key, "Accuracy:", acc) + acc_avg += acc + + print("Average Accuracy:", acc_avg / len(class_acc_dict)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="""Compute the accuracy of the encoder.\n\n""" + """ + Example runs: + python TTS/bin/eval_encoder.py emotion_encoder_model.pth emotion_encoder_config.json dataset_config.json + """, + formatter_class=RawTextHelpFormatter, + ) + parser.add_argument("model_path", type=str, help="Path to model checkpoint file.") + parser.add_argument( + "config_path", + type=str, + help="Path to model config file.", + ) + + parser.add_argument( + "config_dataset_path", + type=str, + help="Path to dataset config file.", + ) + parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True) + parser.add_argument("--eval", type=bool, help="compute eval.", default=True) + + args = parser.parse_args() + + c_dataset = load_config(args.config_dataset_path) + + meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=args.eval) + items = meta_data_train + meta_data_eval + + enc_manager = SpeakerManager( + encoder_model_path=args.model_path, encoder_config_path=args.config_path, use_cuda=args.use_cuda + ) + + compute_encoder_accuracy(items, enc_manager) diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index fa63c46a..a0dd0549 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -37,8 +37,8 @@ def setup_loader(ap, r, verbose=False): precompute_num_workers=0, use_noise_augment=False, verbose=verbose, - speaker_id_mapping=speaker_manager.speaker_ids if c.use_speaker_embedding else None, - d_vector_mapping=speaker_manager.d_vectors if c.use_d_vector_file else None, + speaker_id_mapping=speaker_manager.ids if c.use_speaker_embedding else None, + d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None, ) if c.use_phonemes and c.compute_input_seq_cache: diff --git a/TTS/bin/find_unique_chars.py b/TTS/bin/find_unique_chars.py index 4689dcad..ea169748 100644 --- a/TTS/bin/find_unique_chars.py +++ b/TTS/bin/find_unique_chars.py @@ -29,7 +29,7 @@ def main(): items = train_items + eval_items - texts = "".join(item[0] for item in items) + texts = "".join(item["text"] for item in items) chars = set(texts) lower_chars = filter(lambda c: c.islower(), chars) chars_force_lower = [c.lower() for c in chars] diff --git a/TTS/bin/remove_silence_using_vad.py b/TTS/bin/remove_silence_using_vad.py index 9070f2da..7d88ae91 100755 --- a/TTS/bin/remove_silence_using_vad.py +++ b/TTS/bin/remove_silence_using_vad.py @@ -1,51 +1,31 @@ import argparse import glob -import multiprocessing import os import pathlib -from tqdm.contrib.concurrent import process_map +from tqdm import tqdm -from TTS.utils.vad import get_vad_speech_segments, read_wave, write_wave +from TTS.utils.vad import get_vad_model_and_utils, remove_silence -def remove_silence(filepath): - output_path = filepath.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, "")) +def adjust_path_and_remove_silence(audio_path): + output_path = audio_path.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, "")) # ignore if the file exists if os.path.exists(output_path) and not args.force: - return + return output_path # create all directory structure pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True) - # load wave - audio, sample_rate = read_wave(filepath) + # remove the silence and save the audio + output_path = remove_silence( + model_and_utils, + audio_path, + output_path, + trim_just_beginning_and_end=args.trim_just_beginning_and_end, + use_cuda=args.use_cuda, + ) - # get speech segments - segments = get_vad_speech_segments(audio, sample_rate, aggressiveness=args.aggressiveness) - - segments = list(segments) - num_segments = len(segments) - flag = False - # create the output wave - if num_segments != 0: - for i, segment in reversed(list(enumerate(segments))): - if i >= 1: - if not flag: - concat_segment = segment - flag = True - else: - concat_segment = segment + concat_segment - else: - if flag: - segment = segment + concat_segment - # print("Saving: ", output_path) - write_wave(output_path, segment, sample_rate) - return - else: - print("> Just Copying the file to:", output_path) - # if fail to remove silence just write the file - write_wave(output_path, audio, sample_rate) - return + return output_path def preprocess_audios(): @@ -54,17 +34,24 @@ def preprocess_audios(): if not args.force: print("> Ignoring files that already exist in the output directory.") + if args.trim_just_beginning_and_end: + print("> Trimming just the beginning and the end with nonspeech parts.") + else: + print("> Trimming all nonspeech parts.") + if files: # create threads - num_threads = multiprocessing.cpu_count() - process_map(remove_silence, files, max_workers=num_threads, chunksize=15) + # num_threads = multiprocessing.cpu_count() + # process_map(adjust_path_and_remove_silence, files, max_workers=num_threads, chunksize=15) + for f in tqdm(files): + adjust_path_and_remove_silence(f) else: print("> No files Found !") if __name__ == "__main__": parser = argparse.ArgumentParser( - description="python remove_silence.py -i=VCTK-Corpus-bk/ -o=../VCTK-Corpus-removed-silence -g=wav48/*/*.wav -a=2" + description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end True" ) parser.add_argument("-i", "--input_dir", type=str, default="../VCTK-Corpus", help="Dataset root dir") parser.add_argument( @@ -79,11 +66,20 @@ if __name__ == "__main__": help="path in glob format for acess wavs from input_dir. ex: wav48/*/*.wav", ) parser.add_argument( - "-a", - "--aggressiveness", - type=int, - default=2, - help="set its aggressiveness mode, which is an integer between 0 and 3. 0 is the least aggressive about filtering out non-speech, 3 is the most aggressive.", + "-t", + "--trim_just_beginning_and_end", + type=bool, + default=True, + help="If True this script will trim just the beginning and end nonspeech parts. If False all nonspeech parts will be trim. Default True", + ) + parser.add_argument( + "-c", + "--use_cuda", + type=bool, + default=False, + help="If True use cuda", ) args = parser.parse_args() + # load the model and utils + model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda) preprocess_audios() diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index 509b3da6..6247b2a4 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -60,13 +60,13 @@ If you don't specify any models, then it uses LJSpeech based English model. - Run your own TTS model (Using Griffin-Lim Vocoder): ``` - $ tts --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav + $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav ``` - Run your own TTS and Vocoder models: ``` - $ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav - --vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json + $ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth --out_path output/path/speech.wav + --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json ``` ### Multi-speaker Models @@ -86,7 +86,7 @@ If you don't specify any models, then it uses LJSpeech based English model. - Run your own multi-speaker TTS model: ``` - $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth.tar --speakers_file_path path/to/speaker.json --speaker_idx + $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth --speakers_file_path path/to/speaker.json --speaker_idx ``` """ # We remove Markdown code formatting programmatically here to allow us to copy-and-paste from main README to keep @@ -195,11 +195,28 @@ If you don't specify any models, then it uses LJSpeech based English model. help="If true save raw spectogram for further (vocoder) processing in out_path.", default=False, ) - + parser.add_argument( + "--reference_wav", + type=str, + help="Reference wav file to convert in the voice of the speaker_idx or speaker_wav", + default=None, + ) + parser.add_argument( + "--reference_speaker_idx", + type=str, + help="speaker ID of the reference_wav speaker (If not provided the embedding will be computed using the Speaker Encoder).", + default=None, + ) args = parser.parse_args() # print the description if either text or list_models is not set - if args.text is None and not args.list_models and not args.list_speaker_idxs and not args.list_language_idxs: + if ( + not args.text + and not args.list_models + and not args.list_speaker_idxs + and not args.list_language_idxs + and not args.reference_wav + ): parser.parse_args(["-h"]) # load model manager @@ -261,7 +278,7 @@ If you don't specify any models, then it uses LJSpeech based English model. print( " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model." ) - print(synthesizer.tts_model.speaker_manager.speaker_ids) + print(synthesizer.tts_model.speaker_manager.ids) return # query langauge ids of a multi-lingual model. @@ -269,7 +286,7 @@ If you don't specify any models, then it uses LJSpeech based English model. print( " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model." ) - print(synthesizer.tts_model.language_manager.language_id_mapping) + print(synthesizer.tts_model.language_manager.ids) return # check the arguments against a multi-speaker model. @@ -281,10 +298,18 @@ If you don't specify any models, then it uses LJSpeech based English model. return # RUN THE SYNTHESIS - print(" > Text: {}".format(args.text)) + if args.text: + print(" > Text: {}".format(args.text)) # kick it - wav = synthesizer.tts(args.text, args.speaker_idx, args.language_idx, args.speaker_wav) + wav = synthesizer.tts( + args.text, + args.speaker_idx, + args.language_idx, + args.speaker_wav, + reference_wav=args.reference_wav, + reference_speaker_name=args.reference_speaker_idx, + ) # save the results print(" > Saving output to {}".format(args.out_path)) diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index 5828411c..d28f188e 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -9,17 +9,17 @@ import traceback import torch from torch.utils.data import DataLoader from trainer.torch import NoamLR +from trainer.trainer_utils import get_optimizer -from TTS.speaker_encoder.dataset import SpeakerEncoderDataset -from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss -from TTS.speaker_encoder.utils.generic_utils import save_best_model, setup_speaker_encoder_model -from TTS.speaker_encoder.utils.training import init_training -from TTS.speaker_encoder.utils.visual import plot_embeddings +from TTS.encoder.dataset import EncoderDataset +from TTS.encoder.utils.generic_utils import save_best_model, save_checkpoint, setup_encoder_model +from TTS.encoder.utils.samplers import PerfectBatchSampler +from TTS.encoder.utils.training import init_training +from TTS.encoder.utils.visual import plot_embeddings from TTS.tts.datasets import load_tts_samples from TTS.utils.audio import AudioProcessor -from TTS.utils.generic_utils import count_parameters, remove_experiment_folder, set_init_dict -from TTS.utils.io import load_fsspec -from TTS.utils.radam import RAdam +from TTS.utils.generic_utils import count_parameters, remove_experiment_folder +from TTS.utils.io import copy_model_files from TTS.utils.training import check_update torch.backends.cudnn.enabled = True @@ -32,163 +32,257 @@ print(" > Number of GPUs: ", num_gpus) def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False): + num_utter_per_class = c.num_utter_per_class if not is_val else c.eval_num_utter_per_class + num_classes_in_batch = c.num_classes_in_batch if not is_val else c.eval_num_classes_in_batch + + dataset = EncoderDataset( + c, + ap, + meta_data_eval if is_val else meta_data_train, + voice_len=c.voice_len, + num_utter_per_class=num_utter_per_class, + num_classes_in_batch=num_classes_in_batch, + verbose=verbose, + augmentation_config=c.audio_augmentation if not is_val else None, + use_torch_spec=c.model_params.get("use_torch_spec", False), + ) + # get classes list + classes = dataset.get_class_list() + + sampler = PerfectBatchSampler( + dataset.items, + classes, + batch_size=num_classes_in_batch * num_utter_per_class, # total batch size + num_classes_in_batch=num_classes_in_batch, + num_gpus=1, + shuffle=not is_val, + drop_last=True, + ) + + if len(classes) < num_classes_in_batch: + if is_val: + raise RuntimeError( + f"config.eval_num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Eval dataset) !" + ) + raise RuntimeError( + f"config.num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Train dataset) !" + ) + + # set the classes to avoid get wrong class_id when the number of training and eval classes are not equal if is_val: - loader = None - else: - dataset = SpeakerEncoderDataset( - ap, - meta_data_eval if is_val else meta_data_train, - voice_len=c.voice_len, - num_utter_per_speaker=c.num_utters_per_speaker, - num_speakers_in_batch=c.num_speakers_in_batch, - skip_speakers=c.skip_speakers, - storage_size=c.storage["storage_size"], - sample_from_storage_p=c.storage["sample_from_storage_p"], - verbose=verbose, - augmentation_config=c.audio_augmentation, - ) + dataset.set_classes(train_classes) - # sampler = DistributedSampler(dataset) if num_gpus > 1 else None - loader = DataLoader( - dataset, - batch_size=c.num_speakers_in_batch, - shuffle=False, - num_workers=c.num_loader_workers, - collate_fn=dataset.collate_fn, - ) - return loader, dataset.get_num_speakers() + loader = DataLoader( + dataset, + num_workers=c.num_loader_workers, + batch_sampler=sampler, + collate_fn=dataset.collate_fn, + ) + + return loader, classes, dataset.get_map_classid_to_classname() -def train(model, optimizer, scheduler, criterion, data_loader, global_step): +def evaluation(model, criterion, data_loader, global_step): + eval_loss = 0 + for _, data in enumerate(data_loader): + with torch.no_grad(): + # setup input data + inputs, labels = data + + # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1] + labels = torch.transpose( + labels.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch), 0, 1 + ).reshape(labels.shape) + inputs = torch.transpose( + inputs.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch, -1), 0, 1 + ).reshape(inputs.shape) + + # dispatch data to GPU + if use_cuda: + inputs = inputs.cuda(non_blocking=True) + labels = labels.cuda(non_blocking=True) + + # forward pass model + outputs = model(inputs) + + # loss computation + loss = criterion( + outputs.view(c.eval_num_classes_in_batch, outputs.shape[0] // c.eval_num_classes_in_batch, -1), labels + ) + + eval_loss += loss.item() + + eval_avg_loss = eval_loss / len(data_loader) + # save stats + dashboard_logger.eval_stats(global_step, {"loss": eval_avg_loss}) + # plot the last batch in the evaluation + figures = { + "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch), + } + dashboard_logger.eval_figures(global_step, figures) + return eval_avg_loss + + +def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, global_step): model.train() - epoch_time = 0 best_loss = float("inf") - avg_loss = 0 - avg_loss_all = 0 avg_loader_time = 0 end_time = time.time() + for epoch in range(c.epochs): + tot_loss = 0 + epoch_time = 0 + for _, data in enumerate(data_loader): + start_time = time.time() - for _, data in enumerate(data_loader): - start_time = time.time() + # setup input data + inputs, labels = data + # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1] + labels = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape( + labels.shape + ) + inputs = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape( + inputs.shape + ) + # ToDo: move it to a unit test + # labels_converted = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(labels.shape) + # inputs_converted = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(inputs.shape) + # idx = 0 + # for j in range(0, c.num_classes_in_batch, 1): + # for i in range(j, len(labels), c.num_classes_in_batch): + # if not torch.all(labels[i].eq(labels_converted[idx])) or not torch.all(inputs[i].eq(inputs_converted[idx])): + # print("Invalid") + # print(labels) + # exit() + # idx += 1 + # labels = labels_converted + # inputs = inputs_converted - # setup input data - inputs, labels = data - loader_time = time.time() - end_time - global_step += 1 + loader_time = time.time() - end_time + global_step += 1 - # setup lr - if c.lr_decay: - scheduler.step() - optimizer.zero_grad() + # setup lr + if c.lr_decay: + scheduler.step() + optimizer.zero_grad() - # dispatch data to GPU - if use_cuda: - inputs = inputs.cuda(non_blocking=True) - labels = labels.cuda(non_blocking=True) + # dispatch data to GPU + if use_cuda: + inputs = inputs.cuda(non_blocking=True) + labels = labels.cuda(non_blocking=True) - # forward pass model - outputs = model(inputs) + # forward pass model + outputs = model(inputs) - # loss computation - loss = criterion(outputs.view(c.num_speakers_in_batch, outputs.shape[0] // c.num_speakers_in_batch, -1), labels) - loss.backward() - grad_norm, _ = check_update(model, c.grad_clip) - optimizer.step() + # loss computation + loss = criterion( + outputs.view(c.num_classes_in_batch, outputs.shape[0] // c.num_classes_in_batch, -1), labels + ) + loss.backward() + grad_norm, _ = check_update(model, c.grad_clip) + optimizer.step() - step_time = time.time() - start_time - epoch_time += step_time + step_time = time.time() - start_time + epoch_time += step_time - # Averaged Loss and Averaged Loader Time - avg_loss = 0.01 * loss.item() + 0.99 * avg_loss if avg_loss != 0 else loss.item() - num_loader_workers = c.num_loader_workers if c.num_loader_workers > 0 else 1 - avg_loader_time = ( - 1 / num_loader_workers * loader_time + (num_loader_workers - 1) / num_loader_workers * avg_loader_time - if avg_loader_time != 0 - else loader_time + # acumulate the total epoch loss + tot_loss += loss.item() + + # Averaged Loader Time + num_loader_workers = c.num_loader_workers if c.num_loader_workers > 0 else 1 + avg_loader_time = ( + 1 / num_loader_workers * loader_time + (num_loader_workers - 1) / num_loader_workers * avg_loader_time + if avg_loader_time != 0 + else loader_time + ) + current_lr = optimizer.param_groups[0]["lr"] + + if global_step % c.steps_plot_stats == 0: + # Plot Training Epoch Stats + train_stats = { + "loss": loss.item(), + "lr": current_lr, + "grad_norm": grad_norm, + "step_time": step_time, + "avg_loader_time": avg_loader_time, + } + dashboard_logger.train_epoch_stats(global_step, train_stats) + figures = { + "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch), + } + dashboard_logger.train_figures(global_step, figures) + + if global_step % c.print_step == 0: + print( + " | > Step:{} Loss:{:.5f} GradNorm:{:.5f} " + "StepTime:{:.2f} LoaderTime:{:.2f} AvGLoaderTime:{:.2f} LR:{:.6f}".format( + global_step, loss.item(), grad_norm, step_time, loader_time, avg_loader_time, current_lr + ), + flush=True, + ) + + if global_step % c.save_step == 0: + # save model + save_checkpoint(model, optimizer, criterion, loss.item(), OUT_PATH, global_step, epoch) + + end_time = time.time() + + print("") + print( + ">>> Epoch:{} AvgLoss: {:.5f} GradNorm:{:.5f} " + "EpochTime:{:.2f} AvGLoaderTime:{:.2f} ".format( + epoch, tot_loss / len(data_loader), grad_norm, epoch_time, avg_loader_time + ), + flush=True, ) - current_lr = optimizer.param_groups[0]["lr"] - - if global_step % c.steps_plot_stats == 0: - # Plot Training Epoch Stats - train_stats = { - "loss": avg_loss, - "lr": current_lr, - "grad_norm": grad_norm, - "step_time": step_time, - "avg_loader_time": avg_loader_time, - } - dashboard_logger.train_epoch_stats(global_step, train_stats) - figures = { - # FIXME: not constant - "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), 10), - } - dashboard_logger.train_figures(global_step, figures) - - if global_step % c.print_step == 0: + # evaluation + if c.run_eval: + model.eval() + eval_loss = evaluation(model, criterion, eval_data_loader, global_step) + print("\n\n") + print("--> EVAL PERFORMANCE") print( - " | > Step:{} Loss:{:.5f} AvgLoss:{:.5f} GradNorm:{:.5f} " - "StepTime:{:.2f} LoaderTime:{:.2f} AvGLoaderTime:{:.2f} LR:{:.6f}".format( - global_step, loss.item(), avg_loss, grad_norm, step_time, loader_time, avg_loader_time, current_lr - ), + " | > Epoch:{} AvgLoss: {:.5f} ".format(epoch, eval_loss), flush=True, ) - avg_loss_all += avg_loss + # save the best checkpoint + best_loss = save_best_model(model, optimizer, criterion, eval_loss, best_loss, OUT_PATH, global_step, epoch) + model.train() - if global_step >= c.max_train_step or global_step % c.save_step == 0: - # save best model only - best_loss = save_best_model(model, optimizer, criterion, avg_loss, best_loss, OUT_PATH, global_step) - avg_loss_all = 0 - if global_step >= c.max_train_step: - break - - end_time = time.time() - - return avg_loss, global_step + return best_loss, global_step def main(args): # pylint: disable=redefined-outer-name # pylint: disable=global-variable-undefined global meta_data_train global meta_data_eval + global train_classes ap = AudioProcessor(**c.audio) - model = setup_speaker_encoder_model(c) + model = setup_encoder_model(c) - optimizer = RAdam(model.parameters(), lr=c.lr) + optimizer = get_optimizer(c.optimizer, c.optimizer_params, c.lr, model) # pylint: disable=redefined-outer-name - meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=False) + meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=True) - data_loader, num_speakers = setup_loader(ap, is_val=False, verbose=True) - - if c.loss == "ge2e": - criterion = GE2ELoss(loss_method="softmax") - elif c.loss == "angleproto": - criterion = AngleProtoLoss() - elif c.loss == "softmaxproto": - criterion = SoftmaxAngleProtoLoss(c.model_params["proj_dim"], num_speakers) + train_data_loader, train_classes, map_classid_to_classname = setup_loader(ap, is_val=False, verbose=True) + if c.run_eval: + eval_data_loader, _, _ = setup_loader(ap, is_val=True, verbose=True) else: - raise Exception("The %s not is a loss supported" % c.loss) + eval_data_loader = None + + num_classes = len(train_classes) + criterion = model.get_criterion(c, num_classes) + + if c.loss == "softmaxproto" and c.model != "speaker_encoder": + c.map_classid_to_classname = map_classid_to_classname + copy_model_files(c, OUT_PATH) if args.restore_path: - checkpoint = load_fsspec(args.restore_path) - try: - model.load_state_dict(checkpoint["model"]) - - if "criterion" in checkpoint: - criterion.load_state_dict(checkpoint["criterion"]) - - except (KeyError, RuntimeError): - print(" > Partial model initialization.") - model_dict = model.state_dict() - model_dict = set_init_dict(model_dict, checkpoint["model"], c) - model.load_state_dict(model_dict) - del model_dict - for group in optimizer.param_groups: - group["lr"] = c.lr - - print(" > Model restored from step %d" % checkpoint["step"], flush=True) - args.restore_step = checkpoint["step"] + criterion, args.restore_step = model.load_checkpoint( + c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion + ) + print(" > Model restored from step %d" % args.restore_step, flush=True) else: args.restore_step = 0 @@ -205,7 +299,7 @@ def main(args): # pylint: disable=redefined-outer-name criterion.cuda() global_step = args.restore_step - _, global_step = train(model, optimizer, scheduler, criterion, data_loader, global_step) + _, global_step = train(model, optimizer, scheduler, criterion, train_data_loader, eval_data_loader, global_step) if __name__ == "__main__": diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py index 976b74af..bdb4f6f6 100644 --- a/TTS/bin/train_tts.py +++ b/TTS/bin/train_tts.py @@ -57,7 +57,7 @@ def main(): # init the trainer and 🚀 trainer = Trainer( train_args, - config, + model.config, config.output_path, model=model, train_samples=train_samples, diff --git a/TTS/config/__init__.py b/TTS/config/__init__.py index 5c905295..6b0778c5 100644 --- a/TTS/config/__init__.py +++ b/TTS/config/__init__.py @@ -37,7 +37,7 @@ def register_config(model_name: str) -> Coqpit: """ config_class = None config_name = model_name + "_config" - paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.speaker_encoder"] + paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.encoder.configs"] for path in paths: try: config_class = find_module(path, config_name) diff --git a/TTS/config/shared_configs.py b/TTS/config/shared_configs.py index 6394b264..3ea49796 100644 --- a/TTS/config/shared_configs.py +++ b/TTS/config/shared_configs.py @@ -258,4 +258,3 @@ class BaseTrainingConfig(TrainerConfig): num_loader_workers: int = 0 num_eval_loader_workers: int = 0 use_noise_augment: bool = False - use_language_weighted_sampler: bool = False diff --git a/TTS/speaker_encoder/README.md b/TTS/encoder/README.md similarity index 79% rename from TTS/speaker_encoder/README.md rename to TTS/encoder/README.md index b6f541f8..b38b2005 100644 --- a/TTS/speaker_encoder/README.md +++ b/TTS/encoder/README.md @@ -14,5 +14,5 @@ To run the code, you need to follow the same flow as in TTS. - Define 'config.json' for your needs. Note that, audio parameters should match your TTS model. - Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360``` -- Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth.tar model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files. +- Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files. - Watch training on Tensorboard as in TTS diff --git a/TTS/speaker_encoder/__init__.py b/TTS/encoder/__init__.py similarity index 100% rename from TTS/speaker_encoder/__init__.py rename to TTS/encoder/__init__.py diff --git a/TTS/speaker_encoder/speaker_encoder_config.py b/TTS/encoder/configs/base_encoder_config.py similarity index 66% rename from TTS/speaker_encoder/speaker_encoder_config.py rename to TTS/encoder/configs/base_encoder_config.py index 8212acc7..ebbaa045 100644 --- a/TTS/speaker_encoder/speaker_encoder_config.py +++ b/TTS/encoder/configs/base_encoder_config.py @@ -7,10 +7,10 @@ from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig, BaseTr @dataclass -class SpeakerEncoderConfig(BaseTrainingConfig): - """Defines parameters for Speaker Encoder model.""" +class BaseEncoderConfig(BaseTrainingConfig): + """Defines parameters for a Generic Encoder model.""" - model: str = "speaker_encoder" + model: str = None audio: BaseAudioConfig = field(default_factory=BaseAudioConfig) datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()]) # model params @@ -27,34 +27,30 @@ class SpeakerEncoderConfig(BaseTrainingConfig): audio_augmentation: Dict = field(default_factory=lambda: {}) - storage: Dict = field( - default_factory=lambda: { - "sample_from_storage_p": 0.66, # the probability with which we'll sample from the DataSet in-memory storage - "storage_size": 15, # the size of the in-memory storage with respect to a single batch - } - ) - # training params - max_train_step: int = 1000000 # end training when number of training steps reaches this value. + epochs: int = 10000 loss: str = "angleproto" grad_clip: float = 3.0 lr: float = 0.0001 + optimizer: str = "radam" + optimizer_params: Dict = field(default_factory=lambda: {"betas": [0.9, 0.999], "weight_decay": 0}) lr_decay: bool = False warmup_steps: int = 4000 - wd: float = 1e-6 # logging params tb_model_param_stats: bool = False steps_plot_stats: int = 10 - checkpoint: bool = True save_step: int = 1000 print_step: int = 20 + run_eval: bool = False # data loader - num_speakers_in_batch: int = MISSING - num_utters_per_speaker: int = MISSING + num_classes_in_batch: int = MISSING + num_utter_per_class: int = MISSING + eval_num_classes_in_batch: int = None + eval_num_utter_per_class: int = None + num_loader_workers: int = MISSING - skip_speakers: bool = False voice_len: float = 1.6 def check_values(self): diff --git a/TTS/encoder/configs/emotion_encoder_config.py b/TTS/encoder/configs/emotion_encoder_config.py new file mode 100644 index 00000000..5eda2671 --- /dev/null +++ b/TTS/encoder/configs/emotion_encoder_config.py @@ -0,0 +1,12 @@ +from dataclasses import asdict, dataclass + +from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig + + +@dataclass +class EmotionEncoderConfig(BaseEncoderConfig): + """Defines parameters for Emotion Encoder model.""" + + model: str = "emotion_encoder" + map_classid_to_classname: dict = None + class_name_key: str = "emotion_name" diff --git a/TTS/encoder/configs/speaker_encoder_config.py b/TTS/encoder/configs/speaker_encoder_config.py new file mode 100644 index 00000000..6dceb002 --- /dev/null +++ b/TTS/encoder/configs/speaker_encoder_config.py @@ -0,0 +1,11 @@ +from dataclasses import asdict, dataclass + +from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig + + +@dataclass +class SpeakerEncoderConfig(BaseEncoderConfig): + """Defines parameters for Speaker Encoder model.""" + + model: str = "speaker_encoder" + class_name_key: str = "speaker_name" diff --git a/TTS/encoder/dataset.py b/TTS/encoder/dataset.py new file mode 100644 index 00000000..582b1fe9 --- /dev/null +++ b/TTS/encoder/dataset.py @@ -0,0 +1,147 @@ +import random + +import torch +from torch.utils.data import Dataset + +from TTS.encoder.utils.generic_utils import AugmentWAV + + +class EncoderDataset(Dataset): + def __init__( + self, + config, + ap, + meta_data, + voice_len=1.6, + num_classes_in_batch=64, + num_utter_per_class=10, + verbose=False, + augmentation_config=None, + use_torch_spec=None, + ): + """ + Args: + ap (TTS.tts.utils.AudioProcessor): audio processor object. + meta_data (list): list of dataset instances. + seq_len (int): voice segment length in seconds. + verbose (bool): print diagnostic information. + """ + super().__init__() + self.config = config + self.items = meta_data + self.sample_rate = ap.sample_rate + self.seq_len = int(voice_len * self.sample_rate) + self.num_utter_per_class = num_utter_per_class + self.ap = ap + self.verbose = verbose + self.use_torch_spec = use_torch_spec + self.classes, self.items = self.__parse_items() + + self.classname_to_classid = {key: i for i, key in enumerate(self.classes)} + + # Data Augmentation + self.augmentator = None + self.gaussian_augmentation_config = None + if augmentation_config: + self.data_augmentation_p = augmentation_config["p"] + if self.data_augmentation_p and ("additive" in augmentation_config or "rir" in augmentation_config): + self.augmentator = AugmentWAV(ap, augmentation_config) + + if "gaussian" in augmentation_config.keys(): + self.gaussian_augmentation_config = augmentation_config["gaussian"] + + if self.verbose: + print("\n > DataLoader initialization") + print(f" | > Classes per Batch: {num_classes_in_batch}") + print(f" | > Number of instances : {len(self.items)}") + print(f" | > Sequence length: {self.seq_len}") + print(f" | > Num Classes: {len(self.classes)}") + print(f" | > Classes: {self.classes}") + + def load_wav(self, filename): + audio = self.ap.load_wav(filename, sr=self.ap.sample_rate) + return audio + + def __parse_items(self): + class_to_utters = {} + for item in self.items: + path_ = item["audio_file"] + class_name = item[self.config.class_name_key] + if class_name in class_to_utters.keys(): + class_to_utters[class_name].append(path_) + else: + class_to_utters[class_name] = [ + path_, + ] + + # skip classes with number of samples >= self.num_utter_per_class + class_to_utters = {k: v for (k, v) in class_to_utters.items() if len(v) >= self.num_utter_per_class} + + classes = list(class_to_utters.keys()) + classes.sort() + + new_items = [] + for item in self.items: + path_ = item["audio_file"] + class_name = item["emotion_name"] if self.config.model == "emotion_encoder" else item["speaker_name"] + # ignore filtered classes + if class_name not in classes: + continue + # ignore small audios + if self.load_wav(path_).shape[0] - self.seq_len <= 0: + continue + + new_items.append({"wav_file_path": path_, "class_name": class_name}) + + return classes, new_items + + def __len__(self): + return len(self.items) + + def get_num_classes(self): + return len(self.classes) + + def get_class_list(self): + return self.classes + + def set_classes(self, classes): + self.classes = classes + self.classname_to_classid = {key: i for i, key in enumerate(self.classes)} + + def get_map_classid_to_classname(self): + return dict((c_id, c_n) for c_n, c_id in self.classname_to_classid.items()) + + def __getitem__(self, idx): + return self.items[idx] + + def collate_fn(self, batch): + # get the batch class_ids + labels = [] + feats = [] + for item in batch: + utter_path = item["wav_file_path"] + class_name = item["class_name"] + + # get classid + class_id = self.classname_to_classid[class_name] + # load wav file + wav = self.load_wav(utter_path) + offset = random.randint(0, wav.shape[0] - self.seq_len) + wav = wav[offset : offset + self.seq_len] + + if self.augmentator is not None and self.data_augmentation_p: + if random.random() < self.data_augmentation_p: + wav = self.augmentator.apply_one(wav) + + if not self.use_torch_spec: + mel = self.ap.melspectrogram(wav) + feats.append(torch.FloatTensor(mel)) + else: + feats.append(torch.FloatTensor(wav)) + + labels.append(class_id) + + feats = torch.stack(feats) + labels = torch.LongTensor(labels) + + return feats, labels diff --git a/TTS/speaker_encoder/losses.py b/TTS/encoder/losses.py similarity index 97% rename from TTS/speaker_encoder/losses.py rename to TTS/encoder/losses.py index 8ba917b7..5b5aa0fc 100644 --- a/TTS/speaker_encoder/losses.py +++ b/TTS/encoder/losses.py @@ -189,6 +189,12 @@ class SoftmaxLoss(nn.Module): return L + def inference(self, embedding): + x = self.fc(embedding) + activations = torch.nn.functional.softmax(x, dim=1).squeeze(0) + class_id = torch.argmax(activations) + return class_id + class SoftmaxAngleProtoLoss(nn.Module): """ diff --git a/TTS/encoder/models/base_encoder.py b/TTS/encoder/models/base_encoder.py new file mode 100644 index 00000000..ac7d7dd5 --- /dev/null +++ b/TTS/encoder/models/base_encoder.py @@ -0,0 +1,154 @@ +import numpy as np +import torch +import torchaudio +from coqpit import Coqpit +from torch import nn + +from TTS.encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss +from TTS.utils.generic_utils import set_init_dict +from TTS.utils.io import load_fsspec + + +class PreEmphasis(nn.Module): + def __init__(self, coefficient=0.97): + super().__init__() + self.coefficient = coefficient + self.register_buffer("filter", torch.FloatTensor([-self.coefficient, 1.0]).unsqueeze(0).unsqueeze(0)) + + def forward(self, x): + assert len(x.size()) == 2 + + x = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), "reflect") + return torch.nn.functional.conv1d(x, self.filter).squeeze(1) + + +class BaseEncoder(nn.Module): + """Base `encoder` class. Every new `encoder` model must inherit this. + + It defines common `encoder` specific functions. + """ + + # pylint: disable=W0102 + def __init__(self): + super(BaseEncoder, self).__init__() + + def get_torch_mel_spectrogram_class(self, audio_config): + return torch.nn.Sequential( + PreEmphasis(audio_config["preemphasis"]), + # TorchSTFT( + # n_fft=audio_config["fft_size"], + # hop_length=audio_config["hop_length"], + # win_length=audio_config["win_length"], + # sample_rate=audio_config["sample_rate"], + # window="hamming_window", + # mel_fmin=0.0, + # mel_fmax=None, + # use_htk=True, + # do_amp_to_db=False, + # n_mels=audio_config["num_mels"], + # power=2.0, + # use_mel=True, + # mel_norm=None, + # ) + torchaudio.transforms.MelSpectrogram( + sample_rate=audio_config["sample_rate"], + n_fft=audio_config["fft_size"], + win_length=audio_config["win_length"], + hop_length=audio_config["hop_length"], + window_fn=torch.hamming_window, + n_mels=audio_config["num_mels"], + ), + ) + + @torch.no_grad() + def inference(self, x, l2_norm=True): + return self.forward(x, l2_norm) + + @torch.no_grad() + def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True, l2_norm=True): + """ + Generate embeddings for a batch of utterances + x: 1xTxD + """ + # map to the waveform size + if self.use_torch_spec: + num_frames = num_frames * self.audio_config["hop_length"] + + max_len = x.shape[1] + + if max_len < num_frames: + num_frames = max_len + + offsets = np.linspace(0, max_len - num_frames, num=num_eval) + + frames_batch = [] + for offset in offsets: + offset = int(offset) + end_offset = int(offset + num_frames) + frames = x[:, offset:end_offset] + frames_batch.append(frames) + + frames_batch = torch.cat(frames_batch, dim=0) + embeddings = self.inference(frames_batch, l2_norm=l2_norm) + + if return_mean: + embeddings = torch.mean(embeddings, dim=0, keepdim=True) + return embeddings + + def get_criterion(self, c: Coqpit, num_classes=None): + if c.loss == "ge2e": + criterion = GE2ELoss(loss_method="softmax") + elif c.loss == "angleproto": + criterion = AngleProtoLoss() + elif c.loss == "softmaxproto": + criterion = SoftmaxAngleProtoLoss(c.model_params["proj_dim"], num_classes) + else: + raise Exception("The %s not is a loss supported" % c.loss) + return criterion + + def load_checkpoint( + self, config: Coqpit, checkpoint_path: str, eval: bool = False, use_cuda: bool = False, criterion=None + ): + state = load_fsspec(checkpoint_path, map_location=torch.device("cpu")) + try: + self.load_state_dict(state["model"]) + except (KeyError, RuntimeError) as error: + # If eval raise the error + if eval: + raise error + + print(" > Partial model initialization.") + model_dict = self.state_dict() + model_dict = set_init_dict(model_dict, state["model"], c) + self.load_state_dict(model_dict) + del model_dict + + # load the criterion for restore_path + if criterion is not None and "criterion" in state: + try: + criterion.load_state_dict(state["criterion"]) + except (KeyError, RuntimeError) as error: + print(" > Criterion load ignored because of:", error) + + # instance and load the criterion for the encoder classifier in inference time + if ( + eval + and criterion is None + and "criterion" in state + and getattr(config, "map_classid_to_classname", None) is not None + ): + criterion = self.get_criterion(config, len(config.map_classid_to_classname)) + criterion.load_state_dict(state["criterion"]) + + if use_cuda: + self.cuda() + if criterion is not None: + criterion = criterion.cuda() + + if eval: + self.eval() + assert not self.training + + if not eval: + return criterion, state["step"] + return criterion diff --git a/TTS/encoder/models/lstm.py b/TTS/encoder/models/lstm.py new file mode 100644 index 00000000..51852b5b --- /dev/null +++ b/TTS/encoder/models/lstm.py @@ -0,0 +1,99 @@ +import torch +from torch import nn + +from TTS.encoder.models.base_encoder import BaseEncoder + + +class LSTMWithProjection(nn.Module): + def __init__(self, input_size, hidden_size, proj_size): + super().__init__() + self.input_size = input_size + self.hidden_size = hidden_size + self.proj_size = proj_size + self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True) + self.linear = nn.Linear(hidden_size, proj_size, bias=False) + + def forward(self, x): + self.lstm.flatten_parameters() + o, (_, _) = self.lstm(x) + return self.linear(o) + + +class LSTMWithoutProjection(nn.Module): + def __init__(self, input_dim, lstm_dim, proj_dim, num_lstm_layers): + super().__init__() + self.lstm = nn.LSTM(input_size=input_dim, hidden_size=lstm_dim, num_layers=num_lstm_layers, batch_first=True) + self.linear = nn.Linear(lstm_dim, proj_dim, bias=True) + self.relu = nn.ReLU() + + def forward(self, x): + _, (hidden, _) = self.lstm(x) + return self.relu(self.linear(hidden[-1])) + + +class LSTMSpeakerEncoder(BaseEncoder): + def __init__( + self, + input_dim, + proj_dim=256, + lstm_dim=768, + num_lstm_layers=3, + use_lstm_with_projection=True, + use_torch_spec=False, + audio_config=None, + ): + super().__init__() + self.use_lstm_with_projection = use_lstm_with_projection + self.use_torch_spec = use_torch_spec + self.audio_config = audio_config + self.proj_dim = proj_dim + + layers = [] + # choise LSTM layer + if use_lstm_with_projection: + layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim)) + for _ in range(num_lstm_layers - 1): + layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim)) + self.layers = nn.Sequential(*layers) + else: + self.layers = LSTMWithoutProjection(input_dim, lstm_dim, proj_dim, num_lstm_layers) + + self.instancenorm = nn.InstanceNorm1d(input_dim) + + if self.use_torch_spec: + self.torch_spec = self.get_torch_mel_spectrogram_class(audio_config) + else: + self.torch_spec = None + + self._init_layers() + + def _init_layers(self): + for name, param in self.layers.named_parameters(): + if "bias" in name: + nn.init.constant_(param, 0.0) + elif "weight" in name: + nn.init.xavier_normal_(param) + + def forward(self, x, l2_norm=True): + """Forward pass of the model. + + Args: + x (Tensor): Raw waveform signal or spectrogram frames. If input is a waveform, `torch_spec` must be `True` + to compute the spectrogram on-the-fly. + l2_norm (bool): Whether to L2-normalize the outputs. + + Shapes: + - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})` + """ + with torch.no_grad(): + with torch.cuda.amp.autocast(enabled=False): + if self.use_torch_spec: + x.squeeze_(1) + x = self.torch_spec(x) + x = self.instancenorm(x).transpose(1, 2) + d = self.layers(x) + if self.use_lstm_with_projection: + d = d[:, -1] + if l2_norm: + d = torch.nn.functional.normalize(d, p=2, dim=1) + return d diff --git a/TTS/speaker_encoder/models/resnet.py b/TTS/encoder/models/resnet.py similarity index 67% rename from TTS/speaker_encoder/models/resnet.py rename to TTS/encoder/models/resnet.py index a799fc52..84e9967f 100644 --- a/TTS/speaker_encoder/models/resnet.py +++ b/TTS/encoder/models/resnet.py @@ -1,23 +1,8 @@ -import numpy as np import torch -import torchaudio from torch import nn # from TTS.utils.audio import TorchSTFT -from TTS.utils.io import load_fsspec - - -class PreEmphasis(nn.Module): - def __init__(self, coefficient=0.97): - super().__init__() - self.coefficient = coefficient - self.register_buffer("filter", torch.FloatTensor([-self.coefficient, 1.0]).unsqueeze(0).unsqueeze(0)) - - def forward(self, x): - assert len(x.size()) == 2 - - x = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), "reflect") - return torch.nn.functional.conv1d(x, self.filter).squeeze(1) +from TTS.encoder.models.base_encoder import BaseEncoder class SELayer(nn.Module): @@ -71,7 +56,7 @@ class SEBasicBlock(nn.Module): return out -class ResNetSpeakerEncoder(nn.Module): +class ResNetSpeakerEncoder(BaseEncoder): """Implementation of the model H/ASP without batch normalization in speaker embedding. This model was proposed in: https://arxiv.org/abs/2009.14153 Adapted from: https://github.com/clovaai/voxceleb_trainer """ @@ -110,32 +95,7 @@ class ResNetSpeakerEncoder(nn.Module): self.instancenorm = nn.InstanceNorm1d(input_dim) if self.use_torch_spec: - self.torch_spec = torch.nn.Sequential( - PreEmphasis(audio_config["preemphasis"]), - # TorchSTFT( - # n_fft=audio_config["fft_size"], - # hop_length=audio_config["hop_length"], - # win_length=audio_config["win_length"], - # sample_rate=audio_config["sample_rate"], - # window="hamming_window", - # mel_fmin=0.0, - # mel_fmax=None, - # use_htk=True, - # do_amp_to_db=False, - # n_mels=audio_config["num_mels"], - # power=2.0, - # use_mel=True, - # mel_norm=None, - # ) - torchaudio.transforms.MelSpectrogram( - sample_rate=audio_config["sample_rate"], - n_fft=audio_config["fft_size"], - win_length=audio_config["win_length"], - hop_length=audio_config["hop_length"], - window_fn=torch.hamming_window, - n_mels=audio_config["num_mels"], - ), - ) + self.torch_spec = self.get_torch_mel_spectrogram_class(audio_config) else: self.torch_spec = None @@ -238,47 +198,3 @@ class ResNetSpeakerEncoder(nn.Module): if l2_norm: x = torch.nn.functional.normalize(x, p=2, dim=1) return x - - @torch.no_grad() - def inference(self, x, l2_norm=False): - return self.forward(x, l2_norm) - - @torch.no_grad() - def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True, l2_norm=True): - """ - Generate embeddings for a batch of utterances - x: 1xTxD - """ - # map to the waveform size - if self.use_torch_spec: - num_frames = num_frames * self.audio_config["hop_length"] - - max_len = x.shape[1] - - if max_len < num_frames: - num_frames = max_len - - offsets = np.linspace(0, max_len - num_frames, num=num_eval) - - frames_batch = [] - for offset in offsets: - offset = int(offset) - end_offset = int(offset + num_frames) - frames = x[:, offset:end_offset] - frames_batch.append(frames) - - frames_batch = torch.cat(frames_batch, dim=0) - embeddings = self.inference(frames_batch, l2_norm=l2_norm) - - if return_mean: - embeddings = torch.mean(embeddings, dim=0, keepdim=True) - return embeddings - - def load_checkpoint(self, config: dict, checkpoint_path: str, eval: bool = False, use_cuda: bool = False): - state = load_fsspec(checkpoint_path, map_location=torch.device("cpu")) - self.load_state_dict(state["model"]) - if use_cuda: - self.cuda() - if eval: - self.eval() - assert not self.training diff --git a/TTS/speaker_encoder/requirements.txt b/TTS/encoder/requirements.txt similarity index 100% rename from TTS/speaker_encoder/requirements.txt rename to TTS/encoder/requirements.txt diff --git a/TTS/speaker_encoder/utils/__init__.py b/TTS/encoder/utils/__init__.py similarity index 100% rename from TTS/speaker_encoder/utils/__init__.py rename to TTS/encoder/utils/__init__.py diff --git a/TTS/speaker_encoder/utils/generic_utils.py b/TTS/encoder/utils/generic_utils.py similarity index 78% rename from TTS/speaker_encoder/utils/generic_utils.py rename to TTS/encoder/utils/generic_utils.py index 4ab4e923..91a896f6 100644 --- a/TTS/speaker_encoder/utils/generic_utils.py +++ b/TTS/encoder/utils/generic_utils.py @@ -3,60 +3,15 @@ import glob import os import random import re -from multiprocessing import Manager import numpy as np from scipy import signal -from TTS.speaker_encoder.models.lstm import LSTMSpeakerEncoder -from TTS.speaker_encoder.models.resnet import ResNetSpeakerEncoder +from TTS.encoder.models.lstm import LSTMSpeakerEncoder +from TTS.encoder.models.resnet import ResNetSpeakerEncoder from TTS.utils.io import save_fsspec -class Storage(object): - def __init__(self, maxsize, storage_batchs, num_speakers_in_batch, num_threads=8): - # use multiprocessing for threading safe - self.storage = Manager().list() - self.maxsize = maxsize - self.num_speakers_in_batch = num_speakers_in_batch - self.num_threads = num_threads - self.ignore_last_batch = False - - if storage_batchs >= 3: - self.ignore_last_batch = True - - # used for fast random sample - self.safe_storage_size = self.maxsize - self.num_threads - if self.ignore_last_batch: - self.safe_storage_size -= self.num_speakers_in_batch - - def __len__(self): - return len(self.storage) - - def full(self): - return len(self.storage) >= self.maxsize - - def append(self, item): - # if storage is full, remove an item - if self.full(): - self.storage.pop(0) - - self.storage.append(item) - - def get_random_sample(self): - # safe storage size considering all threads remove one item from storage in same time - storage_size = len(self.storage) - self.num_threads - - if self.ignore_last_batch: - storage_size -= self.num_speakers_in_batch - - return self.storage[random.randint(0, storage_size)] - - def get_random_sample_fast(self): - """Call this method only when storage is full""" - return self.storage[random.randint(0, self.safe_storage_size)] - - class AugmentWAV(object): def __init__(self, ap, augmentation_config): @@ -170,7 +125,7 @@ def to_camel(text): return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text) -def setup_speaker_encoder_model(config: "Coqpit"): +def setup_encoder_model(config: "Coqpit"): if config.model_params["model_name"].lower() == "lstm": model = LSTMSpeakerEncoder( config.model_params["input_dim"], @@ -192,7 +147,7 @@ def setup_speaker_encoder_model(config: "Coqpit"): def save_checkpoint(model, optimizer, criterion, model_loss, out_path, current_step, epoch): - checkpoint_path = "checkpoint_{}.pth.tar".format(current_step) + checkpoint_path = "checkpoint_{}.pth".format(current_step) checkpoint_path = os.path.join(out_path, checkpoint_path) print(" | | > Checkpoint saving : {}".format(checkpoint_path)) @@ -209,7 +164,7 @@ def save_checkpoint(model, optimizer, criterion, model_loss, out_path, current_s save_fsspec(state, checkpoint_path) -def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path, current_step): +def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path, current_step, epoch): if model_loss < best_loss: new_state_dict = model.state_dict() state = { @@ -217,11 +172,12 @@ def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path "optimizer": optimizer.state_dict(), "criterion": criterion.state_dict(), "step": current_step, + "epoch": epoch, "loss": model_loss, "date": datetime.date.today().strftime("%B %d, %Y"), } best_loss = model_loss - bestmodel_path = "best_model.pth.tar" + bestmodel_path = "best_model.pth" bestmodel_path = os.path.join(out_path, bestmodel_path) print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path)) save_fsspec(state, bestmodel_path) diff --git a/TTS/speaker_encoder/utils/io.py b/TTS/encoder/utils/io.py similarity index 91% rename from TTS/speaker_encoder/utils/io.py rename to TTS/encoder/utils/io.py index 7a3aadc9..d1dad3e2 100644 --- a/TTS/speaker_encoder/utils/io.py +++ b/TTS/encoder/utils/io.py @@ -5,7 +5,7 @@ from TTS.utils.io import save_fsspec def save_checkpoint(model, optimizer, model_loss, out_path, current_step): - checkpoint_path = "checkpoint_{}.pth.tar".format(current_step) + checkpoint_path = "checkpoint_{}.pth".format(current_step) checkpoint_path = os.path.join(out_path, checkpoint_path) print(" | | > Checkpoint saving : {}".format(checkpoint_path)) @@ -31,7 +31,7 @@ def save_best_model(model, optimizer, model_loss, best_loss, out_path, current_s "date": datetime.date.today().strftime("%B %d, %Y"), } best_loss = model_loss - bestmodel_path = "best_model.pth.tar" + bestmodel_path = "best_model.pth" bestmodel_path = os.path.join(out_path, bestmodel_path) print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path)) save_fsspec(state, bestmodel_path) diff --git a/TTS/speaker_encoder/utils/prepare_voxceleb.py b/TTS/encoder/utils/prepare_voxceleb.py similarity index 100% rename from TTS/speaker_encoder/utils/prepare_voxceleb.py rename to TTS/encoder/utils/prepare_voxceleb.py diff --git a/TTS/encoder/utils/samplers.py b/TTS/encoder/utils/samplers.py new file mode 100644 index 00000000..08256b34 --- /dev/null +++ b/TTS/encoder/utils/samplers.py @@ -0,0 +1,114 @@ +import random + +from torch.utils.data.sampler import Sampler, SubsetRandomSampler + + +class SubsetSampler(Sampler): + """ + Samples elements sequentially from a given list of indices. + + Args: + indices (list): a sequence of indices + """ + + def __init__(self, indices): + super().__init__(indices) + self.indices = indices + + def __iter__(self): + return (self.indices[i] for i in range(len(self.indices))) + + def __len__(self): + return len(self.indices) + + +class PerfectBatchSampler(Sampler): + """ + Samples a mini-batch of indices for a balanced class batching + + Args: + dataset_items(list): dataset items to sample from. + classes (list): list of classes of dataset_items to sample from. + batch_size (int): total number of samples to be sampled in a mini-batch. + num_gpus (int): number of GPU in the data parallel mode. + shuffle (bool): if True, samples randomly, otherwise samples sequentially. + drop_last (bool): if True, drops last incomplete batch. + """ + + def __init__( + self, + dataset_items, + classes, + batch_size, + num_classes_in_batch, + num_gpus=1, + shuffle=True, + drop_last=False, + label_key="class_name", + ): + super().__init__(dataset_items) + assert ( + batch_size % (num_classes_in_batch * num_gpus) == 0 + ), "Batch size must be divisible by number of classes times the number of data parallel devices (if enabled)." + + label_indices = {} + for idx, item in enumerate(dataset_items): + label = item[label_key] + if label not in label_indices.keys(): + label_indices[label] = [idx] + else: + label_indices[label].append(idx) + + if shuffle: + self._samplers = [SubsetRandomSampler(label_indices[key]) for key in classes] + else: + self._samplers = [SubsetSampler(label_indices[key]) for key in classes] + + self._batch_size = batch_size + self._drop_last = drop_last + self._dp_devices = num_gpus + self._num_classes_in_batch = num_classes_in_batch + + def __iter__(self): + + batch = [] + if self._num_classes_in_batch != len(self._samplers): + valid_samplers_idx = random.sample(range(len(self._samplers)), self._num_classes_in_batch) + else: + valid_samplers_idx = None + + iters = [iter(s) for s in self._samplers] + done = False + + while True: + b = [] + for i, it in enumerate(iters): + if valid_samplers_idx is not None and i not in valid_samplers_idx: + continue + idx = next(it, None) + if idx is None: + done = True + break + b.append(idx) + if done: + break + batch += b + if len(batch) == self._batch_size: + yield batch + batch = [] + if valid_samplers_idx is not None: + valid_samplers_idx = random.sample(range(len(self._samplers)), self._num_classes_in_batch) + + if not self._drop_last: + if len(batch) > 0: + groups = len(batch) // self._num_classes_in_batch + if groups % self._dp_devices == 0: + yield batch + else: + batch = batch[: (groups // self._dp_devices) * self._dp_devices * self._num_classes_in_batch] + if len(batch) > 0: + yield batch + + def __len__(self): + class_batch_size = self._batch_size // self._num_classes_in_batch + return min(((len(s) + class_batch_size - 1) // class_batch_size) for s in self._samplers) diff --git a/TTS/speaker_encoder/utils/training.py b/TTS/encoder/utils/training.py similarity index 100% rename from TTS/speaker_encoder/utils/training.py rename to TTS/encoder/utils/training.py diff --git a/TTS/speaker_encoder/utils/visual.py b/TTS/encoder/utils/visual.py similarity index 69% rename from TTS/speaker_encoder/utils/visual.py rename to TTS/encoder/utils/visual.py index 4f40f68c..f2db2f3f 100644 --- a/TTS/speaker_encoder/utils/visual.py +++ b/TTS/encoder/utils/visual.py @@ -29,14 +29,18 @@ colormap = ( ) -def plot_embeddings(embeddings, num_utter_per_speaker): - embeddings = embeddings[: 10 * num_utter_per_speaker] +def plot_embeddings(embeddings, num_classes_in_batch): + num_utter_per_class = embeddings.shape[0] // num_classes_in_batch + + # if necessary get just the first 10 classes + if num_classes_in_batch > 10: + num_classes_in_batch = 10 + embeddings = embeddings[: num_classes_in_batch * num_utter_per_class] + model = umap.UMAP() projection = model.fit_transform(embeddings) - num_speakers = embeddings.shape[0] // num_utter_per_speaker - ground_truth = np.repeat(np.arange(num_speakers), num_utter_per_speaker) + ground_truth = np.repeat(np.arange(num_classes_in_batch), num_utter_per_class) colors = [colormap[i] for i in ground_truth] - fig, ax = plt.subplots(figsize=(16, 10)) _ = ax.scatter(projection[:, 0], projection[:, 1], c=colors) plt.gca().set_aspect("equal", "datalim") diff --git a/TTS/model.py b/TTS/model.py index 39cbeabc..a53b916a 100644 --- a/TTS/model.py +++ b/TTS/model.py @@ -1,46 +1,34 @@ -from abc import ABC, abstractmethod -from typing import Dict, List, Tuple +from abc import abstractmethod +from typing import Dict import torch from coqpit import Coqpit -from torch import nn +from trainer import TrainerModel # pylint: skip-file -class BaseTrainerModel(ABC, nn.Module): - """Abstract 🐸TTS class. Every new 🐸TTS model must inherit this.""" +class BaseTrainerModel(TrainerModel): + """BaseTrainerModel model expanding TrainerModel with required functions by 🐸TTS. + + Every new 🐸TTS model must inherit it. + """ @staticmethod @abstractmethod def init_from_config(config: Coqpit): - """Init the model from given config. + """Init the model and all its attributes from the given config. Override this depending on your model. """ ... - @abstractmethod - def forward(self, input: torch.Tensor, *args, aux_input={}, **kwargs) -> Dict: - """Forward ... for the model mainly used in training. - - You can be flexible here and use different number of arguments and argument names since it is intended to be - used by `train_step()` without exposing it out of the model. - - Args: - input (torch.Tensor): Input tensor. - aux_input (Dict): Auxiliary model inputs like embeddings, durations or any other sorts of inputs. - - Returns: - Dict: Model outputs. Main model output must be named as "model_outputs". - """ - outputs_dict = {"model_outputs": None} - ... - return outputs_dict - @abstractmethod def inference(self, input: torch.Tensor, aux_input={}) -> Dict: - """Forward ... for inference. + """Forward pass for inference. + + It must return a dictionary with the main model output and all the auxiliary outputs. The key ```model_outputs``` + is considered to be the main output and you can add any other auxiliary outputs as you want. We don't use `*kwargs` since it is problematic with the TorchScript API. @@ -55,78 +43,9 @@ class BaseTrainerModel(ABC, nn.Module): ... return outputs_dict - def format_batch(self, batch: Dict) -> Dict: - """Format batch returned by the data loader before sending it to the model. - - If not implemented, model uses the batch as is. - Can be used for data augmentation, feature ectraction, etc. - """ - return batch - - def format_batch_on_device(self, batch: Dict) -> Dict: - """Format batch on device before sending it to the model. - - If not implemented, model uses the batch as is. - Can be used for data augmentation, feature ectraction, etc. - """ - return batch - - @abstractmethod - def train_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]: - """Perform a single training step. Run the model forward ... and compute losses. - - Args: - batch (Dict): Input tensors. - criterion (nn.Module): Loss layer designed for the model. - - Returns: - Tuple[Dict, Dict]: Model ouputs and computed losses. - """ - outputs_dict = {} - loss_dict = {} # this returns from the criterion - ... - return outputs_dict, loss_dict - - def train_log(self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int) -> None: - """Create visualizations and waveform examples for training. - - For example, here you can plot spectrograms and generate sample sample waveforms from these spectrograms to - be projected onto Tensorboard. - - Args: - ap (AudioProcessor): audio processor used at training. - batch (Dict): Model inputs used at the previous training step. - outputs (Dict): Model outputs generated at the previoud training step. - - Returns: - Tuple[Dict, np.ndarray]: training plots and output waveform. - """ - ... - - @abstractmethod - def eval_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]: - """Perform a single evaluation step. Run the model forward ... and compute losses. In most cases, you can - call `train_step()` with no changes. - - Args: - batch (Dict): Input tensors. - criterion (nn.Module): Loss layer designed for the model. - - Returns: - Tuple[Dict, Dict]: Model ouputs and computed losses. - """ - outputs_dict = {} - loss_dict = {} # this returns from the criterion - ... - return outputs_dict, loss_dict - - def eval_log(self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int) -> None: - """The same as `train_log()`""" - ... - @abstractmethod def load_checkpoint(self, config: Coqpit, checkpoint_path: str, eval: bool = False, strict: bool = True) -> None: - """Load a checkpoint and get ready for training or inference. + """Load a model checkpoint gile and get ready for training or inference. Args: config (Coqpit): Model configuration. @@ -135,36 +54,3 @@ class BaseTrainerModel(ABC, nn.Module): strcit (bool, optional): Match all checkpoint keys to model's keys. Defaults to True. """ ... - - @staticmethod - @abstractmethod - def init_from_config(config: Coqpit, samples: List[Dict] = None, verbose=False) -> "BaseTrainerModel": - """Init the model from given config. - - Override this depending on your model. - """ - ... - - @abstractmethod - def get_data_loader( - self, config: Coqpit, assets: Dict, is_eval: True, data_items: List, verbose: bool, num_gpus: int - ): - ... - - # def get_optimizer(self) -> Union["Optimizer", List["Optimizer"]]: - # """Setup an return optimizer or optimizers.""" - # ... - - # def get_lr(self) -> Union[float, List[float]]: - # """Return learning rate(s). - - # Returns: - # Union[float, List[float]]: Model's initial learning rates. - # """ - # ... - - # def get_scheduler(self, optimizer: torch.optim.Optimizer): - # ... - - # def get_criterion(self): - # ... diff --git a/TTS/server/README.md b/TTS/server/README.md index 89ee21eb..5458e398 100644 --- a/TTS/server/README.md +++ b/TTS/server/README.md @@ -21,4 +21,4 @@ Run the server with the official models on a GPU. ```CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan --use_cuda True``` Run the server with a custom models. -```python TTS/server/server.py --tts_checkpoint /path/to/tts/model.pth.tar --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth.tar --vocoder_config /path/to/vocoder/config.json``` +```python TTS/server/server.py --tts_checkpoint /path/to/tts/model.pth --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth --vocoder_config /path/to/vocoder/config.json``` diff --git a/TTS/server/conf.json b/TTS/server/conf.json index 32e475cf..49b6c09c 100644 --- a/TTS/server/conf.json +++ b/TTS/server/conf.json @@ -1,6 +1,6 @@ { "tts_path":"/media/erogol/data_ssd/Models/libri_tts/5049/", // tts model root folder - "tts_file":"best_model.pth.tar", // tts checkpoint file + "tts_file":"best_model.pth", // tts checkpoint file "tts_config":"config.json", // tts config.json file "tts_speakers": null, // json file listing speaker ids. null if no speaker embedding. "vocoder_config":null, diff --git a/TTS/server/server.py b/TTS/server/server.py index aef507fd..fd53e76d 100644 --- a/TTS/server/server.py +++ b/TTS/server/server.py @@ -143,7 +143,7 @@ def index(): "index.html", show_details=args.show_details, use_multi_speaker=use_multi_speaker, - speaker_ids=speaker_manager.speaker_ids if speaker_manager is not None else None, + speaker_ids=speaker_manager.ids if speaker_manager is not None else None, use_gst=use_gst, ) diff --git a/TTS/speaker_encoder/configs/config.json b/TTS/speaker_encoder/configs/config.json deleted file mode 100644 index 30d83e51..00000000 --- a/TTS/speaker_encoder/configs/config.json +++ /dev/null @@ -1,118 +0,0 @@ - -{ - "model_name": "lstm", - "run_name": "mueller91", - "run_description": "train speaker encoder with voxceleb1, voxceleb2 and libriSpeech ", - "audio":{ - // Audio processing parameters - "num_mels": 40, // size of the mel spec frame. - "fft_size": 400, // number of stft frequency levels. Size of the linear spectogram frame. - "sample_rate": 16000, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. - "win_length": 400, // stft window length in ms. - "hop_length": 160, // stft window hop-lengh in ms. - "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. - "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. - "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. - "min_level_db": -100, // normalization range - "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. - "power": 1.5, // value to sharpen wav signals after GL algorithm. - "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. - // Normalization parameters - "signal_norm": true, // normalize the spec values in range [0, 1] - "symmetric_norm": true, // move normalization to range [-1, 1] - "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] - "clip_norm": true, // clip normalized values into the range. - "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! - "do_trim_silence": true, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) - "trim_db": 60, // threshold for timming silence. Set this according to your dataset. - "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored - }, - "reinit_layers": [], - "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA) - "grad_clip": 3.0, // upper limit for gradients for clipping. - "epochs": 1000, // total number of epochs to train. - "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. - "lr_decay": false, // if true, Noam learning rate decaying is applied through training. - "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" - "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - "steps_plot_stats": 10, // number of steps to plot embeddings. - "num_speakers_in_batch": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. - "num_utters_per_speaker": 10, // - "skip_speakers": false, // skip speakers with samples less than "num_utters_per_speaker" - - "voice_len": 1.6, // number of seconds for each training instance - "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "wd": 0.000001, // Weight decay weight. - "checkpoint": true, // If true, it saves checkpoints per "save_step" - "save_step": 1000, // Number of training steps expected to save traning stats and checkpoints. - "print_step": 20, // Number of steps to log traning on console. - "output_path": "../../MozillaTTSOutput/checkpoints/voxceleb_librispeech/speaker_encoder/", // DATASET-RELATED: output path for all training outputs. - "model": { - "input_dim": 40, - "proj_dim": 256, - "lstm_dim": 768, - "num_lstm_layers": 3, - "use_lstm_with_projection": true - }, - - "audio_augmentation": { - "p": 0, - //add a gaussian noise to the data in order to increase robustness - "gaussian":{ // as the insertion of Gaussian noise is quick to be calculated, we added it after loading the wav file, this way, even audios that were reused with the cache can receive this noise - "p": 1, // propability of apply this method, 0 is disable - "min_amplitude": 0.0, - "max_amplitude": 1e-5 - } - }, - "storage": { - "sample_from_storage_p": 0.66, // the probability with which we'll sample from the DataSet in-memory storage - "storage_size": 15, // the size of the in-memory storage with respect to a single batch - "additive_noise": 1e-5 // add very small gaussian noise to the data in order to increase robustness - }, - "datasets": - [ - { - "name": "vctk_slim", - "path": "../../../audio-datasets/en/VCTK-Corpus/", - "meta_file_train": null, - "meta_file_val": null - }, - { - "name": "libri_tts", - "path": "../../../audio-datasets/en/LibriTTS/train-clean-100", - "meta_file_train": null, - "meta_file_val": null - }, - { - "name": "libri_tts", - "path": "../../../audio-datasets/en/LibriTTS/train-clean-360", - "meta_file_train": null, - "meta_file_val": null - }, - { - "name": "libri_tts", - "path": "../../../audio-datasets/en/LibriTTS/train-other-500", - "meta_file_train": null, - "meta_file_val": null - }, - { - "name": "voxceleb1", - "path": "../../../audio-datasets/en/voxceleb1/", - "meta_file_train": null, - "meta_file_val": null - }, - { - "name": "voxceleb2", - "path": "../../../audio-datasets/en/voxceleb2/", - "meta_file_train": null, - "meta_file_val": null - }, - { - "name": "common_voice", - "path": "../../../audio-datasets/en/MozillaCommonVoice", - "meta_file_train": "train.tsv", - "meta_file_val": "test.tsv" - } - ] -} \ No newline at end of file diff --git a/TTS/speaker_encoder/configs/config_resnet_angleproto.json b/TTS/speaker_encoder/configs/config_resnet_angleproto.json deleted file mode 100644 index c26d29ce..00000000 --- a/TTS/speaker_encoder/configs/config_resnet_angleproto.json +++ /dev/null @@ -1,956 +0,0 @@ -{ - "model": "speaker_encoder", - "run_name": "speaker_encoder", - "run_description": "resnet speaker encoder trained with commonvoice all languages dev and train, Voxceleb 1 dev and Voxceleb 2 dev", - // AUDIO PARAMETERS - "audio":{ - // Audio processing parameters - "num_mels": 80, // size of the mel spec frame. - "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. - "sample_rate": 16000, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. - "win_length": 1024, // stft window length in ms. - "hop_length": 256, // stft window hop-lengh in ms. - "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. - "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. - "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. - "min_level_db": -100, // normalization range - "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. - "power": 1.5, // value to sharpen wav signals after GL algorithm. - "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. - "stft_pad_mode": "reflect", - // Normalization parameters - "signal_norm": true, // normalize the spec values in range [0, 1] - "symmetric_norm": true, // move normalization to range [-1, 1] - "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] - "clip_norm": true, // clip normalized values into the range. - "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! - "spec_gain": 20.0, - "do_trim_silence": false, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) - "trim_db": 60, // threshold for timming silence. Set this according to your dataset. - "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored - }, - "reinit_layers": [], - - "loss": "angleproto", // "ge2e" to use Generalized End-to-End loss, "angleproto" to use Angular Prototypical loss and "softmaxproto" to use Softmax with Angular Prototypical loss - "grad_clip": 3.0, // upper limit for gradients for clipping. - "max_train_step": 1000000, // total number of steps to train. - "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. - "lr_decay": false, // if true, Noam learning rate decaying is applied through training. - "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" - "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - "steps_plot_stats": 100, // number of steps to plot embeddings. - - // Speakers config - "num_speakers_in_batch": 200, // Batch size for training. - "num_utters_per_speaker": 2, // - "skip_speakers": true, // skip speakers with samples less than "num_utters_per_speaker" - "voice_len": 2, // number of seconds for each training instance - - "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "wd": 0.000001, // Weight decay weight. - "checkpoint": true, // If true, it saves checkpoints per "save_step" - "save_step": 1000, // Number of training steps expected to save the best checkpoints in training. - "print_step": 50, // Number of steps to log traning on console. - "output_path": "../checkpoints/speaker_encoder/angleproto/resnet_voxceleb1_and_voxceleb2-and-common-voice-all-using-angleproto/", // DATASET-RELATED: output path for all training outputs. - - "audio_augmentation": { - "p": 0.5, // propability of apply this method, 0 is disable rir and additive noise augmentation - "rir":{ - "rir_path": "/workspace/store/ecasanova/ComParE/RIRS_NOISES/simulated_rirs/", - "conv_mode": "full" - }, - "additive":{ - "sounds_path": "/workspace/store/ecasanova/ComParE/musan/", - // list of each of the directories in your data augmentation, if a directory is in "sounds_path" but is not listed here it will be ignored - "speech":{ - "min_snr_in_db": 13, - "max_snr_in_db": 20, - "min_num_noises": 2, - "max_num_noises": 3 - }, - "noise":{ - "min_snr_in_db": 0, - "max_snr_in_db": 15, - "min_num_noises": 1, - "max_num_noises": 1 - }, - "music":{ - "min_snr_in_db": 5, - "max_snr_in_db": 15, - "min_num_noises": 1, - "max_num_noises": 1 - } - }, - //add a gaussian noise to the data in order to increase robustness - "gaussian":{ // as the insertion of Gaussian noise is quick to be calculated, we added it after loading the wav file, this way, even audios that were reused with the cache can receive this noise - "p": 0.5, // propability of apply this method, 0 is disable - "min_amplitude": 0.0, - "max_amplitude": 1e-5 - } - }, - "model_params": { - "model_name": "resnet", - "input_dim": 80, - "proj_dim": 512 - }, - "storage": { - "sample_from_storage_p": 0.5, // the probability with which we'll sample from the DataSet in-memory storage - "storage_size": 35 // the size of the in-memory storage with respect to a single batch - }, - "datasets": - [ - { - "name": "voxceleb2", - "path": "/workspace/scratch/ecasanova/datasets/VoxCeleb/vox2_dev_aac/", - "meta_file_train": null, - "meta_file_val": null - }, - { - "name": "voxceleb1", - "path": "/workspace/scratch/ecasanova/datasets/VoxCeleb/vox1_dev_wav/", - "meta_file_train": null, - "meta_file_val": null - }, - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fi", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fi", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-CN", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-CN", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-sursilv", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-sursilv", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lt", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lt", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ka", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ka", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sv-SE", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sv-SE", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pl", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pl", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ru", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ru", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mn", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mn", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/nl", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/nl", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sl", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sl", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/es", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/es", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pt", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pt", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hi", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hi", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ja", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ja", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ia", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ia", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/br", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/br", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/id", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/id", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/dv", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/dv", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ta", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ta", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/or", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/or", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-HK", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-HK", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/de", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/de", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/uk", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/uk", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/en", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/en", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fa", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fa", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vi", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vi", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ab", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ab", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sah", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sah", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vot", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vot", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fr", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fr", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tr", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tr", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lg", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lg", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mt", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mt", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rw", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rw", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hu", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hu", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-vallader", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-vallader", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/el", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/el", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tt", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tt", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-TW", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-TW", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/et", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/et", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fy-NL", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fy-NL", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cs", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cs", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/as", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/as", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ro", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ro", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eo", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eo", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pa-IN", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pa-IN", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/th", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/th", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/it", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/it", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ga-IE", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ga-IE", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cnh", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cnh", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ky", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ky", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ar", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ar", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eu", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eu", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ca", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ca", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/kab", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/kab", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cy", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cy", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cv", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cv", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hsb", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hsb", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lv", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lv", - "meta_file_train": "dev.tsv", - "meta_file_val": null - } - - ] -} \ No newline at end of file diff --git a/TTS/speaker_encoder/configs/config_resnet_softmax_angleproto.json b/TTS/speaker_encoder/configs/config_resnet_softmax_angleproto.json deleted file mode 100644 index ccbd751a..00000000 --- a/TTS/speaker_encoder/configs/config_resnet_softmax_angleproto.json +++ /dev/null @@ -1,957 +0,0 @@ - -{ - "model": "speaker_encoder", - "run_name": "speaker_encoder", - "run_description": "resnet speaker encoder trained with commonvoice all languages dev and train, Voxceleb 1 dev and Voxceleb 2 dev", - // AUDIO PARAMETERS - "audio":{ - // Audio processing parameters - "num_mels": 80, // size of the mel spec frame. - "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. - "sample_rate": 16000, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. - "win_length": 1024, // stft window length in ms. - "hop_length": 256, // stft window hop-lengh in ms. - "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. - "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. - "preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. - "min_level_db": -100, // normalization range - "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air. - "power": 1.5, // value to sharpen wav signals after GL algorithm. - "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation. - "stft_pad_mode": "reflect", - // Normalization parameters - "signal_norm": true, // normalize the spec values in range [0, 1] - "symmetric_norm": true, // move normalization to range [-1, 1] - "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] - "clip_norm": true, // clip normalized values into the range. - "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!! - "spec_gain": 20.0, - "do_trim_silence": false, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) - "trim_db": 60, // threshold for timming silence. Set this according to your dataset. - "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored - }, - "reinit_layers": [], - - "loss": "softmaxproto", // "ge2e" to use Generalized End-to-End loss, "angleproto" to use Angular Prototypical loss and "softmaxproto" to use Softmax with Angular Prototypical loss - "grad_clip": 3.0, // upper limit for gradients for clipping. - "max_train_step": 1000000, // total number of steps to train. - "lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate. - "lr_decay": false, // if true, Noam learning rate decaying is applied through training. - "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" - "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - "steps_plot_stats": 100, // number of steps to plot embeddings. - - // Speakers config - "num_speakers_in_batch": 200, // Batch size for training. - "num_utters_per_speaker": 2, // - "skip_speakers": true, // skip speakers with samples less than "num_utters_per_speaker" - "voice_len": 2, // number of seconds for each training instance - - "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "wd": 0.000001, // Weight decay weight. - "checkpoint": true, // If true, it saves checkpoints per "save_step" - "save_step": 1000, // Number of training steps expected to save the best checkpoints in training. - "print_step": 50, // Number of steps to log traning on console. - "output_path": "../../../checkpoints/speaker_encoder/resnet_voxceleb1_and_voxceleb2-and-common-voice-all/", // DATASET-RELATED: output path for all training outputs. - - "audio_augmentation": { - "p": 0.5, // propability of apply this method, 0 is disable rir and additive noise augmentation - "rir":{ - "rir_path": "/workspace/store/ecasanova/ComParE/RIRS_NOISES/simulated_rirs/", - "conv_mode": "full" - }, - "additive":{ - "sounds_path": "/workspace/store/ecasanova/ComParE/musan/", - // list of each of the directories in your data augmentation, if a directory is in "sounds_path" but is not listed here it will be ignored - "speech":{ - "min_snr_in_db": 13, - "max_snr_in_db": 20, - "min_num_noises": 2, - "max_num_noises": 3 - }, - "noise":{ - "min_snr_in_db": 0, - "max_snr_in_db": 15, - "min_num_noises": 1, - "max_num_noises": 1 - }, - "music":{ - "min_snr_in_db": 5, - "max_snr_in_db": 15, - "min_num_noises": 1, - "max_num_noises": 1 - } - }, - //add a gaussian noise to the data in order to increase robustness - "gaussian":{ // as the insertion of Gaussian noise is quick to be calculated, we added it after loading the wav file, this way, even audios that were reused with the cache can receive this noise - "p": 0.5, // propability of apply this method, 0 is disable - "min_amplitude": 0.0, - "max_amplitude": 1e-5 - } - }, - "model_params": { - "model_name": "resnet", - "input_dim": 80, - "proj_dim": 512 - }, - "storage": { - "sample_from_storage_p": 0.66, // the probability with which we'll sample from the DataSet in-memory storage - "storage_size": 35 // the size of the in-memory storage with respect to a single batch - }, - "datasets": - [ - { - "name": "voxceleb2", - "path": "/workspace/scratch/ecasanova/datasets/VoxCeleb/vox2_dev_aac/", - "meta_file_train": null, - "meta_file_val": null - }, - { - "name": "voxceleb1", - "path": "/workspace/scratch/ecasanova/datasets/VoxCeleb/vox1_dev_wav/", - "meta_file_train": null, - "meta_file_val": null - }, - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fi", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fi", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-CN", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-CN", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-sursilv", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-sursilv", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lt", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lt", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ka", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ka", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sv-SE", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sv-SE", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pl", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pl", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ru", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ru", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mn", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mn", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/nl", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/nl", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sl", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sl", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/es", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/es", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pt", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pt", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hi", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hi", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ja", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ja", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ia", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ia", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/br", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/br", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/id", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/id", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/dv", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/dv", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ta", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ta", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/or", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/or", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-HK", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-HK", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/de", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/de", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/uk", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/uk", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/en", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/en", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fa", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fa", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vi", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vi", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ab", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ab", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sah", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/sah", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vot", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/vot", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fr", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fr", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tr", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tr", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lg", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lg", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mt", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/mt", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rw", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rw", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hu", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hu", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-vallader", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/rm-vallader", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/el", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/el", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tt", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/tt", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-TW", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/zh-TW", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/et", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/et", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fy-NL", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/fy-NL", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cs", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cs", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/as", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/as", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ro", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ro", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eo", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eo", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pa-IN", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/pa-IN", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/th", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/th", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/it", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/it", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ga-IE", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ga-IE", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cnh", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cnh", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ky", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ky", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ar", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ar", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eu", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/eu", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ca", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ca", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/kab", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/kab", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cy", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cy", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cv", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/cv", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hsb", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/hsb", - "meta_file_train": "dev.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lv", - "meta_file_train": "train.tsv", - "meta_file_val": null - }, - - { - "name": "common_voice", - "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/lv", - "meta_file_train": "dev.tsv", - "meta_file_val": null - } - - ] -} \ No newline at end of file diff --git a/TTS/speaker_encoder/dataset.py b/TTS/speaker_encoder/dataset.py deleted file mode 100644 index 28a23e2f..00000000 --- a/TTS/speaker_encoder/dataset.py +++ /dev/null @@ -1,253 +0,0 @@ -import random - -import numpy as np -import torch -from torch.utils.data import Dataset - -from TTS.speaker_encoder.utils.generic_utils import AugmentWAV, Storage - - -class SpeakerEncoderDataset(Dataset): - def __init__( - self, - ap, - meta_data, - voice_len=1.6, - num_speakers_in_batch=64, - storage_size=1, - sample_from_storage_p=0.5, - num_utter_per_speaker=10, - skip_speakers=False, - verbose=False, - augmentation_config=None, - ): - """ - Args: - ap (TTS.tts.utils.AudioProcessor): audio processor object. - meta_data (list): list of dataset instances. - seq_len (int): voice segment length in seconds. - verbose (bool): print diagnostic information. - """ - super().__init__() - self.items = meta_data - self.sample_rate = ap.sample_rate - self.seq_len = int(voice_len * self.sample_rate) - self.num_speakers_in_batch = num_speakers_in_batch - self.num_utter_per_speaker = num_utter_per_speaker - self.skip_speakers = skip_speakers - self.ap = ap - self.verbose = verbose - self.__parse_items() - storage_max_size = storage_size * num_speakers_in_batch - self.storage = Storage( - maxsize=storage_max_size, storage_batchs=storage_size, num_speakers_in_batch=num_speakers_in_batch - ) - self.sample_from_storage_p = float(sample_from_storage_p) - - speakers_aux = list(self.speakers) - speakers_aux.sort() - self.speakerid_to_classid = {key: i for i, key in enumerate(speakers_aux)} - - # Augmentation - self.augmentator = None - self.gaussian_augmentation_config = None - if augmentation_config: - self.data_augmentation_p = augmentation_config["p"] - if self.data_augmentation_p and ("additive" in augmentation_config or "rir" in augmentation_config): - self.augmentator = AugmentWAV(ap, augmentation_config) - - if "gaussian" in augmentation_config.keys(): - self.gaussian_augmentation_config = augmentation_config["gaussian"] - - if self.verbose: - print("\n > DataLoader initialization") - print(f" | > Speakers per Batch: {num_speakers_in_batch}") - print(f" | > Storage Size: {storage_max_size} instances, each with {num_utter_per_speaker} utters") - print(f" | > Sample_from_storage_p : {self.sample_from_storage_p}") - print(f" | > Number of instances : {len(self.items)}") - print(f" | > Sequence length: {self.seq_len}") - print(f" | > Num speakers: {len(self.speakers)}") - - def load_wav(self, filename): - audio = self.ap.load_wav(filename, sr=self.ap.sample_rate) - return audio - - def load_data(self, idx): - text, wav_file, speaker_name = self.items[idx] - wav = np.asarray(self.load_wav(wav_file), dtype=np.float32) - mel = self.ap.melspectrogram(wav).astype("float32") - # sample seq_len - - assert text.size > 0, self.items[idx]["audio_file"] - assert wav.size > 0, self.items[idx]["audio_file"] - - sample = { - "mel": mel, - "item_idx": self.items[idx]["audio_file"], - "speaker_name": speaker_name, - } - return sample - - def __parse_items(self): - self.speaker_to_utters = {} - for i in self.items: - path_ = i["audio_file"] - speaker_ = i["speaker_name"] - if speaker_ in self.speaker_to_utters.keys(): - self.speaker_to_utters[speaker_].append(path_) - else: - self.speaker_to_utters[speaker_] = [ - path_, - ] - - if self.skip_speakers: - self.speaker_to_utters = { - k: v for (k, v) in self.speaker_to_utters.items() if len(v) >= self.num_utter_per_speaker - } - - self.speakers = [k for (k, v) in self.speaker_to_utters.items()] - - def __len__(self): - return int(1e10) - - def get_num_speakers(self): - return len(self.speakers) - - def __sample_speaker(self, ignore_speakers=None): - speaker = random.sample(self.speakers, 1)[0] - # if list of speakers_id is provide make sure that it's will be ignored - if ignore_speakers and self.speakerid_to_classid[speaker] in ignore_speakers: - while True: - speaker = random.sample(self.speakers, 1)[0] - if self.speakerid_to_classid[speaker] not in ignore_speakers: - break - - if self.num_utter_per_speaker > len(self.speaker_to_utters[speaker]): - utters = random.choices(self.speaker_to_utters[speaker], k=self.num_utter_per_speaker) - else: - utters = random.sample(self.speaker_to_utters[speaker], self.num_utter_per_speaker) - return speaker, utters - - def __sample_speaker_utterances(self, speaker): - """ - Sample all M utterances for the given speaker. - """ - wavs = [] - labels = [] - for _ in range(self.num_utter_per_speaker): - # TODO:dummy but works - while True: - # remove speakers that have num_utter less than 2 - if len(self.speaker_to_utters[speaker]) > 1: - utter = random.sample(self.speaker_to_utters[speaker], 1)[0] - else: - if speaker in self.speakers: - self.speakers.remove(speaker) - - speaker, _ = self.__sample_speaker() - continue - - wav = self.load_wav(utter) - if wav.shape[0] - self.seq_len > 0: - break - - if utter in self.speaker_to_utters[speaker]: - self.speaker_to_utters[speaker].remove(utter) - - if self.augmentator is not None and self.data_augmentation_p: - if random.random() < self.data_augmentation_p: - wav = self.augmentator.apply_one(wav) - - wavs.append(wav) - labels.append(self.speakerid_to_classid[speaker]) - return wavs, labels - - def __getitem__(self, idx): - speaker, _ = self.__sample_speaker() - speaker_id = self.speakerid_to_classid[speaker] - return speaker, speaker_id - - def __load_from_disk_and_storage(self, speaker): - # don't sample from storage, but from HDD - wavs_, labels_ = self.__sample_speaker_utterances(speaker) - # put the newly loaded item into storage - self.storage.append((wavs_, labels_)) - return wavs_, labels_ - - def collate_fn(self, batch): - # get the batch speaker_ids - batch = np.array(batch) - speakers_id_in_batch = set(batch[:, 1].astype(np.int32)) - - labels = [] - feats = [] - speakers = set() - - for speaker, speaker_id in batch: - speaker_id = int(speaker_id) - - # ensure that an speaker appears only once in the batch - if speaker_id in speakers: - - # remove current speaker - if speaker_id in speakers_id_in_batch: - speakers_id_in_batch.remove(speaker_id) - - speaker, _ = self.__sample_speaker(ignore_speakers=speakers_id_in_batch) - speaker_id = self.speakerid_to_classid[speaker] - speakers_id_in_batch.add(speaker_id) - - if random.random() < self.sample_from_storage_p and self.storage.full(): - # sample from storage (if full) - wavs_, labels_ = self.storage.get_random_sample_fast() - - # force choose the current speaker or other not in batch - # It's necessary for ideal training with AngleProto and GE2E losses - if labels_[0] in speakers_id_in_batch and labels_[0] != speaker_id: - attempts = 0 - while True: - wavs_, labels_ = self.storage.get_random_sample_fast() - if labels_[0] == speaker_id or labels_[0] not in speakers_id_in_batch: - break - - attempts += 1 - # Try 5 times after that load from disk - if attempts >= 5: - wavs_, labels_ = self.__load_from_disk_and_storage(speaker) - break - else: - # don't sample from storage, but from HDD - wavs_, labels_ = self.__load_from_disk_and_storage(speaker) - - # append speaker for control - speakers.add(labels_[0]) - - # remove current speaker and append other - if speaker_id in speakers_id_in_batch: - speakers_id_in_batch.remove(speaker_id) - - speakers_id_in_batch.add(labels_[0]) - - # get a random subset of each of the wavs and extract mel spectrograms. - feats_ = [] - for wav in wavs_: - offset = random.randint(0, wav.shape[0] - self.seq_len) - wav = wav[offset : offset + self.seq_len] - # add random gaussian noise - if self.gaussian_augmentation_config and self.gaussian_augmentation_config["p"]: - if random.random() < self.gaussian_augmentation_config["p"]: - wav += np.random.normal( - self.gaussian_augmentation_config["min_amplitude"], - self.gaussian_augmentation_config["max_amplitude"], - size=len(wav), - ) - mel = self.ap.melspectrogram(wav) - feats_.append(torch.FloatTensor(mel)) - - labels.append(torch.LongTensor(labels_)) - feats.extend(feats_) - - feats = torch.stack(feats) - labels = torch.stack(labels) - - return feats, labels diff --git a/TTS/speaker_encoder/models/lstm.py b/TTS/speaker_encoder/models/lstm.py deleted file mode 100644 index ec394cdb..00000000 --- a/TTS/speaker_encoder/models/lstm.py +++ /dev/null @@ -1,189 +0,0 @@ -import numpy as np -import torch -import torchaudio -from torch import nn - -from TTS.speaker_encoder.models.resnet import PreEmphasis -from TTS.utils.io import load_fsspec - - -class LSTMWithProjection(nn.Module): - def __init__(self, input_size, hidden_size, proj_size): - super().__init__() - self.input_size = input_size - self.hidden_size = hidden_size - self.proj_size = proj_size - self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True) - self.linear = nn.Linear(hidden_size, proj_size, bias=False) - - def forward(self, x): - self.lstm.flatten_parameters() - o, (_, _) = self.lstm(x) - return self.linear(o) - - -class LSTMWithoutProjection(nn.Module): - def __init__(self, input_dim, lstm_dim, proj_dim, num_lstm_layers): - super().__init__() - self.lstm = nn.LSTM(input_size=input_dim, hidden_size=lstm_dim, num_layers=num_lstm_layers, batch_first=True) - self.linear = nn.Linear(lstm_dim, proj_dim, bias=True) - self.relu = nn.ReLU() - - def forward(self, x): - _, (hidden, _) = self.lstm(x) - return self.relu(self.linear(hidden[-1])) - - -class LSTMSpeakerEncoder(nn.Module): - def __init__( - self, - input_dim, - proj_dim=256, - lstm_dim=768, - num_lstm_layers=3, - use_lstm_with_projection=True, - use_torch_spec=False, - audio_config=None, - ): - super().__init__() - self.use_lstm_with_projection = use_lstm_with_projection - self.use_torch_spec = use_torch_spec - self.audio_config = audio_config - self.proj_dim = proj_dim - - layers = [] - # choise LSTM layer - if use_lstm_with_projection: - layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim)) - for _ in range(num_lstm_layers - 1): - layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim)) - self.layers = nn.Sequential(*layers) - else: - self.layers = LSTMWithoutProjection(input_dim, lstm_dim, proj_dim, num_lstm_layers) - - self.instancenorm = nn.InstanceNorm1d(input_dim) - - if self.use_torch_spec: - self.torch_spec = torch.nn.Sequential( - PreEmphasis(audio_config["preemphasis"]), - # TorchSTFT( - # n_fft=audio_config["fft_size"], - # hop_length=audio_config["hop_length"], - # win_length=audio_config["win_length"], - # sample_rate=audio_config["sample_rate"], - # window="hamming_window", - # mel_fmin=0.0, - # mel_fmax=None, - # use_htk=True, - # do_amp_to_db=False, - # n_mels=audio_config["num_mels"], - # power=2.0, - # use_mel=True, - # mel_norm=None, - # ) - torchaudio.transforms.MelSpectrogram( - sample_rate=audio_config["sample_rate"], - n_fft=audio_config["fft_size"], - win_length=audio_config["win_length"], - hop_length=audio_config["hop_length"], - window_fn=torch.hamming_window, - n_mels=audio_config["num_mels"], - ), - ) - else: - self.torch_spec = None - - self._init_layers() - - def _init_layers(self): - for name, param in self.layers.named_parameters(): - if "bias" in name: - nn.init.constant_(param, 0.0) - elif "weight" in name: - nn.init.xavier_normal_(param) - - def forward(self, x, l2_norm=True): - """Forward pass of the model. - - Args: - x (Tensor): Raw waveform signal or spectrogram frames. If input is a waveform, `torch_spec` must be `True` - to compute the spectrogram on-the-fly. - l2_norm (bool): Whether to L2-normalize the outputs. - - Shapes: - - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})` - """ - with torch.no_grad(): - with torch.cuda.amp.autocast(enabled=False): - if self.use_torch_spec: - x.squeeze_(1) - x = self.torch_spec(x) - x = self.instancenorm(x).transpose(1, 2) - d = self.layers(x) - if self.use_lstm_with_projection: - d = d[:, -1] - if l2_norm: - d = torch.nn.functional.normalize(d, p=2, dim=1) - return d - - @torch.no_grad() - def inference(self, x, l2_norm=True): - d = self.forward(x, l2_norm=l2_norm) - return d - - def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True): - """ - Generate embeddings for a batch of utterances - x: 1xTxD - """ - max_len = x.shape[1] - - if max_len < num_frames: - num_frames = max_len - - offsets = np.linspace(0, max_len - num_frames, num=num_eval) - - frames_batch = [] - for offset in offsets: - offset = int(offset) - end_offset = int(offset + num_frames) - frames = x[:, offset:end_offset] - frames_batch.append(frames) - - frames_batch = torch.cat(frames_batch, dim=0) - embeddings = self.inference(frames_batch) - - if return_mean: - embeddings = torch.mean(embeddings, dim=0, keepdim=True) - - return embeddings - - def batch_compute_embedding(self, x, seq_lens, num_frames=160, overlap=0.5): - """ - Generate embeddings for a batch of utterances - x: BxTxD - """ - num_overlap = num_frames * overlap - max_len = x.shape[1] - embed = None - num_iters = seq_lens / (num_frames - num_overlap) - cur_iter = 0 - for offset in range(0, max_len, num_frames - num_overlap): - cur_iter += 1 - end_offset = min(x.shape[1], offset + num_frames) - frames = x[:, offset:end_offset] - if embed is None: - embed = self.inference(frames) - else: - embed[cur_iter <= num_iters, :] += self.inference(frames[cur_iter <= num_iters, :, :]) - return embed / num_iters - - # pylint: disable=unused-argument, redefined-builtin - def load_checkpoint(self, config: dict, checkpoint_path: str, eval: bool = False, use_cuda: bool = False): - state = load_fsspec(checkpoint_path, map_location=torch.device("cpu")) - self.load_state_dict(state["model"]) - if use_cuda: - self.cuda() - if eval: - self.eval() - assert not self.training diff --git a/TTS/speaker_encoder/umap.png b/TTS/speaker_encoder/umap.png deleted file mode 100644 index ca8aefea..00000000 Binary files a/TTS/speaker_encoder/umap.png and /dev/null differ diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py index f43c6464..dcc862e8 100644 --- a/TTS/tts/configs/shared_configs.py +++ b/TTS/tts/configs/shared_configs.py @@ -220,6 +220,18 @@ class BaseTTSConfig(BaseTrainingConfig): eval_split_size (float): If between 0.0 and 1.0 represents the proportion of the dataset to include in the evaluation set. If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%). + + use_speaker_weighted_sampler (bool): + Enable / Disable the batch balancer by speaker. Defaults to ```False```. + + speaker_weighted_sampler_alpha (float): + Number that control the influence of the speaker sampler weights. Defaults to ```1.0```. + + use_language_weighted_sampler (bool): + Enable / Disable the batch balancer by language. Defaults to ```False```. + + language_weighted_sampler_alpha (float): + Number that control the influence of the language sampler weights. Defaults to ```1.0```. """ audio: BaseAudioConfig = field(default_factory=BaseAudioConfig) @@ -252,7 +264,7 @@ class BaseTTSConfig(BaseTrainingConfig): # dataset datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()]) # optimizer - optimizer: str = None + optimizer: str = "radam" optimizer_params: dict = None # scheduler lr_scheduler: str = "" @@ -262,3 +274,8 @@ class BaseTTSConfig(BaseTrainingConfig): # evaluation eval_split_max_size: int = None eval_split_size: float = 0.01 + # weighted samplers + use_speaker_weighted_sampler: bool = False + speaker_weighted_sampler_alpha: float = 1.0 + use_language_weighted_sampler: bool = False + language_weighted_sampler_alpha: float = 1.0 diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py index aacfc647..c13fcdb8 100644 --- a/TTS/tts/datasets/formatters.py +++ b/TTS/tts/datasets/formatters.py @@ -246,7 +246,7 @@ def libri_tts(root_path, meta_files=None, ignored_speakers=None): continue items.append({"text": text, "audio_file": wav_file, "speaker_name": f"LTTS_{speaker_name}"}) for item in items: - assert os.path.exists(item[1]), f" [!] wav files don't exist - {item[1]}" + assert os.path.exists(item["audio_file"]), f" [!] wav files don't exist - {item['audio_file']}" return items @@ -328,27 +328,49 @@ def vctk(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic else: wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + f"_{mic}.{file_ext}") if os.path.exists(wav_file): - items.append([text, wav_file, "VCTK_" + speaker_id]) + items.append({"text": text, "audio_file": wav_file, "speaker_name": "VCTK_" + speaker_id}) else: print(f" [!] wav files don't exist - {wav_file}") return items -def vctk_old(root_path, meta_files=None, wavs_path="wav48"): +def vctk_old(root_path, meta_files=None, wavs_path="wav48", ignored_speakers=None): """homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz""" - test_speakers = meta_files items = [] meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True) for meta_file in meta_files: _, speaker_id, txt_file = os.path.relpath(meta_file, root_path).split(os.sep) file_id = txt_file.split(".")[0] - if isinstance(test_speakers, list): # if is list ignore this speakers ids - if speaker_id in test_speakers: + # ignore speakers + if isinstance(ignored_speakers, list): + if speaker_id in ignored_speakers: continue with open(meta_file, "r", encoding="utf-8") as file_text: text = file_text.readlines()[0] wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + ".wav") - items.append([text, wav_file, "VCTK_old_" + speaker_id]) + items.append({"text": text, "audio_file": wav_file, "speaker_name": "VCTK_old_" + speaker_id}) + return items + + +def open_bible(root_path, meta_files="train", ignore_digits_sentences=True, ignored_speakers=None): + """ToDo: Refer the paper when available""" + items = [] + split_dir = meta_files + meta_files = glob(f"{os.path.join(root_path, split_dir)}/**/*.txt", recursive=True) + for meta_file in meta_files: + _, speaker_id, txt_file = os.path.relpath(meta_file, root_path).split(os.sep) + file_id = txt_file.split(".")[0] + # ignore speakers + if isinstance(ignored_speakers, list): + if speaker_id in ignored_speakers: + continue + with open(meta_file, "r", encoding="utf-8") as file_text: + text = file_text.readline().replace("\n", "") + # ignore sentences that contains digits + if ignore_digits_sentences and any(map(str.isdigit, text)): + continue + wav_file = os.path.join(root_path, split_dir, speaker_id, file_id + ".flac") + items.append({"text": text, "audio_file": wav_file, "speaker_name": "OB_" + speaker_id}) return items @@ -419,6 +441,26 @@ def _voxcel_x(root_path, meta_file, voxcel_idx): return [x.strip().split("|") for x in f.readlines()] +def emotion(root_path, meta_file, ignored_speakers=None): + """Generic emotion dataset""" + txt_file = os.path.join(root_path, meta_file) + items = [] + with open(txt_file, "r", encoding="utf-8") as ttf: + for line in ttf: + if line.startswith("file_path"): + continue + cols = line.split(",") + wav_file = os.path.join(root_path, cols[0]) + speaker_id = cols[1] + emotion_id = cols[2].replace("\n", "") + # ignore speakers + if isinstance(ignored_speakers, list): + if speaker_id in ignored_speakers: + continue + items.append({"audio_file": wav_file, "speaker_name": speaker_id, "emotion_name": emotion_id}) + return items + + def baker(root_path: str, meta_file: str, **kwargs) -> List[List[str]]: # pylint: disable=unused-argument """Normalizes the Baker meta data file to TTS format diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index 4e54b947..652b77dd 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -7,12 +7,13 @@ import torch.distributed as dist from coqpit import Coqpit from torch import nn from torch.utils.data import DataLoader -from torch.utils.data.distributed import DistributedSampler +from torch.utils.data.sampler import WeightedRandomSampler +from trainer.torch import DistributedSampler, DistributedSamplerWrapper from TTS.model import BaseTrainerModel from TTS.tts.datasets.dataset import TTSDataset -from TTS.tts.utils.languages import LanguageManager, get_language_weighted_sampler -from TTS.tts.utils.speakers import SpeakerManager, get_speaker_weighted_sampler +from TTS.tts.utils.languages import LanguageManager, get_language_balancer_weights +from TTS.tts.utils.speakers import SpeakerManager, get_speaker_balancer_weights, get_speaker_manager from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.visual import plot_alignment, plot_spectrogram @@ -135,18 +136,18 @@ class BaseTTS(BaseTrainerModel): if hasattr(self, "speaker_manager"): if config.use_d_vector_file: if speaker_name is None: - d_vector = self.speaker_manager.get_random_d_vector() + d_vector = self.speaker_manager.get_random_embeddings() else: - d_vector = self.speaker_manager.get_d_vector_by_speaker(speaker_name) + d_vector = self.speaker_manager.get_d_vector_by_name(speaker_name) elif config.use_speaker_embedding: if speaker_name is None: - speaker_id = self.speaker_manager.get_random_speaker_id() + speaker_id = self.speaker_manager.get_random_id() else: - speaker_id = self.speaker_manager.speaker_ids[speaker_name] + speaker_id = self.speaker_manager.ids[speaker_name] # get language id if hasattr(self, "language_manager") and config.use_language_embedding and language_name is not None: - language_id = self.language_manager.language_id_mapping[language_name] + language_id = self.language_manager.ids[language_name] return { "text": text, @@ -232,6 +233,36 @@ class BaseTTS(BaseTrainerModel): "language_ids": language_ids, } + def get_sampler(self, config: Coqpit, dataset: TTSDataset, num_gpus=1): + weights = None + data_items = dataset.samples + + if getattr(config, "use_language_weighted_sampler", False): + alpha = getattr(config, "language_weighted_sampler_alpha", 1.0) + print(" > Using Language weighted sampler with alpha:", alpha) + weights = get_language_balancer_weights(data_items) * alpha + + if getattr(config, "use_speaker_weighted_sampler", False): + alpha = getattr(config, "speaker_weighted_sampler_alpha", 1.0) + print(" > Using Speaker weighted sampler with alpha:", alpha) + if weights is not None: + weights += get_speaker_balancer_weights(data_items) * alpha + else: + weights = get_speaker_balancer_weights(data_items) * alpha + + if weights is not None: + sampler = WeightedRandomSampler(weights, len(weights)) + else: + sampler = None + + # sampler for DDP + if sampler is None: + sampler = DistributedSampler(dataset) if num_gpus > 1 else None + else: # If a sampler is already defined use this sampler and DDP sampler together + sampler = DistributedSamplerWrapper(sampler) if num_gpus > 1 else sampler + + return sampler + def get_data_loader( self, config: Coqpit, @@ -248,23 +279,19 @@ class BaseTTS(BaseTrainerModel): # setup multi-speaker attributes if hasattr(self, "speaker_manager") and self.speaker_manager is not None: if hasattr(config, "model_args"): - speaker_id_mapping = ( - self.speaker_manager.speaker_ids if config.model_args.use_speaker_embedding else None - ) - d_vector_mapping = self.speaker_manager.d_vectors if config.model_args.use_d_vector_file else None + speaker_id_mapping = self.speaker_manager.ids if config.model_args.use_speaker_embedding else None + d_vector_mapping = self.speaker_manager.embeddings if config.model_args.use_d_vector_file else None config.use_d_vector_file = config.model_args.use_d_vector_file else: - speaker_id_mapping = self.speaker_manager.speaker_ids if config.use_speaker_embedding else None - d_vector_mapping = self.speaker_manager.d_vectors if config.use_d_vector_file else None + speaker_id_mapping = self.speaker_manager.ids if config.use_speaker_embedding else None + d_vector_mapping = self.speaker_manager.embeddings if config.use_d_vector_file else None else: speaker_id_mapping = None d_vector_mapping = None # setup multi-lingual attributes if hasattr(self, "language_manager") and self.language_manager is not None: - language_id_mapping = ( - self.language_manager.language_id_mapping if self.args.use_language_embedding else None - ) + language_id_mapping = self.language_manager.ids if self.args.use_language_embedding else None else: language_id_mapping = None @@ -300,25 +327,8 @@ class BaseTTS(BaseTrainerModel): # sort input sequences from short to long dataset.preprocess_samples() - # sampler for DDP - sampler = DistributedSampler(dataset) if num_gpus > 1 else None - - # Weighted samplers - # TODO: make this DDP amenable - assert not ( - num_gpus > 1 and getattr(config, "use_language_weighted_sampler", False) - ), "language_weighted_sampler is not supported with DistributedSampler" - assert not ( - num_gpus > 1 and getattr(config, "use_speaker_weighted_sampler", False) - ), "speaker_weighted_sampler is not supported with DistributedSampler" - - if sampler is None: - if getattr(config, "use_language_weighted_sampler", False): - print(" > Using Language weighted sampler") - sampler = get_language_weighted_sampler(dataset.samples) - elif getattr(config, "use_speaker_weighted_sampler", False): - print(" > Using Language weighted sampler") - sampler = get_speaker_weighted_sampler(dataset.samples) + # get samplers + sampler = self.get_sampler(config, dataset, num_gpus) loader = DataLoader( dataset, @@ -338,13 +348,13 @@ class BaseTTS(BaseTrainerModel): d_vector = None if self.config.use_d_vector_file: - d_vector = [self.speaker_manager.d_vectors[name]["embedding"] for name in self.speaker_manager.d_vectors] + d_vector = [self.speaker_manager.embeddings[name]["embedding"] for name in self.speaker_manager.embeddings] d_vector = (random.sample(sorted(d_vector), 1),) aux_inputs = { "speaker_id": None if not self.config.use_speaker_embedding - else random.sample(sorted(self.speaker_manager.speaker_ids.values()), 1), + else random.sample(sorted(self.speaker_manager.ids.values()), 1), "d_vector": d_vector, "style_wav": None, # TODO: handle GST style input } @@ -391,7 +401,7 @@ class BaseTTS(BaseTrainerModel): """Save the speaker.json and language_ids.json at the beginning of the training. Also update both paths.""" if self.speaker_manager is not None: output_path = os.path.join(trainer.output_path, "speakers.json") - self.speaker_manager.save_speaker_ids_to_file(output_path) + self.speaker_manager.save_ids_to_file(output_path) trainer.config.speakers_file = output_path # some models don't have `model_args` set if hasattr(trainer.config, "model_args"): @@ -402,7 +412,7 @@ class BaseTTS(BaseTrainerModel): if hasattr(self, "language_manager") and self.language_manager is not None: output_path = os.path.join(trainer.output_path, "language_ids.json") - self.language_manager.save_language_ids_to_file(output_path) + self.language_manager.save_ids_to_file(output_path) trainer.config.language_ids_file = output_path if hasattr(trainer.config, "model_args"): trainer.config.model_args.language_ids_file = output_path diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index fea570a6..7c0f95e1 100644 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -124,7 +124,7 @@ class GlowTTS(BaseTTS): ) if self.speaker_manager is not None: assert ( - config.d_vector_dim == self.speaker_manager.d_vector_dim + config.d_vector_dim == self.speaker_manager.embedding_dim ), " [!] d-vector dimension mismatch b/w config and speaker manager." # init speaker embedding layer if config.use_speaker_embedding and not config.use_d_vector_file: diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index a43e081c..943b9eae 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -13,7 +13,6 @@ from torch import nn from torch.cuda.amp.autocast_mode import autocast from torch.nn import functional as F from torch.utils.data import DataLoader -from torch.utils.data.distributed import DistributedSampler from trainer.trainer_utils import get_optimizer, get_scheduler from TTS.tts.configs.shared_configs import CharactersConfig @@ -24,8 +23,8 @@ from TTS.tts.layers.vits.networks import PosteriorEncoder, ResidualCouplingBlock from TTS.tts.layers.vits.stochastic_duration_predictor import StochasticDurationPredictor from TTS.tts.models.base_tts import BaseTTS from TTS.tts.utils.helpers import generate_path, maximum_path, rand_segments, segment, sequence_mask -from TTS.tts.utils.languages import LanguageManager, get_language_weighted_sampler -from TTS.tts.utils.speakers import SpeakerManager, get_speaker_weighted_sampler +from TTS.tts.utils.languages import LanguageManager +from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.text.characters import BaseCharacters, _characters, _pad, _phonemes, _punctuations from TTS.tts.utils.text.tokenizer import TTSTokenizer @@ -653,28 +652,28 @@ class Vits(BaseTTS): # TODO: make this a function if self.args.use_speaker_encoder_as_loss: - if self.speaker_manager.speaker_encoder is None and ( + if self.speaker_manager.encoder is None and ( not self.args.speaker_encoder_model_path or not self.args.speaker_encoder_config_path ): raise RuntimeError( " [!] To use the speaker consistency loss (SCL) you need to specify speaker_encoder_model_path and speaker_encoder_config_path !!" ) - self.speaker_manager.speaker_encoder.eval() + self.speaker_manager.encoder.eval() print(" > External Speaker Encoder Loaded !!") if ( - hasattr(self.speaker_manager.speaker_encoder, "audio_config") - and self.config.audio["sample_rate"] != self.speaker_manager.speaker_encoder.audio_config["sample_rate"] + hasattr(self.speaker_manager.encoder, "audio_config") + and self.config.audio["sample_rate"] != self.speaker_manager.encoder.audio_config["sample_rate"] ): self.audio_transform = torchaudio.transforms.Resample( orig_freq=self.audio_config["sample_rate"], - new_freq=self.speaker_manager.speaker_encoder.audio_config["sample_rate"], + new_freq=self.speaker_manager.encoder.audio_config["sample_rate"], ) # pylint: disable=W0101,W0105 self.audio_transform = torchaudio.transforms.Resample( orig_freq=self.config.audio.sample_rate, - new_freq=self.speaker_manager.speaker_encoder.audio_config["sample_rate"], + new_freq=self.speaker_manager.encoder.audio_config["sample_rate"], ) def _init_speaker_embedding(self): @@ -707,7 +706,6 @@ class Vits(BaseTTS): torch.nn.init.xavier_uniform_(self.emb_l.weight) else: self.embedded_language_dim = 0 - self.emb_l = None def get_aux_input(self, aux_input: Dict): sid, g, lid = self._set_cond_input(aux_input) @@ -889,7 +887,7 @@ class Vits(BaseTTS): pad_short=True, ) - if self.args.use_speaker_encoder_as_loss and self.speaker_manager.speaker_encoder is not None: + if self.args.use_speaker_encoder_as_loss and self.speaker_manager.encoder is not None: # concate generated and GT waveforms wavs_batch = torch.cat((wav_seg, o), dim=0) @@ -898,7 +896,7 @@ class Vits(BaseTTS): if self.audio_transform is not None: wavs_batch = self.audio_transform(wavs_batch) - pred_embs = self.speaker_manager.speaker_encoder.forward(wavs_batch, l2_norm=True) + pred_embs = self.speaker_manager.encoder.forward(wavs_batch, l2_norm=True) # split generated and GT speaker embeddings gt_spk_emb, syn_spk_emb = torch.chunk(pred_embs, 2, dim=0) @@ -996,6 +994,34 @@ class Vits(BaseTTS): outputs = {"model_outputs": o, "alignments": attn.squeeze(1), "z": z, "z_p": z_p, "m_p": m_p, "logs_p": logs_p} return outputs + @torch.no_grad() + def inference_voice_conversion( + self, reference_wav, speaker_id=None, d_vector=None, reference_speaker_id=None, reference_d_vector=None + ): + """Inference for voice conversion + + Args: + reference_wav (Tensor): Reference wavform. Tensor of shape [B, T] + speaker_id (Tensor): speaker_id of the target speaker. Tensor of shape [B] + d_vector (Tensor): d_vector embedding of target speaker. Tensor of shape `[B, C]` + reference_speaker_id (Tensor): speaker_id of the reference_wav speaker. Tensor of shape [B] + reference_d_vector (Tensor): d_vector embedding of the reference_wav speaker. Tensor of shape `[B, C]` + """ + # compute spectrograms + y = wav_to_spec( + reference_wav, + self.config.audio.fft_size, + self.config.audio.hop_length, + self.config.audio.win_length, + center=False, + ).transpose(1, 2) + y_lengths = torch.tensor([y.size(-1)]).to(y.device) + speaker_cond_src = reference_speaker_id if reference_speaker_id is not None else reference_d_vector + speaker_cond_tgt = speaker_id if speaker_id is not None else d_vector + # print(y.shape, y_lengths.shape) + wav, _, _ = self.voice_conversion(y, y_lengths, speaker_cond_src, speaker_cond_tgt) + return wav + def voice_conversion(self, y, y_lengths, speaker_cond_src, speaker_cond_tgt): """Forward pass for voice conversion @@ -1008,12 +1034,11 @@ class Vits(BaseTTS): speaker_cond_tgt (Tensor): Target speaker ID. Tensor of shape [B,] """ assert self.num_speakers > 0, "num_speakers have to be larger than 0." - # speaker embedding if self.args.use_speaker_embedding and not self.args.use_d_vector_file: g_src = self.emb_g(speaker_cond_src).unsqueeze(-1) g_tgt = self.emb_g(speaker_cond_tgt).unsqueeze(-1) - elif self.args.use_speaker_embedding and self.args.use_d_vector_file: + elif not self.args.use_speaker_embedding and self.args.use_d_vector_file: g_src = F.normalize(speaker_cond_src).unsqueeze(-1) g_tgt = F.normalize(speaker_cond_tgt).unsqueeze(-1) else: @@ -1198,18 +1223,18 @@ class Vits(BaseTTS): if hasattr(self, "speaker_manager"): if config.use_d_vector_file: if speaker_name is None: - d_vector = self.speaker_manager.get_random_d_vector() + d_vector = self.speaker_manager.get_random_embeddings() else: - d_vector = self.speaker_manager.get_mean_d_vector(speaker_name, num_samples=1, randomize=False) + d_vector = self.speaker_manager.get_mean_embedding(speaker_name, num_samples=None, randomize=False) elif config.use_speaker_embedding: if speaker_name is None: - speaker_id = self.speaker_manager.get_random_speaker_id() + speaker_id = self.speaker_manager.get_random_id() else: - speaker_id = self.speaker_manager.speaker_ids[speaker_name] + speaker_id = self.speaker_manager.ids[speaker_name] # get language id if hasattr(self, "language_manager") and config.use_language_embedding and language_name is not None: - language_id = self.language_manager.language_id_mapping[language_name] + language_id = self.language_manager.ids[language_name] return { "text": text, @@ -1264,26 +1289,22 @@ class Vits(BaseTTS): d_vectors = None # get numerical speaker ids from speaker names - if self.speaker_manager is not None and self.speaker_manager.speaker_ids and self.args.use_speaker_embedding: - speaker_ids = [self.speaker_manager.speaker_ids[sn] for sn in batch["speaker_names"]] + if self.speaker_manager is not None and self.speaker_manager.ids and self.args.use_speaker_embedding: + speaker_ids = [self.speaker_manager.ids[sn] for sn in batch["speaker_names"]] if speaker_ids is not None: speaker_ids = torch.LongTensor(speaker_ids) batch["speaker_ids"] = speaker_ids # get d_vectors from audio file names - if self.speaker_manager is not None and self.speaker_manager.d_vectors and self.args.use_d_vector_file: - d_vector_mapping = self.speaker_manager.d_vectors + if self.speaker_manager is not None and self.speaker_manager.embeddings and self.args.use_d_vector_file: + d_vector_mapping = self.speaker_manager.embeddings d_vectors = [d_vector_mapping[w]["embedding"] for w in batch["audio_files"]] d_vectors = torch.FloatTensor(d_vectors) # get language ids from language names - if ( - self.language_manager is not None - and self.language_manager.language_id_mapping - and self.args.use_language_embedding - ): - language_ids = [self.language_manager.language_id_mapping[ln] for ln in batch["language_names"]] + if self.language_manager is not None and self.language_manager.ids and self.args.use_language_embedding: + language_ids = [self.language_manager.ids[ln] for ln in batch["language_names"]] if language_ids is not None: language_ids = torch.LongTensor(language_ids) @@ -1354,31 +1375,15 @@ class Vits(BaseTTS): # sort input sequences from short to long dataset.preprocess_samples() - # sampler for DDP - sampler = DistributedSampler(dataset) if num_gpus > 1 else None - - # Weighted samplers - # TODO: make this DDP amenable - assert not ( - num_gpus > 1 and getattr(config, "use_language_weighted_sampler", False) - ), "language_weighted_sampler is not supported with DistributedSampler" - assert not ( - num_gpus > 1 and getattr(config, "use_speaker_weighted_sampler", False) - ), "speaker_weighted_sampler is not supported with DistributedSampler" - - if sampler is None: - if getattr(config, "use_language_weighted_sampler", False): - print(" > Using Language weighted sampler") - sampler = get_language_weighted_sampler(dataset.samples) - elif getattr(config, "use_speaker_weighted_sampler", False): - print(" > Using Language weighted sampler") - sampler = get_speaker_weighted_sampler(dataset.samples) + # get samplers + sampler = self.get_sampler(config, dataset, num_gpus) loader = DataLoader( dataset, batch_size=config.eval_batch_size if is_eval else config.batch_size, shuffle=False, # shuffle is done in the dataset. drop_last=False, # setting this False might cause issues in AMP training. + sampler=sampler, collate_fn=dataset.collate_fn, num_workers=config.num_eval_loader_workers if is_eval else config.num_loader_workers, pin_memory=False, @@ -1481,7 +1486,7 @@ class Vits(BaseTTS): language_manager = LanguageManager.init_from_config(config) if config.model_args.speaker_encoder_model_path: - speaker_manager.init_speaker_encoder( + speaker_manager.init_encoder( config.model_args.speaker_encoder_model_path, config.model_args.speaker_encoder_config_path ) return Vits(new_config, ap, tokenizer, speaker_manager, language_manager) diff --git a/TTS/tts/utils/languages.py b/TTS/tts/utils/languages.py index 19708c13..9b5e2007 100644 --- a/TTS/tts/utils/languages.py +++ b/TTS/tts/utils/languages.py @@ -1,17 +1,16 @@ -import json import os -from typing import Dict, List +from typing import Any, Dict, List import fsspec import numpy as np import torch from coqpit import Coqpit -from torch.utils.data.sampler import WeightedRandomSampler from TTS.config import check_config_and_model_args +from TTS.tts.utils.managers import BaseIDManager -class LanguageManager: +class LanguageManager(BaseIDManager): """Manage the languages for multi-lingual 🐸TTS models. Load a datafile and parse the information in a way that can be queried by language. @@ -26,37 +25,23 @@ class LanguageManager: >>> language_id_mapper = manager.language_ids """ - language_id_mapping: Dict = {} - def __init__( self, language_ids_file_path: str = "", config: Coqpit = None, ): - self.language_id_mapping = {} - if language_ids_file_path: - self.set_language_ids_from_file(language_ids_file_path) + super().__init__(id_file_path=language_ids_file_path) if config: self.set_language_ids_from_config(config) - @staticmethod - def _load_json(json_file_path: str) -> Dict: - with fsspec.open(json_file_path, "r") as f: - return json.load(f) - - @staticmethod - def _save_json(json_file_path: str, data: dict) -> None: - with fsspec.open(json_file_path, "w") as f: - json.dump(data, f, indent=4) - @property def num_languages(self) -> int: - return len(list(self.language_id_mapping.keys())) + return len(list(self.ids.keys())) @property def language_names(self) -> List: - return list(self.language_id_mapping.keys()) + return list(self.ids.keys()) @staticmethod def parse_language_ids_from_config(c: Coqpit) -> Dict: @@ -80,25 +65,24 @@ class LanguageManager: """Set language IDs from config samples. Args: - items (List): Data sampled returned by `load_meta_data()`. + c (Coqpit): Config. """ - self.language_id_mapping = self.parse_language_ids_from_config(c) + self.ids = self.parse_language_ids_from_config(c) - def set_language_ids_from_file(self, file_path: str) -> None: - """Load language ids from a json file. + @staticmethod + def parse_ids_from_data(items: List, parse_key: str) -> Any: + raise NotImplementedError - Args: - file_path (str): Path to the target json file. - """ - self.language_id_mapping = self._load_json(file_path) + def set_ids_from_data(self, items: List, parse_key: str) -> Any: + raise NotImplementedError - def save_language_ids_to_file(self, file_path: str) -> None: + def save_ids_to_file(self, file_path: str) -> None: """Save language IDs to a json file. Args: file_path (str): Path to the output file. """ - self._save_json(file_path, self.language_id_mapping) + self._save_json(file_path, self.ids) @staticmethod def init_from_config(config: Coqpit) -> "LanguageManager": @@ -128,11 +112,14 @@ def _set_file_path(path): return None -def get_language_weighted_sampler(items: list): +def get_language_balancer_weights(items: list): language_names = np.array([item["language"] for item in items]) unique_language_names = np.unique(language_names).tolist() language_ids = [unique_language_names.index(l) for l in language_names] language_count = np.array([len(np.where(language_names == l)[0]) for l in unique_language_names]) weight_language = 1.0 / language_count - dataset_samples_weight = torch.from_numpy(np.array([weight_language[l] for l in language_ids])).double() - return WeightedRandomSampler(dataset_samples_weight, len(dataset_samples_weight)) + # get weight for each sample + dataset_samples_weight = np.array([weight_language[l] for l in language_ids]) + # normalize + dataset_samples_weight = dataset_samples_weight / np.linalg.norm(dataset_samples_weight) + return torch.from_numpy(dataset_samples_weight).float() diff --git a/TTS/tts/utils/managers.py b/TTS/tts/utils/managers.py new file mode 100644 index 00000000..85ed53cc --- /dev/null +++ b/TTS/tts/utils/managers.py @@ -0,0 +1,285 @@ +import json +import random +from typing import Any, Dict, List, Tuple, Union + +import fsspec +import numpy as np +import torch + +from TTS.config import load_config +from TTS.encoder.utils.generic_utils import setup_encoder_model +from TTS.utils.audio import AudioProcessor + + +class BaseIDManager: + """Base `ID` Manager class. Every new `ID` manager must inherit this. + It defines common `ID` manager specific functions. + """ + + def __init__(self, id_file_path: str = ""): + self.ids = {} + + if id_file_path: + self.load_ids_from_file(id_file_path) + + @staticmethod + def _load_json(json_file_path: str) -> Dict: + with fsspec.open(json_file_path, "r") as f: + return json.load(f) + + @staticmethod + def _save_json(json_file_path: str, data: dict) -> None: + with fsspec.open(json_file_path, "w") as f: + json.dump(data, f, indent=4) + + def set_ids_from_data(self, items: List, parse_key: str) -> None: + """Set IDs from data samples. + + Args: + items (List): Data sampled returned by `load_tts_samples()`. + """ + self.ids = self.parse_ids_from_data(items, parse_key=parse_key) + + def load_ids_from_file(self, file_path: str) -> None: + """Set IDs from a file. + + Args: + file_path (str): Path to the file. + """ + self.ids = self._load_json(file_path) + + def save_ids_to_file(self, file_path: str) -> None: + """Save IDs to a json file. + + Args: + file_path (str): Path to the output file. + """ + self._save_json(file_path, self.ids) + + def get_random_id(self) -> Any: + """Get a random embedding. + + Args: + + Returns: + np.ndarray: embedding. + """ + if self.ids: + return self.ids[random.choices(list(self.ids.keys()))[0]] + + return None + + @staticmethod + def parse_ids_from_data(items: List, parse_key: str) -> Tuple[Dict]: + """Parse IDs from data samples retured by `load_tts_samples()`. + + Args: + items (list): Data sampled returned by `load_tts_samples()`. + parse_key (str): The key to being used to parse the data. + Returns: + Tuple[Dict]: speaker IDs. + """ + classes = sorted({item[parse_key] for item in items}) + ids = {name: i for i, name in enumerate(classes)} + return ids + + +class EmbeddingManager(BaseIDManager): + """Base `Embedding` Manager class. Every new `Embedding` manager must inherit this. + It defines common `Embedding` manager specific functions. + """ + + def __init__( + self, + embedding_file_path: str = "", + id_file_path: str = "", + encoder_model_path: str = "", + encoder_config_path: str = "", + use_cuda: bool = False, + ): + super().__init__(id_file_path=id_file_path) + + self.embeddings = {} + self.embeddings_by_names = {} + self.clip_ids = [] + self.encoder = None + self.encoder_ap = None + self.use_cuda = use_cuda + + if embedding_file_path: + self.load_embeddings_from_file(embedding_file_path) + + if encoder_model_path and encoder_config_path: + self.init_encoder(encoder_model_path, encoder_config_path) + + @property + def embedding_dim(self): + """Dimensionality of embeddings. If embeddings are not loaded, returns zero.""" + if self.embeddings: + return len(self.embeddings[list(self.embeddings.keys())[0]]["embedding"]) + return 0 + + def save_embeddings_to_file(self, file_path: str) -> None: + """Save embeddings to a json file. + + Args: + file_path (str): Path to the output file. + """ + self._save_json(file_path, self.embeddings) + + def load_embeddings_from_file(self, file_path: str) -> None: + """Load embeddings from a json file. + + Args: + file_path (str): Path to the target json file. + """ + self.embeddings = self._load_json(file_path) + + speakers = sorted({x["name"] for x in self.embeddings.values()}) + self.ids = {name: i for i, name in enumerate(speakers)} + + self.clip_ids = list(set(sorted(clip_name for clip_name in self.embeddings.keys()))) + # cache embeddings_by_names for fast inference using a bigger speakers.json + self.embeddings_by_names = self.get_embeddings_by_names() + + def get_embedding_by_clip(self, clip_idx: str) -> List: + """Get embedding by clip ID. + + Args: + clip_idx (str): Target clip ID. + + Returns: + List: embedding as a list. + """ + return self.embeddings[clip_idx]["embedding"] + + def get_embeddings_by_name(self, idx: str) -> List[List]: + """Get all embeddings of a speaker. + + Args: + idx (str): Target name. + + Returns: + List[List]: all the embeddings of the given speaker. + """ + return self.embeddings_by_names[idx] + + def get_embeddings_by_names(self) -> Dict: + """Get all embeddings by names. + + Returns: + Dict: all the embeddings of each speaker. + """ + embeddings_by_names = {} + for x in self.embeddings.values(): + if x["name"] not in embeddings_by_names.keys(): + embeddings_by_names[x["name"]] = [x["embedding"]] + else: + embeddings_by_names[x["name"]].append(x["embedding"]) + return embeddings_by_names + + def get_mean_embedding(self, idx: str, num_samples: int = None, randomize: bool = False) -> np.ndarray: + """Get mean embedding of a idx. + + Args: + idx (str): Target name. + num_samples (int, optional): Number of samples to be averaged. Defaults to None. + randomize (bool, optional): Pick random `num_samples` of embeddings. Defaults to False. + + Returns: + np.ndarray: Mean embedding. + """ + embeddings = self.get_embeddings_by_name(idx) + if num_samples is None: + embeddings = np.stack(embeddings).mean(0) + else: + assert len(embeddings) >= num_samples, f" [!] {idx} has number of samples < {num_samples}" + if randomize: + embeddings = np.stack(random.choices(embeddings, k=num_samples)).mean(0) + else: + embeddings = np.stack(embeddings[:num_samples]).mean(0) + return embeddings + + def get_random_embedding(self) -> Any: + """Get a random embedding. + + Args: + + Returns: + np.ndarray: embedding. + """ + if self.embeddings: + return self.embeddings[random.choices(list(self.embeddings.keys()))[0]]["embedding"] + + return None + + def get_clips(self) -> List: + return sorted(self.embeddings.keys()) + + def init_encoder(self, model_path: str, config_path: str) -> None: + """Initialize a speaker encoder model. + + Args: + model_path (str): Model file path. + config_path (str): Model config file path. + """ + self.encoder_config = load_config(config_path) + self.encoder = setup_encoder_model(self.encoder_config) + self.encoder_criterion = self.encoder.load_checkpoint( + self.encoder_config, model_path, eval=True, use_cuda=self.use_cuda + ) + self.encoder_ap = AudioProcessor(**self.encoder_config.audio) + + def compute_embedding_from_clip(self, wav_file: Union[str, List[str]]) -> list: + """Compute a embedding from a given audio file. + + Args: + wav_file (Union[str, List[str]]): Target file path. + + Returns: + list: Computed embedding. + """ + + def _compute(wav_file: str): + waveform = self.encoder_ap.load_wav(wav_file, sr=self.encoder_ap.sample_rate) + if not self.encoder_config.model_params.get("use_torch_spec", False): + m_input = self.encoder_ap.melspectrogram(waveform) + m_input = torch.from_numpy(m_input) + else: + m_input = torch.from_numpy(waveform) + + if self.use_cuda: + m_input = m_input.cuda() + m_input = m_input.unsqueeze(0) + embedding = self.encoder.compute_embedding(m_input) + return embedding + + if isinstance(wav_file, list): + # compute the mean embedding + embeddings = None + for wf in wav_file: + embedding = _compute(wf) + if embeddings is None: + embeddings = embedding + else: + embeddings += embedding + return (embeddings / len(wav_file))[0].tolist() + embedding = _compute(wav_file) + return embedding[0].tolist() + + def compute_embeddings(self, feats: Union[torch.Tensor, np.ndarray]) -> List: + """Compute embedding from features. + + Args: + feats (Union[torch.Tensor, np.ndarray]): Input features. + + Returns: + List: computed embedding. + """ + if isinstance(feats, np.ndarray): + feats = torch.from_numpy(feats) + if feats.ndim == 2: + feats = feats.unsqueeze(0) + if self.use_cuda: + feats = feats.cuda() + return self.encoder.compute_embedding(feats) diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 99d653e6..284d0179 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -1,20 +1,17 @@ import json import os -import random -from typing import Any, Dict, List, Tuple, Union +from typing import Any, Dict, List, Union import fsspec import numpy as np import torch from coqpit import Coqpit -from torch.utils.data.sampler import WeightedRandomSampler -from TTS.config import get_from_config_or_model_args_with_default, load_config -from TTS.speaker_encoder.utils.generic_utils import setup_speaker_encoder_model -from TTS.utils.audio import AudioProcessor +from TTS.config import get_from_config_or_model_args_with_default +from TTS.tts.utils.managers import EmbeddingManager -class SpeakerManager: +class SpeakerManager(EmbeddingManager): """Manage the speakers for multi-speaker 🐸TTS models. Load a datafile and parse the information in a way that can be queried by speaker or clip. @@ -51,7 +48,7 @@ class SpeakerManager: >>> # load a sample audio and compute embedding >>> waveform = ap.load_wav(sample_wav_path) >>> mel = ap.melspectrogram(waveform) - >>> d_vector = manager.compute_d_vector(mel.T) + >>> d_vector = manager.compute_embeddings(mel.T) """ def __init__( @@ -63,260 +60,27 @@ class SpeakerManager: encoder_config_path: str = "", use_cuda: bool = False, ): - - self.d_vectors = {} - self.speaker_ids = {} - self.clip_ids = [] - self.speaker_encoder = None - self.speaker_encoder_ap = None - self.use_cuda = use_cuda + super().__init__( + embedding_file_path=d_vectors_file_path, + id_file_path=speaker_id_file_path, + encoder_model_path=encoder_model_path, + encoder_config_path=encoder_config_path, + use_cuda=use_cuda, + ) if data_items: - self.speaker_ids, _ = self.parse_speakers_from_data(data_items) - - if d_vectors_file_path: - self.set_d_vectors_from_file(d_vectors_file_path) - - if speaker_id_file_path: - self.set_speaker_ids_from_file(speaker_id_file_path) - - if encoder_model_path and encoder_config_path: - self.init_speaker_encoder(encoder_model_path, encoder_config_path) - - @staticmethod - def _load_json(json_file_path: str) -> Dict: - with fsspec.open(json_file_path, "r") as f: - return json.load(f) - - @staticmethod - def _save_json(json_file_path: str, data: dict) -> None: - with fsspec.open(json_file_path, "w") as f: - json.dump(data, f, indent=4) + self.set_ids_from_data(data_items, parse_key="speaker_name") @property def num_speakers(self): - return len(self.speaker_ids) + return len(self.ids) @property def speaker_names(self): - return list(self.speaker_ids.keys()) - - @property - def d_vector_dim(self): - """Dimensionality of d_vectors. If d_vectors are not loaded, returns zero.""" - if self.d_vectors: - return len(self.d_vectors[list(self.d_vectors.keys())[0]]["embedding"]) - return 0 - - @staticmethod - def parse_speakers_from_data(items: list) -> Tuple[Dict, int]: - """Parse speaker IDs from data samples retured by `load_tts_samples()`. - - Args: - items (list): Data sampled returned by `load_tts_samples()`. - - Returns: - Tuple[Dict, int]: speaker IDs and number of speakers. - """ - speakers = sorted({item["speaker_name"] for item in items}) - speaker_ids = {name: i for i, name in enumerate(speakers)} - num_speakers = len(speaker_ids) - return speaker_ids, num_speakers - - def set_speaker_ids_from_data(self, items: List) -> None: - """Set speaker IDs from data samples. - - Args: - items (List): Data sampled returned by `load_tts_samples()`. - """ - self.speaker_ids, _ = self.parse_speakers_from_data(items) - - def set_speaker_ids_from_file(self, file_path: str) -> None: - """Set speaker IDs from a file. - - Args: - file_path (str): Path to the file. - """ - self.speaker_ids = self._load_json(file_path) - - def save_speaker_ids_to_file(self, file_path: str) -> None: - """Save speaker IDs to a json file. - - Args: - file_path (str): Path to the output file. - """ - self._save_json(file_path, self.speaker_ids) - - def save_d_vectors_to_file(self, file_path: str) -> None: - """Save d_vectors to a json file. - - Args: - file_path (str): Path to the output file. - """ - self._save_json(file_path, self.d_vectors) - - def set_d_vectors_from_file(self, file_path: str) -> None: - """Load d_vectors from a json file. - - Args: - file_path (str): Path to the target json file. - """ - self.d_vectors = self._load_json(file_path) - - speakers = sorted({x["name"] for x in self.d_vectors.values()}) - self.speaker_ids = {name: i for i, name in enumerate(speakers)} - - self.clip_ids = list(set(sorted(clip_name for clip_name in self.d_vectors.keys()))) - - def get_d_vector_by_clip(self, clip_idx: str) -> List: - """Get d_vector by clip ID. - - Args: - clip_idx (str): Target clip ID. - - Returns: - List: d_vector as a list. - """ - return self.d_vectors[clip_idx]["embedding"] - - def get_d_vectors_by_speaker(self, speaker_idx: str) -> List[List]: - """Get all d_vectors of a speaker. - - Args: - speaker_idx (str): Target speaker ID. - - Returns: - List[List]: all the d_vectors of the given speaker. - """ - return [x["embedding"] for x in self.d_vectors.values() if x["name"] == speaker_idx] - - def get_mean_d_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False) -> np.ndarray: - """Get mean d_vector of a speaker ID. - - Args: - speaker_idx (str): Target speaker ID. - num_samples (int, optional): Number of samples to be averaged. Defaults to None. - randomize (bool, optional): Pick random `num_samples` of d_vectors. Defaults to False. - - Returns: - np.ndarray: Mean d_vector. - """ - d_vectors = self.get_d_vectors_by_speaker(speaker_idx) - if num_samples is None: - d_vectors = np.stack(d_vectors).mean(0) - else: - assert len(d_vectors) >= num_samples, f" [!] speaker {speaker_idx} has number of samples < {num_samples}" - if randomize: - d_vectors = np.stack(random.choices(d_vectors, k=num_samples)).mean(0) - else: - d_vectors = np.stack(d_vectors[:num_samples]).mean(0) - return d_vectors - - def get_random_speaker_id(self) -> Any: - """Get a random d_vector. - - Args: - - Returns: - np.ndarray: d_vector. - """ - if self.speaker_ids: - return self.speaker_ids[random.choices(list(self.speaker_ids.keys()))[0]] - - return None - - def get_random_d_vector(self) -> Any: - """Get a random D ID. - - Args: - - Returns: - np.ndarray: d_vector. - """ - if self.d_vectors: - return self.d_vectors[random.choices(list(self.d_vectors.keys()))[0]]["embedding"] - - return None + return list(self.ids.keys()) def get_speakers(self) -> List: - return self.speaker_ids - - def get_clips(self) -> List: - return sorted(self.d_vectors.keys()) - - def init_speaker_encoder(self, model_path: str, config_path: str) -> None: - """Initialize a speaker encoder model. - - Args: - model_path (str): Model file path. - config_path (str): Model config file path. - """ - self.speaker_encoder_config = load_config(config_path) - self.speaker_encoder = setup_speaker_encoder_model(self.speaker_encoder_config) - self.speaker_encoder.load_checkpoint(config_path, model_path, eval=True, use_cuda=self.use_cuda) - self.speaker_encoder_ap = AudioProcessor(**self.speaker_encoder_config.audio) - - def compute_d_vector_from_clip(self, wav_file: Union[str, List[str]]) -> list: - """Compute a d_vector from a given audio file. - - Args: - wav_file (Union[str, List[str]]): Target file path. - - Returns: - list: Computed d_vector. - """ - - def _compute(wav_file: str): - waveform = self.speaker_encoder_ap.load_wav(wav_file, sr=self.speaker_encoder_ap.sample_rate) - if not self.speaker_encoder_config.model_params.get("use_torch_spec", False): - m_input = self.speaker_encoder_ap.melspectrogram(waveform) - m_input = torch.from_numpy(m_input) - else: - m_input = torch.from_numpy(waveform) - - if self.use_cuda: - m_input = m_input.cuda() - m_input = m_input.unsqueeze(0) - d_vector = self.speaker_encoder.compute_embedding(m_input) - return d_vector - - if isinstance(wav_file, list): - # compute the mean d_vector - d_vectors = None - for wf in wav_file: - d_vector = _compute(wf) - if d_vectors is None: - d_vectors = d_vector - else: - d_vectors += d_vector - return (d_vectors / len(wav_file))[0].tolist() - d_vector = _compute(wav_file) - return d_vector[0].tolist() - - def compute_d_vector(self, feats: Union[torch.Tensor, np.ndarray]) -> List: - """Compute d_vector from features. - - Args: - feats (Union[torch.Tensor, np.ndarray]): Input features. - - Returns: - List: computed d_vector. - """ - if isinstance(feats, np.ndarray): - feats = torch.from_numpy(feats) - if feats.ndim == 2: - feats = feats.unsqueeze(0) - if self.use_cuda: - feats = feats.cuda() - return self.speaker_encoder.compute_embedding(feats) - - def run_umap(self): - # TODO: implement speaker encoder - raise NotImplementedError - - def plot_embeddings(self): - # TODO: implement speaker encoder - raise NotImplementedError + return self.ids @staticmethod def init_from_config(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "SpeakerManager": @@ -402,7 +166,7 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, speaker_manager = SpeakerManager() if c.use_speaker_embedding: if data is not None: - speaker_manager.set_speaker_ids_from_data(data) + speaker_manager.set_ids_from_data(data, parse_key="speaker_name") if restore_path: speakers_file = _set_file_path(restore_path) # restoring speaker manager from a previous run. @@ -414,27 +178,27 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, raise RuntimeError( "You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.d_vector_file" ) - speaker_manager.load_d_vectors_file(c.d_vector_file) - speaker_manager.set_d_vectors_from_file(speakers_file) + speaker_manager.load_embeddings_from_file(c.d_vector_file) + speaker_manager.load_embeddings_from_file(speakers_file) elif not c.use_d_vector_file: # restor speaker manager with speaker ID file. - speaker_ids_from_data = speaker_manager.speaker_ids - speaker_manager.set_speaker_ids_from_file(speakers_file) + speaker_ids_from_data = speaker_manager.ids + speaker_manager.load_ids_from_file(speakers_file) assert all( - speaker in speaker_manager.speaker_ids for speaker in speaker_ids_from_data + speaker in speaker_manager.ids for speaker in speaker_ids_from_data ), " [!] You cannot introduce new speakers to a pre-trained model." elif c.use_d_vector_file and c.d_vector_file: # new speaker manager with external speaker embeddings. - speaker_manager.set_d_vectors_from_file(c.d_vector_file) + speaker_manager.load_embeddings_from_file(c.d_vector_file) elif c.use_d_vector_file and not c.d_vector_file: raise "use_d_vector_file is True, so you need pass a external speaker embedding file." elif c.use_speaker_embedding and "speakers_file" in c and c.speakers_file: # new speaker manager with speaker IDs file. - speaker_manager.set_speaker_ids_from_file(c.speakers_file) + speaker_manager.load_ids_from_file(c.speakers_file) if speaker_manager.num_speakers > 0: print( " > Speaker manager is loaded with {} speakers: {}".format( - speaker_manager.num_speakers, ", ".join(speaker_manager.speaker_ids) + speaker_manager.num_speakers, ", ".join(speaker_manager.ids) ) ) @@ -443,17 +207,19 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None, out_file_path = os.path.join(out_path, "speakers.json") print(f" > Saving `speakers.json` to {out_file_path}.") if c.use_d_vector_file and c.d_vector_file: - speaker_manager.save_d_vectors_to_file(out_file_path) + speaker_manager.save_embeddings_to_file(out_file_path) else: - speaker_manager.save_speaker_ids_to_file(out_file_path) + speaker_manager.save_ids_to_file(out_file_path) return speaker_manager -def get_speaker_weighted_sampler(items: list): +def get_speaker_balancer_weights(items: list): speaker_names = np.array([item["speaker_name"] for item in items]) unique_speaker_names = np.unique(speaker_names).tolist() speaker_ids = [unique_speaker_names.index(l) for l in speaker_names] speaker_count = np.array([len(np.where(speaker_names == l)[0]) for l in unique_speaker_names]) weight_speaker = 1.0 / speaker_count - dataset_samples_weight = torch.from_numpy(np.array([weight_speaker[l] for l in speaker_ids])).double() - return WeightedRandomSampler(dataset_samples_weight, len(dataset_samples_weight)) + dataset_samples_weight = np.array([weight_speaker[l] for l in speaker_ids]) + # normalize + dataset_samples_weight = dataset_samples_weight / np.linalg.norm(dataset_samples_weight) + return torch.from_numpy(dataset_samples_weight).float() diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index b6e19ab4..f9e13251 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -205,3 +205,84 @@ def synthesis( "outputs": outputs, } return return_dict + + +def transfer_voice( + model, + CONFIG, + use_cuda, + reference_wav, + speaker_id=None, + d_vector=None, + reference_speaker_id=None, + reference_d_vector=None, + do_trim_silence=False, + use_griffin_lim=False, +): + """Synthesize voice for the given text using Griffin-Lim vocoder or just compute output features to be passed to + the vocoder model. + + Args: + model (TTS.tts.models): + The TTS model to synthesize audio with. + + CONFIG (Coqpit): + Model configuration. + + use_cuda (bool): + Enable/disable CUDA. + + reference_wav (str): + Path of reference_wav to be used to voice conversion. + + speaker_id (int): + Speaker ID passed to the speaker embedding layer in multi-speaker model. Defaults to None. + + d_vector (torch.Tensor): + d-vector for multi-speaker models in share :math:`[1, D]`. Defaults to None. + + reference_speaker_id (int): + Reference Speaker ID passed to the speaker embedding layer in multi-speaker model. Defaults to None. + + reference_d_vector (torch.Tensor): + Reference d-vector for multi-speaker models in share :math:`[1, D]`. Defaults to None. + + enable_eos_bos_chars (bool): + enable special chars for end of sentence and start of sentence. Defaults to False. + + do_trim_silence (bool): + trim silence after synthesis. Defaults to False. + """ + # pass tensors to backend + if speaker_id is not None: + speaker_id = id_to_torch(speaker_id, cuda=use_cuda) + + if d_vector is not None: + d_vector = embedding_to_torch(d_vector, cuda=use_cuda) + + if reference_d_vector is not None: + reference_d_vector = embedding_to_torch(reference_d_vector, cuda=use_cuda) + + # load reference_wav audio + reference_wav = embedding_to_torch(model.ap.load_wav(reference_wav, sr=model.ap.sample_rate), cuda=use_cuda) + + if hasattr(model, "module"): + _func = model.module.inference_voice_conversion + else: + _func = model.inference_voice_conversion + model_outputs = _func(reference_wav, speaker_id, d_vector, reference_speaker_id, reference_d_vector) + + # convert outputs to numpy + # plot results + wav = None + model_outputs = model_outputs.squeeze() + if model_outputs.ndim == 2: # [T, C_spec] + if use_griffin_lim: + wav = inv_spectrogram(model_outputs, model.ap, CONFIG) + # trim silence + if do_trim_silence: + wav = trim_silence(wav, model.ap) + else: # [T,] + wav = model_outputs + + return wav diff --git a/TTS/tts/utils/text/phonemizers/__init__.py b/TTS/tts/utils/text/phonemizers/__init__.py index 5dc117c4..374d0c8a 100644 --- a/TTS/tts/utils/text/phonemizers/__init__.py +++ b/TTS/tts/utils/text/phonemizers/__init__.py @@ -12,16 +12,9 @@ GRUUT_LANGS = list(Gruut.supported_languages()) # Dict setting default phonemizers for each language -DEF_LANG_TO_PHONEMIZER = { - "ja-jp": JA_JP_Phonemizer.name(), - "zh-cn": ZH_CN_Phonemizer.name(), -} - - # Add Gruut languages _ = [Gruut.name()] * len(GRUUT_LANGS) -_new_dict = dict(list(zip(GRUUT_LANGS, _))) -DEF_LANG_TO_PHONEMIZER.update(_new_dict) +DEF_LANG_TO_PHONEMIZER = dict(list(zip(GRUUT_LANGS, _))) # Add ESpeak languages and override any existing ones @@ -29,7 +22,10 @@ _ = [ESpeak.name()] * len(ESPEAK_LANGS) _new_dict = dict(list(zip(list(ESPEAK_LANGS), _))) DEF_LANG_TO_PHONEMIZER.update(_new_dict) +# Force default for some languages DEF_LANG_TO_PHONEMIZER["en"] = DEF_LANG_TO_PHONEMIZER["en-us"] +DEF_LANG_TO_PHONEMIZER["ja-jp"] = JA_JP_Phonemizer.name() +DEF_LANG_TO_PHONEMIZER["zh-cn"] = ZH_CN_Phonemizer.name() def get_phonemizer_by_name(name: str, **kwargs) -> BasePhonemizer: diff --git a/TTS/tts/utils/text/tokenizer.py b/TTS/tts/utils/text/tokenizer.py index f0d85a44..1569c634 100644 --- a/TTS/tts/utils/text/tokenizer.py +++ b/TTS/tts/utils/text/tokenizer.py @@ -191,6 +191,7 @@ class TTSTokenizer: phonemizer = get_phonemizer_by_name( DEF_LANG_TO_PHONEMIZER[config.phoneme_language], **phonemizer_kwargs ) + new_config.phonemizer = phonemizer.name() except KeyError as e: raise ValueError( f"""No phonemizer found for language {config.phoneme_language}. diff --git a/TTS/utils/audio.py b/TTS/utils/audio.py index d0777c11..4d435162 100644 --- a/TTS/utils/audio.py +++ b/TTS/utils/audio.py @@ -371,7 +371,9 @@ class AudioProcessor(object): self.hop_length = hop_length self.win_length = win_length assert min_level_db != 0.0, " [!] min_level_db is 0" - assert self.win_length <= self.fft_size, " [!] win_length cannot be larger than fft_size" + assert ( + self.win_length <= self.fft_size + ), f" [!] win_length cannot be larger than fft_size - {self.win_length} vs {self.fft_size}" members = vars(self) if verbose: print(" > Setting up Audio Processor...") diff --git a/TTS/utils/generic_utils.py b/TTS/utils/generic_utils.py index 69609bcb..b685210c 100644 --- a/TTS/utils/generic_utils.py +++ b/TTS/utils/generic_utils.py @@ -67,7 +67,7 @@ def get_experiment_folder_path(root_path, model_name): def remove_experiment_folder(experiment_path): """Check folder if there is a checkpoint, otherwise remove the folder""" fs = fsspec.get_mapper(experiment_path).fs - checkpoint_files = fs.glob(experiment_path + "/*.pth.tar") + checkpoint_files = fs.glob(experiment_path + "/*.pth") if not checkpoint_files: if fs.exists(experiment_path): fs.rm(experiment_path, recursive=True) diff --git a/TTS/utils/io.py b/TTS/utils/io.py index 54818ce9..304df5ed 100644 --- a/TTS/utils/io.py +++ b/TTS/utils/io.py @@ -140,7 +140,7 @@ def save_checkpoint( output_folder, **kwargs, ): - file_name = "checkpoint_{}.pth.tar".format(current_step) + file_name = "checkpoint_{}.pth".format(current_step) checkpoint_path = os.path.join(output_folder, file_name) print("\n > CHECKPOINT : {}".format(checkpoint_path)) save_model( @@ -170,7 +170,7 @@ def save_best_model( **kwargs, ): if current_loss < best_loss: - best_model_name = f"best_model_{current_step}.pth.tar" + best_model_name = f"best_model_{current_step}.pth" checkpoint_path = os.path.join(out_path, best_model_name) print(" > BEST MODEL : {}".format(checkpoint_path)) save_model( @@ -187,12 +187,12 @@ def save_best_model( fs = fsspec.get_mapper(out_path).fs # only delete previous if current is saved successfully if not keep_all_best or (current_step < keep_after): - model_names = fs.glob(os.path.join(out_path, "best_model*.pth.tar")) + model_names = fs.glob(os.path.join(out_path, "best_model*.pth")) for model_name in model_names: if os.path.basename(model_name) != best_model_name: fs.rm(model_name) # create a shortcut which always points to the currently best model - shortcut_name = "best_model.pth.tar" + shortcut_name = "best_model.pth" shortcut_path = os.path.join(out_path, shortcut_name) fs.copy(checkpoint_path, shortcut_path) best_loss = current_loss diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index 01d54ad6..87cb5592 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -4,12 +4,24 @@ import os import zipfile from pathlib import Path from shutil import copyfile, rmtree +from typing import Dict, Tuple import requests from TTS.config import load_config from TTS.utils.generic_utils import get_user_data_dir +LICENSE_URLS = { + "cc by-nc-nd 4.0": "https://creativecommons.org/licenses/by-nc-nd/4.0/", + "mpl": "https://www.mozilla.org/en-US/MPL/2.0/", + "mpl2": "https://www.mozilla.org/en-US/MPL/2.0/", + "mpl 2.0": "https://www.mozilla.org/en-US/MPL/2.0/", + "mit": "https://choosealicense.com/licenses/mit/", + "apache 2.0": "https://choosealicense.com/licenses/apache-2.0/", + "apache2": "https://choosealicense.com/licenses/apache-2.0/", + "cc-by-sa 4.0": "https://creativecommons.org/licenses/by-sa/4.0/", +} + class ModelManager(object): """Manage TTS models defined in .models.json. @@ -107,6 +119,22 @@ class ModelManager(object): for dataset in self.models_dict[model_type][lang]: print(f" >: {model_type}/{lang}/{dataset}") + @staticmethod + def print_model_license(model_item: Dict): + """Print the license of a model + + Args: + model_item (dict): model item in the models.json + """ + if "license" in model_item and model_item["license"].strip() != "": + print(f" > Model's license - {model_item['license']}") + if model_item["license"].lower() in LICENSE_URLS: + print(f" > Check {LICENSE_URLS[model_item['license'].lower()]} for more info.") + else: + print(" > Check https://opensource.org/licenses for more info.") + else: + print(" > Model's license - No license information available") + def download_model(self, model_name): """Download model files given the full model name. Model name is in the format @@ -114,7 +142,7 @@ class ModelManager(object): e.g. 'tts_model/en/ljspeech/tacotron' Every model must have the following files: - - *.pth.tar : pytorch model checkpoint file. + - *.pth : pytorch model checkpoint file. - config.json : model config file. - scale_stats.npy (if exist): scale values for preprocessing. @@ -127,9 +155,6 @@ class ModelManager(object): model_item = self.models_dict[model_type][lang][dataset][model] # set the model specific output path output_path = os.path.join(self.output_prefix, model_full_name) - output_model_path = os.path.join(output_path, "model_file.pth.tar") - output_config_path = os.path.join(output_path, "config.json") - if os.path.exists(output_path): print(f" > {model_name} is already downloaded.") else: @@ -137,10 +162,52 @@ class ModelManager(object): print(f" > Downloading model to {output_path}") # download from github release self._download_zip_file(model_item["github_rls_url"], output_path) + self.print_model_license(model_item=model_item) + # find downloaded files + output_model_path, output_config_path = self._find_files(output_path) # update paths in the config.json self._update_paths(output_path, output_config_path) return output_model_path, output_config_path, model_item + @staticmethod + def _find_files(output_path: str) -> Tuple[str, str]: + """Find the model and config files in the output path + + Args: + output_path (str): path to the model files + + Returns: + Tuple[str, str]: path to the model file and config file + """ + model_file = None + config_file = None + for file_name in os.listdir(output_path): + if file_name in ["model_file.pth", "model_file.pth.tar", "model.pth"]: + model_file = os.path.join(output_path, file_name) + elif file_name == "config.json": + config_file = os.path.join(output_path, file_name) + if model_file is None: + raise ValueError(" [!] Model file not found in the output path") + if config_file is None: + raise ValueError(" [!] Config file not found in the output path") + return model_file, config_file + + @staticmethod + def _find_speaker_encoder(output_path: str) -> str: + """Find the speaker encoder file in the output path + + Args: + output_path (str): path to the model files + + Returns: + str: path to the speaker encoder file + """ + speaker_encoder_file = None + for file_name in os.listdir(output_path): + if file_name in ["model_se.pth", "model_se.pth.tar"]: + speaker_encoder_file = os.path.join(output_path, file_name) + return speaker_encoder_file + def _update_paths(self, output_path: str, config_path: str) -> None: """Update paths for certain files in config.json after download. @@ -152,7 +219,7 @@ class ModelManager(object): output_d_vector_file_path = os.path.join(output_path, "speakers.json") output_speaker_ids_file_path = os.path.join(output_path, "speaker_ids.json") speaker_encoder_config_path = os.path.join(output_path, "config_se.json") - speaker_encoder_model_path = os.path.join(output_path, "model_se.pth.tar") + speaker_encoder_model_path = self._find_speaker_encoder(output_path) # update the scale_path.npy file path in the model config.json self._update_path("audio.stats_path", output_stats_path, config_path) @@ -174,7 +241,7 @@ class ModelManager(object): @staticmethod def _update_path(field_name, new_path, config_path): """Update the path in the model config.json for the current environment after download""" - if os.path.exists(new_path): + if new_path and os.path.exists(new_path): config = load_config(config_path) field_names = field_name.split(".") if len(field_names) > 1: diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 4007931b..f9572add 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -10,7 +10,7 @@ from TTS.tts.models import setup_model as setup_tts_model # pylint: disable=unused-wildcard-import # pylint: disable=wildcard-import -from TTS.tts.utils.synthesis import synthesis, trim_silence +from TTS.tts.utils.synthesis import synthesis, transfer_voice, trim_silence from TTS.utils.audio import AudioProcessor from TTS.vocoder.models import setup_model as setup_vocoder_model from TTS.vocoder.utils.generic_utils import interpolate_vocoder_input @@ -109,15 +109,21 @@ class Synthesizer(object): """ # pylint: disable=global-statement self.tts_config = load_config(tts_config_path) - self.use_phonemes = self.tts_config.use_phonemes + if self.tts_config["use_phonemes"] and self.tts_config["phonemizer"] is None: + raise ValueError("Phonemizer is not defined in the TTS config.") + self.tts_model = setup_tts_model(config=self.tts_config) if not self.encoder_checkpoint: self._set_speaker_encoder_paths_from_tts_config() + self.tts_model.load_checkpoint(self.tts_config, tts_checkpoint, eval=True) if use_cuda: self.tts_model.cuda() + if self.encoder_checkpoint and hasattr(self.tts_model, "speaker_manager"): + self.tts_model.speaker_manager.init_encoder(self.encoder_checkpoint, self.encoder_config) + def _set_speaker_encoder_paths_from_tts_config(self): """Set the encoder paths from the tts model config for models with speaker encoders.""" if hasattr(self.tts_config, "model_args") and hasattr( @@ -183,11 +189,13 @@ class Synthesizer(object): def tts( self, - text: str, + text: str = "", speaker_name: str = "", language_name: str = "", speaker_wav: Union[str, List[str]] = None, style_wav=None, + reference_wav=None, + reference_speaker_name=None, ) -> List[int]: """🐸 TTS magic. Run all the models and generate speech. @@ -197,33 +205,43 @@ class Synthesizer(object): language_name (str, optional): language id for multi-language models. Defaults to "". speaker_wav (Union[str, List[str]], optional): path to the speaker wav. Defaults to None. style_wav ([type], optional): style waveform for GST. Defaults to None. - + reference_wav ([type], optional): reference waveform for voice conversion. Defaults to None. + reference_speaker_name ([type], optional): spekaer id of reference waveform. Defaults to None. Returns: List[int]: [description] """ start_time = time.time() wavs = [] - sens = self.split_into_sentences(text) - print(" > Text splitted to sentences.") - print(sens) + + if not text and not reference_wav: + raise ValueError( + "You need to define either `text` (for sythesis) or a `reference_wav` (for voice conversion) to use the Coqui TTS API." + ) + + if text: + sens = self.split_into_sentences(text) + print(" > Text splitted to sentences.") + print(sens) # handle multi-speaker speaker_embedding = None speaker_id = None - if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "speaker_ids"): + if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "ids"): if speaker_name and isinstance(speaker_name, str): if self.tts_config.use_d_vector_file: - # get the speaker embedding from the saved d_vectors. - speaker_embedding = self.tts_model.speaker_manager.get_d_vectors_by_speaker(speaker_name)[0] + # get the average speaker embedding from the saved d_vectors. + speaker_embedding = self.tts_model.speaker_manager.get_mean_embedding( + speaker_name, num_samples=None, randomize=False + ) speaker_embedding = np.array(speaker_embedding)[None, :] # [1 x embedding_dim] else: # get speaker idx from the speaker name - speaker_id = self.tts_model.speaker_manager.speaker_ids[speaker_name] + speaker_id = self.tts_model.speaker_manager.ids[speaker_name] elif not speaker_name and not speaker_wav: raise ValueError( " [!] Look like you use a multi-speaker model. " - "You need to define either a `speaker_name` or a `style_wav` to use a multi-speaker model." + "You need to define either a `speaker_name` or a `speaker_wav` to use a multi-speaker model." ) else: speaker_embedding = None @@ -240,7 +258,7 @@ class Synthesizer(object): hasattr(self.tts_model, "language_manager") and self.tts_model.language_manager is not None ): if language_name and isinstance(language_name, str): - language_id = self.tts_model.language_manager.language_id_mapping[language_name] + language_id = self.tts_model.language_manager.ids[language_name] elif not language_name: raise ValueError( @@ -256,26 +274,93 @@ class Synthesizer(object): # compute a new d_vector from the given clip. if speaker_wav is not None: - speaker_embedding = self.tts_model.speaker_manager.compute_d_vector_from_clip(speaker_wav) + speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(speaker_wav) use_gl = self.vocoder_model is None - for sen in sens: - # synthesize voice - outputs = synthesis( + if not reference_wav: + for sen in sens: + # synthesize voice + outputs = synthesis( + model=self.tts_model, + text=sen, + CONFIG=self.tts_config, + use_cuda=self.use_cuda, + speaker_id=speaker_id, + language_id=language_id, + style_wav=style_wav, + use_griffin_lim=use_gl, + d_vector=speaker_embedding, + ) + waveform = outputs["wav"] + mel_postnet_spec = outputs["outputs"]["model_outputs"][0].detach().cpu().numpy() + if not use_gl: + # denormalize tts output based on tts audio config + mel_postnet_spec = self.tts_model.ap.denormalize(mel_postnet_spec.T).T + device_type = "cuda" if self.use_cuda else "cpu" + # renormalize spectrogram based on vocoder config + vocoder_input = self.vocoder_ap.normalize(mel_postnet_spec.T) + # compute scale factor for possible sample rate mismatch + scale_factor = [ + 1, + self.vocoder_config["audio"]["sample_rate"] / self.tts_model.ap.sample_rate, + ] + if scale_factor[1] != 1: + print(" > interpolating tts model output.") + vocoder_input = interpolate_vocoder_input(scale_factor, vocoder_input) + else: + vocoder_input = torch.tensor(vocoder_input).unsqueeze(0) # pylint: disable=not-callable + # run vocoder model + # [1, T, C] + waveform = self.vocoder_model.inference(vocoder_input.to(device_type)) + if self.use_cuda and not use_gl: + waveform = waveform.cpu() + if not use_gl: + waveform = waveform.numpy() + waveform = waveform.squeeze() + + # trim silence + if self.tts_config.audio["do_trim_silence"] is True: + waveform = trim_silence(waveform, self.tts_model.ap) + + wavs += list(waveform) + wavs += [0] * 10000 + else: + # get the speaker embedding or speaker id for the reference wav file + reference_speaker_embedding = None + reference_speaker_id = None + if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "speaker_ids"): + if reference_speaker_name and isinstance(reference_speaker_name, str): + if self.tts_config.use_d_vector_file: + # get the speaker embedding from the saved d_vectors. + reference_speaker_embedding = self.tts_model.speaker_manager.get_embeddings_by_name( + reference_speaker_name + )[0] + reference_speaker_embedding = np.array(reference_speaker_embedding)[ + None, : + ] # [1 x embedding_dim] + else: + # get speaker idx from the speaker name + reference_speaker_id = self.tts_model.speaker_manager.ids[reference_speaker_name] + else: + reference_speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip( + reference_wav + ) + + outputs = transfer_voice( model=self.tts_model, - text=sen, CONFIG=self.tts_config, use_cuda=self.use_cuda, + reference_wav=reference_wav, speaker_id=speaker_id, - language_id=language_id, - style_wav=style_wav, - use_griffin_lim=use_gl, d_vector=speaker_embedding, + use_griffin_lim=use_gl, + reference_speaker_id=reference_speaker_id, + reference_d_vector=reference_speaker_embedding, ) - waveform = outputs["wav"] - mel_postnet_spec = outputs["outputs"]["model_outputs"][0].detach().cpu().numpy() + waveform = outputs if not use_gl: + mel_postnet_spec = outputs[0].detach().cpu().numpy() # denormalize tts output based on tts audio config mel_postnet_spec = self.tts_model.ap.denormalize(mel_postnet_spec.T).T device_type = "cuda" if self.use_cuda else "cpu" @@ -294,18 +379,11 @@ class Synthesizer(object): # run vocoder model # [1, T, C] waveform = self.vocoder_model.inference(vocoder_input.to(device_type)) - if self.use_cuda and not use_gl: + if self.use_cuda: waveform = waveform.cpu() if not use_gl: waveform = waveform.numpy() - waveform = waveform.squeeze() - - # trim silence - if self.tts_config.audio["do_trim_silence"] is True: - waveform = trim_silence(waveform, self.tts_model.ap) - - wavs += list(waveform) - wavs += [0] * 10000 + wavs = waveform.squeeze() # compute stats process_time = time.time() - start_time diff --git a/TTS/utils/vad.py b/TTS/utils/vad.py index 923544d0..033b911a 100644 --- a/TTS/utils/vad.py +++ b/TTS/utils/vad.py @@ -1,144 +1,81 @@ -# This code is adpated from: https://github.com/wiseman/py-webrtcvad/blob/master/example.py -import collections -import contextlib -import wave - -import webrtcvad +import torch +import torchaudio -def read_wave(path): - """Reads a .wav file. +def read_audio(path): + wav, sr = torchaudio.load(path) - Takes the path, and returns (PCM audio data, sample rate). - """ - with contextlib.closing(wave.open(path, "rb")) as wf: - num_channels = wf.getnchannels() - assert num_channels == 1 - sample_width = wf.getsampwidth() - assert sample_width == 2 - sample_rate = wf.getframerate() - assert sample_rate in (8000, 16000, 32000, 48000) - pcm_data = wf.readframes(wf.getnframes()) - return pcm_data, sample_rate + if wav.size(0) > 1: + wav = wav.mean(dim=0, keepdim=True) + + return wav.squeeze(0), sr -def write_wave(path, audio, sample_rate): - """Writes a .wav file. - - Takes path, PCM audio data, and sample rate. - """ - with contextlib.closing(wave.open(path, "wb")) as wf: - wf.setnchannels(1) - wf.setsampwidth(2) - wf.setframerate(sample_rate) - wf.writeframes(audio) +def resample_wav(wav, sr, new_sr): + wav = wav.unsqueeze(0) + transform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=new_sr) + wav = transform(wav) + return wav.squeeze(0) -class Frame(object): - """Represents a "frame" of audio data.""" +def map_timestamps_to_new_sr(vad_sr, new_sr, timestamps, just_begging_end=False): + factor = new_sr / vad_sr + new_timestamps = [] + if just_begging_end and timestamps: + # get just the start and end timestamps + new_dict = {"start": int(timestamps[0]["start"] * factor), "end": int(timestamps[-1]["end"] * factor)} + new_timestamps.append(new_dict) + else: + for ts in timestamps: + # map to the new SR + new_dict = {"start": int(ts["start"] * factor), "end": int(ts["end"] * factor)} + new_timestamps.append(new_dict) - def __init__(self, _bytes, timestamp, duration): - self.bytes = _bytes - self.timestamp = timestamp - self.duration = duration + return new_timestamps -def frame_generator(frame_duration_ms, audio, sample_rate): - """Generates audio frames from PCM audio data. +def get_vad_model_and_utils(use_cuda=False): + model, utils = torch.hub.load(repo_or_dir="snakers4/silero-vad", model="silero_vad", force_reload=True, onnx=False) + if use_cuda: + model = model.cuda() - Takes the desired frame duration in milliseconds, the PCM data, and - the sample rate. - - Yields Frames of the requested duration. - """ - n = int(sample_rate * (frame_duration_ms / 1000.0) * 2) - offset = 0 - timestamp = 0.0 - duration = (float(n) / sample_rate) / 2.0 - while offset + n < len(audio): - yield Frame(audio[offset : offset + n], timestamp, duration) - timestamp += duration - offset += n + get_speech_timestamps, save_audio, _, _, collect_chunks = utils + return model, get_speech_timestamps, save_audio, collect_chunks -def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, frames): - """Filters out non-voiced audio frames. +def remove_silence( + model_and_utils, audio_path, out_path, vad_sample_rate=8000, trim_just_beginning_and_end=True, use_cuda=False +): - Given a webrtcvad.Vad and a source of audio frames, yields only - the voiced audio. + # get the VAD model and utils functions + model, get_speech_timestamps, save_audio, collect_chunks = model_and_utils - Uses a padded, sliding window algorithm over the audio frames. - When more than 90% of the frames in the window are voiced (as - reported by the VAD), the collector triggers and begins yielding - audio frames. Then the collector waits until 90% of the frames in - the window are unvoiced to detrigger. + # read ground truth wav and resample the audio for the VAD + wav, gt_sample_rate = read_audio(audio_path) - The window is padded at the front and back to provide a small - amount of silence or the beginnings/endings of speech around the - voiced frames. + # if needed, resample the audio for the VAD model + if gt_sample_rate != vad_sample_rate: + wav_vad = resample_wav(wav, gt_sample_rate, vad_sample_rate) + else: + wav_vad = wav - Arguments: + if use_cuda: + wav_vad = wav_vad.cuda() - sample_rate - The audio sample rate, in Hz. - frame_duration_ms - The frame duration in milliseconds. - padding_duration_ms - The amount to pad the window, in milliseconds. - vad - An instance of webrtcvad.Vad. - frames - a source of audio frames (sequence or generator). + # get speech timestamps from full audio file + speech_timestamps = get_speech_timestamps(wav_vad, model, sampling_rate=vad_sample_rate, window_size_samples=768) - Returns: A generator that yields PCM audio data. - """ - num_padding_frames = int(padding_duration_ms / frame_duration_ms) - # We use a deque for our sliding window/ring buffer. - ring_buffer = collections.deque(maxlen=num_padding_frames) - # We have two states: TRIGGERED and NOTTRIGGERED. We start in the - # NOTTRIGGERED state. - triggered = False + # map the current speech_timestamps to the sample rate of the ground truth audio + new_speech_timestamps = map_timestamps_to_new_sr( + vad_sample_rate, gt_sample_rate, speech_timestamps, trim_just_beginning_and_end + ) - voiced_frames = [] - for frame in frames: - is_speech = vad.is_speech(frame.bytes, sample_rate) + # if have speech timestamps else save the wav + if new_speech_timestamps: + wav = collect_chunks(new_speech_timestamps, wav) + else: + print(f"> The file {audio_path} probably does not have speech please check it !!") - # sys.stdout.write('1' if is_speech else '0') - if not triggered: - ring_buffer.append((frame, is_speech)) - num_voiced = len([f for f, speech in ring_buffer if speech]) - # If we're NOTTRIGGERED and more than 90% of the frames in - # the ring buffer are voiced frames, then enter the - # TRIGGERED state. - if num_voiced > 0.9 * ring_buffer.maxlen: - triggered = True - # sys.stdout.write('+(%s)' % (ring_buffer[0][0].timestamp,)) - # We want to yield all the audio we see from now until - # we are NOTTRIGGERED, but we have to start with the - # audio that's already in the ring buffer. - for f, _ in ring_buffer: - voiced_frames.append(f) - ring_buffer.clear() - else: - # We're in the TRIGGERED state, so collect the audio data - # and add it to the ring buffer. - voiced_frames.append(frame) - ring_buffer.append((frame, is_speech)) - num_unvoiced = len([f for f, speech in ring_buffer if not speech]) - # If more than 90% of the frames in the ring buffer are - # unvoiced, then enter NOTTRIGGERED and yield whatever - # audio we've collected. - if num_unvoiced > 0.9 * ring_buffer.maxlen: - # sys.stdout.write('-(%s)' % (frame.timestamp + frame.duration)) - triggered = False - yield b"".join([f.bytes for f in voiced_frames]) - ring_buffer.clear() - voiced_frames = [] - # If we have any leftover voiced audio when we run out of input, - # yield it. - if voiced_frames: - yield b"".join([f.bytes for f in voiced_frames]) - - -def get_vad_speech_segments(audio, sample_rate, aggressiveness=2, padding_duration_ms=300): - - vad = webrtcvad.Vad(int(aggressiveness)) - frames = list(frame_generator(30, audio, sample_rate)) - segments = vad_collector(sample_rate, 30, padding_duration_ms, vad, frames) - - return segments + # save audio + save_audio(out_path, wav, sampling_rate=gt_sample_rate) + return out_path diff --git a/TTS/vocoder/README.md b/TTS/vocoder/README.md index e0ae8f21..b9fb17c8 100644 --- a/TTS/vocoder/README.md +++ b/TTS/vocoder/README.md @@ -29,7 +29,7 @@ You can continue a previous training run by the following command. You can fine-tune a pre-trained model by the following command. -```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --restore_path path/to/your/model.pth.tar``` +```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --restore_path path/to/your/model.pth``` Restoring a model starts a new training in a different folder. It only restores model weights with the given checkpoint file. However, continuing a training starts from the same directory where the previous training run left off. diff --git a/docs/source/_templates/page.html b/docs/source/_templates/page.html new file mode 100644 index 00000000..2c6ef4ee --- /dev/null +++ b/docs/source/_templates/page.html @@ -0,0 +1,23 @@ +{% extends "!page.html" %} +{% block scripts %} + {{ super() }} + + + + + + + +{% endblock %} diff --git a/docs/source/finetuning.md b/docs/source/finetuning.md index 7d7ef1cb..fd97daa5 100644 --- a/docs/source/finetuning.md +++ b/docs/source/finetuning.md @@ -93,13 +93,13 @@ them and fine-tune it for your own dataset. This will help you in two main ways: ```bash CUDA_VISIBLE_DEVICES="0" python recipes/ljspeech/glow_tts/train_glowtts.py \ - --restore_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth.tar + --restore_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth ``` ```bash CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tts.py \ --config_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/config.json \ - --restore_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth.tar + --restore_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth ``` As stated above, you can also use command-line arguments to change the model configuration. @@ -107,7 +107,7 @@ them and fine-tune it for your own dataset. This will help you in two main ways: ```bash CUDA_VISIBLE_DEVICES="0" python recipes/ljspeech/glow_tts/train_glowtts.py \ - --restore_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth.tar + --restore_path /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth --coqpit.run_name "glow-tts-finetune" \ --coqpit.lr 0.00001 ``` diff --git a/docs/source/inference.md b/docs/source/inference.md index 544473bf..1057d04d 100644 --- a/docs/source/inference.md +++ b/docs/source/inference.md @@ -44,7 +44,7 @@ Run your own TTS model (Using Griffin-Lim Vocoder) ```bash tts --text "Text for TTS" \ - --model_path path/to/model.pth.tar \ + --model_path path/to/model.pth \ --config_path path/to/config.json \ --out_path folder/to/save/output.wav ``` @@ -54,9 +54,9 @@ Run your own TTS and Vocoder models ```bash tts --text "Text for TTS" \ --config_path path/to/config.json \ - --model_path path/to/model.pth.tar \ + --model_path path/to/model.pth \ --out_path folder/to/save/output.wav \ - --vocoder_path path/to/vocoder.pth.tar \ + --vocoder_path path/to/vocoder.pth \ --vocoder_config_path path/to/vocoder_config.json ``` diff --git a/docs/source/main_classes/trainer_api.md b/docs/source/main_classes/trainer_api.md index a5c3cfb7..f765fff7 100644 --- a/docs/source/main_classes/trainer_api.md +++ b/docs/source/main_classes/trainer_api.md @@ -1,17 +1,3 @@ # Trainer API -The {class}`TTS.trainer.Trainer` provides a lightweight, extensible, and feature-complete training run-time. We optimized it for 🐸 but -can also be used for any DL training in different domains. It supports distributed multi-gpu, mixed-precision (apex or torch.amp) training. - - -## Trainer -```{eval-rst} -.. autoclass:: TTS.trainer.Trainer - :members: -``` - -## TrainingArgs -```{eval-rst} -.. autoclass:: TTS.trainer.TrainingArgs - :members: -``` \ No newline at end of file +We made the trainer a seprate project on https://github.com/coqui-ai/Trainer diff --git a/docs/source/training_a_model.md b/docs/source/training_a_model.md index a28710d0..22090f6e 100644 --- a/docs/source/training_a_model.md +++ b/docs/source/training_a_model.md @@ -33,7 +33,7 @@ If you like to run a multi-gpu training using DDP back-end, ```bash - $ CUDA_VISIBLE_DEVICES="0, 1, 2" python TTS/bin/distribute.py --script /train_glowtts.py + $ CUDA_VISIBLE_DEVICES="0, 1, 2" python -m trainer.distribute --script /train_glowtts.py ``` The example above runs a multi-gpu training using GPUs `0, 1, 2`. @@ -122,7 +122,7 @@ ```bash $ tts --text "Text for TTS" \ - --model_path path/to/checkpoint_x.pth.tar \ + --model_path path/to/checkpoint_x.pth \ --config_path path/to/config.json \ --out_path folder/to/save/output.wav ``` diff --git a/docs/source/tutorial_for_nervous_beginners.md b/docs/source/tutorial_for_nervous_beginners.md index fa09cb7d..d2d3c4bb 100644 --- a/docs/source/tutorial_for_nervous_beginners.md +++ b/docs/source/tutorial_for_nervous_beginners.md @@ -50,13 +50,13 @@ A breakdown of a simple script that trains a GlowTTS model on the LJspeech datas - Fine-tune a model. ```bash - CUDA_VISIBLE_DEVICES=0 python train.py --restore_path path/to/model/checkpoint.pth.tar + CUDA_VISIBLE_DEVICES=0 python train.py --restore_path path/to/model/checkpoint.pth ``` - Run multi-gpu training. ```bash - CUDA_VISIBLE_DEVICES=0,1,2 python TTS/bin/distribute.py --script train.py + CUDA_VISIBLE_DEVICES=0,1,2 python -m trainer.distribute --script train.py ``` ### CLI Way diff --git a/notebooks/ExtractTTSpectrogram.ipynb b/notebooks/ExtractTTSpectrogram.ipynb index 50b60ff0..a257b6bf 100644 --- a/notebooks/ExtractTTSpectrogram.ipynb +++ b/notebooks/ExtractTTSpectrogram.ipynb @@ -66,7 +66,7 @@ "DATASET = \"ljspeech\"\n", "METADATA_FILE = \"metadata.csv\"\n", "CONFIG_PATH = \"/home/ubuntu/.local/share/tts/tts_models--en--ljspeech--tacotron2-DDC_ph/config.json\"\n", - "MODEL_FILE = \"/home/ubuntu/.local/share/tts/tts_models--en--ljspeech--tacotron2-DDC_ph/model_file.pth.tar\"\n", + "MODEL_FILE = \"/home/ubuntu/.local/share/tts/tts_models--en--ljspeech--tacotron2-DDC_ph/model_file.pth\"\n", "BATCH_SIZE = 32\n", "\n", "QUANTIZED_WAV = False\n", diff --git a/notebooks/PlotUmapLibriTTS.ipynb b/notebooks/PlotUmapLibriTTS.ipynb index c809a5c4..1e29790b 100644 --- a/notebooks/PlotUmapLibriTTS.ipynb +++ b/notebooks/PlotUmapLibriTTS.ipynb @@ -66,7 +66,7 @@ "outputs": [], "source": [ "MODEL_RUN_PATH = \"/media/erogol/data_ssd/Models/libri_tts/speaker_encoder/libritts_360-half-October-31-2019_04+54PM-19d2f5f/\"\n", - "MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth.tar\"\n", + "MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth\"\n", "CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n", "\n", "# My single speaker locations\n", diff --git a/notebooks/TestAttention.ipynb b/notebooks/TestAttention.ipynb index 5d8eed85..b257ff70 100644 --- a/notebooks/TestAttention.ipynb +++ b/notebooks/TestAttention.ipynb @@ -73,7 +73,7 @@ "\n", "# Set constants\n", "ROOT_PATH = '/home/erogol/Models/LJSpeech/ljspeech-May-20-2020_12+29PM-1835628/'\n", - "MODEL_PATH = ROOT_PATH + '/best_model.pth.tar'\n", + "MODEL_PATH = ROOT_PATH + '/best_model.pth'\n", "CONFIG_PATH = ROOT_PATH + '/config.json'\n", "OUT_FOLDER = './hard_sentences/'\n", "CONFIG = load_config(CONFIG_PATH)\n", diff --git a/notebooks/dataset_analysis/AnalyzeDataset.ipynb b/notebooks/dataset_analysis/AnalyzeDataset.ipynb index e08f3ab3..51963847 100644 --- a/notebooks/dataset_analysis/AnalyzeDataset.ipynb +++ b/notebooks/dataset_analysis/AnalyzeDataset.ipynb @@ -416,7 +416,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1" + "version": "3.9.5" } }, "nbformat": 4, diff --git a/notebooks/dataset_analysis/CheckSpectrograms.ipynb b/notebooks/dataset_analysis/CheckSpectrograms.ipynb index 74ca51ab..47e5c4cf 100644 --- a/notebooks/dataset_analysis/CheckSpectrograms.ipynb +++ b/notebooks/dataset_analysis/CheckSpectrograms.ipynb @@ -3,6 +3,10 @@ { "cell_type": "code", "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], "source": [ "%matplotlib inline\n", "\n", @@ -12,21 +16,51 @@ "\n", "import IPython.display as ipd\n", "import glob" - ], - "outputs": [], - "metadata": { - "Collapsed": "false" - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], "source": [ - "config_path = \"/home/erogol/gdrive/Projects/TTS/recipes/ljspeech/align_tts/config_transformer2.json\"\n", - "data_path = \"/home/erogol/gdrive/Datasets/LJSpeech-1.1/\"\n", - "\n", - "file_paths = glob.glob(data_path + \"/**/*.wav\", recursive=True)\n", - "CONFIG = load_config(config_path)\n", + "from TTS.config.shared_configs import BaseAudioConfig\n", + "CONFIG = BaseAudioConfig()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ✍️ Set these values " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data_path = \"/root/wav48_silence_trimmed/\"\n", + "file_ext = \".flac\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Read audio files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "file_paths = glob.glob(data_path + f\"/**/*{file_ext}\", recursive=True)\n", "\n", "# Change this to the index of the desired file listed below\n", "sample_file_index = 10\n", @@ -35,44 +69,45 @@ "\n", "print(\"File list, by index:\")\n", "dict(enumerate(file_paths))" - ], - "outputs": [], - "metadata": { - "Collapsed": "false" - } + ] }, { "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, "source": [ - "### Setup Audio Processor\n", + "## ✍️ Set Audio Processor\n", "Play with the AP parameters until you find a good fit with the synthesis speech below.\n", "\n", "The default values are loaded from your config.json file, so you only need to\n", "uncomment and modify values below that you'd like to tune." - ], - "metadata": { - "Collapsed": "false" - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], "source": [ "tune_params={\n", - "# 'audio_processor': 'audio',\n", - "# 'num_mels': 80, # In general, you don't need to change this. \n", - "# 'fft_size': 1024, # In general, you don't need to change this.\n", - "# 'sample_rate': 22050, # This must match the sample rate of the dataset.\n", - "# 'hop_length': 256, # In general, you don't need to change this.\n", - "# 'win_length': 1024, # In general, you don't need to change this.\n", - "# 'preemphasis': 0.98, # In general, 0 gives better voice recovery but makes training harder. If your model does not train, try 0.97 - 0.99.\n", - "# 'min_level_db': -100,\n", - "# 'ref_level_db': 0, # The base DB; increase until all background noise is removed in the spectrogram, then lower until you hear better speech below.\n", - "# 'power': 1.5, # Change this value and listen to the synthesized voice. 1.2 - 1.5 are resonable values.\n", - "# 'griffin_lim_iters': 60, # Quality does not improve for values > 60\n", - "# 'mel_fmin': 0.0, # Adjust this and check mel-spectrogram-based voice synthesis below.\n", - "# 'mel_fmax': 8000.0, # Adjust this and check mel-spectrogram-based voice synthesis below.\n", - "# 'do_trim_silence': True # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.\n", + " 'num_mels': 80, # In general, you don't need to change this. \n", + " 'fft_size': 2400, # In general, you don't need to change this.\n", + " 'frame_length_ms': 50, \n", + " 'frame_shift_ms': 12.5,\n", + " 'sample_rate': 48000, # This must match the sample rate of the dataset.\n", + " 'hop_length': None, # In general, you don't need to change this.\n", + " 'win_length': 1024, # In general, you don't need to change this.\n", + " 'preemphasis': 0.98, # In general, 0 gives better voice recovery but makes training harder. If your model does not train, try 0.97 - 0.99.\n", + " 'min_level_db': -100,\n", + " 'ref_level_db': 0, # The base DB; increase until all background noise is removed in the spectrogram, then lower until you hear better speech below.\n", + " 'power': 1.5, # Change this value and listen to the synthesized voice. 1.2 - 1.5 are resonable values.\n", + " 'griffin_lim_iters': 60, # Quality does not improve for values > 60\n", + " 'mel_fmin': 0.0, # Adjust this and check mel-spectrogram-based voice synthesis below.\n", + " 'mel_fmax': 8000.0, # Adjust this and check mel-spectrogram-based voice synthesis below.\n", + " 'do_trim_silence': True # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.\n", "}\n", "\n", "# These options have to be forced off in order to avoid errors about the \n", @@ -86,59 +121,57 @@ "}\n", "\n", "# Override select parts of loaded config with parameters above\n", - "tuned_config = CONFIG.audio.copy()\n", + "tuned_config = CONFIG.copy()\n", "tuned_config.update(reset)\n", "tuned_config.update(tune_params)\n", "\n", "AP = AudioProcessor(**tuned_config);" - ], - "outputs": [], - "metadata": { - "Collapsed": "false" - } + ] }, { "cell_type": "markdown", - "source": [ - "### Check audio loading " - ], "metadata": { "Collapsed": "false" - } + }, + "source": [ + "### Check audio loading " + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], "source": [ "wav = AP.load_wav(SAMPLE_FILE_PATH)\n", "ipd.Audio(data=wav, rate=AP.sample_rate) " - ], - "outputs": [], - "metadata": { - "Collapsed": "false" - } + ] }, { "cell_type": "markdown", - "source": [ - "### Generate Mel-Spectrogram and Re-synthesis with GL" - ], "metadata": { "Collapsed": "false" - } + }, + "source": [ + "### Generate Mel-Spectrogram and Re-synthesis with GL" + ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "AP.power = 1.5" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "mel = AP.melspectrogram(wav)\n", "print(\"Max:\", mel.max())\n", @@ -148,24 +181,24 @@ "\n", "wav_gen = AP.inv_melspectrogram(mel)\n", "ipd.Audio(wav_gen, rate=AP.sample_rate)" - ], - "outputs": [], - "metadata": { - "Collapsed": "false" - } + ] }, { "cell_type": "markdown", - "source": [ - "### Generate Linear-Spectrogram and Re-synthesis with GL" - ], "metadata": { "Collapsed": "false" - } + }, + "source": [ + "### Generate Linear-Spectrogram and Re-synthesis with GL" + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], "source": [ "spec = AP.spectrogram(wav)\n", "print(\"Max:\", spec.max())\n", @@ -175,26 +208,26 @@ "\n", "wav_gen = AP.inv_spectrogram(spec)\n", "ipd.Audio(wav_gen, rate=AP.sample_rate)" - ], - "outputs": [], - "metadata": { - "Collapsed": "false" - } + ] }, { "cell_type": "markdown", + "metadata": { + "Collapsed": "false" + }, "source": [ "### Compare values for a certain parameter\n", "\n", "Optimize your parameters by comparing different values per parameter at a time." - ], - "metadata": { - "Collapsed": "false" - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], "source": [ "from librosa import display\n", "from matplotlib import pylab as plt\n", @@ -234,39 +267,39 @@ " val = values[idx]\n", " print(\" > {} = {}\".format(attribute, val))\n", " IPython.display.display(IPython.display.Audio(wav_gen, rate=AP.sample_rate))" - ], - "outputs": [], - "metadata": { - "Collapsed": "false" - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "Collapsed": "false" + }, + "outputs": [], "source": [ "compare_values(\"preemphasis\", [0, 0.5, 0.97, 0.98, 0.99])" - ], - "outputs": [], - "metadata": { - "Collapsed": "false" - } + ] }, { "cell_type": "code", "execution_count": null, - "source": [ - "compare_values(\"ref_level_db\", [2, 5, 10, 15, 20, 25, 30, 35, 40, 1000])" - ], - "outputs": [], "metadata": { "Collapsed": "false" - } + }, + "outputs": [], + "source": [ + "compare_values(\"ref_level_db\", [2, 5, 10, 15, 20, 25, 30, 35, 40, 1000])" + ] } ], "metadata": { + "interpreter": { + "hash": "27648abe09795c3a768a281b31f7524fcf66a207e733f8ecda3a4e1fd1059fb0" + }, "kernelspec": { - "name": "python3", - "display_name": "Python 3.8.5 64-bit ('torch': conda)" + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -278,12 +311,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" - }, - "interpreter": { - "hash": "27648abe09795c3a768a281b31f7524fcf66a207e733f8ecda3a4e1fd1059fb0" + "version": "3.9.5" } }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/pyproject.toml b/pyproject.toml index 0941a906..b775f12a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ max-line-length=120 [tool.black] line-length = 120 -target-version = ['py38'] +target-version = ['py39'] exclude = ''' ( diff --git a/recipes/ljspeech/align_tts/train_aligntts.py b/recipes/ljspeech/align_tts/train_aligntts.py index f1b29025..591b1509 100644 --- a/recipes/ljspeech/align_tts/train_aligntts.py +++ b/recipes/ljspeech/align_tts/train_aligntts.py @@ -49,7 +49,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) +train_samples, eval_samples = load_tts_samples( + dataset_config, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, +) # init model model = AlignTTS(config, ap, tokenizer) diff --git a/recipes/ljspeech/fast_pitch/train_fast_pitch.py b/recipes/ljspeech/fast_pitch/train_fast_pitch.py index a3fc35c9..a84658f3 100644 --- a/recipes/ljspeech/fast_pitch/train_fast_pitch.py +++ b/recipes/ljspeech/fast_pitch/train_fast_pitch.py @@ -84,7 +84,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) +train_samples, eval_samples = load_tts_samples( + dataset_config, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, +) # init the model model = ForwardTTS(config, ap, tokenizer, speaker_manager=None) diff --git a/recipes/ljspeech/fast_speech/train_fast_speech.py b/recipes/ljspeech/fast_speech/train_fast_speech.py index 560d3de2..0245dd93 100644 --- a/recipes/ljspeech/fast_speech/train_fast_speech.py +++ b/recipes/ljspeech/fast_speech/train_fast_speech.py @@ -83,7 +83,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) +train_samples, eval_samples = load_tts_samples( + dataset_config, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, +) # init the model model = ForwardTTS(config, ap, tokenizer) diff --git a/recipes/ljspeech/glow_tts/train_glowtts.py b/recipes/ljspeech/glow_tts/train_glowtts.py index c47cd00a..a0b4ac48 100644 --- a/recipes/ljspeech/glow_tts/train_glowtts.py +++ b/recipes/ljspeech/glow_tts/train_glowtts.py @@ -60,7 +60,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) +train_samples, eval_samples = load_tts_samples( + dataset_config, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, +) # INITIALIZE THE MODEL # Models take a config object and a speaker manager as input diff --git a/recipes/ljspeech/hifigan/train_hifigan.py b/recipes/ljspeech/hifigan/train_hifigan.py index 1e5bbf30..b4cbae63 100644 --- a/recipes/ljspeech/hifigan/train_hifigan.py +++ b/recipes/ljspeech/hifigan/train_hifigan.py @@ -37,16 +37,10 @@ ap = AudioProcessor(**config.audio.to_dict()) eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size) # init model -model = GAN(config) +model = GAN(config, ap) # init the trainer and 🚀 trainer = Trainer( - TrainerArgs(), - config, - output_path, - model=model, - train_samples=train_samples, - eval_samples=eval_samples, - training_assets={"audio_processor": ap}, + TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples ) trainer.fit() diff --git a/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py b/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py index 40ff5a00..225f5a30 100644 --- a/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py +++ b/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py @@ -37,16 +37,10 @@ ap = AudioProcessor(**config.audio.to_dict()) eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size) # init model -model = GAN(config) +model = GAN(config, ap) # init the trainer and 🚀 trainer = Trainer( - TrainerArgs(), - config, - output_path, - model=model, - train_samples=train_samples, - eval_samples=eval_samples, - training_assets={"audio_processor": ap}, + TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples ) trainer.fit() diff --git a/recipes/ljspeech/speedy_speech/train_speedy_speech.py b/recipes/ljspeech/speedy_speech/train_speedy_speech.py index 7ad132b2..1ab3db1c 100644 --- a/recipes/ljspeech/speedy_speech/train_speedy_speech.py +++ b/recipes/ljspeech/speedy_speech/train_speedy_speech.py @@ -67,7 +67,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) +train_samples, eval_samples = load_tts_samples( + dataset_config, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, +) # init model model = ForwardTTS(config, ap, tokenizer) diff --git a/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py b/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py index ea1b0874..a9f253ea 100644 --- a/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py +++ b/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py @@ -77,7 +77,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) +train_samples, eval_samples = load_tts_samples( + dataset_config, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, +) # INITIALIZE THE MODEL # Models take a config object and a speaker manager as input diff --git a/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py b/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py index d00f8ed7..99089db8 100644 --- a/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py +++ b/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py @@ -74,7 +74,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) +train_samples, eval_samples = load_tts_samples( + dataset_config, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, +) # INITIALIZE THE MODEL # Models take a config object and a speaker manager as input @@ -84,12 +89,6 @@ model = Tacotron2(config, ap, tokenizer, speaker_manager=None) # init the trainer and 🚀 trainer = Trainer( - TrainerArgs(), - config, - output_path, - model=model, - train_samples=train_samples, - eval_samples=eval_samples, - training_assets={"audio_processor": ap}, + TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples ) trainer.fit() diff --git a/recipes/ljspeech/univnet/train.py b/recipes/ljspeech/univnet/train.py index 19c91925..81d2b889 100644 --- a/recipes/ljspeech/univnet/train.py +++ b/recipes/ljspeech/univnet/train.py @@ -36,16 +36,10 @@ ap = AudioProcessor(**config.audio.to_dict()) eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size) # init model -model = GAN(config) +model = GAN(config, ap) # init the trainer and 🚀 trainer = Trainer( - TrainerArgs(), - config, - output_path, - model=model, - train_samples=train_samples, - eval_samples=eval_samples, - training_assets={"audio_processor": ap}, + TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples ) trainer.fit() diff --git a/recipes/ljspeech/vits_tts/train_vits.py b/recipes/ljspeech/vits_tts/train_vits.py index cfb3351d..c070b3f1 100644 --- a/recipes/ljspeech/vits_tts/train_vits.py +++ b/recipes/ljspeech/vits_tts/train_vits.py @@ -69,7 +69,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) +train_samples, eval_samples = load_tts_samples( + dataset_config, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, +) # init model model = Vits(config, ap, tokenizer, speaker_manager=None) diff --git a/recipes/multilingual/vits_tts/train_vits_tts.py b/recipes/multilingual/vits_tts/train_vits_tts.py index ac2c21a2..0e650ade 100644 --- a/recipes/multilingual/vits_tts/train_vits_tts.py +++ b/recipes/multilingual/vits_tts/train_vits_tts.py @@ -7,9 +7,10 @@ from TTS.config.shared_configs import BaseAudioConfig from TTS.tts.configs.shared_configs import BaseDatasetConfig from TTS.tts.configs.vits_config import VitsConfig from TTS.tts.datasets import load_tts_samples -from TTS.tts.models.vits import Vits, VitsArgs +from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs from TTS.tts.utils.languages import LanguageManager from TTS.tts.utils.speakers import SpeakerManager +from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.utils.audio import AudioProcessor output_path = os.path.dirname(os.path.abspath(__file__)) @@ -73,15 +74,16 @@ config = VitsConfig( max_audio_len=160000, output_path=output_path, datasets=dataset_config, - characters={ - "pad": "_", - "eos": "&", - "bos": "*", - "characters": "!¡'(),-.:;¿?abcdefghijklmnopqrstuvwxyzµßàáâäåæçèéêëìíîïñòóôöùúûüąćęłńœśşźżƒабвгдежзийклмнопрстуфхцчшщъыьэюяёєіїґӧ «°±µ»$%&‘’‚“`”„", - "punctuations": "!¡'(),-.:;¿? ", - "phonemes": None, - "unique": True, - }, + characters=CharactersConfig( + characters_class="TTS.tts.models.vits.VitsCharacters", + pad="", + eos="", + bos="", + blank="", + characters="!¡'(),-.:;¿?abcdefghijklmnopqrstuvwxyzµßàáâäåæçèéêëìíîïñòóôöùúûüąćęłńœśşźżƒабвгдежзийклмнопрстуфхцчшщъыьэюяёєіїґӧ «°±µ»$%&‘’‚“`”„", + punctuations="!¡'(),-.:;¿? ", + phonemes=None, + ), test_sentences=[ [ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", @@ -100,32 +102,39 @@ config = VitsConfig( ], ) +# force the convertion of the custom characters to a config attribute +config.from_dict(config.to_dict()) + # init audio processor ap = AudioProcessor(**config.audio.to_dict()) # load training samples -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) +train_samples, eval_samples = load_tts_samples( + dataset_config, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, +) # init speaker manager for multi-speaker training # it maps speaker-id to speaker-name in the model and data-loader speaker_manager = SpeakerManager() -speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) +speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name") config.model_args.num_speakers = speaker_manager.num_speakers language_manager = LanguageManager(config=config) config.model_args.num_languages = language_manager.num_languages +# INITIALIZE THE TOKENIZER +# Tokenizer is used to convert text to sequences of token IDs. +# config is updated with the default characters if not defined in the config. +tokenizer, config = TTSTokenizer.init_from_config(config) + # init model -model = Vits(config, speaker_manager, language_manager) +model = Vits(config, ap, tokenizer, speaker_manager, language_manager) # init the trainer and 🚀 trainer = Trainer( - TrainerArgs(), - config, - output_path, - model=model, - train_samples=train_samples, - eval_samples=eval_samples, - training_assets={"audio_processor": ap}, + TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples ) trainer.fit() diff --git a/recipes/vctk/fast_pitch/train_fast_pitch.py b/recipes/vctk/fast_pitch/train_fast_pitch.py index 986202c5..c39932da 100644 --- a/recipes/vctk/fast_pitch/train_fast_pitch.py +++ b/recipes/vctk/fast_pitch/train_fast_pitch.py @@ -71,12 +71,17 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) +train_samples, eval_samples = load_tts_samples( + dataset_config, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, +) # init speaker manager for multi-speaker training # it maps speaker-id to speaker-name in the model and data-loader speaker_manager = SpeakerManager() -speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) +speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name") config.model_args.num_speakers = speaker_manager.num_speakers # init model diff --git a/recipes/vctk/fast_speech/train_fast_speech.py b/recipes/vctk/fast_speech/train_fast_speech.py index fe785a41..a3249de1 100644 --- a/recipes/vctk/fast_speech/train_fast_speech.py +++ b/recipes/vctk/fast_speech/train_fast_speech.py @@ -69,12 +69,17 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) +train_samples, eval_samples = load_tts_samples( + dataset_config, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, +) # init speaker manager for multi-speaker training # it maps speaker-id to speaker-name in the model and data-loader speaker_manager = SpeakerManager() -speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) +speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name") config.model_args.num_speakers = speaker_manager.num_speakers # init model diff --git a/recipes/vctk/glow_tts/train_glow_tts.py b/recipes/vctk/glow_tts/train_glow_tts.py index ebdbfb37..23c02efc 100644 --- a/recipes/vctk/glow_tts/train_glow_tts.py +++ b/recipes/vctk/glow_tts/train_glow_tts.py @@ -69,12 +69,17 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) +train_samples, eval_samples = load_tts_samples( + dataset_config, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, +) # init speaker manager for multi-speaker training # it maps speaker-id to speaker-name in the model and data-loader speaker_manager = SpeakerManager() -speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) +speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name") config.num_speakers = speaker_manager.num_speakers # init model diff --git a/recipes/vctk/speedy_speech/train_speedy_speech.py b/recipes/vctk/speedy_speech/train_speedy_speech.py index 80d21ca2..bcd0105a 100644 --- a/recipes/vctk/speedy_speech/train_speedy_speech.py +++ b/recipes/vctk/speedy_speech/train_speedy_speech.py @@ -69,12 +69,17 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) +train_samples, eval_samples = load_tts_samples( + dataset_config, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, +) # init speaker manager for multi-speaker training # it maps speaker-id to speaker-name in the model and data-loader speaker_manager = SpeakerManager() -speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) +speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name") config.model_args.num_speakers = speaker_manager.num_speakers # init model diff --git a/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py b/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py index bed21ad9..36e28ed7 100644 --- a/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py +++ b/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py @@ -72,12 +72,17 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) +train_samples, eval_samples = load_tts_samples( + dataset_config, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, +) # init speaker manager for multi-speaker training # it mainly handles speaker-id to speaker-name for the model and the data-loader speaker_manager = SpeakerManager() -speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) +speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name") # init model model = Tacotron(config, ap, tokenizer, speaker_manager) diff --git a/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py b/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py index caa745b3..d04d91c0 100644 --- a/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py +++ b/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py @@ -78,12 +78,17 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) +train_samples, eval_samples = load_tts_samples( + dataset_config, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, +) # init speaker manager for multi-speaker training # it mainly handles speaker-id to speaker-name for the model and the data-loader speaker_manager = SpeakerManager() -speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) +speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name") # init model model = Tacotron2(config, ap, tokenizer, speaker_manager) diff --git a/recipes/vctk/tacotron2/train_tacotron2.py b/recipes/vctk/tacotron2/train_tacotron2.py index 43f5d4e6..5a0e157a 100644 --- a/recipes/vctk/tacotron2/train_tacotron2.py +++ b/recipes/vctk/tacotron2/train_tacotron2.py @@ -78,12 +78,17 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) +train_samples, eval_samples = load_tts_samples( + dataset_config, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, +) # init speaker manager for multi-speaker training # it mainly handles speaker-id to speaker-name for the model and the data-loader speaker_manager = SpeakerManager() -speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) +speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name") # init model model = Tacotron2(config, ap, tokenizer, speaker_manager) diff --git a/recipes/vctk/vits/train_vits.py b/recipes/vctk/vits/train_vits.py index dff4eefc..88fd7de9 100644 --- a/recipes/vctk/vits/train_vits.py +++ b/recipes/vctk/vits/train_vits.py @@ -53,6 +53,7 @@ config = VitsConfig( epochs=1000, text_cleaner="english_cleaners", use_phonemes=True, + phoneme_language="en", phoneme_cache_path=os.path.join(output_path, "phoneme_cache"), compute_input_seq_cache=True, print_step=25, @@ -78,12 +79,17 @@ tokenizer, config = TTSTokenizer.init_from_config(config) # You can define your custom sample loader returning the list of samples. # Or define your custom formatter and pass it to the `load_tts_samples`. # Check `TTS.tts.datasets.load_tts_samples` for more details. -train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True) +train_samples, eval_samples = load_tts_samples( + dataset_config, + eval_split=True, + eval_split_max_size=config.eval_split_max_size, + eval_split_size=config.eval_split_size, +) # init speaker manager for multi-speaker training # it maps speaker-id to speaker-name in the model and data-loader speaker_manager = SpeakerManager() -speaker_manager.set_speaker_ids_from_data(train_samples + eval_samples) +speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name") config.model_args.num_speakers = speaker_manager.num_speakers # init model diff --git a/requirements.txt b/requirements.txt index e3871874..50c0d2ac 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,8 +24,8 @@ matplotlib tensorboardX pyworld # coqui stack -coqui-trainer -coqpit # config managemenr +trainer +coqpit # config management # chinese g2p deps jieba pypinyin @@ -33,6 +33,4 @@ pypinyin mecab-python3==1.0.3 unidic-lite==1.0.8 # gruut+supported langs -gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=2.0.0 -# others -webrtcvad # for VAD +gruut[cs,de,es,fr,it,nl,pt,ru,sv]==2.2.3 diff --git a/tests/aux_tests/test_extract_tts_spectrograms.py b/tests/aux_tests/test_extract_tts_spectrograms.py index 8c795d58..ef751846 100644 --- a/tests/aux_tests/test_extract_tts_spectrograms.py +++ b/tests/aux_tests/test_extract_tts_spectrograms.py @@ -15,7 +15,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase): def test_GlowTTS(): # set paths config_path = os.path.join(get_tests_input_path(), "test_glow_tts.json") - checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth.tar") + checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth") output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/") # load config c = load_config(config_path) @@ -33,7 +33,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase): def test_Tacotron2(): # set paths config_path = os.path.join(get_tests_input_path(), "test_tacotron2_config.json") - checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth.tar") + checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth") output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/") # load config c = load_config(config_path) @@ -51,7 +51,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase): def test_Tacotron(): # set paths config_path = os.path.join(get_tests_input_path(), "test_tacotron_config.json") - checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth.tar") + checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth") output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/") # load config c = load_config(config_path) diff --git a/tests/aux_tests/test_speaker_encoder.py b/tests/aux_tests/test_speaker_encoder.py index 97b3b92f..f2875cc1 100644 --- a/tests/aux_tests/test_speaker_encoder.py +++ b/tests/aux_tests/test_speaker_encoder.py @@ -3,9 +3,9 @@ import unittest import torch as T from tests import get_tests_input_path -from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss -from TTS.speaker_encoder.models.lstm import LSTMSpeakerEncoder -from TTS.speaker_encoder.models.resnet import ResNetSpeakerEncoder +from TTS.encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss +from TTS.encoder.models.lstm import LSTMSpeakerEncoder +from TTS.encoder.models.resnet import ResNetSpeakerEncoder file_path = get_tests_input_path() diff --git a/tests/aux_tests/test_speaker_encoder_train.py b/tests/aux_tests/test_speaker_encoder_train.py index 7901fe5a..d9d6d71e 100644 --- a/tests/aux_tests/test_speaker_encoder_train.py +++ b/tests/aux_tests/test_speaker_encoder_train.py @@ -4,14 +4,14 @@ import shutil from tests import get_device_id, get_tests_output_path, run_cli from TTS.config.shared_configs import BaseAudioConfig -from TTS.speaker_encoder.speaker_encoder_config import SpeakerEncoderConfig +from TTS.encoder.configs.speaker_encoder_config import SpeakerEncoderConfig def run_test_train(): command = ( f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --config_path {config_path} " f"--coqpit.output_path {output_path} " - "--coqpit.datasets.0.name ljspeech " + "--coqpit.datasets.0.name ljspeech_test " "--coqpit.datasets.0.meta_file_train metadata.csv " "--coqpit.datasets.0.meta_file_val metadata.csv " "--coqpit.datasets.0.path tests/data/ljspeech " @@ -24,17 +24,21 @@ output_path = os.path.join(get_tests_output_path(), "train_outputs") config = SpeakerEncoderConfig( batch_size=4, - num_speakers_in_batch=1, - num_utters_per_speaker=10, - num_loader_workers=0, - max_train_step=2, + num_classes_in_batch=4, + num_utter_per_class=2, + eval_num_classes_in_batch=4, + eval_num_utter_per_class=2, + num_loader_workers=1, + epochs=1, print_step=1, - save_step=1, + save_step=2, print_eval=True, + run_eval=True, audio=BaseAudioConfig(num_mels=80), ) config.audio.do_trim_silence = True config.audio.trim_db = 60 +config.loss = "ge2e" config.save_json(config_path) print(config) @@ -69,14 +73,14 @@ run_cli(command_train) shutil.rmtree(continue_path) # test model with ge2e loss function -config.loss = "ge2e" -config.save_json(config_path) -run_test_train() +# config.loss = "ge2e" +# config.save_json(config_path) +# run_test_train() # test model with angleproto loss function -config.loss = "angleproto" -config.save_json(config_path) -run_test_train() +# config.loss = "angleproto" +# config.save_json(config_path) +# run_test_train() # test model with softmaxproto loss function config.loss = "softmaxproto" diff --git a/tests/aux_tests/test_speaker_manager.py b/tests/aux_tests/test_speaker_manager.py index fff49b13..7552e0a5 100644 --- a/tests/aux_tests/test_speaker_manager.py +++ b/tests/aux_tests/test_speaker_manager.py @@ -6,13 +6,13 @@ import torch from tests import get_tests_input_path from TTS.config import load_config -from TTS.speaker_encoder.utils.generic_utils import setup_speaker_encoder_model -from TTS.speaker_encoder.utils.io import save_checkpoint +from TTS.encoder.utils.generic_utils import setup_encoder_model +from TTS.encoder.utils.io import save_checkpoint from TTS.tts.utils.speakers import SpeakerManager from TTS.utils.audio import AudioProcessor encoder_config_path = os.path.join(get_tests_input_path(), "test_speaker_encoder_config.json") -encoder_model_path = os.path.join(get_tests_input_path(), "checkpoint_0.pth.tar") +encoder_model_path = os.path.join(get_tests_input_path(), "checkpoint_0.pth") sample_wav_path = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0001.wav") sample_wav_path2 = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0002.wav") d_vectors_file_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.json") @@ -28,7 +28,7 @@ class SpeakerManagerTest(unittest.TestCase): config.audio.resample = True # create a dummy speaker encoder - model = setup_speaker_encoder_model(config) + model = setup_encoder_model(config) save_checkpoint(model, None, None, get_tests_input_path(), 0) # load audio processor and speaker encoder @@ -38,19 +38,19 @@ class SpeakerManagerTest(unittest.TestCase): # load a sample audio and compute embedding waveform = ap.load_wav(sample_wav_path) mel = ap.melspectrogram(waveform) - d_vector = manager.compute_d_vector(mel) + d_vector = manager.compute_embeddings(mel) assert d_vector.shape[1] == 256 # compute d_vector directly from an input file - d_vector = manager.compute_d_vector_from_clip(sample_wav_path) - d_vector2 = manager.compute_d_vector_from_clip(sample_wav_path) + d_vector = manager.compute_embedding_from_clip(sample_wav_path) + d_vector2 = manager.compute_embedding_from_clip(sample_wav_path) d_vector = torch.FloatTensor(d_vector) d_vector2 = torch.FloatTensor(d_vector2) assert d_vector.shape[0] == 256 assert (d_vector - d_vector2).sum() == 0.0 # compute d_vector from a list of wav files. - d_vector3 = manager.compute_d_vector_from_clip([sample_wav_path, sample_wav_path2]) + d_vector3 = manager.compute_embedding_from_clip([sample_wav_path, sample_wav_path2]) d_vector3 = torch.FloatTensor(d_vector3) assert d_vector3.shape[0] == 256 assert (d_vector - d_vector3).sum() != 0.0 @@ -62,14 +62,14 @@ class SpeakerManagerTest(unittest.TestCase): def test_speakers_file_processing(): manager = SpeakerManager(d_vectors_file_path=d_vectors_file_path) print(manager.num_speakers) - print(manager.d_vector_dim) + print(manager.embedding_dim) print(manager.clip_ids) - d_vector = manager.get_d_vector_by_clip(manager.clip_ids[0]) + d_vector = manager.get_embedding_by_clip(manager.clip_ids[0]) assert len(d_vector) == 256 - d_vectors = manager.get_d_vectors_by_speaker(manager.speaker_names[0]) + d_vectors = manager.get_embeddings_by_name(manager.speaker_names[0]) assert len(d_vectors[0]) == 256 - d_vector1 = manager.get_mean_d_vector(manager.speaker_names[0], num_samples=2, randomize=True) + d_vector1 = manager.get_mean_embedding(manager.speaker_names[0], num_samples=2, randomize=True) assert len(d_vector1) == 256 - d_vector2 = manager.get_mean_d_vector(manager.speaker_names[0], num_samples=2, randomize=False) + d_vector2 = manager.get_mean_embedding(manager.speaker_names[0], num_samples=2, randomize=False) assert len(d_vector2) == 256 assert np.sum(np.array(d_vector1) - np.array(d_vector2)) != 0 diff --git a/tests/data_tests/test_samplers.py b/tests/data_tests/test_samplers.py index 497a3fb5..42f1bfd5 100644 --- a/tests/data_tests/test_samplers.py +++ b/tests/data_tests/test_samplers.py @@ -1,10 +1,13 @@ import functools +import unittest import torch from TTS.config.shared_configs import BaseDatasetConfig +from TTS.encoder.utils.samplers import PerfectBatchSampler from TTS.tts.datasets import load_tts_samples -from TTS.tts.utils.languages import get_language_weighted_sampler +from TTS.tts.utils.languages import get_language_balancer_weights +from TTS.tts.utils.speakers import get_speaker_balancer_weights # Fixing random state to avoid random fails torch.manual_seed(0) @@ -25,34 +28,111 @@ dataset_config_pt = BaseDatasetConfig( language="pt-br", ) -# Adding the EN samples twice to create an unbalanced dataset +# Adding the EN samples twice to create a language unbalanced dataset train_samples, eval_samples = load_tts_samples( [dataset_config_en, dataset_config_en, dataset_config_pt], eval_split=True ) +# gerenate a speaker unbalanced dataset +for i, sample in enumerate(train_samples): + if i < 5: + sample["speaker_name"] = "ljspeech-0" + else: + sample["speaker_name"] = "ljspeech-1" + def is_balanced(lang_1, lang_2): return 0.85 < lang_1 / lang_2 < 1.2 -random_sampler = torch.utils.data.RandomSampler(train_samples) -ids = functools.reduce(lambda a, b: a + b, [list(random_sampler) for i in range(100)]) -en, pt = 0, 0 -for index in ids: - if train_samples[index]["language"] == "en": - en += 1 - else: - pt += 1 +class TestSamplers(unittest.TestCase): + def test_language_random_sampler(self): # pylint: disable=no-self-use + random_sampler = torch.utils.data.RandomSampler(train_samples) + ids = functools.reduce(lambda a, b: a + b, [list(random_sampler) for i in range(100)]) + en, pt = 0, 0 + for index in ids: + if train_samples[index]["language"] == "en": + en += 1 + else: + pt += 1 -assert not is_balanced(en, pt), "Random sampler is supposed to be unbalanced" + assert not is_balanced(en, pt), "Random sampler is supposed to be unbalanced" -weighted_sampler = get_language_weighted_sampler(train_samples) -ids = functools.reduce(lambda a, b: a + b, [list(weighted_sampler) for i in range(100)]) -en, pt = 0, 0 -for index in ids: - if train_samples[index]["language"] == "en": - en += 1 - else: - pt += 1 + def test_language_weighted_random_sampler(self): # pylint: disable=no-self-use + weighted_sampler = torch.utils.data.sampler.WeightedRandomSampler( + get_language_balancer_weights(train_samples), len(train_samples) + ) + ids = functools.reduce(lambda a, b: a + b, [list(weighted_sampler) for i in range(100)]) + en, pt = 0, 0 + for index in ids: + if train_samples[index]["language"] == "en": + en += 1 + else: + pt += 1 -assert is_balanced(en, pt), "Weighted sampler is supposed to be balanced" + assert is_balanced(en, pt), "Language Weighted sampler is supposed to be balanced" + + def test_speaker_weighted_random_sampler(self): # pylint: disable=no-self-use + + weighted_sampler = torch.utils.data.sampler.WeightedRandomSampler( + get_speaker_balancer_weights(train_samples), len(train_samples) + ) + ids = functools.reduce(lambda a, b: a + b, [list(weighted_sampler) for i in range(100)]) + spk1, spk2 = 0, 0 + for index in ids: + if train_samples[index]["speaker_name"] == "ljspeech-0": + spk1 += 1 + else: + spk2 += 1 + + assert is_balanced(spk1, spk2), "Speaker Weighted sampler is supposed to be balanced" + + def test_perfect_sampler(self): # pylint: disable=no-self-use + classes = set() + for item in train_samples: + classes.add(item["speaker_name"]) + + sampler = PerfectBatchSampler( + train_samples, + classes, + batch_size=2 * 3, # total batch size + num_classes_in_batch=2, + label_key="speaker_name", + shuffle=False, + drop_last=True, + ) + batchs = functools.reduce(lambda a, b: a + b, [list(sampler) for i in range(100)]) + for batch in batchs: + spk1, spk2 = 0, 0 + # for in each batch + for index in batch: + if train_samples[index]["speaker_name"] == "ljspeech-0": + spk1 += 1 + else: + spk2 += 1 + assert spk1 == spk2, "PerfectBatchSampler is supposed to be perfectly balanced" + + def test_perfect_sampler_shuffle(self): # pylint: disable=no-self-use + classes = set() + for item in train_samples: + classes.add(item["speaker_name"]) + + sampler = PerfectBatchSampler( + train_samples, + classes, + batch_size=2 * 3, # total batch size + num_classes_in_batch=2, + label_key="speaker_name", + shuffle=True, + drop_last=False, + ) + batchs = functools.reduce(lambda a, b: a + b, [list(sampler) for i in range(100)]) + for batch in batchs: + spk1, spk2 = 0, 0 + # for in each batch + for index in batch: + if train_samples[index]["speaker_name"] == "ljspeech-0": + spk1 += 1 + else: + spk2 += 1 + assert spk1 == spk2, "PerfectBatchSampler is supposed to be perfectly balanced" diff --git a/tests/inference_tests/test_synthesizer.py b/tests/inference_tests/test_synthesizer.py index d643cb81..b5350b0f 100644 --- a/tests/inference_tests/test_synthesizer.py +++ b/tests/inference_tests/test_synthesizer.py @@ -20,7 +20,7 @@ class SynthesizerTest(unittest.TestCase): def test_in_out(self): self._create_random_model() tts_root_path = get_tests_output_path() - tts_checkpoint = os.path.join(tts_root_path, "checkpoint_10.pth.tar") + tts_checkpoint = os.path.join(tts_root_path, "checkpoint_10.pth") tts_config = os.path.join(tts_root_path, "dummy_model_config.json") synthesizer = Synthesizer(tts_checkpoint, tts_config, None, None) synthesizer.tts("Better this test works!!") diff --git a/tests/inputs/server_config.json b/tests/inputs/server_config.json index 0cb9b948..f0a92283 100644 --- a/tests/inputs/server_config.json +++ b/tests/inputs/server_config.json @@ -1,5 +1,5 @@ { - "tts_checkpoint":"checkpoint_10.pth.tar", // tts checkpoint file + "tts_checkpoint":"checkpoint_10.pth", // tts checkpoint file "tts_config":"dummy_model_config.json", // tts config.json file "tts_speakers": null, // json file listing speaker ids. null if no speaker embedding. "wavernn_lib_path": null, // Rootpath to wavernn project folder to be imported. If this is null, model uses GL for speech synthesis. diff --git a/tests/inputs/test_glow_tts.json b/tests/inputs/test_glow_tts.json index 6dd86057..64b09828 100644 --- a/tests/inputs/test_glow_tts.json +++ b/tests/inputs/test_glow_tts.json @@ -66,8 +66,8 @@ "use_mas": false, // use Monotonic Alignment Search if true. Otherwise use pre-computed attention alignments. // TRAINING - "batch_size": 2, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. - "eval_batch_size":1, + "batch_size": 8, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + "eval_batch_size": 8, "r": 1, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. "loss_masking": true, // enable / disable loss masking against the sequence padding. "data_dep_init_iter": 1, diff --git a/tests/inputs/test_speaker_encoder_config.json b/tests/inputs/test_speaker_encoder_config.json index 09a2f6a4..bfcc17ab 100644 --- a/tests/inputs/test_speaker_encoder_config.json +++ b/tests/inputs/test_speaker_encoder_config.json @@ -36,8 +36,8 @@ "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. "steps_plot_stats": 10, // number of steps to plot embeddings. - "num_speakers_in_batch": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. - "num_utters_per_speaker": 10, // + "num_classes_in_batch": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + "num_utter_per_class": 10, // "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values. "wd": 0.000001, // Weight decay weight. "checkpoint": true, // If true, it saves checkpoints per "save_step" diff --git a/tests/inputs/test_tacotron2_config.json b/tests/inputs/test_tacotron2_config.json index 6c82891d..69b23560 100644 --- a/tests/inputs/test_tacotron2_config.json +++ b/tests/inputs/test_tacotron2_config.json @@ -61,8 +61,8 @@ "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. // TRAINING - "batch_size": 1, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. - "eval_batch_size":1, + "batch_size": 8, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + "eval_batch_size": 8, "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. "gradual_training": [[0, 7, 4], [1, 5, 2]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed. "loss_masking": true, // enable / disable loss masking against the sequence padding. diff --git a/tests/inputs/test_tacotron_config.json b/tests/inputs/test_tacotron_config.json index b60ed35e..90e07fc7 100644 --- a/tests/inputs/test_tacotron_config.json +++ b/tests/inputs/test_tacotron_config.json @@ -61,8 +61,8 @@ "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. // TRAINING - "batch_size": 1, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. - "eval_batch_size":1, + "batch_size": 8, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. + "eval_batch_size": 8, "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. "gradual_training": [[0, 7, 4], [1, 5, 2]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed. "loss_masking": true, // enable / disable loss masking against the sequence padding. diff --git a/tests/tts_tests/test_align_tts_train.py b/tests/tts_tests/test_align_tts_train.py index 85dfbbcb..75c5643c 100644 --- a/tests/tts_tests/test_align_tts_train.py +++ b/tests/tts_tests/test_align_tts_train.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -42,7 +43,7 @@ command_train = ( "--coqpit.datasets.0.meta_file_train metadata.csv " "--coqpit.datasets.0.meta_file_val metadata.csv " "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.test_delay_epochs -1" + "--coqpit.test_delay_epochs 0 " ) run_cli(command_train) @@ -54,6 +55,14 @@ continue_config_path = os.path.join(continue_path, "config.json") continue_restore_path, _ = get_last_checkpoint(continue_path) out_wav_path = os.path.join(get_tests_output_path(), "output.wav") +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) diff --git a/tests/tts_tests/test_fast_pitch_speaker_emb_train.py b/tests/tts_tests/test_fast_pitch_speaker_emb_train.py index 37faf449..9553d745 100644 --- a/tests/tts_tests/test_fast_pitch_speaker_emb_train.py +++ b/tests/tts_tests/test_fast_pitch_speaker_emb_train.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -74,6 +75,14 @@ out_wav_path = os.path.join(get_tests_output_path(), "output.wav") speaker_id = "ljspeech-1" continue_speakers_path = os.path.join(continue_path, "speakers.json") +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) diff --git a/tests/tts_tests/test_fast_pitch_train.py b/tests/tts_tests/test_fast_pitch_train.py index d2d78af4..134cd4ba 100644 --- a/tests/tts_tests/test_fast_pitch_train.py +++ b/tests/tts_tests/test_fast_pitch_train.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -73,6 +74,14 @@ continue_config_path = os.path.join(continue_path, "config.json") continue_restore_path, _ = get_last_checkpoint(continue_path) out_wav_path = os.path.join(get_tests_output_path(), "output.wav") +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) diff --git a/tests/tts_tests/test_glow_tts.py b/tests/tts_tests/test_glow_tts.py index 2783e4bd..2a723f10 100644 --- a/tests/tts_tests/test_glow_tts.py +++ b/tests/tts_tests/test_glow_tts.py @@ -86,7 +86,7 @@ class TestGlowTTS(unittest.TestCase): model = GlowTTS(config) model.speaker_manager = speaker_manager model.init_multispeaker(config) - self.assertEqual(model.c_in_channels, speaker_manager.d_vector_dim) + self.assertEqual(model.c_in_channels, speaker_manager.embedding_dim) self.assertEqual(model.num_speakers, speaker_manager.num_speakers) def test_unlock_act_norm_layers(self): diff --git a/tests/tts_tests/test_glow_tts_d-vectors_train.py b/tests/tts_tests/test_glow_tts_d-vectors_train.py index 14f9e4d2..3a9c8fcc 100644 --- a/tests/tts_tests/test_glow_tts_d-vectors_train.py +++ b/tests/tts_tests/test_glow_tts_d-vectors_train.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -61,6 +62,14 @@ out_wav_path = os.path.join(get_tests_output_path(), "output.wav") speaker_id = "ljspeech-1" continue_speakers_path = config.d_vector_file +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) diff --git a/tests/tts_tests/test_glow_tts_speaker_emb_train.py b/tests/tts_tests/test_glow_tts_speaker_emb_train.py index c327332e..322b506e 100644 --- a/tests/tts_tests/test_glow_tts_speaker_emb_train.py +++ b/tests/tts_tests/test_glow_tts_speaker_emb_train.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -58,6 +59,14 @@ out_wav_path = os.path.join(get_tests_output_path(), "output.wav") speaker_id = "ljspeech-1" continue_speakers_path = os.path.join(continue_path, "speakers.json") +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) diff --git a/tests/tts_tests/test_glow_tts_train.py b/tests/tts_tests/test_glow_tts_train.py index b0acf004..cf9a04f4 100644 --- a/tests/tts_tests/test_glow_tts_train.py +++ b/tests/tts_tests/test_glow_tts_train.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -55,6 +56,14 @@ continue_config_path = os.path.join(continue_path, "config.json") continue_restore_path, _ = get_last_checkpoint(continue_path) out_wav_path = os.path.join(get_tests_output_path(), "output.wav") +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) diff --git a/tests/tts_tests/test_speedy_speech_train.py b/tests/tts_tests/test_speedy_speech_train.py index 9a26d253..c4adcee3 100644 --- a/tests/tts_tests/test_speedy_speech_train.py +++ b/tests/tts_tests/test_speedy_speech_train.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -54,6 +55,14 @@ continue_config_path = os.path.join(continue_path, "config.json") continue_restore_path, _ = get_last_checkpoint(continue_path) out_wav_path = os.path.join(get_tests_output_path(), "output.wav") +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example for it.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) diff --git a/tests/tts_tests/test_tacotron2_d-vectors_train.py b/tests/tts_tests/test_tacotron2_d-vectors_train.py index 6b003f2c..0d02fa98 100644 --- a/tests/tts_tests/test_tacotron2_d-vectors_train.py +++ b/tests/tts_tests/test_tacotron2_d-vectors_train.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -61,6 +62,14 @@ out_wav_path = os.path.join(get_tests_output_path(), "output.wav") speaker_id = "ljspeech-1" continue_speakers_path = config.d_vector_file +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) diff --git a/tests/tts_tests/test_tacotron2_speaker_emb_train.py b/tests/tts_tests/test_tacotron2_speaker_emb_train.py index b9f4de0b..2e812d90 100644 --- a/tests/tts_tests/test_tacotron2_speaker_emb_train.py +++ b/tests/tts_tests/test_tacotron2_speaker_emb_train.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -59,6 +60,14 @@ out_wav_path = os.path.join(get_tests_output_path(), "output.wav") speaker_id = "ljspeech-1" continue_speakers_path = os.path.join(continue_path, "speakers.json") +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) diff --git a/tests/tts_tests/test_tacotron2_train.py b/tests/tts_tests/test_tacotron2_train.py index 8c30d9f9..d1941022 100644 --- a/tests/tts_tests/test_tacotron2_train.py +++ b/tests/tts_tests/test_tacotron2_train.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -54,6 +55,14 @@ continue_config_path = os.path.join(continue_path, "config.json") continue_restore_path, _ = get_last_checkpoint(continue_path) out_wav_path = os.path.join(get_tests_output_path(), "output.wav") +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) diff --git a/tests/tts_tests/test_tacotron2_train_fsspec_path.py b/tests/tts_tests/test_tacotron2_train_fsspec_path.py deleted file mode 100644 index 5d14a983..00000000 --- a/tests/tts_tests/test_tacotron2_train_fsspec_path.py +++ /dev/null @@ -1,55 +0,0 @@ -import glob -import os -import shutil - -from tests import get_device_id, get_tests_output_path, run_cli -from TTS.tts.configs.tacotron2_config import Tacotron2Config - -config_path = os.path.join(get_tests_output_path(), "test_model_config.json") -output_path = os.path.join(get_tests_output_path(), "train_outputs") - -config = Tacotron2Config( - r=5, - batch_size=8, - eval_batch_size=8, - num_loader_workers=0, - num_eval_loader_workers=0, - text_cleaner="english_cleaners", - use_phonemes=False, - phoneme_language="en-us", - phoneme_cache_path=os.path.join(get_tests_output_path(), "train_outputs/phoneme_cache/"), - run_eval=True, - test_delay_epochs=-1, - epochs=1, - print_step=1, - test_sentences=[ - "Be a voice, not an echo.", - ], - print_eval=True, - max_decoder_steps=50, -) -config.audio.do_trim_silence = True -config.audio.trim_db = 60 -config.save_json(config_path) - -# train the model for one epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path file://{config_path} " - f"--coqpit.output_path file://{output_path} " - "--coqpit.datasets.0.name ljspeech " - "--coqpit.datasets.0.meta_file_train metadata.csv " - "--coqpit.datasets.0.meta_file_val metadata.csv " - "--coqpit.datasets.0.path tests/data/ljspeech " - "--coqpit.test_delay_epochs 0 " -) -run_cli(command_train) - -# Find latest folder -continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime) - -# restore the model and continue training for one more epoch -command_train = ( - f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path file://{continue_path} " -) -run_cli(command_train) -shutil.rmtree(continue_path) diff --git a/tests/tts_tests/test_vits.py b/tests/tts_tests/test_vits.py index 384234e5..de683c81 100644 --- a/tests/tts_tests/test_vits.py +++ b/tests/tts_tests/test_vits.py @@ -7,7 +7,7 @@ from trainer.logging.tensorboard_logger import TensorboardLogger from tests import assertHasAttr, assertHasNotAttr, get_tests_data_path, get_tests_input_path, get_tests_output_path from TTS.config import load_config -from TTS.speaker_encoder.utils.generic_utils import setup_speaker_encoder_model +from TTS.encoder.utils.generic_utils import setup_encoder_model from TTS.tts.configs.vits_config import VitsConfig from TTS.tts.models.vits import Vits, VitsArgs, amp_to_db, db_to_amp, load_audio, spec_to_mel, wav_to_mel, wav_to_spec from TTS.tts.utils.speakers import SpeakerManager @@ -79,25 +79,25 @@ class TestVits(unittest.TestCase): model = Vits(args) self.assertEqual(model.language_manager, None) self.assertEqual(model.embedded_language_dim, 0) - self.assertEqual(model.emb_l, None) + assertHasNotAttr(self, model, "emb_l") args = VitsArgs(language_ids_file=LANG_FILE) model = Vits(args) self.assertNotEqual(model.language_manager, None) self.assertEqual(model.embedded_language_dim, 0) - self.assertEqual(model.emb_l, None) + assertHasNotAttr(self, model, "emb_l") args = VitsArgs(language_ids_file=LANG_FILE, use_language_embedding=True) model = Vits(args) self.assertNotEqual(model.language_manager, None) self.assertEqual(model.embedded_language_dim, args.embedded_language_dim) - self.assertNotEqual(model.emb_l, None) + assertHasAttr(self, model, "emb_l") args = VitsArgs(language_ids_file=LANG_FILE, use_language_embedding=True, embedded_language_dim=102) model = Vits(args) self.assertNotEqual(model.language_manager, None) self.assertEqual(model.embedded_language_dim, args.embedded_language_dim) - self.assertNotEqual(model.emb_l, None) + assertHasAttr(self, model, "emb_l") def test_get_aux_input(self): aux_input = {"speaker_ids": None, "style_wav": None, "d_vectors": None, "language_ids": None} @@ -242,9 +242,9 @@ class TestVits(unittest.TestCase): speaker_encoder_config = load_config(SPEAKER_ENCODER_CONFIG) speaker_encoder_config.model_params["use_torch_spec"] = True - speaker_encoder = setup_speaker_encoder_model(speaker_encoder_config).to(device) + speaker_encoder = setup_encoder_model(speaker_encoder_config).to(device) speaker_manager = SpeakerManager() - speaker_manager.speaker_encoder = speaker_encoder + speaker_manager.encoder = speaker_encoder args = VitsArgs( language_ids_file=LANG_FILE, diff --git a/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py b/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py index 0c7672d7..683bb0a7 100644 --- a/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py +++ b/tests/tts_tests/test_vits_multilingual_speaker_emb_train.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -92,6 +93,14 @@ languae_id = "en" continue_speakers_path = os.path.join(continue_path, "speakers.json") continue_languages_path = os.path.join(continue_path, "language_ids.json") +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --language_ids_file_path {continue_languages_path} --language_idx {languae_id} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) diff --git a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py index a8e2020e..e4a82cdd 100644 --- a/tests/tts_tests/test_vits_multilingual_train-d_vectors.py +++ b/tests/tts_tests/test_vits_multilingual_train-d_vectors.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -45,7 +46,7 @@ config = VitsConfig( ["Be a voice, not an echo.", "ljspeech-0", None, "en"], ["Be a voice, not an echo.", "ljspeech-1", None, "pt-br"], ], - datasets=[dataset_config_en, dataset_config_pt], + datasets=[dataset_config_en, dataset_config_en, dataset_config_en, dataset_config_pt], ) # set audio config config.audio.do_trim_silence = True @@ -71,8 +72,11 @@ config.d_vector_dim = 256 config.model_args.use_sdp = True config.use_sdp = True -# deactivate language sampler -config.use_language_weighted_sampler = False +# activate language and speaker samplers +config.use_language_weighted_sampler = True +config.language_weighted_sampler_alpha = 10 +config.use_speaker_weighted_sampler = True +config.speaker_weighted_sampler_alpha = 5 config.save_json(config_path) @@ -96,6 +100,14 @@ languae_id = "en" continue_speakers_path = config.d_vector_file continue_languages_path = os.path.join(continue_path, "language_ids.json") +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --language_ids_file_path {continue_languages_path} --language_idx {languae_id} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) diff --git a/tests/tts_tests/test_vits_speaker_emb_train.py b/tests/tts_tests/test_vits_speaker_emb_train.py index c928cee4..48597241 100644 --- a/tests/tts_tests/test_vits_speaker_emb_train.py +++ b/tests/tts_tests/test_vits_speaker_emb_train.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -65,6 +66,14 @@ out_wav_path = os.path.join(get_tests_output_path(), "output.wav") speaker_id = "ljspeech-1" continue_speakers_path = os.path.join(continue_path, "speakers.json") +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) diff --git a/tests/tts_tests/test_vits_train.py b/tests/tts_tests/test_vits_train.py index 003f99a8..64ff63f3 100644 --- a/tests/tts_tests/test_vits_train.py +++ b/tests/tts_tests/test_vits_train.py @@ -1,4 +1,5 @@ import glob +import json import os import shutil @@ -54,6 +55,14 @@ continue_config_path = os.path.join(continue_path, "config.json") continue_restore_path, _ = get_last_checkpoint(continue_path) out_wav_path = os.path.join(get_tests_output_path(), "output.wav") +# Check integrity of the config +with open(continue_config_path, "r", encoding="utf-8") as f: + config_loaded = json.load(f) +assert config_loaded["characters"] is not None +assert config_loaded["output_path"] in continue_path +assert config_loaded["test_delay_epochs"] == 0 + +# Load the model and run inference inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" run_cli(inference_command) diff --git a/tests/zoo_tests/test_models.py b/tests/zoo_tests/test_models.py index 63d9e7ca..e614ce74 100644 --- a/tests/zoo_tests/test_models.py +++ b/tests/zoo_tests/test_models.py @@ -38,7 +38,7 @@ def test_run_all_models(): language_manager = LanguageManager(language_ids_file_path=language_files[0]) language_id = language_manager.language_names[0] - speaker_id = list(speaker_manager.speaker_ids.keys())[0] + speaker_id = list(speaker_manager.ids.keys())[0] run_cli( f"tts --model_name {model_name} " f'--text "This is an example." --out_path "{output_path}" --speaker_idx "{speaker_id}" --language_idx "{language_id}" '