Merge pull request #1942 from coqui-ai/dev

v0.9.0
This commit is contained in:
Eren Gölge 2022-11-16 16:50:57 +01:00 committed by GitHub
commit 56ba616a03
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
165 changed files with 101831 additions and 443 deletions

View File

@ -1,2 +1,9 @@
.git/ .git/
Dockerfile Dockerfile
build/
dist/
TTS.egg-info/
tests/outputs/*
tests/train_outputs/*
__pycache__/
*.pyc

View File

@ -21,9 +21,9 @@ jobs:
python-version: [3.7, 3.8, 3.9, "3.10"] python-version: [3.7, 3.8, 3.9, "3.10"]
experimental: [false] experimental: [false]
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }} - name: Set up Python ${{ matrix.python-version }}
uses: coqui-ai/setup-python@pip-cache-key-py-ver uses: actions/setup-python@v4
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
architecture: x64 architecture: x64

View File

@ -21,9 +21,9 @@ jobs:
python-version: [3.7, 3.8, 3.9, "3.10"] python-version: [3.7, 3.8, 3.9, "3.10"]
experimental: [false] experimental: [false]
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }} - name: Set up Python ${{ matrix.python-version }}
uses: coqui-ai/setup-python@pip-cache-key-py-ver uses: actions/setup-python@v4
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
architecture: x64 architecture: x64

View File

@ -15,8 +15,8 @@ jobs:
matrix: matrix:
arch: ["amd64"] arch: ["amd64"]
base: base:
- "nvcr.io/nvidia/pytorch:22.03-py3" # GPU enabled - "nvidia/cuda:11.8.0-base-ubuntu22.04" # GPU enabled
- "ubuntu:20.04" # CPU only - "python:3.10.8-slim" # CPU only
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v2
- name: Log in to the Container registry - name: Log in to the Container registry
@ -32,7 +32,7 @@ jobs:
base="ghcr.io/coqui-ai/tts" base="ghcr.io/coqui-ai/tts"
tags="" # PR build tags="" # PR build
if [[ ${{ matrix.base }} = "ubuntu:20.04" ]]; then if [[ ${{ matrix.base }} = "python:3.10.8-slim" ]]; then
base="ghcr.io/coqui-ai/tts-cpu" base="ghcr.io/coqui-ai/tts-cpu"
fi fi

View File

@ -21,9 +21,9 @@ jobs:
python-version: [3.7, 3.8, 3.9, "3.10"] python-version: [3.7, 3.8, 3.9, "3.10"]
experimental: [false] experimental: [false]
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }} - name: Set up Python ${{ matrix.python-version }}
uses: coqui-ai/setup-python@pip-cache-key-py-ver uses: actions/setup-python@v4
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
architecture: x64 architecture: x64

View File

@ -21,9 +21,9 @@ jobs:
python-version: [3.9] python-version: [3.9]
experimental: [false] experimental: [false]
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }} - name: Set up Python ${{ matrix.python-version }}
uses: coqui-ai/setup-python@pip-cache-key-py-ver uses: actions/setup-python@v4
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
architecture: x64 architecture: x64

View File

@ -21,9 +21,9 @@ jobs:
python-version: [3.7, 3.8, 3.9, "3.10"] python-version: [3.7, 3.8, 3.9, "3.10"]
experimental: [false] experimental: [false]
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }} - name: Set up Python ${{ matrix.python-version }}
uses: coqui-ai/setup-python@pip-cache-key-py-ver uses: actions/setup-python@v4
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
architecture: x64 architecture: x64

View File

@ -21,9 +21,9 @@ jobs:
python-version: [3.7, 3.8, 3.9, "3.10"] python-version: [3.7, 3.8, 3.9, "3.10"]
experimental: [false] experimental: [false]
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }} - name: Set up Python ${{ matrix.python-version }}
uses: coqui-ai/setup-python@pip-cache-key-py-ver uses: actions/setup-python@v4
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
architecture: x64 architecture: x64

View File

@ -21,9 +21,9 @@ jobs:
python-version: [3.7, 3.8, 3.9, "3.10"] python-version: [3.7, 3.8, 3.9, "3.10"]
experimental: [false] experimental: [false]
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }} - name: Set up Python ${{ matrix.python-version }}
uses: coqui-ai/setup-python@pip-cache-key-py-ver uses: actions/setup-python@v4
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
architecture: x64 architecture: x64

52
.github/workflows/zoo_tests0.yml vendored Normal file
View File

@ -0,0 +1,52 @@
name: zoo-tests-0
on:
push:
branches:
- main
pull_request:
types: [opened, synchronize, reopened]
jobs:
check_skip:
runs-on: ubuntu-latest
if: "! contains(github.event.head_commit.message, '[ci skip]')"
steps:
- run: echo "${{ github.event.head_commit.message }}"
test:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: [3.7, 3.8, 3.9, "3.10"]
experimental: [false]
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
architecture: x64
cache: 'pip'
cache-dependency-path: 'requirements*'
- name: check OS
run: cat /etc/os-release
- name: Install dependencies
run: |
sudo apt-get update
sudo apt-get install -y git make gcc
sudo apt-get install espeak espeak-ng
make system-deps
- name: Install/upgrade Python setup deps
run: python3 -m pip install --upgrade pip setuptools wheel
- name: Replace scarf urls
run: |
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
- name: Install TTS
run: |
python3 -m pip install .[all]
python3 setup.py egg_info
- name: Unit tests
run: |
nose2 -F -v -B TTS tests.zoo_tests.test_models.test_models_offset_0_step_3
nose2 -F -v -B TTS tests.zoo_tests.test_models.test_voice_conversion

View File

@ -1,4 +1,4 @@
name: zoo-tests name: zoo-tests-1
on: on:
push: push:
@ -21,9 +21,9 @@ jobs:
python-version: [3.7, 3.8, 3.9, "3.10"] python-version: [3.7, 3.8, 3.9, "3.10"]
experimental: [false] experimental: [false]
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }} - name: Set up Python ${{ matrix.python-version }}
uses: coqui-ai/setup-python@pip-cache-key-py-ver uses: actions/setup-python@v4
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
architecture: x64 architecture: x64
@ -47,4 +47,4 @@ jobs:
python3 -m pip install .[all] python3 -m pip install .[all]
python3 setup.py egg_info python3 setup.py egg_info
- name: Unit tests - name: Unit tests
run: make test_zoo run: nose2 -F -v -B --with-coverage --coverage TTS tests.zoo_tests.test_models.test_models_offset_1_step_3

50
.github/workflows/zoo_tests2.yml vendored Normal file
View File

@ -0,0 +1,50 @@
name: zoo-tests-2
on:
push:
branches:
- main
pull_request:
types: [opened, synchronize, reopened]
jobs:
check_skip:
runs-on: ubuntu-latest
if: "! contains(github.event.head_commit.message, '[ci skip]')"
steps:
- run: echo "${{ github.event.head_commit.message }}"
test:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: [3.7, 3.8, 3.9, "3.10"]
experimental: [false]
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
architecture: x64
cache: 'pip'
cache-dependency-path: 'requirements*'
- name: check OS
run: cat /etc/os-release
- name: Install dependencies
run: |
sudo apt-get update
sudo apt-get install -y git make gcc
sudo apt-get install espeak espeak-ng
make system-deps
- name: Install/upgrade Python setup deps
run: python3 -m pip install --upgrade pip setuptools wheel
- name: Replace scarf urls
run: |
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
- name: Install TTS
run: |
python3 -m pip install .[all]
python3 setup.py egg_info
- name: Unit tests
run: nose2 -F -v -B --with-coverage --coverage TTS tests.zoo_tests.test_models.test_models_offset_2_step_3

View File

@ -1,20 +1,12 @@
ARG BASE=nvcr.io/nvidia/pytorch:22.03-py3 ARG BASE=nvidia/cuda:11.8.0-base-ubuntu22.04
FROM ${BASE} FROM ${BASE}
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ make python3 python3-dev python3-pip python3-venv python3-wheel espeak espeak-ng libsndfile1-dev && rm -rf /var/lib/apt/lists/* RUN apt-get update && apt-get upgrade -y
RUN pip install llvmlite --ignore-installed RUN apt-get install -y --no-install-recommends gcc g++ make python3 python3-dev python3-pip python3-venv python3-wheel espeak-ng libsndfile1-dev && rm -rf /var/lib/apt/lists/*
RUN pip3 install llvmlite --ignore-installed
# Create and activate virtual env
ENV VIRTUAL_ENV=/venv
RUN python3 -m venv $VIRTUAL_ENV
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
RUN pip install -U pip setuptools wheel
WORKDIR /root WORKDIR /root
COPY requirements.txt /root
COPY requirements.dev.txt /root
COPY requirements.notebooks.txt /root
RUN ["/bin/bash", "-c", "pip install -r <(cat requirements.txt requirements.dev.txt requirements.notebooks.txt)"]
COPY . /root COPY . /root
RUN pip3 install torch torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
RUN make install RUN make install
ENTRYPOINT ["tts"] ENTRYPOINT ["tts"]
CMD ["--help"] CMD ["--help"]

View File

@ -1,9 +1,16 @@
# <img src="https://raw.githubusercontent.com/coqui-ai/TTS/main/images/coqui-log-green-TTS.png" height="56"/> <img src="https://raw.githubusercontent.com/coqui-ai/TTS/main/images/coqui-log-green-TTS.png" height="56"/>
----
### 📣 Clone your voice with a single click on [🐸Coqui.ai](https://app.coqui.ai/auth/signin)
### 📣 🐸Coqui Studio is launching soon!! Join our [waiting list](https://coqui.ai/)!!
----
🐸TTS is a library for advanced Text-to-Speech generation. It's built on the latest research, was designed to achieve the best trade-off among ease-of-training, speed and quality. 🐸TTS is a library for advanced Text-to-Speech generation. It's built on the latest research, was designed to achieve the best trade-off among ease-of-training, speed and quality.
🐸TTS comes with pretrained models, tools for measuring dataset quality and already used in **20+ languages** for products and research projects. 🐸TTS comes with pretrained models, tools for measuring dataset quality and already used in **20+ languages** for products and research projects.
[![Gitter](https://badges.gitter.im/coqui-ai/TTS.svg)](https://gitter.im/coqui-ai/TTS?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge) [![Dicord](https://img.shields.io/discord/1037326658807533628?color=%239B59B6&label=chat%20on%20discord)](https://discord.gg/5eXr5seRrv)
[![License](<https://img.shields.io/badge/License-MPL%202.0-brightgreen.svg>)](https://opensource.org/licenses/MPL-2.0) [![License](<https://img.shields.io/badge/License-MPL%202.0-brightgreen.svg>)](https://opensource.org/licenses/MPL-2.0)
[![PyPI version](https://badge.fury.io/py/TTS.svg)](https://badge.fury.io/py/TTS) [![PyPI version](https://badge.fury.io/py/TTS.svg)](https://badge.fury.io/py/TTS)
[![Covenant](https://camo.githubusercontent.com/7d620efaa3eac1c5b060ece5d6aacfcc8b81a74a04d05cd0398689c01c4463bb/68747470733a2f2f696d672e736869656c64732e696f2f62616467652f436f6e7472696275746f72253230436f76656e616e742d76322e3025323061646f707465642d6666363962342e737667)](https://github.com/coqui-ai/TTS/blob/master/CODE_OF_CONDUCT.md) [![Covenant](https://camo.githubusercontent.com/7d620efaa3eac1c5b060ece5d6aacfcc8b81a74a04d05cd0398689c01c4463bb/68747470733a2f2f696d672e736869656c64732e696f2f62616467652f436f6e7472696275746f72253230436f76656e616e742d76322e3025323061646f707465642d6666363962342e737667)](https://github.com/coqui-ai/TTS/blob/master/CODE_OF_CONDUCT.md)
@ -36,12 +43,12 @@ Please use our dedicated channels for questions and discussion. Help is much mor
| ------------------------------- | --------------------------------------- | | ------------------------------- | --------------------------------------- |
| 🚨 **Bug Reports** | [GitHub Issue Tracker] | | 🚨 **Bug Reports** | [GitHub Issue Tracker] |
| 🎁 **Feature Requests & Ideas** | [GitHub Issue Tracker] | | 🎁 **Feature Requests & Ideas** | [GitHub Issue Tracker] |
| 👩‍💻 **Usage Questions** | [Github Discussions] | | 👩‍💻 **Usage Questions** | [GitHub Discussions] |
| 🗯 **General Discussion** | [Github Discussions] or [Gitter Room] | | 🗯 **General Discussion** | [GitHub Discussions] or [Discord] |
[github issue tracker]: https://github.com/coqui-ai/tts/issues [github issue tracker]: https://github.com/coqui-ai/tts/issues
[github discussions]: https://github.com/coqui-ai/TTS/discussions [github discussions]: https://github.com/coqui-ai/TTS/discussions
[gitter room]: https://gitter.im/coqui-ai/TTS?utm_source=share-link&utm_medium=link&utm_campaign=share-link [discord]: https://discord.gg/5eXr5seRrv
[Tutorials and Examples]: https://github.com/coqui-ai/TTS/wiki/TTS-Notebooks-and-Tutorials [Tutorials and Examples]: https://github.com/coqui-ai/TTS/wiki/TTS-Notebooks-and-Tutorials
@ -75,7 +82,7 @@ Underlined "TTS*" and "Judy*" are 🐸TTS models
- Modular (but not too much) code base enabling easy implementation of new ideas. - Modular (but not too much) code base enabling easy implementation of new ideas.
## Implemented Models ## Implemented Models
### Text-to-Spectrogram ### Spectrogram models
- Tacotron: [paper](https://arxiv.org/abs/1703.10135) - Tacotron: [paper](https://arxiv.org/abs/1703.10135)
- Tacotron2: [paper](https://arxiv.org/abs/1712.05884) - Tacotron2: [paper](https://arxiv.org/abs/1712.05884)
- Glow-TTS: [paper](https://arxiv.org/abs/2005.11129) - Glow-TTS: [paper](https://arxiv.org/abs/2005.11129)
@ -83,9 +90,12 @@ Underlined "TTS*" and "Judy*" are 🐸TTS models
- Align-TTS: [paper](https://arxiv.org/abs/2003.01950) - Align-TTS: [paper](https://arxiv.org/abs/2003.01950)
- FastPitch: [paper](https://arxiv.org/pdf/2006.06873.pdf) - FastPitch: [paper](https://arxiv.org/pdf/2006.06873.pdf)
- FastSpeech: [paper](https://arxiv.org/abs/1905.09263) - FastSpeech: [paper](https://arxiv.org/abs/1905.09263)
- SC-GlowTTS: [paper](https://arxiv.org/abs/2104.05557)
- Capacitron: [paper](https://arxiv.org/abs/1906.03402)
### End-to-End Models ### End-to-End Models
- VITS: [paper](https://arxiv.org/pdf/2106.06103) - VITS: [paper](https://arxiv.org/pdf/2106.06103)
- YourTTS: [paper](https://arxiv.org/abs/2112.02418)
### Attention Methods ### Attention Methods
- Guided Attention: [paper](https://arxiv.org/abs/1710.08969) - Guided Attention: [paper](https://arxiv.org/abs/1710.08969)
@ -136,6 +146,21 @@ $ make install
If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](https://stackoverflow.com/questions/66726331/how-can-i-run-mozilla-tts-coqui-tts-training-with-cuda-on-a-windows-system). If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](https://stackoverflow.com/questions/66726331/how-can-i-run-mozilla-tts-coqui-tts-training-with-cuda-on-a-windows-system).
## Docker Image
You can also try TTS without install with the docker image.
Simply run the following command and you will be able to run TTS without installing it.
```bash
docker run --rm -it -p 5002:5002 --entrypoint /bin/bash ghcr.io/coqui-ai/tts-cpu
python3 TTS/server/server.py --list_models #To get the list of available models
python3 TTS/server/server.py --model_name tts_models/en/vctk/vits # To start a server
```
You can then enjoy the TTS server [here](http://[::1]:5002/)
More details about the docker images (like GPU support) can be found [here](https://tts.readthedocs.io/en/latest/docker_images.html)
## Use TTS ## Use TTS
### Single Speaker Models ### Single Speaker Models
@ -208,7 +233,7 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht
- Run your own TTS and Vocoder models: - Run your own TTS and Vocoder models:
``` ```
$ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth --out_path output/path/speech.wav $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
--vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
``` ```
@ -229,7 +254,7 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht
- Run your own multi-speaker TTS model: - Run your own multi-speaker TTS model:
``` ```
$ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id> $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/model.pth --config_path path/to/config.json --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
``` ```
## Directory Structure ## Directory Structure
@ -239,8 +264,6 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht
|- TTS |- TTS
|- bin/ (folder for all the executables.) |- bin/ (folder for all the executables.)
|- train*.py (train your target model.) |- train*.py (train your target model.)
|- distribute.py (train your TTS model using Multiple GPUs.)
|- compute_statistics.py (compute dataset statistics for normalization.)
|- ... |- ...
|- tts/ (text to speech models) |- tts/ (text to speech models)
|- layers/ (model layer definitions) |- layers/ (model layer definitions)

View File

@ -12,6 +12,61 @@
} }
} }
}, },
"bg": {
"cv": {
"vits":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--bg--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"cs": {
"cv": {
"vits":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--cs--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"da": {
"cv": {
"vits":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--da--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"et": {
"cv": {
"vits":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--et--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"ga": {
"cv": {
"vits":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ga--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"en": { "en": {
"ek1": { "ek1": {
"tacotron2": { "tacotron2": {
@ -79,6 +134,14 @@
"license": "apache 2.0", "license": "apache 2.0",
"contact": "egolge@coqui.com" "contact": "egolge@coqui.com"
}, },
"vits--neon": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--en--ljspeech--vits.zip",
"default_vocoder": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause",
"contact": null,
"commit": null
},
"fast_pitch": { "fast_pitch": {
"description": "FastPitch model trained on LJSpeech using the Aligner Network", "description": "FastPitch model trained on LJSpeech using the Aligner Network",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--fast_pitch.zip", "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--fast_pitch.zip",
@ -130,10 +193,10 @@
"license": "apache 2.0", "license": "apache 2.0",
"contact": "adamfroghyar@gmail.com" "contact": "adamfroghyar@gmail.com"
}, },
"capacitron-t2-c150": { "capacitron-t2-c150_v2": {
"description": "Capacitron additions to Tacotron 2 with Capacity at 150 as in https://arxiv.org/pdf/1906.03402.pdf", "description": "Capacitron additions to Tacotron 2 with Capacity at 150 as in https://arxiv.org/pdf/1906.03402.pdf",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--en--blizzard2013--capacitron-t2-c150.zip", "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.1_models/tts_models--en--blizzard2013--capacitron-t2-c150_v2.zip",
"commit": "d6284e7", "commit": "a67039d",
"default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2", "default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
"author": "Adam Froghyar @a-froghyar", "author": "Adam Froghyar @a-froghyar",
"license": "apache 2.0", "license": "apache 2.0",
@ -151,6 +214,15 @@
"license": "MPL", "license": "MPL",
"contact": "egolge@coqui.com" "contact": "egolge@coqui.com"
} }
},
"css10":{
"vits":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--es--css10--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
} }
}, },
"fr": { "fr": {
@ -158,11 +230,20 @@
"tacotron2-DDC": { "tacotron2-DDC": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--fr--mai--tacotron2-DDC.zip", "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--fr--mai--tacotron2-DDC.zip",
"default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan", "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
"commit": "", "commit": null,
"author": "Eren Gölge @erogol", "author": "Eren Gölge @erogol",
"license": "MPL", "license": "MPL",
"contact": "egolge@coqui.com" "contact": "egolge@coqui.com"
} }
},
"css10":{
"vits":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fr--css10--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
} }
}, },
"uk":{ "uk":{
@ -174,6 +255,13 @@
"license": "MIT", "license": "MIT",
"contact": "", "contact": "",
"default_vocoder": "vocoder_models/uk/mai/multiband-melgan" "default_vocoder": "vocoder_models/uk/mai/multiband-melgan"
},
"vits":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--uk--mai--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
} }
} }
}, },
@ -198,6 +286,15 @@
"stats_file": null, "stats_file": null,
"commit": "540d811" "commit": "540d811"
} }
},
"css10":{
"vits":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--nl--css10--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
} }
}, },
"de": { "de": {
@ -224,6 +321,15 @@
"license": "apache 2.0", "license": "apache 2.0",
"commit": "unknown" "commit": "unknown"
} }
},
"css10": {
"vits-neon":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--de--css10--vits.zip",
"default_vocoder": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause",
"commit": null
}
} }
}, },
"ja": { "ja": {
@ -359,6 +465,149 @@
"commit": "1b22f03" "commit": "1b22f03"
} }
} }
},
"hu": {
"css10": {
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hu--css10--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"el": {
"cv": {
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--el--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"fi": {
"css10": {
"vits":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fi--css10--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"hr": {
"cv": {
"vits":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hr--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"lt": {
"cv": {
"vits":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lt--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"lv": {
"cv": {
"vits":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lv--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"mt": {
"cv": {
"vits":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--mt--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"pl": {
"mai_female": {
"vits":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pl--mai_female--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"pt": {
"cv": {
"vits":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pt--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"ro": {
"cv": {
"vits":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ro--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"sk": {
"cv": {
"vits":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sk--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"sl": {
"cv": {
"vits":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sl--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"sv": {
"cv": {
"vits":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sv--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
} }
}, },
"vocoder_models": { "vocoder_models": {

View File

@ -1 +1 @@
0.8.0 0.9.0

View File

@ -6,38 +6,87 @@ import torch
from tqdm import tqdm from tqdm import tqdm
from TTS.config import load_config from TTS.config import load_config
from TTS.config.shared_configs import BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples from TTS.tts.datasets import load_tts_samples
from TTS.tts.utils.managers import save_file from TTS.tts.utils.managers import save_file
from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.speakers import SpeakerManager
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="""Compute embedding vectors for each wav file in a dataset.\n\n""" description="""Compute embedding vectors for each audio file in a dataset and store them keyed by `{dataset_name}#{file_path}` in a .pth file\n\n"""
""" """
Example runs: Example runs:
python TTS/bin/compute_embeddings.py speaker_encoder_model.pth speaker_encoder_config.json dataset_config.json python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --config_dataset_path dataset_config.json
python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --fomatter vctk --dataset_path /path/to/vctk/dataset --dataset_name my_vctk --metafile /path/to/vctk/metafile.csv
""", """,
formatter_class=RawTextHelpFormatter, formatter_class=RawTextHelpFormatter,
) )
parser.add_argument("model_path", type=str, help="Path to model checkpoint file.") parser.add_argument(
parser.add_argument("config_path", type=str, help="Path to model config file.") "--model_path",
parser.add_argument("config_dataset_path", type=str, help="Path to dataset config file.") type=str,
help="Path to model checkpoint file. It defaults to the released speaker encoder.",
default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
)
parser.add_argument(
"--config_path",
type=str,
help="Path to model config file. It defaults to the released speaker encoder config.",
default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
)
parser.add_argument(
"--config_dataset_path",
type=str,
help="Path to dataset config file. You either need to provide this or `formatter_name`, `dataset_name` and `dataset_path` arguments.",
default=None,
)
parser.add_argument("--output_path", type=str, help="Path for output `pth` or `json` file.", default="speakers.pth") parser.add_argument("--output_path", type=str, help="Path for output `pth` or `json` file.", default="speakers.pth")
parser.add_argument("--old_file", type=str, help="Previous embedding file to only compute new audios.", default=None) parser.add_argument("--old_file", type=str, help="Previous embedding file to only compute new audios.", default=None)
parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False) parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False)
parser.add_argument("--no_eval", type=bool, help="Do not compute eval?. Default False", default=False) parser.add_argument("--no_eval", type=bool, help="Do not compute eval?. Default False", default=False)
parser.add_argument(
"--formatter_name",
type=str,
help="Name of the formatter to use. You either need to provide this or `config_dataset_path`",
default=None,
)
parser.add_argument(
"--dataset_name",
type=str,
help="Name of the dataset to use. You either need to provide this or `config_dataset_path`",
default=None,
)
parser.add_argument(
"--dataset_path",
type=str,
help="Path to the dataset. You either need to provide this or `config_dataset_path`",
default=None,
)
parser.add_argument(
"--metafile",
type=str,
help="Path to the meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
default=None,
)
args = parser.parse_args() args = parser.parse_args()
use_cuda = torch.cuda.is_available() and not args.disable_cuda use_cuda = torch.cuda.is_available() and not args.disable_cuda
c_dataset = load_config(args.config_dataset_path) if args.config_dataset_path is not None:
c_dataset = load_config(args.config_dataset_path)
meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=not args.no_eval)
else:
c_dataset = BaseDatasetConfig()
c_dataset.formatter = args.formatter_name
c_dataset.dataset_name = args.dataset_name
c_dataset.path = args.dataset_path
c_dataset.meta_file_train = args.metafile if args.metafile else None
meta_data_train, meta_data_eval = load_tts_samples(c_dataset, eval_split=not args.no_eval)
meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=not args.no_eval)
if meta_data_eval is None: if meta_data_eval is None:
wav_files = meta_data_train samples = meta_data_train
else: else:
wav_files = meta_data_train + meta_data_eval samples = meta_data_train + meta_data_eval
encoder_manager = SpeakerManager( encoder_manager = SpeakerManager(
encoder_model_path=args.model_path, encoder_model_path=args.model_path,
@ -50,25 +99,23 @@ class_name_key = encoder_manager.encoder_config.class_name_key
# compute speaker embeddings # compute speaker embeddings
speaker_mapping = {} speaker_mapping = {}
for idx, wav_file in enumerate(tqdm(wav_files)): for idx, fields in enumerate(tqdm(samples)):
if isinstance(wav_file, dict): class_name = fields[class_name_key]
class_name = wav_file[class_name_key] audio_file = fields["audio_file"]
wav_file = wav_file["audio_file"] embedding_key = fields["audio_unique_name"]
else: root_path = fields["root_path"]
class_name = None
wav_file_name = os.path.basename(wav_file) if args.old_file is not None and embedding_key in encoder_manager.clip_ids:
if args.old_file is not None and wav_file_name in encoder_manager.clip_ids:
# get the embedding from the old file # get the embedding from the old file
embedd = encoder_manager.get_embedding_by_clip(wav_file_name) embedd = encoder_manager.get_embedding_by_clip(embedding_key)
else: else:
# extract the embedding # extract the embedding
embedd = encoder_manager.compute_embedding_from_clip(wav_file) embedd = encoder_manager.compute_embedding_from_clip(audio_file)
# create speaker_mapping if target dataset is defined # create speaker_mapping if target dataset is defined
speaker_mapping[wav_file_name] = {} speaker_mapping[embedding_key] = {}
speaker_mapping[wav_file_name]["name"] = class_name speaker_mapping[embedding_key]["name"] = class_name
speaker_mapping[wav_file_name]["embedding"] = embedd speaker_mapping[embedding_key]["embedding"] = embedd
if speaker_mapping: if speaker_mapping:
# save speaker_mapping if target dataset is defined # save speaker_mapping if target dataset is defined

View File

@ -37,7 +37,7 @@ def setup_loader(ap, r, verbose=False):
precompute_num_workers=0, precompute_num_workers=0,
use_noise_augment=False, use_noise_augment=False,
verbose=verbose, verbose=verbose,
speaker_id_mapping=speaker_manager.ids if c.use_speaker_embedding else None, speaker_id_mapping=speaker_manager.name_to_id if c.use_speaker_embedding else None,
d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None, d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None,
) )

View File

@ -7,30 +7,25 @@ from tqdm.contrib.concurrent import process_map
from TTS.config import load_config from TTS.config import load_config
from TTS.tts.datasets import load_tts_samples from TTS.tts.datasets import load_tts_samples
from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut from TTS.tts.utils.text.phonemizers import Gruut
phonemizer = Gruut(language="en-us")
def compute_phonemes(item): def compute_phonemes(item):
try: text = item["text"]
text = item[0] ph = phonemizer.phonemize(text).replace("|", "")
ph = phonemizer.phonemize(text).split("|") return set(list(ph))
except:
return []
return list(set(ph))
def main(): def main():
# pylint: disable=W0601 # pylint: disable=W0601
global c global c, phonemizer
# pylint: disable=bad-option-value # pylint: disable=bad-option-value
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="""Find all the unique characters or phonemes in a dataset.\n\n""" description="""Find all the unique characters or phonemes in a dataset.\n\n"""
""" """
Example runs: Example runs:
python TTS/bin/find_unique_chars.py --config_path config.json python TTS/bin/find_unique_phonemes.py --config_path config.json
""", """,
formatter_class=RawTextHelpFormatter, formatter_class=RawTextHelpFormatter,
) )
@ -46,15 +41,24 @@ def main():
items = train_items + eval_items items = train_items + eval_items
print("Num items:", len(items)) print("Num items:", len(items))
is_lang_def = all(item["language"] for item in items) language_list = [item["language"] for item in items]
is_lang_def = all(language_list)
if not c.phoneme_language or not is_lang_def: if not c.phoneme_language or not is_lang_def:
raise ValueError("Phoneme language must be defined in config.") raise ValueError("Phoneme language must be defined in config.")
if not language_list.count(language_list[0]) == len(language_list):
raise ValueError(
"Currently, just one phoneme language per config file is supported !! Please split the dataset config into different configs and run it individually for each language !!"
)
phonemizer = Gruut(language=language_list[0], keep_puncs=True)
phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15) phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15)
phones = [] phones = []
for ph in phonemes: for ph in phonemes:
phones.extend(ph) phones.extend(ph)
phones = set(phones) phones = set(phones)
lower_phones = filter(lambda c: c.islower(), phones) lower_phones = filter(lambda c: c.islower(), phones)
phones_force_lower = [c.lower() for c in phones] phones_force_lower = [c.lower() for c in phones]

View File

@ -17,7 +17,7 @@ def adjust_path_and_remove_silence(audio_path):
# create all directory structure # create all directory structure
pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True) pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
# remove the silence and save the audio # remove the silence and save the audio
output_path = remove_silence( output_path, is_speech = remove_silence(
model_and_utils, model_and_utils,
audio_path, audio_path,
output_path, output_path,
@ -25,26 +25,34 @@ def adjust_path_and_remove_silence(audio_path):
use_cuda=args.use_cuda, use_cuda=args.use_cuda,
) )
return output_path return output_path, is_speech
def preprocess_audios(): def preprocess_audios():
files = sorted(glob.glob(os.path.join(args.input_dir, args.glob), recursive=True)) files = sorted(glob.glob(os.path.join(args.input_dir, args.glob), recursive=True))
print("> Number of files: ", len(files)) print("> Number of files: ", len(files))
if not args.force: if not args.force:
print("> Ignoring files that already exist in the output directory.") print("> Ignoring files that already exist in the output idrectory.")
if args.trim_just_beginning_and_end: if args.trim_just_beginning_and_end:
print("> Trimming just the beginning and the end with nonspeech parts.") print("> Trimming just the beginning and the end with nonspeech parts.")
else: else:
print("> Trimming all nonspeech parts.") print("> Trimming all nonspeech parts.")
filtered_files = []
if files: if files:
# create threads # create threads
# num_threads = multiprocessing.cpu_count() # num_threads = multiprocessing.cpu_count()
# process_map(adjust_path_and_remove_silence, files, max_workers=num_threads, chunksize=15) # process_map(adjust_path_and_remove_silence, files, max_workers=num_threads, chunksize=15)
for f in tqdm(files): for f in tqdm(files):
adjust_path_and_remove_silence(f) output_path, is_speech = adjust_path_and_remove_silence(f)
if not is_speech:
filtered_files.append(output_path)
# write files that do not have speech
with open(os.path.join(args.output_dir, "filtered_files.txt"), "w", encoding="utf-8") as f:
for file in filtered_files:
f.write(file + "\n")
else: else:
print("> No files Found !") print("> No files Found !")

View File

@ -238,6 +238,13 @@ If you don't specify any models, then it uses LJSpeech based English model.
help="speaker ID of the reference_wav speaker (If not provided the embedding will be computed using the Speaker Encoder).", help="speaker ID of the reference_wav speaker (If not provided the embedding will be computed using the Speaker Encoder).",
default=None, default=None,
) )
parser.add_argument(
"--progress_bar",
type=str2bool,
help="If true shows a progress bar for the model download. Defaults to True",
default=True,
)
args = parser.parse_args() args = parser.parse_args()
# print the description if either text or list_models is not set # print the description if either text or list_models is not set
@ -255,7 +262,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
# load model manager # load model manager
path = Path(__file__).parent / "../.models.json" path = Path(__file__).parent / "../.models.json"
manager = ModelManager(path) manager = ModelManager(path, progress_bar=args.progress_bar)
model_path = None model_path = None
config_path = None config_path = None
@ -323,7 +330,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
print( print(
" > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model." " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
) )
print(synthesizer.tts_model.speaker_manager.ids) print(synthesizer.tts_model.speaker_manager.name_to_id)
return return
# query langauge ids of a multi-lingual model. # query langauge ids of a multi-lingual model.
@ -331,7 +338,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
print( print(
" > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model." " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
) )
print(synthesizer.tts_model.language_manager.ids) print(synthesizer.tts_model.language_manager.name_to_id)
return return
# check the arguments against a multi-speaker model. # check the arguments against a multi-speaker model.

View File

@ -62,7 +62,7 @@ class BaseAudioConfig(Coqpit):
Maximum frequency of the F0 frames. Defaults to ```640```. Maximum frequency of the F0 frames. Defaults to ```640```.
pitch_fmin (float, optional): pitch_fmin (float, optional):
Minimum frequency of the F0 frames. Defaults to ```0```. Minimum frequency of the F0 frames. Defaults to ```1```.
trim_db (int): trim_db (int):
Silence threshold used for silence trimming. Defaults to 45. Silence threshold used for silence trimming. Defaults to 45.
@ -144,7 +144,7 @@ class BaseAudioConfig(Coqpit):
do_amp_to_db_mel: bool = True do_amp_to_db_mel: bool = True
# f0 params # f0 params
pitch_fmax: float = 640.0 pitch_fmax: float = 640.0
pitch_fmin: float = 0.0 pitch_fmin: float = 1.0
# normalization params # normalization params
signal_norm: bool = True signal_norm: bool = True
min_level_db: int = -100 min_level_db: int = -100
@ -193,21 +193,24 @@ class BaseDatasetConfig(Coqpit):
"""Base config for TTS datasets. """Base config for TTS datasets.
Args: Args:
name (str): formatter (str):
Dataset name that defines the preprocessor in use. Defaults to None. Formatter name that defines used formatter in ```TTS.tts.datasets.formatter```. Defaults to `""`.
dataset_name (str):
Unique name for the dataset. Defaults to `""`.
path (str): path (str):
Root path to the dataset files. Defaults to None. Root path to the dataset files. Defaults to `""`.
meta_file_train (str): meta_file_train (str):
Name of the dataset meta file. Or a list of speakers to be ignored at training for multi-speaker datasets. Name of the dataset meta file. Or a list of speakers to be ignored at training for multi-speaker datasets.
Defaults to None. Defaults to `""`.
ignored_speakers (List): ignored_speakers (List):
List of speakers IDs that are not used at the training. Default None. List of speakers IDs that are not used at the training. Default None.
language (str): language (str):
Language code of the dataset. If defined, it overrides `phoneme_language`. Defaults to None. Language code of the dataset. If defined, it overrides `phoneme_language`. Defaults to `""`.
meta_file_val (str): meta_file_val (str):
Name of the dataset meta file that defines the instances used at validation. Name of the dataset meta file that defines the instances used at validation.
@ -217,7 +220,8 @@ class BaseDatasetConfig(Coqpit):
train the duration predictor. train the duration predictor.
""" """
name: str = "" formatter: str = ""
dataset_name: str = ""
path: str = "" path: str = ""
meta_file_train: str = "" meta_file_train: str = ""
ignored_speakers: List[str] = None ignored_speakers: List[str] = None
@ -230,7 +234,7 @@ class BaseDatasetConfig(Coqpit):
): ):
"""Check config fields""" """Check config fields"""
c = asdict(self) c = asdict(self)
check_argument("name", c, restricted=True) check_argument("formatter", c, restricted=True)
check_argument("path", c, restricted=True) check_argument("path", c, restricted=True)
check_argument("meta_file_train", c, restricted=True) check_argument("meta_file_train", c, restricted=True)
check_argument("meta_file_val", c, restricted=False) check_argument("meta_file_val", c, restricted=False)

View File

@ -107,11 +107,18 @@ class BaseEncoder(nn.Module):
return criterion return criterion
def load_checkpoint( def load_checkpoint(
self, config: Coqpit, checkpoint_path: str, eval: bool = False, use_cuda: bool = False, criterion=None self,
config: Coqpit,
checkpoint_path: str,
eval: bool = False,
use_cuda: bool = False,
criterion=None,
cache=False,
): ):
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu")) state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
try: try:
self.load_state_dict(state["model"]) self.load_state_dict(state["model"])
print(" > Model fully restored. ")
except (KeyError, RuntimeError) as error: except (KeyError, RuntimeError) as error:
# If eval raise the error # If eval raise the error
if eval: if eval:

View File

@ -44,13 +44,16 @@ class BaseTrainerModel(TrainerModel):
return outputs_dict return outputs_dict
@abstractmethod @abstractmethod
def load_checkpoint(self, config: Coqpit, checkpoint_path: str, eval: bool = False, strict: bool = True) -> None: def load_checkpoint(
self, config: Coqpit, checkpoint_path: str, eval: bool = False, strict: bool = True, cache=False
) -> None:
"""Load a model checkpoint gile and get ready for training or inference. """Load a model checkpoint gile and get ready for training or inference.
Args: Args:
config (Coqpit): Model configuration. config (Coqpit): Model configuration.
checkpoint_path (str): Path to the model checkpoint file. checkpoint_path (str): Path to the model checkpoint file.
eval (bool, optional): If true, init model for inference else for training. Defaults to False. eval (bool, optional): If true, init model for inference else for training. Defaults to False.
strcit (bool, optional): Match all checkpoint keys to model's keys. Defaults to True. strict (bool, optional): Match all checkpoint keys to model's keys. Defaults to True.
cache (bool, optional): If True, cache the file locally for subsequent calls. It is cached under `get_user_data_dir()/tts_cache`. Defaults to False.
""" """
... ...

View File

@ -5,6 +5,7 @@ import json
import os import os
import sys import sys
from pathlib import Path from pathlib import Path
from threading import Lock
from typing import Union from typing import Union
from flask import Flask, render_template, request, send_file from flask import Flask, render_template, request, send_file
@ -146,7 +147,7 @@ def index():
"index.html", "index.html",
show_details=args.show_details, show_details=args.show_details,
use_multi_speaker=use_multi_speaker, use_multi_speaker=use_multi_speaker,
speaker_ids=speaker_manager.ids if speaker_manager is not None else None, speaker_ids=speaker_manager.name_to_id if speaker_manager is not None else None,
use_gst=use_gst, use_gst=use_gst,
) )
@ -168,17 +169,21 @@ def details():
) )
lock = Lock()
@app.route("/api/tts", methods=["GET"]) @app.route("/api/tts", methods=["GET"])
def tts(): def tts():
text = request.args.get("text") with lock:
speaker_idx = request.args.get("speaker_id", "") text = request.args.get("text")
style_wav = request.args.get("style_wav", "") speaker_idx = request.args.get("speaker_id", "")
style_wav = style_wav_uri_to_dict(style_wav) style_wav = request.args.get("style_wav", "")
print(" > Model input: {}".format(text)) style_wav = style_wav_uri_to_dict(style_wav)
print(" > Speaker Idx: {}".format(speaker_idx)) print(" > Model input: {}".format(text))
wavs = synthesizer.tts(text, speaker_name=speaker_idx, style_wav=style_wav) print(" > Speaker Idx: {}".format(speaker_idx))
out = io.BytesIO() wavs = synthesizer.tts(text, speaker_name=speaker_idx, style_wav=style_wav)
synthesizer.save_wav(wavs, out) out = io.BytesIO()
synthesizer.save_wav(wavs, out)
return send_file(out, mimetype="audio/wav") return send_file(out, mimetype="audio/wav")

View File

@ -1,3 +1,4 @@
import os
import sys import sys
from collections import Counter from collections import Counter
from pathlib import Path from pathlib import Path
@ -12,20 +13,16 @@ from TTS.tts.datasets.formatters import *
def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01): def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01):
"""Split a dataset into train and eval. Consider speaker distribution in multi-speaker training. """Split a dataset into train and eval. Consider speaker distribution in multi-speaker training.
Args: Args:
<<<<<<< HEAD items (List[List]):
items (List[List]): A list of samples. Each sample is a list of `[audio_path, text, speaker_id]`.
A list of samples. Each sample is a list of `[audio_path, text, speaker_id]`.
eval_split_max_size (int): eval_split_max_size (int):
Number maximum of samples to be used for evaluation in proportion split. Defaults to None (Disabled). Number maximum of samples to be used for evaluation in proportion split. Defaults to None (Disabled).
eval_split_size (float): eval_split_size (float):
If between 0.0 and 1.0 represents the proportion of the dataset to include in the evaluation set. If between 0.0 and 1.0 represents the proportion of the dataset to include in the evaluation set.
If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%). If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%).
=======
items (List[List]): A list of samples. Each sample is a list of `[text, audio_path, speaker_id]`.
>>>>>>> Fix docstring
""" """
speakers = [item["speaker_name"] for item in items] speakers = [item["speaker_name"] for item in items]
is_multi_speaker = len(set(speakers)) > 1 is_multi_speaker = len(set(speakers)) > 1
@ -59,6 +56,17 @@ def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01):
return items[:eval_split_size], items[eval_split_size:] return items[:eval_split_size], items[eval_split_size:]
def add_extra_keys(metadata, language, dataset_name):
for item in metadata:
# add language name
item["language"] = language
# add unique audio name
relfilepath = os.path.splitext(os.path.relpath(item["audio_file"], item["root_path"]))[0]
audio_unique_name = f"{dataset_name}#{relfilepath}"
item["audio_unique_name"] = audio_unique_name
return metadata
def load_tts_samples( def load_tts_samples(
datasets: Union[List[Dict], Dict], datasets: Union[List[Dict], Dict],
eval_split=True, eval_split=True,
@ -97,7 +105,8 @@ def load_tts_samples(
if not isinstance(datasets, list): if not isinstance(datasets, list):
datasets = [datasets] datasets = [datasets]
for dataset in datasets: for dataset in datasets:
name = dataset["name"] formatter_name = dataset["formatter"]
dataset_name = dataset["dataset_name"]
root_path = dataset["path"] root_path = dataset["path"]
meta_file_train = dataset["meta_file_train"] meta_file_train = dataset["meta_file_train"]
meta_file_val = dataset["meta_file_val"] meta_file_val = dataset["meta_file_val"]
@ -106,17 +115,19 @@ def load_tts_samples(
# setup the right data processor # setup the right data processor
if formatter is None: if formatter is None:
formatter = _get_formatter_by_name(name) formatter = _get_formatter_by_name(formatter_name)
# load train set # load train set
meta_data_train = formatter(root_path, meta_file_train, ignored_speakers=ignored_speakers) meta_data_train = formatter(root_path, meta_file_train, ignored_speakers=ignored_speakers)
meta_data_train = [{**item, **{"language": language}} for item in meta_data_train] assert len(meta_data_train) > 0, f" [!] No training samples found in {root_path}/{meta_file_train}"
meta_data_train = add_extra_keys(meta_data_train, language, dataset_name)
print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}") print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}")
# load evaluation split if set # load evaluation split if set
if eval_split: if eval_split:
if meta_file_val: if meta_file_val:
meta_data_eval = formatter(root_path, meta_file_val, ignored_speakers=ignored_speakers) meta_data_eval = formatter(root_path, meta_file_val, ignored_speakers=ignored_speakers)
meta_data_eval = [{**item, **{"language": language}} for item in meta_data_eval] meta_data_eval = add_extra_keys(meta_data_eval, language, dataset_name)
else: else:
meta_data_eval, meta_data_train = split_dataset(meta_data_train, eval_split_max_size, eval_split_size) meta_data_eval, meta_data_train = split_dataset(meta_data_train, eval_split_max_size, eval_split_size)
meta_data_eval_all += meta_data_eval meta_data_eval_all += meta_data_eval

View File

@ -1,3 +1,4 @@
import base64
import collections import collections
import os import os
import random import random
@ -34,6 +35,12 @@ def noise_augment_audio(wav):
return wav + (1.0 / 32768.0) * np.random.rand(*wav.shape) return wav + (1.0 / 32768.0) * np.random.rand(*wav.shape)
def string2filename(string):
# generate a safe and reversible filename based on a string
filename = base64.urlsafe_b64encode(string.encode("utf-8")).decode("utf-8", "ignore")
return filename
class TTSDataset(Dataset): class TTSDataset(Dataset):
def __init__( def __init__(
self, self,
@ -201,7 +208,7 @@ class TTSDataset(Dataset):
def get_f0(self, idx): def get_f0(self, idx):
out_dict = self.f0_dataset[idx] out_dict = self.f0_dataset[idx]
item = self.samples[idx] item = self.samples[idx]
assert item["audio_file"] == out_dict["audio_file"] assert item["audio_unique_name"] == out_dict["audio_unique_name"]
return out_dict return out_dict
@staticmethod @staticmethod
@ -256,6 +263,7 @@ class TTSDataset(Dataset):
"speaker_name": item["speaker_name"], "speaker_name": item["speaker_name"],
"language_name": item["language"], "language_name": item["language"],
"wav_file_name": os.path.basename(item["audio_file"]), "wav_file_name": os.path.basename(item["audio_file"]),
"audio_unique_name": item["audio_unique_name"],
} }
return sample return sample
@ -397,8 +405,8 @@ class TTSDataset(Dataset):
language_ids = None language_ids = None
# get pre-computed d-vectors # get pre-computed d-vectors
if self.d_vector_mapping is not None: if self.d_vector_mapping is not None:
wav_files_names = list(batch["wav_file_name"]) embedding_keys = list(batch["audio_unique_name"])
d_vectors = [self.d_vector_mapping[w]["embedding"] for w in wav_files_names] d_vectors = [self.d_vector_mapping[w]["embedding"] for w in embedding_keys]
else: else:
d_vectors = None d_vectors = None
@ -560,19 +568,18 @@ class PhonemeDataset(Dataset):
def __getitem__(self, index): def __getitem__(self, index):
item = self.samples[index] item = self.samples[index]
ids = self.compute_or_load(item["audio_file"], item["text"]) ids = self.compute_or_load(string2filename(item["audio_unique_name"]), item["text"])
ph_hat = self.tokenizer.ids_to_text(ids) ph_hat = self.tokenizer.ids_to_text(ids)
return {"text": item["text"], "ph_hat": ph_hat, "token_ids": ids, "token_ids_len": len(ids)} return {"text": item["text"], "ph_hat": ph_hat, "token_ids": ids, "token_ids_len": len(ids)}
def __len__(self): def __len__(self):
return len(self.samples) return len(self.samples)
def compute_or_load(self, wav_file, text): def compute_or_load(self, file_name, text):
"""Compute phonemes for the given text. """Compute phonemes for the given text.
If the phonemes are already cached, load them from cache. If the phonemes are already cached, load them from cache.
""" """
file_name = os.path.splitext(os.path.basename(wav_file))[0]
file_ext = "_phoneme.npy" file_ext = "_phoneme.npy"
cache_path = os.path.join(self.cache_path, file_name + file_ext) cache_path = os.path.join(self.cache_path, file_name + file_ext)
try: try:
@ -669,11 +676,11 @@ class F0Dataset:
def __getitem__(self, idx): def __getitem__(self, idx):
item = self.samples[idx] item = self.samples[idx]
f0 = self.compute_or_load(item["audio_file"]) f0 = self.compute_or_load(item["audio_file"], string2filename(item["audio_unique_name"]))
if self.normalize_f0: if self.normalize_f0:
assert self.mean is not None and self.std is not None, " [!] Mean and STD is not available" assert self.mean is not None and self.std is not None, " [!] Mean and STD is not available"
f0 = self.normalize(f0) f0 = self.normalize(f0)
return {"audio_file": item["audio_file"], "f0": f0} return {"audio_unique_name": item["audio_unique_name"], "f0": f0}
def __len__(self): def __len__(self):
return len(self.samples) return len(self.samples)
@ -705,8 +712,7 @@ class F0Dataset:
return self.pad_id return self.pad_id
@staticmethod @staticmethod
def create_pitch_file_path(wav_file, cache_path): def create_pitch_file_path(file_name, cache_path):
file_name = os.path.splitext(os.path.basename(wav_file))[0]
pitch_file = os.path.join(cache_path, file_name + "_pitch.npy") pitch_file = os.path.join(cache_path, file_name + "_pitch.npy")
return pitch_file return pitch_file
@ -744,11 +750,11 @@ class F0Dataset:
pitch[zero_idxs] = 0.0 pitch[zero_idxs] = 0.0
return pitch return pitch
def compute_or_load(self, wav_file): def compute_or_load(self, wav_file, audio_unique_name):
""" """
compute pitch and return a numpy array of pitch values compute pitch and return a numpy array of pitch values
""" """
pitch_file = self.create_pitch_file_path(wav_file, self.cache_path) pitch_file = self.create_pitch_file_path(audio_unique_name, self.cache_path)
if not os.path.exists(pitch_file): if not os.path.exists(pitch_file):
pitch = self._compute_and_save_pitch(self.ap, wav_file, pitch_file) pitch = self._compute_and_save_pitch(self.ap, wav_file, pitch_file)
else: else:
@ -756,14 +762,14 @@ class F0Dataset:
return pitch.astype(np.float32) return pitch.astype(np.float32)
def collate_fn(self, batch): def collate_fn(self, batch):
audio_file = [item["audio_file"] for item in batch] audio_unique_name = [item["audio_unique_name"] for item in batch]
f0s = [item["f0"] for item in batch] f0s = [item["f0"] for item in batch]
f0_lens = [len(item["f0"]) for item in batch] f0_lens = [len(item["f0"]) for item in batch]
f0_lens_max = max(f0_lens) f0_lens_max = max(f0_lens)
f0s_torch = torch.LongTensor(len(f0s), f0_lens_max).fill_(self.get_pad_id()) f0s_torch = torch.LongTensor(len(f0s), f0_lens_max).fill_(self.get_pad_id())
for i, f0_len in enumerate(f0_lens): for i, f0_len in enumerate(f0_lens):
f0s_torch[i, :f0_len] = torch.LongTensor(f0s[i]) f0s_torch[i, :f0_len] = torch.LongTensor(f0s[i])
return {"audio_file": audio_file, "f0": f0s_torch, "f0_lens": f0_lens} return {"audio_unique_name": audio_unique_name, "f0": f0s_torch, "f0_lens": f0_lens}
def print_logs(self, level: int = 0) -> None: def print_logs(self, level: int = 0) -> None:
indent = "\t" * level indent = "\t" * level

View File

@ -15,6 +15,15 @@ from tqdm import tqdm
def coqui(root_path, meta_file, ignored_speakers=None): def coqui(root_path, meta_file, ignored_speakers=None):
"""Interal dataset formatter.""" """Interal dataset formatter."""
filepath = os.path.join(root_path, meta_file)
# ensure there are 4 columns for every line
with open(filepath, "r", encoding="utf8") as f:
lines = f.readlines()
num_cols = len(lines[0].split("|")) # take the first row as reference
for idx, line in enumerate(lines[1:]):
if len(line.split("|")) != num_cols:
print(f" > Missing column in line {idx + 1} -> {line.strip()}")
# load metadata
metadata = pd.read_csv(os.path.join(root_path, meta_file), sep="|") metadata = pd.read_csv(os.path.join(root_path, meta_file), sep="|")
assert all(x in metadata.columns for x in ["audio_file", "text"]) assert all(x in metadata.columns for x in ["audio_file", "text"])
speaker_name = None if "speaker_name" in metadata.columns else "coqui" speaker_name = None if "speaker_name" in metadata.columns else "coqui"
@ -97,9 +106,9 @@ def mailabs(root_path, meta_files=None, ignored_speakers=None):
meta_files (str): list of meta files to be used in the training. If None, finds all the csv files meta_files (str): list of meta files to be used in the training. If None, finds all the csv files
recursively. Defaults to None recursively. Defaults to None
""" """
speaker_regex = re.compile("by_book/(male|female)/(?P<speaker_name>[^/]+)/") speaker_regex = re.compile(f"by_book{os.sep}(male|female){os.sep}(?P<speaker_name>[^{os.sep}]+){os.sep}")
if not meta_files: if not meta_files:
csv_files = glob(root_path + "/**/metadata.csv", recursive=True) csv_files = glob(root_path + f"{os.sep}**{os.sep}metadata.csv", recursive=True)
else: else:
csv_files = meta_files csv_files = meta_files
@ -578,3 +587,17 @@ def kokoro(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
text = cols[2].replace(" ", "") text = cols[2].replace(" ", "")
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path}) items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
return items return items
def kss(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
"""Korean single-speaker dataset from https://www.kaggle.com/datasets/bryanpark/korean-single-speaker-speech-dataset"""
txt_file = os.path.join(root_path, meta_file)
items = []
speaker_name = "kss"
with open(txt_file, "r", encoding="utf-8") as ttf:
for line in ttf:
cols = line.split("|")
wav_file = os.path.join(root_path, cols[0])
text = cols[2] # cols[1] => 6월, cols[2] => 유월
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name})
return items

View File

@ -398,9 +398,9 @@ class AlignTTS(BaseTTS):
logger.eval_audios(steps, audios, self.ap.sample_rate) logger.eval_audios(steps, audios, self.ap.sample_rate)
def load_checkpoint( def load_checkpoint(
self, config, checkpoint_path, eval=False self, config, checkpoint_path, eval=False, cache=False
): # pylint: disable=unused-argument, redefined-builtin ): # pylint: disable=unused-argument, redefined-builtin
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu")) state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
self.load_state_dict(state["model"]) self.load_state_dict(state["model"])
if eval: if eval:
self.eval() self.eval()

View File

@ -92,16 +92,17 @@ class BaseTacotron(BaseTTS):
pass pass
def load_checkpoint( def load_checkpoint(
self, config, checkpoint_path, eval=False self, config, checkpoint_path, eval=False, cache=False
): # pylint: disable=unused-argument, redefined-builtin ): # pylint: disable=unused-argument, redefined-builtin
"""Load model checkpoint and set up internals. """Load model checkpoint and set up internals.
Args: Args:
config (Coqpi): model configuration. config (Coqpi): model configuration.
checkpoint_path (str): path to checkpoint file. checkpoint_path (str): path to checkpoint file.
eval (bool): whether to load model for evaluation. eval (bool, optional): whether to load model for evaluation.
cache (bool, optional): If True, cache the file locally for subsequent calls. It is cached under `get_user_data_dir()/tts_cache`. Defaults to False.
""" """
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu")) state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
self.load_state_dict(state["model"]) self.load_state_dict(state["model"])
# TODO: set r in run-time by taking it from the new config # TODO: set r in run-time by taking it from the new config
if "r" in state: if "r" in state:

View File

@ -144,11 +144,11 @@ class BaseTTS(BaseTrainerModel):
if speaker_name is None: if speaker_name is None:
speaker_id = self.speaker_manager.get_random_id() speaker_id = self.speaker_manager.get_random_id()
else: else:
speaker_id = self.speaker_manager.ids[speaker_name] speaker_id = self.speaker_manager.name_to_id[speaker_name]
# get language id # get language id
if hasattr(self, "language_manager") and config.use_language_embedding and language_name is not None: if hasattr(self, "language_manager") and config.use_language_embedding and language_name is not None:
language_id = self.language_manager.ids[language_name] language_id = self.language_manager.name_to_id[language_name]
return { return {
"text": text, "text": text,
@ -288,11 +288,13 @@ class BaseTTS(BaseTrainerModel):
# setup multi-speaker attributes # setup multi-speaker attributes
if hasattr(self, "speaker_manager") and self.speaker_manager is not None: if hasattr(self, "speaker_manager") and self.speaker_manager is not None:
if hasattr(config, "model_args"): if hasattr(config, "model_args"):
speaker_id_mapping = self.speaker_manager.ids if config.model_args.use_speaker_embedding else None speaker_id_mapping = (
self.speaker_manager.name_to_id if config.model_args.use_speaker_embedding else None
)
d_vector_mapping = self.speaker_manager.embeddings if config.model_args.use_d_vector_file else None d_vector_mapping = self.speaker_manager.embeddings if config.model_args.use_d_vector_file else None
config.use_d_vector_file = config.model_args.use_d_vector_file config.use_d_vector_file = config.model_args.use_d_vector_file
else: else:
speaker_id_mapping = self.speaker_manager.ids if config.use_speaker_embedding else None speaker_id_mapping = self.speaker_manager.name_to_id if config.use_speaker_embedding else None
d_vector_mapping = self.speaker_manager.embeddings if config.use_d_vector_file else None d_vector_mapping = self.speaker_manager.embeddings if config.use_d_vector_file else None
else: else:
speaker_id_mapping = None speaker_id_mapping = None
@ -300,7 +302,7 @@ class BaseTTS(BaseTrainerModel):
# setup multi-lingual attributes # setup multi-lingual attributes
if hasattr(self, "language_manager") and self.language_manager is not None: if hasattr(self, "language_manager") and self.language_manager is not None:
language_id_mapping = self.language_manager.ids if self.args.use_language_embedding else None language_id_mapping = self.language_manager.name_to_id if self.args.use_language_embedding else None
else: else:
language_id_mapping = None language_id_mapping = None
@ -342,7 +344,7 @@ class BaseTTS(BaseTrainerModel):
loader = DataLoader( loader = DataLoader(
dataset, dataset,
batch_size=config.eval_batch_size if is_eval else config.batch_size, batch_size=config.eval_batch_size if is_eval else config.batch_size,
shuffle=False, # shuffle is done in the dataset. shuffle=True, # if there is no other sampler
collate_fn=dataset.collate_fn, collate_fn=dataset.collate_fn,
drop_last=False, # setting this False might cause issues in AMP training. drop_last=False, # setting this False might cause issues in AMP training.
sampler=sampler, sampler=sampler,
@ -363,7 +365,7 @@ class BaseTTS(BaseTrainerModel):
aux_inputs = { aux_inputs = {
"speaker_id": None "speaker_id": None
if not self.config.use_speaker_embedding if not self.config.use_speaker_embedding
else random.sample(sorted(self.speaker_manager.ids.values()), 1), else random.sample(sorted(self.speaker_manager.name_to_id.values()), 1),
"d_vector": d_vector, "d_vector": d_vector,
"style_wav": None, # TODO: handle GST style input "style_wav": None, # TODO: handle GST style input
} }

View File

@ -16,6 +16,7 @@ from TTS.tts.utils.helpers import average_over_durations, generate_path, maximum
from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.tts.utils.visual import plot_alignment, plot_avg_pitch, plot_spectrogram from TTS.tts.utils.visual import plot_alignment, plot_avg_pitch, plot_spectrogram
from TTS.utils.io import load_fsspec
@dataclass @dataclass
@ -707,9 +708,9 @@ class ForwardTTS(BaseTTS):
logger.eval_audios(steps, audios, self.ap.sample_rate) logger.eval_audios(steps, audios, self.ap.sample_rate)
def load_checkpoint( def load_checkpoint(
self, config, checkpoint_path, eval=False self, config, checkpoint_path, eval=False, cache=False
): # pylint: disable=unused-argument, redefined-builtin ): # pylint: disable=unused-argument, redefined-builtin
state = torch.load(checkpoint_path, map_location=torch.device("cpu")) state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
self.load_state_dict(state["model"]) self.load_state_dict(state["model"])
if eval: if eval:
self.eval() self.eval()

View File

@ -284,6 +284,7 @@ class VitsDataset(TTSDataset):
"wav_file": wav_filename, "wav_file": wav_filename,
"speaker_name": item["speaker_name"], "speaker_name": item["speaker_name"],
"language_name": item["language"], "language_name": item["language"],
"audio_unique_name": item["audio_unique_name"],
} }
@property @property
@ -308,6 +309,7 @@ class VitsDataset(TTSDataset):
- language_names: :math:`[B]` - language_names: :math:`[B]`
- audiofile_paths: :math:`[B]` - audiofile_paths: :math:`[B]`
- raw_texts: :math:`[B]` - raw_texts: :math:`[B]`
- audio_unique_names: :math:`[B]`
""" """
# convert list of dicts to dict of lists # convert list of dicts to dict of lists
B = len(batch) B = len(batch)
@ -348,6 +350,7 @@ class VitsDataset(TTSDataset):
"language_names": batch["language_name"], "language_names": batch["language_name"],
"audio_files": batch["wav_file"], "audio_files": batch["wav_file"],
"raw_text": batch["raw_text"], "raw_text": batch["raw_text"],
"audio_unique_names": batch["audio_unique_name"],
} }
@ -718,6 +721,10 @@ class Vits(BaseTTS):
use_spectral_norm=self.args.use_spectral_norm_disriminator, use_spectral_norm=self.args.use_spectral_norm_disriminator,
) )
@property
def device(self):
return next(self.parameters()).device
def init_multispeaker(self, config: Coqpit): def init_multispeaker(self, config: Coqpit):
"""Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer """Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer
or with external `d_vectors` computed from a speaker encoder model. or with external `d_vectors` computed from a speaker encoder model.
@ -755,17 +762,12 @@ class Vits(BaseTTS):
if ( if (
hasattr(self.speaker_manager.encoder, "audio_config") hasattr(self.speaker_manager.encoder, "audio_config")
and self.config.audio["sample_rate"] != self.speaker_manager.encoder.audio_config["sample_rate"] and self.config.audio.sample_rate != self.speaker_manager.encoder.audio_config["sample_rate"]
): ):
self.audio_transform = torchaudio.transforms.Resample( self.audio_transform = torchaudio.transforms.Resample(
orig_freq=self.audio_config["sample_rate"], orig_freq=self.config.audio.sample_rate,
new_freq=self.speaker_manager.encoder.audio_config["sample_rate"], new_freq=self.speaker_manager.encoder.audio_config["sample_rate"],
) )
# pylint: disable=W0101,W0105
self.audio_transform = torchaudio.transforms.Resample(
orig_freq=self.config.audio.sample_rate,
new_freq=self.speaker_manager.encoder.audio_config["sample_rate"],
)
def _init_speaker_embedding(self): def _init_speaker_embedding(self):
# pylint: disable=attribute-defined-outside-init # pylint: disable=attribute-defined-outside-init
@ -808,6 +810,13 @@ class Vits(BaseTTS):
orig_freq=self.config.audio["sample_rate"], new_freq=self.args.encoder_sample_rate orig_freq=self.config.audio["sample_rate"], new_freq=self.args.encoder_sample_rate
) # pylint: disable=W0201 ) # pylint: disable=W0201
def on_epoch_start(self, trainer): # pylint: disable=W0613
"""Freeze layers at the beginning of an epoch"""
self._freeze_layers()
# set the device of speaker encoder
if self.args.use_speaker_encoder_as_loss:
self.speaker_manager.encoder = self.speaker_manager.encoder.to(self.device)
def on_init_end(self, trainer): # pylint: disable=W0613 def on_init_end(self, trainer): # pylint: disable=W0613
"""Reinit layes if needed""" """Reinit layes if needed"""
if self.args.reinit_DP: if self.args.reinit_DP:
@ -1185,7 +1194,6 @@ class Vits(BaseTTS):
y_lengths = torch.tensor([y.size(-1)]).to(y.device) y_lengths = torch.tensor([y.size(-1)]).to(y.device)
speaker_cond_src = reference_speaker_id if reference_speaker_id is not None else reference_d_vector speaker_cond_src = reference_speaker_id if reference_speaker_id is not None else reference_d_vector
speaker_cond_tgt = speaker_id if speaker_id is not None else d_vector speaker_cond_tgt = speaker_id if speaker_id is not None else d_vector
# print(y.shape, y_lengths.shape)
wav, _, _ = self.voice_conversion(y, y_lengths, speaker_cond_src, speaker_cond_tgt) wav, _, _ = self.voice_conversion(y, y_lengths, speaker_cond_src, speaker_cond_tgt)
return wav return wav
@ -1229,8 +1237,6 @@ class Vits(BaseTTS):
Tuple[Dict, Dict]: Model ouputs and computed losses. Tuple[Dict, Dict]: Model ouputs and computed losses.
""" """
self._freeze_layers()
spec_lens = batch["spec_lens"] spec_lens = batch["spec_lens"]
if optimizer_idx == 0: if optimizer_idx == 0:
@ -1402,11 +1408,11 @@ class Vits(BaseTTS):
if speaker_name is None: if speaker_name is None:
speaker_id = self.speaker_manager.get_random_id() speaker_id = self.speaker_manager.get_random_id()
else: else:
speaker_id = self.speaker_manager.ids[speaker_name] speaker_id = self.speaker_manager.name_to_id[speaker_name]
# get language id # get language id
if hasattr(self, "language_manager") and config.use_language_embedding and language_name is not None: if hasattr(self, "language_manager") and config.use_language_embedding and language_name is not None:
language_id = self.language_manager.ids[language_name] language_id = self.language_manager.name_to_id[language_name]
return { return {
"text": text, "text": text,
@ -1461,8 +1467,8 @@ class Vits(BaseTTS):
d_vectors = None d_vectors = None
# get numerical speaker ids from speaker names # get numerical speaker ids from speaker names
if self.speaker_manager is not None and self.speaker_manager.ids and self.args.use_speaker_embedding: if self.speaker_manager is not None and self.speaker_manager.name_to_id and self.args.use_speaker_embedding:
speaker_ids = [self.speaker_manager.ids[sn] for sn in batch["speaker_names"]] speaker_ids = [self.speaker_manager.name_to_id[sn] for sn in batch["speaker_names"]]
if speaker_ids is not None: if speaker_ids is not None:
speaker_ids = torch.LongTensor(speaker_ids) speaker_ids = torch.LongTensor(speaker_ids)
@ -1471,12 +1477,12 @@ class Vits(BaseTTS):
# get d_vectors from audio file names # get d_vectors from audio file names
if self.speaker_manager is not None and self.speaker_manager.embeddings and self.args.use_d_vector_file: if self.speaker_manager is not None and self.speaker_manager.embeddings and self.args.use_d_vector_file:
d_vector_mapping = self.speaker_manager.embeddings d_vector_mapping = self.speaker_manager.embeddings
d_vectors = [d_vector_mapping[w]["embedding"] for w in batch["audio_files"]] d_vectors = [d_vector_mapping[w]["embedding"] for w in batch["audio_unique_names"]]
d_vectors = torch.FloatTensor(d_vectors) d_vectors = torch.FloatTensor(d_vectors)
# get language ids from language names # get language ids from language names
if self.language_manager is not None and self.language_manager.ids and self.args.use_language_embedding: if self.language_manager is not None and self.language_manager.name_to_id and self.args.use_language_embedding:
language_ids = [self.language_manager.ids[ln] for ln in batch["language_names"]] language_ids = [self.language_manager.name_to_id[ln] for ln in batch["language_names"]]
if language_ids is not None: if language_ids is not None:
language_ids = torch.LongTensor(language_ids) language_ids = torch.LongTensor(language_ids)
@ -1680,14 +1686,10 @@ class Vits(BaseTTS):
return [VitsDiscriminatorLoss(self.config), VitsGeneratorLoss(self.config)] return [VitsDiscriminatorLoss(self.config), VitsGeneratorLoss(self.config)]
def load_checkpoint( def load_checkpoint(
self, self, config, checkpoint_path, eval=False, strict=True, cache=False
config,
checkpoint_path,
eval=False,
strict=True,
): # pylint: disable=unused-argument, redefined-builtin ): # pylint: disable=unused-argument, redefined-builtin
"""Load the model checkpoint and setup for training or inference""" """Load the model checkpoint and setup for training or inference"""
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu")) state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
# compat band-aid for the pre-trained models to not use the encoder baked into the model # compat band-aid for the pre-trained models to not use the encoder baked into the model
# TODO: consider baking the speaker encoder into the model and call it from there. # TODO: consider baking the speaker encoder into the model and call it from there.
# as it is probably easier for model distribution. # as it is probably easier for model distribution.

View File

@ -37,11 +37,11 @@ class LanguageManager(BaseIDManager):
@property @property
def num_languages(self) -> int: def num_languages(self) -> int:
return len(list(self.ids.keys())) return len(list(self.name_to_id.keys()))
@property @property
def language_names(self) -> List: def language_names(self) -> List:
return list(self.ids.keys()) return list(self.name_to_id.keys())
@staticmethod @staticmethod
def parse_language_ids_from_config(c: Coqpit) -> Dict: def parse_language_ids_from_config(c: Coqpit) -> Dict:
@ -67,7 +67,7 @@ class LanguageManager(BaseIDManager):
Args: Args:
c (Coqpit): Config. c (Coqpit): Config.
""" """
self.ids = self.parse_language_ids_from_config(c) self.name_to_id = self.parse_language_ids_from_config(c)
@staticmethod @staticmethod
def parse_ids_from_data(items: List, parse_key: str) -> Any: def parse_ids_from_data(items: List, parse_key: str) -> Any:
@ -82,7 +82,7 @@ class LanguageManager(BaseIDManager):
Args: Args:
file_path (str): Path to the output file. file_path (str): Path to the output file.
""" """
self._save_json(file_path, self.ids) self._save_json(file_path, self.name_to_id)
@staticmethod @staticmethod
def init_from_config(config: Coqpit) -> "LanguageManager": def init_from_config(config: Coqpit) -> "LanguageManager":

View File

@ -39,7 +39,7 @@ class BaseIDManager:
""" """
def __init__(self, id_file_path: str = ""): def __init__(self, id_file_path: str = ""):
self.ids = {} self.name_to_id = {}
if id_file_path: if id_file_path:
self.load_ids_from_file(id_file_path) self.load_ids_from_file(id_file_path)
@ -60,7 +60,7 @@ class BaseIDManager:
Args: Args:
items (List): Data sampled returned by `load_tts_samples()`. items (List): Data sampled returned by `load_tts_samples()`.
""" """
self.ids = self.parse_ids_from_data(items, parse_key=parse_key) self.name_to_id = self.parse_ids_from_data(items, parse_key=parse_key)
def load_ids_from_file(self, file_path: str) -> None: def load_ids_from_file(self, file_path: str) -> None:
"""Set IDs from a file. """Set IDs from a file.
@ -68,7 +68,7 @@ class BaseIDManager:
Args: Args:
file_path (str): Path to the file. file_path (str): Path to the file.
""" """
self.ids = load_file(file_path) self.name_to_id = load_file(file_path)
def save_ids_to_file(self, file_path: str) -> None: def save_ids_to_file(self, file_path: str) -> None:
"""Save IDs to a json file. """Save IDs to a json file.
@ -76,7 +76,7 @@ class BaseIDManager:
Args: Args:
file_path (str): Path to the output file. file_path (str): Path to the output file.
""" """
save_file(self.ids, file_path) save_file(self.name_to_id, file_path)
def get_random_id(self) -> Any: def get_random_id(self) -> Any:
"""Get a random embedding. """Get a random embedding.
@ -86,8 +86,8 @@ class BaseIDManager:
Returns: Returns:
np.ndarray: embedding. np.ndarray: embedding.
""" """
if self.ids: if self.name_to_id:
return self.ids[random.choices(list(self.ids.keys()))[0]] return self.name_to_id[random.choices(list(self.name_to_id.keys()))[0]]
return None return None
@ -109,11 +109,27 @@ class BaseIDManager:
class EmbeddingManager(BaseIDManager): class EmbeddingManager(BaseIDManager):
"""Base `Embedding` Manager class. Every new `Embedding` manager must inherit this. """Base `Embedding` Manager class. Every new `Embedding` manager must inherit this.
It defines common `Embedding` manager specific functions. It defines common `Embedding` manager specific functions.
It expects embeddings files in the following format:
::
{
'audio_file_key':{
'name': 'category_name',
'embedding'[<embedding_values>]
},
...
}
`audio_file_key` is a unique key to the audio file in the dataset. It can be the path to the file or any other unique key.
`embedding` is the embedding vector of the audio file.
`name` can be name of the speaker of the audio file.
""" """
def __init__( def __init__(
self, self,
embedding_file_path: str = "", embedding_file_path: Union[str, List[str]] = "",
id_file_path: str = "", id_file_path: str = "",
encoder_model_path: str = "", encoder_model_path: str = "",
encoder_config_path: str = "", encoder_config_path: str = "",
@ -129,11 +145,24 @@ class EmbeddingManager(BaseIDManager):
self.use_cuda = use_cuda self.use_cuda = use_cuda
if embedding_file_path: if embedding_file_path:
self.load_embeddings_from_file(embedding_file_path) if isinstance(embedding_file_path, list):
self.load_embeddings_from_list_of_files(embedding_file_path)
else:
self.load_embeddings_from_file(embedding_file_path)
if encoder_model_path and encoder_config_path: if encoder_model_path and encoder_config_path:
self.init_encoder(encoder_model_path, encoder_config_path, use_cuda) self.init_encoder(encoder_model_path, encoder_config_path, use_cuda)
@property
def num_embeddings(self):
"""Get number of embeddings."""
return len(self.embeddings)
@property
def num_names(self):
"""Get number of embeddings."""
return len(self.embeddings_by_names)
@property @property
def embedding_dim(self): def embedding_dim(self):
"""Dimensionality of embeddings. If embeddings are not loaded, returns zero.""" """Dimensionality of embeddings. If embeddings are not loaded, returns zero."""
@ -141,6 +170,11 @@ class EmbeddingManager(BaseIDManager):
return len(self.embeddings[list(self.embeddings.keys())[0]]["embedding"]) return len(self.embeddings[list(self.embeddings.keys())[0]]["embedding"])
return 0 return 0
@property
def embedding_names(self):
"""Get embedding names."""
return list(self.embeddings_by_names.keys())
def save_embeddings_to_file(self, file_path: str) -> None: def save_embeddings_to_file(self, file_path: str) -> None:
"""Save embeddings to a json file. """Save embeddings to a json file.
@ -149,20 +183,57 @@ class EmbeddingManager(BaseIDManager):
""" """
save_file(self.embeddings, file_path) save_file(self.embeddings, file_path)
@staticmethod
def read_embeddings_from_file(file_path: str):
"""Load embeddings from a json file.
Args:
file_path (str): Path to the file.
"""
embeddings = load_file(file_path)
speakers = sorted({x["name"] for x in embeddings.values()})
name_to_id = {name: i for i, name in enumerate(speakers)}
clip_ids = list(set(sorted(clip_name for clip_name in embeddings.keys())))
# cache embeddings_by_names for fast inference using a bigger speakers.json
embeddings_by_names = {}
for x in embeddings.values():
if x["name"] not in embeddings_by_names.keys():
embeddings_by_names[x["name"]] = [x["embedding"]]
else:
embeddings_by_names[x["name"]].append(x["embedding"])
return name_to_id, clip_ids, embeddings, embeddings_by_names
def load_embeddings_from_file(self, file_path: str) -> None: def load_embeddings_from_file(self, file_path: str) -> None:
"""Load embeddings from a json file. """Load embeddings from a json file.
Args: Args:
file_path (str): Path to the target json file. file_path (str): Path to the target json file.
""" """
self.embeddings = load_file(file_path) self.name_to_id, self.clip_ids, self.embeddings, self.embeddings_by_names = self.read_embeddings_from_file(
file_path
)
speakers = sorted({x["name"] for x in self.embeddings.values()}) def load_embeddings_from_list_of_files(self, file_paths: List[str]) -> None:
self.ids = {name: i for i, name in enumerate(speakers)} """Load embeddings from a list of json files and don't allow duplicate keys.
self.clip_ids = list(set(sorted(clip_name for clip_name in self.embeddings.keys()))) Args:
# cache embeddings_by_names for fast inference using a bigger speakers.json file_paths (List[str]): List of paths to the target json files.
self.embeddings_by_names = self.get_embeddings_by_names() """
self.name_to_id = {}
self.clip_ids = []
self.embeddings_by_names = {}
self.embeddings = {}
for file_path in file_paths:
ids, clip_ids, embeddings, embeddings_by_names = self.read_embeddings_from_file(file_path)
# check colliding keys
duplicates = set(self.embeddings.keys()) & set(embeddings.keys())
if duplicates:
raise ValueError(f" [!] Duplicate embedding names <{duplicates}> in {file_path}")
# store values
self.name_to_id.update(ids)
self.clip_ids.extend(clip_ids)
self.embeddings_by_names.update(embeddings_by_names)
self.embeddings.update(embeddings)
def get_embedding_by_clip(self, clip_idx: str) -> List: def get_embedding_by_clip(self, clip_idx: str) -> List:
"""Get embedding by clip ID. """Get embedding by clip ID.

View File

@ -73,14 +73,14 @@ class SpeakerManager(EmbeddingManager):
@property @property
def num_speakers(self): def num_speakers(self):
return len(self.ids) return len(self.name_to_id)
@property @property
def speaker_names(self): def speaker_names(self):
return list(self.ids.keys()) return list(self.name_to_id.keys())
def get_speakers(self) -> List: def get_speakers(self) -> List:
return self.ids return self.name_to_id
@staticmethod @staticmethod
def init_from_config(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "SpeakerManager": def init_from_config(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "SpeakerManager":
@ -182,10 +182,10 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None,
speaker_manager.load_embeddings_from_file(c.d_vector_file) speaker_manager.load_embeddings_from_file(c.d_vector_file)
speaker_manager.load_embeddings_from_file(speakers_file) speaker_manager.load_embeddings_from_file(speakers_file)
elif not c.use_d_vector_file: # restor speaker manager with speaker ID file. elif not c.use_d_vector_file: # restor speaker manager with speaker ID file.
speaker_ids_from_data = speaker_manager.ids speaker_ids_from_data = speaker_manager.name_to_id
speaker_manager.load_ids_from_file(speakers_file) speaker_manager.load_ids_from_file(speakers_file)
assert all( assert all(
speaker in speaker_manager.ids for speaker in speaker_ids_from_data speaker in speaker_manager.name_to_id for speaker in speaker_ids_from_data
), " [!] You cannot introduce new speakers to a pre-trained model." ), " [!] You cannot introduce new speakers to a pre-trained model."
elif c.use_d_vector_file and c.d_vector_file: elif c.use_d_vector_file and c.d_vector_file:
# new speaker manager with external speaker embeddings. # new speaker manager with external speaker embeddings.
@ -199,7 +199,7 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None,
if speaker_manager.num_speakers > 0: if speaker_manager.num_speakers > 0:
print( print(
" > Speaker manager is loaded with {} speakers: {}".format( " > Speaker manager is loaded with {} speakers: {}".format(
speaker_manager.num_speakers, ", ".join(speaker_manager.ids) speaker_manager.num_speakers, ", ".join(speaker_manager.name_to_id)
) )
) )

View File

@ -295,7 +295,12 @@ def transfer_voice(
reference_d_vector = embedding_to_torch(reference_d_vector, cuda=use_cuda) reference_d_vector = embedding_to_torch(reference_d_vector, cuda=use_cuda)
# load reference_wav audio # load reference_wav audio
reference_wav = embedding_to_torch(model.ap.load_wav(reference_wav, sr=model.ap.sample_rate), cuda=use_cuda) reference_wav = embedding_to_torch(
model.ap.load_wav(
reference_wav, sr=model.args.encoder_sample_rate if model.args.encoder_sample_rate else model.ap.sample_rate
),
cuda=use_cuda,
)
if hasattr(model, "module"): if hasattr(model, "module"):
_func = model.module.inference_voice_conversion _func = model.module.inference_voice_conversion

View File

View File

@ -0,0 +1,44 @@
# coding: utf-8
# Add the word you want to the dictionary.
etc_dictionary = {"1+1": "원플러스원", "2+1": "투플러스원"}
english_dictionary = {
"KOREA": "코리아",
"IDOL": "아이돌",
"IT": "아이티",
"IQ": "아이큐",
"UP": "",
"DOWN": "다운",
"PC": "피씨",
"CCTV": "씨씨티비",
"SNS": "에스엔에스",
"AI": "에이아이",
"CEO": "씨이오",
"A": "에이",
"B": "",
"C": "",
"D": "",
"E": "",
"F": "에프",
"G": "",
"H": "에이치",
"I": "아이",
"J": "제이",
"K": "케이",
"L": "",
"M": "",
"N": "",
"O": "",
"P": "",
"Q": "",
"R": "",
"S": "에스",
"T": "",
"U": "",
"V": "브이",
"W": "더블유",
"X": "엑스",
"Y": "와이",
"Z": "제트",
}

View File

@ -0,0 +1,32 @@
# coding: utf-8
# Code based on https://github.com/carpedm20/multi-speaker-tacotron-tensorflow/blob/master/text/korean.py
import re
from TTS.tts.utils.text.korean.ko_dictionary import english_dictionary, etc_dictionary
def normalize(text):
text = text.strip()
text = re.sub("[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]", "", text)
text = normalize_with_dictionary(text, etc_dictionary)
text = normalize_english(text)
text = text.lower()
return text
def normalize_with_dictionary(text, dic):
if any(key in text for key in dic.keys()):
pattern = re.compile("|".join(re.escape(key) for key in dic.keys()))
return pattern.sub(lambda x: dic[x.group()], text)
return text
def normalize_english(text):
def fn(m):
word = m.group()
if word in english_dictionary:
return english_dictionary.get(word)
return word
text = re.sub("([A-Za-z]+)", fn, text)
return text

View File

@ -0,0 +1,36 @@
from jamo import hangul_to_jamo
from TTS.tts.utils.text.korean.korean import normalize
g2p = None
def korean_text_to_phonemes(text, character: str = "hangeul") -> str:
"""
The input and output values look the same, but they are different in Unicode.
example :
input = '하늘' (Unicode : \ud558\ub298), ( + )
output = '하늘' (Unicode :\u1112\u1161\u1102\u1173\u11af), ( + + + + )
"""
global g2p # pylint: disable=global-statement
if g2p is None:
from g2pkk import G2p
g2p = G2p()
if character == "english":
from anyascii import anyascii
text = normalize(text)
text = g2p(text)
text = anyascii(text)
return text
text = normalize(text)
text = g2p(text)
text = list(hangul_to_jamo(text)) # '하늘' --> ['ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆯ']
return "".join(text)

View File

@ -2,6 +2,7 @@ from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
from TTS.tts.utils.text.phonemizers.espeak_wrapper import ESpeak from TTS.tts.utils.text.phonemizers.espeak_wrapper import ESpeak
from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut
from TTS.tts.utils.text.phonemizers.ja_jp_phonemizer import JA_JP_Phonemizer from TTS.tts.utils.text.phonemizers.ja_jp_phonemizer import JA_JP_Phonemizer
from TTS.tts.utils.text.phonemizers.ko_kr_phonemizer import KO_KR_Phonemizer
from TTS.tts.utils.text.phonemizers.zh_cn_phonemizer import ZH_CN_Phonemizer from TTS.tts.utils.text.phonemizers.zh_cn_phonemizer import ZH_CN_Phonemizer
PHONEMIZERS = {b.name(): b for b in (ESpeak, Gruut, JA_JP_Phonemizer)} PHONEMIZERS = {b.name(): b for b in (ESpeak, Gruut, JA_JP_Phonemizer)}
@ -26,6 +27,7 @@ DEF_LANG_TO_PHONEMIZER.update(_new_dict)
DEF_LANG_TO_PHONEMIZER["en"] = DEF_LANG_TO_PHONEMIZER["en-us"] DEF_LANG_TO_PHONEMIZER["en"] = DEF_LANG_TO_PHONEMIZER["en-us"]
DEF_LANG_TO_PHONEMIZER["ja-jp"] = JA_JP_Phonemizer.name() DEF_LANG_TO_PHONEMIZER["ja-jp"] = JA_JP_Phonemizer.name()
DEF_LANG_TO_PHONEMIZER["zh-cn"] = ZH_CN_Phonemizer.name() DEF_LANG_TO_PHONEMIZER["zh-cn"] = ZH_CN_Phonemizer.name()
DEF_LANG_TO_PHONEMIZER["ko-kr"] = KO_KR_Phonemizer.name()
def get_phonemizer_by_name(name: str, **kwargs) -> BasePhonemizer: def get_phonemizer_by_name(name: str, **kwargs) -> BasePhonemizer:
@ -46,6 +48,8 @@ def get_phonemizer_by_name(name: str, **kwargs) -> BasePhonemizer:
return ZH_CN_Phonemizer(**kwargs) return ZH_CN_Phonemizer(**kwargs)
if name == "ja_jp_phonemizer": if name == "ja_jp_phonemizer":
return JA_JP_Phonemizer(**kwargs) return JA_JP_Phonemizer(**kwargs)
if name == "ko_kr_phonemizer":
return KO_KR_Phonemizer(**kwargs)
raise ValueError(f"Phonemizer {name} not found") raise ValueError(f"Phonemizer {name} not found")

View File

@ -94,6 +94,8 @@ class ESpeak(BasePhonemizer):
# band-aid for backwards compatibility # band-aid for backwards compatibility
if language == "en": if language == "en":
language = "en-us" language = "en-us"
if language == "zh-cn":
language = "cmn"
super().__init__(language, punctuations=punctuations, keep_puncs=keep_puncs) super().__init__(language, punctuations=punctuations, keep_puncs=keep_puncs)
if backend is not None: if backend is not None:

View File

@ -0,0 +1,65 @@
from typing import Dict
from TTS.tts.utils.text.korean.phonemizer import korean_text_to_phonemes
from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
_DEF_KO_PUNCS = "、.,[]()?!〽~『』「」【】"
class KO_KR_Phonemizer(BasePhonemizer):
"""🐸TTS ko_kr_phonemizer using functions in `TTS.tts.utils.text.korean.phonemizer`
TODO: Add Korean to character (ᄀᄁᄂᄃᄄᄅᄆᄇᄈᄉᄊᄋᄌᄍᄎᄏᄐᄑ하ᅢᅣᅤᅥᅦᅧᅨᅩᅪᅫᅬᅭᅮᅯᅰᅱᅲᅳᅴᅵᆨᆩᆪᆫᆬᆭᆮᆯᆰᆱᆲᆳᆴᆵᆶᆷᆸᆹᆺᆻᆼᆽᆾᆿᇀᇁᇂ)
Example:
>>> from TTS.tts.utils.text.phonemizers import KO_KR_Phonemizer
>>> phonemizer = KO_KR_Phonemizer()
>>> phonemizer.phonemize("이 문장은 음성합성 테스트를 위한 문장입니다.", separator="|")
'ᄋ|ᅵ| |ᄆ|ᅮ|ᆫ|ᄌ|ᅡ|ᆼ|ᄋ|ᅳ| |ᄂ|ᅳ|ᆷ|ᄉ|ᅥ|ᆼ|ᄒ|ᅡ|ᆸ|ᄊ|ᅥ|ᆼ| |ᄐ|ᅦ|ᄉ|ᅳ|ᄐ|ᅳ|ᄅ|ᅳ| |ᄅ|ᅱ|ᄒ|ᅡ|ᆫ| |ᄆ|ᅮ|ᆫ|ᄌ|ᅡ|ᆼ|ᄋ|ᅵ|ᆷ|ᄂ|ᅵ|ᄃ|ᅡ|.'
>>> from TTS.tts.utils.text.phonemizers import KO_KR_Phonemizer
>>> phonemizer = KO_KR_Phonemizer()
>>> phonemizer.phonemize("이 문장은 음성합성 테스트를 위한 문장입니다.", separator="|", character='english')
'I| |M|u|n|J|a|n|g|E|u| |N|e|u|m|S|e|o|n|g|H|a|b|S|s|e|o|n|g| |T|e|S|e|u|T|e|u|L|e|u| |L|w|i|H|a|n| |M|u|n|J|a|n|g|I|m|N|i|D|a|.'
"""
language = "ko-kr"
def __init__(self, punctuations=_DEF_KO_PUNCS, keep_puncs=True, **kwargs): # pylint: disable=unused-argument
super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs)
@staticmethod
def name():
return "ko_kr_phonemizer"
def _phonemize(self, text: str, separator: str = "", character: str = "hangeul") -> str:
ph = korean_text_to_phonemes(text, character=character)
if separator is not None or separator != "":
return separator.join(ph)
return ph
def phonemize(self, text: str, separator: str = "", character: str = "hangeul") -> str:
return self._phonemize(text, separator, character)
@staticmethod
def supported_languages() -> Dict:
return {"ko-kr": "hangeul(korean)"}
def version(self) -> str:
return "0.0.2"
def is_available(self) -> bool:
return True
if __name__ == "__main__":
texts = "이 문장은 음성합성 테스트를 위한 문장입니다."
e = KO_KR_Phonemizer()
print(e.supported_languages())
print(e.version())
print(e.language)
print(e.name())
print(e.is_available())
print(e.phonemize(texts))

View File

@ -42,7 +42,7 @@ class ZH_CN_Phonemizer(BasePhonemizer):
@staticmethod @staticmethod
def supported_languages() -> Dict: def supported_languages() -> Dict:
return {"zh-cn": "Japanese (Japan)"} return {"zh-cn": "Chinese (China)"}
def version(self) -> str: def version(self) -> str:
return "0.0.1" return "0.0.1"

View File

@ -2,9 +2,9 @@ from typing import Tuple
import librosa import librosa
import numpy as np import numpy as np
import pyworld as pw
import scipy import scipy
import soundfile as sf import soundfile as sf
from librosa import pyin
# For using kwargs # For using kwargs
# pylint: disable=unused-argument # pylint: disable=unused-argument
@ -242,12 +242,28 @@ def compute_stft_paddings(
def compute_f0( def compute_f0(
*, x: np.ndarray = None, pitch_fmax: float = None, hop_length: int = None, sample_rate: int = None, **kwargs *,
x: np.ndarray = None,
pitch_fmax: float = None,
pitch_fmin: float = None,
hop_length: int = None,
win_length: int = None,
sample_rate: int = None,
stft_pad_mode: str = "reflect",
center: bool = True,
**kwargs,
) -> np.ndarray: ) -> np.ndarray:
"""Compute pitch (f0) of a waveform using the same parameters used for computing melspectrogram. """Compute pitch (f0) of a waveform using the same parameters used for computing melspectrogram.
Args: Args:
x (np.ndarray): Waveform. Shape :math:`[T_wav,]` x (np.ndarray): Waveform. Shape :math:`[T_wav,]`
pitch_fmax (float): Pitch max value.
pitch_fmin (float): Pitch min value.
hop_length (int): Number of frames between STFT columns.
win_length (int): STFT window length.
sample_rate (int): Audio sampling rate.
stft_pad_mode (str): Padding mode for STFT.
center (bool): Centered padding.
Returns: Returns:
np.ndarray: Pitch. Shape :math:`[T_pitch,]`. :math:`T_pitch == T_wav / hop_length` np.ndarray: Pitch. Shape :math:`[T_pitch,]`. :math:`T_pitch == T_wav / hop_length`
@ -255,20 +271,35 @@ def compute_f0(
Examples: Examples:
>>> WAV_FILE = filename = librosa.util.example_audio_file() >>> WAV_FILE = filename = librosa.util.example_audio_file()
>>> from TTS.config import BaseAudioConfig >>> from TTS.config import BaseAudioConfig
>>> from TTS.utils.audio.processor import AudioProcessor >>> conf = BaseAudioConfig(pitch_fmax=8000) >>> from TTS.utils.audio import AudioProcessor
>>> conf = BaseAudioConfig(pitch_fmax=640, pitch_fmin=1)
>>> ap = AudioProcessor(**conf) >>> ap = AudioProcessor(**conf)
>>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050] >>> wav = ap.load_wav(WAV_FILE, sr=ap.sample_rate)[:5 * ap.sample_rate]
>>> pitch = ap.compute_f0(wav) >>> pitch = ap.compute_f0(wav)
""" """
assert pitch_fmax is not None, " [!] Set `pitch_fmax` before caling `compute_f0`." assert pitch_fmax is not None, " [!] Set `pitch_fmax` before caling `compute_f0`."
assert pitch_fmin is not None, " [!] Set `pitch_fmin` before caling `compute_f0`."
f0, t = pw.dio( f0, voiced_mask, _ = pyin(
x.astype(np.double), y=x.astype(np.double),
fs=sample_rate, fmin=pitch_fmin,
f0_ceil=pitch_fmax, fmax=pitch_fmax,
frame_period=1000 * hop_length / sample_rate, sr=sample_rate,
frame_length=win_length,
win_length=win_length // 2,
hop_length=hop_length,
pad_mode=stft_pad_mode,
center=center,
n_thresholds=100,
beta_parameters=(2, 18),
boltzmann_parameter=2,
resolution=0.1,
max_transition_rate=35.92,
switch_prob=0.01,
no_trough_prob=0.01,
) )
f0 = pw.stonemask(x.astype(np.double), f0, t, sample_rate) f0[~voiced_mask] = 0.0
return f0 return f0

View File

@ -2,12 +2,12 @@ from typing import Dict, Tuple
import librosa import librosa
import numpy as np import numpy as np
import pyworld as pw
import scipy.io.wavfile import scipy.io.wavfile
import scipy.signal import scipy.signal
import soundfile as sf import soundfile as sf
from TTS.tts.utils.helpers import StandardScaler from TTS.tts.utils.helpers import StandardScaler
from TTS.utils.audio.numpy_transforms import compute_f0
# pylint: disable=too-many-public-methods # pylint: disable=too-many-public-methods
@ -573,23 +573,28 @@ class AudioProcessor(object):
>>> WAV_FILE = filename = librosa.util.example_audio_file() >>> WAV_FILE = filename = librosa.util.example_audio_file()
>>> from TTS.config import BaseAudioConfig >>> from TTS.config import BaseAudioConfig
>>> from TTS.utils.audio import AudioProcessor >>> from TTS.utils.audio import AudioProcessor
>>> conf = BaseAudioConfig(pitch_fmax=8000) >>> conf = BaseAudioConfig(pitch_fmax=640, pitch_fmin=1)
>>> ap = AudioProcessor(**conf) >>> ap = AudioProcessor(**conf)
>>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050] >>> wav = ap.load_wav(WAV_FILE, sr=ap.sample_rate)[:5 * ap.sample_rate]
>>> pitch = ap.compute_f0(wav) >>> pitch = ap.compute_f0(wav)
""" """
assert self.pitch_fmax is not None, " [!] Set `pitch_fmax` before caling `compute_f0`." assert self.pitch_fmax is not None, " [!] Set `pitch_fmax` before caling `compute_f0`."
assert self.pitch_fmin is not None, " [!] Set `pitch_fmin` before caling `compute_f0`."
# align F0 length to the spectrogram length # align F0 length to the spectrogram length
if len(x) % self.hop_length == 0: if len(x) % self.hop_length == 0:
x = np.pad(x, (0, self.hop_length // 2), mode="reflect") x = np.pad(x, (0, self.hop_length // 2), mode=self.stft_pad_mode)
f0, t = pw.dio( f0 = compute_f0(
x.astype(np.double), x=x,
fs=self.sample_rate, pitch_fmax=self.pitch_fmax,
f0_ceil=self.pitch_fmax, pitch_fmin=self.pitch_fmin,
frame_period=1000 * self.hop_length / self.sample_rate, hop_length=self.hop_length,
win_length=self.win_length,
sample_rate=self.sample_rate,
stft_pad_mode=self.stft_pad_mode,
center=True,
) )
f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate)
return f0 return f0
### Audio Processing ### ### Audio Processing ###

View File

@ -38,9 +38,9 @@ class CapacitronOptimizer:
self.param_groups = self.primary_optimizer.param_groups self.param_groups = self.primary_optimizer.param_groups
self.primary_optimizer.step() self.primary_optimizer.step()
def zero_grad(self): def zero_grad(self, set_to_none=False):
self.primary_optimizer.zero_grad() self.primary_optimizer.zero_grad(set_to_none)
self.secondary_optimizer.zero_grad() self.secondary_optimizer.zero_grad(set_to_none)
def load_state_dict(self, state_dict): def load_state_dict(self, state_dict):
self.primary_optimizer.load_state_dict(state_dict[0]) self.primary_optimizer.load_state_dict(state_dict[0])

View File

@ -9,6 +9,8 @@ import fsspec
import torch import torch
from coqpit import Coqpit from coqpit import Coqpit
from TTS.utils.generic_utils import get_user_data_dir
class RenamingUnpickler(pickle_tts.Unpickler): class RenamingUnpickler(pickle_tts.Unpickler):
"""Overload default pickler to solve module renaming problem""" """Overload default pickler to solve module renaming problem"""
@ -57,6 +59,7 @@ def copy_model_files(config: Coqpit, out_path, new_fields=None):
def load_fsspec( def load_fsspec(
path: str, path: str,
map_location: Union[str, Callable, torch.device, Dict[Union[str, torch.device], Union[str, torch.device]]] = None, map_location: Union[str, Callable, torch.device, Dict[Union[str, torch.device], Union[str, torch.device]]] = None,
cache: bool = True,
**kwargs, **kwargs,
) -> Any: ) -> Any:
"""Like torch.load but can load from other locations (e.g. s3:// , gs://). """Like torch.load but can load from other locations (e.g. s3:// , gs://).
@ -64,21 +67,33 @@ def load_fsspec(
Args: Args:
path: Any path or url supported by fsspec. path: Any path or url supported by fsspec.
map_location: torch.device or str. map_location: torch.device or str.
cache: If True, cache a remote file locally for subsequent calls. It is cached under `get_user_data_dir()/tts_cache`. Defaults to True.
**kwargs: Keyword arguments forwarded to torch.load. **kwargs: Keyword arguments forwarded to torch.load.
Returns: Returns:
Object stored in path. Object stored in path.
""" """
with fsspec.open(path, "rb") as f: is_local = os.path.isdir(path) or os.path.isfile(path)
return torch.load(f, map_location=map_location, **kwargs) if cache and not is_local:
with fsspec.open(
f"filecache::{path}",
filecache={"cache_storage": str(get_user_data_dir("tts_cache"))},
mode="rb",
) as f:
return torch.load(f, map_location=map_location, **kwargs)
else:
with fsspec.open(path, "rb") as f:
return torch.load(f, map_location=map_location, **kwargs)
def load_checkpoint(model, checkpoint_path, use_cuda=False, eval=False): # pylint: disable=redefined-builtin def load_checkpoint(
model, checkpoint_path, use_cuda=False, eval=False, cache=False
): # pylint: disable=redefined-builtin
try: try:
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu")) state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
except ModuleNotFoundError: except ModuleNotFoundError:
pickle_tts.Unpickler = RenamingUnpickler pickle_tts.Unpickler = RenamingUnpickler
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), pickle_module=pickle_tts) state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), pickle_module=pickle_tts, cache=cache)
model.load_state_dict(state["model"]) model.load_state_dict(state["model"])
if use_cuda: if use_cuda:
model.cuda() model.cuda()

View File

@ -32,11 +32,14 @@ class ModelManager(object):
home path. home path.
Args: Args:
models_file (str): path to .model.json models_file (str): path to .model.json file. Defaults to None.
output_prefix (str): prefix to `tts` to download models. Defaults to None
progress_bar (bool): print a progress bar when donwloading a file. Defaults to False.
""" """
def __init__(self, models_file=None, output_prefix=None): def __init__(self, models_file=None, output_prefix=None, progress_bar=False):
super().__init__() super().__init__()
self.progress_bar = progress_bar
if output_prefix is None: if output_prefix is None:
self.output_prefix = get_user_data_dir("tts") self.output_prefix = get_user_data_dir("tts")
else: else:
@ -236,7 +239,7 @@ class ModelManager(object):
os.makedirs(output_path, exist_ok=True) os.makedirs(output_path, exist_ok=True)
print(f" > Downloading model to {output_path}") print(f" > Downloading model to {output_path}")
# download from github release # download from github release
self._download_zip_file(model_item["github_rls_url"], output_path) self._download_zip_file(model_item["github_rls_url"], output_path, self.progress_bar)
self.print_model_license(model_item=model_item) self.print_model_license(model_item=model_item)
# find downloaded files # find downloaded files
output_model_path, output_config_path = self._find_files(output_path) output_model_path, output_config_path = self._find_files(output_path)
@ -334,7 +337,7 @@ class ModelManager(object):
config.save_json(config_path) config.save_json(config_path)
@staticmethod @staticmethod
def _download_zip_file(file_url, output_folder): def _download_zip_file(file_url, output_folder, progress_bar):
"""Download the github releases""" """Download the github releases"""
# download the file # download the file
r = requests.get(file_url, stream=True) r = requests.get(file_url, stream=True)
@ -342,11 +345,13 @@ class ModelManager(object):
try: try:
total_size_in_bytes = int(r.headers.get("content-length", 0)) total_size_in_bytes = int(r.headers.get("content-length", 0))
block_size = 1024 # 1 Kibibyte block_size = 1024 # 1 Kibibyte
progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True) if progress_bar:
progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
temp_zip_name = os.path.join(output_folder, file_url.split("/")[-1]) temp_zip_name = os.path.join(output_folder, file_url.split("/")[-1])
with open(temp_zip_name, "wb") as file: with open(temp_zip_name, "wb") as file:
for data in r.iter_content(block_size): for data in r.iter_content(block_size):
progress_bar.update(len(data)) if progress_bar:
progress_bar.update(len(data))
file.write(data) file.write(data)
with zipfile.ZipFile(temp_zip_name) as z: with zipfile.ZipFile(temp_zip_name) as z:
z.extractall(output_folder) z.extractall(output_folder)

View File

@ -212,8 +212,13 @@ class Synthesizer(object):
# handle multi-speaker # handle multi-speaker
speaker_embedding = None speaker_embedding = None
speaker_id = None speaker_id = None
if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "ids"): if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "name_to_id"):
if speaker_name and isinstance(speaker_name, str):
# handle Neon models with single speaker.
if len(self.tts_model.speaker_manager.name_to_id) == 1:
speaker_id = list(self.tts_model.speaker_manager.name_to_id.values())[0]
elif speaker_name and isinstance(speaker_name, str):
if self.tts_config.use_d_vector_file: if self.tts_config.use_d_vector_file:
# get the average speaker embedding from the saved d_vectors. # get the average speaker embedding from the saved d_vectors.
speaker_embedding = self.tts_model.speaker_manager.get_mean_embedding( speaker_embedding = self.tts_model.speaker_manager.get_mean_embedding(
@ -222,7 +227,7 @@ class Synthesizer(object):
speaker_embedding = np.array(speaker_embedding)[None, :] # [1 x embedding_dim] speaker_embedding = np.array(speaker_embedding)[None, :] # [1 x embedding_dim]
else: else:
# get speaker idx from the speaker name # get speaker idx from the speaker name
speaker_id = self.tts_model.speaker_manager.ids[speaker_name] speaker_id = self.tts_model.speaker_manager.name_to_id[speaker_name]
elif not speaker_name and not speaker_wav: elif not speaker_name and not speaker_wav:
raise ValueError( raise ValueError(
@ -243,8 +248,12 @@ class Synthesizer(object):
if self.tts_languages_file or ( if self.tts_languages_file or (
hasattr(self.tts_model, "language_manager") and self.tts_model.language_manager is not None hasattr(self.tts_model, "language_manager") and self.tts_model.language_manager is not None
): ):
if language_name and isinstance(language_name, str):
language_id = self.tts_model.language_manager.ids[language_name] if len(self.tts_model.language_manager.name_to_id) == 1:
language_id = list(self.tts_model.language_manager.name_to_id.values())[0]
elif language_name and isinstance(language_name, str):
language_id = self.tts_model.language_manager.name_to_id[language_name]
elif not language_name: elif not language_name:
raise ValueError( raise ValueError(
@ -316,7 +325,7 @@ class Synthesizer(object):
# get the speaker embedding or speaker id for the reference wav file # get the speaker embedding or speaker id for the reference wav file
reference_speaker_embedding = None reference_speaker_embedding = None
reference_speaker_id = None reference_speaker_id = None
if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "ids"): if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "name_to_id"):
if reference_speaker_name and isinstance(reference_speaker_name, str): if reference_speaker_name and isinstance(reference_speaker_name, str):
if self.tts_config.use_d_vector_file: if self.tts_config.use_d_vector_file:
# get the speaker embedding from the saved d_vectors. # get the speaker embedding from the saved d_vectors.
@ -328,12 +337,11 @@ class Synthesizer(object):
] # [1 x embedding_dim] ] # [1 x embedding_dim]
else: else:
# get speaker idx from the speaker name # get speaker idx from the speaker name
reference_speaker_id = self.tts_model.speaker_manager.ids[reference_speaker_name] reference_speaker_id = self.tts_model.speaker_manager.name_to_id[reference_speaker_name]
else: else:
reference_speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip( reference_speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(
reference_wav reference_wav
) )
outputs = transfer_voice( outputs = transfer_voice(
model=self.tts_model, model=self.tts_model,
CONFIG=self.tts_config, CONFIG=self.tts_config,

View File

@ -1,3 +1,4 @@
import soundfile as sf
import torch import torch
import torchaudio import torchaudio
@ -48,7 +49,7 @@ def remove_silence(
): ):
# get the VAD model and utils functions # get the VAD model and utils functions
model, get_speech_timestamps, save_audio, collect_chunks = model_and_utils model, get_speech_timestamps, _, collect_chunks = model_and_utils
# read ground truth wav and resample the audio for the VAD # read ground truth wav and resample the audio for the VAD
wav, gt_sample_rate = read_audio(audio_path) wav, gt_sample_rate = read_audio(audio_path)
@ -73,9 +74,11 @@ def remove_silence(
# if have speech timestamps else save the wav # if have speech timestamps else save the wav
if new_speech_timestamps: if new_speech_timestamps:
wav = collect_chunks(new_speech_timestamps, wav) wav = collect_chunks(new_speech_timestamps, wav)
is_speech = True
else: else:
print(f"> The file {audio_path} probably does not have speech please check it !!") print(f"> The file {audio_path} probably does not have speech please check it !!")
is_speech = False
# save audio # save audio
save_audio(out_path, wav, sampling_rate=gt_sample_rate) sf.write(out_path, wav, gt_sample_rate, subtype="PCM_16")
return out_path return out_path, is_speech

View File

@ -22,14 +22,12 @@ class HifiganConfig(BaseGANVocoderConfig):
generator_model_params (dict): Parameters of the generator model. Defaults to generator_model_params (dict): Parameters of the generator model. Defaults to
` `
{ {
"use_mel": True, "upsample_factors": [8, 8, 2, 2],
"sample_rate": 22050, "upsample_kernel_sizes": [16, 16, 4, 4],
"n_fft": 1024, "upsample_initial_channel": 512,
"hop_length": 256, "resblock_kernel_sizes": [3, 7, 11],
"win_length": 1024, "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
"n_mels": 80, "resblock_type": "1",
"mel_fmin": 0.0,
"mel_fmax": None,
} }
` `
batch_size (int): batch_size (int):

View File

@ -231,6 +231,7 @@ class GAN(BaseVocoder):
config: Coqpit, config: Coqpit,
checkpoint_path: str, checkpoint_path: str,
eval: bool = False, # pylint: disable=unused-argument, redefined-builtin eval: bool = False, # pylint: disable=unused-argument, redefined-builtin
cache: bool = False,
) -> None: ) -> None:
"""Load a GAN checkpoint and initialize model parameters. """Load a GAN checkpoint and initialize model parameters.
@ -239,7 +240,7 @@ class GAN(BaseVocoder):
checkpoint_path (str): Checkpoint file path. checkpoint_path (str): Checkpoint file path.
eval (bool, optional): If true, load the model for inference. If falseDefaults to False. eval (bool, optional): If true, load the model for inference. If falseDefaults to False.
""" """
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu")) state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
# band-aid for older than v0.0.15 GAN models # band-aid for older than v0.0.15 GAN models
if "model_disc" in state: if "model_disc" in state:
self.model_g.load_checkpoint(config, checkpoint_path, eval) self.model_g.load_checkpoint(config, checkpoint_path, eval)

View File

@ -290,9 +290,9 @@ class HifiganGenerator(torch.nn.Module):
remove_weight_norm(self.conv_post) remove_weight_norm(self.conv_post)
def load_checkpoint( def load_checkpoint(
self, config, checkpoint_path, eval=False self, config, checkpoint_path, eval=False, cache=False
): # pylint: disable=unused-argument, redefined-builtin ): # pylint: disable=unused-argument, redefined-builtin
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu")) state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
self.load_state_dict(state["model"]) self.load_state_dict(state["model"])
if eval: if eval:
self.eval() self.eval()

View File

@ -85,9 +85,9 @@ class MelganGenerator(nn.Module):
layer.remove_weight_norm() layer.remove_weight_norm()
def load_checkpoint( def load_checkpoint(
self, config, checkpoint_path, eval=False self, config, checkpoint_path, eval=False, cache=False
): # pylint: disable=unused-argument, redefined-builtin ): # pylint: disable=unused-argument, redefined-builtin
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu")) state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
self.load_state_dict(state["model"]) self.load_state_dict(state["model"])
if eval: if eval:
self.eval() self.eval()

View File

@ -153,9 +153,9 @@ class ParallelWaveganGenerator(torch.nn.Module):
return self._get_receptive_field_size(self.layers, self.stacks, self.kernel_size) return self._get_receptive_field_size(self.layers, self.stacks, self.kernel_size)
def load_checkpoint( def load_checkpoint(
self, config, checkpoint_path, eval=False self, config, checkpoint_path, eval=False, cache=False
): # pylint: disable=unused-argument, redefined-builtin ): # pylint: disable=unused-argument, redefined-builtin
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu")) state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
self.load_state_dict(state["model"]) self.load_state_dict(state["model"])
if eval: if eval:
self.eval() self.eval()

View File

@ -218,9 +218,9 @@ class Wavegrad(BaseVocoder):
self.y_conv = weight_norm(self.y_conv) self.y_conv = weight_norm(self.y_conv)
def load_checkpoint( def load_checkpoint(
self, config, checkpoint_path, eval=False self, config, checkpoint_path, eval=False, cache=False
): # pylint: disable=unused-argument, redefined-builtin ): # pylint: disable=unused-argument, redefined-builtin
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu")) state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
self.load_state_dict(state["model"]) self.load_state_dict(state["model"])
if eval: if eval:
self.eval() self.eval()

View File

@ -542,9 +542,9 @@ class Wavernn(BaseVocoder):
return unfolded return unfolded
def load_checkpoint( def load_checkpoint(
self, config, checkpoint_path, eval=False self, config, checkpoint_path, eval=False, cache=False
): # pylint: disable=unused-argument, redefined-builtin ): # pylint: disable=unused-argument, redefined-builtin
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu")) state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
self.load_state_dict(state["model"]) self.load_state_dict(state["model"])
if eval: if eval:
self.eval() self.eval()

View File

@ -0,0 +1,56 @@
(docker_images)=
## Docker images
We provide docker images to be able to test TTS without having to setup your own environment.
### Using premade images
You can use premade images built automatically from the latest TTS version.
#### CPU version
```bash
docker pull ghcr.io/coqui-ai/tts-cpu
```
#### GPU version
```bash
docker pull ghcr.io/coqui-ai/tts
```
### Building your own image
```bash
docker build -t tts .
```
## Basic inference
Basic usage: generating an audio file from a text passed as argument.
You can pass any tts argument after the image name.
### CPU version
```bash
docker run --rm -v ~/tts-output:/root/tts-output ghcr.io/coqui-ai/tts-cpu --text "Hello." --out_path /root/tts-output/hello.wav
```
### GPU version
For the GPU version, you need to have the latest NVIDIA drivers installed.
With `nvidia-smi` you can check the CUDA version supported, it must be >= 11.8
```bash
docker run --rm --gpus all -v ~/tts-output:/root/tts-output ghcr.io/coqui-ai/tts --text "Hello." --out_path /root/tts-output/hello.wav --use_cuda true
```
## Start a server
Starting a TTS server:
Start the container and get a shell inside it.
### CPU version
```bash
docker run --rm -it -p 5002:5002 --entrypoint /bin/bash ghcr.io/coqui-ai/tts-cpu
python3 TTS/server/server.py --list_models #To get the list of available models
python3 TTS/server/server.py --model_name tts_models/en/vctk/vits
```
### GPU version
```bash
docker run --rm -it -p 5002:5002 --gpus all --entrypoint /bin/bash ghcr.io/coqui-ai/tts
python3 TTS/server/server.py --list_models #To get the list of available models
python3 TTS/server/server.py --model_name tts_models/en/vctk/vits --use_cuda true
```
Click [there](http://[::1]:5002/) and have fun with the server!

View File

@ -53,7 +53,7 @@ We tried to collect common issues and questions we receive about 🐸TTS. It is
"mixed_precision": false, "mixed_precision": false,
"output_path": "recipes/ljspeech/glow_tts/", "output_path": "recipes/ljspeech/glow_tts/",
"test_sentences": ["Test this sentence.", "This test sentence.", "Sentence this test."], "test_sentences": ["Test this sentence.", "This test sentence.", "Sentence this test."],
"datasets":[{"name": "ljspeech", "meta_file_train":"metadata.csv", "path": "recipes/ljspeech/LJSpeech-1.1/"}] "datasets":[{"formatter": "ljspeech", "meta_file_train":"metadata.csv", "path": "recipes/ljspeech/LJSpeech-1.1/"}]
} }
``` ```

View File

@ -88,7 +88,7 @@ from TTS.tts.datasets import load_tts_samples
# dataset config for one of the pre-defined datasets # dataset config for one of the pre-defined datasets
dataset_config = BaseDatasetConfig( dataset_config = BaseDatasetConfig(
name="vctk", meta_file_train="", language="en-us", path="dataset-path") formatter="vctk", meta_file_train="", language="en-us", path="dataset-path")
) )
# load training samples # load training samples

View File

@ -20,6 +20,7 @@
:caption: Using 🐸TTS :caption: Using 🐸TTS
inference inference
docker_images
implementing_a_new_model implementing_a_new_model
training_a_model training_a_model
finetuning finetuning

View File

@ -12,7 +12,7 @@ Currently we provide the following pre-configured architectures:
- **FastPitch:** - **FastPitch:**
It uses the same FastSpeech architecture that us conditioned on fundemental frequency (f0) contours with the It uses the same FastSpeech architecture that is conditioned on fundemental frequency (f0) contours with the
promise of more expressive speech. promise of more expressive speech.
- **SpeedySpeech:** - **SpeedySpeech:**

View File

@ -84,7 +84,7 @@ We still support running training from CLI like in the old days. The same traini
"print_eval": true, "print_eval": true,
"mixed_precision": false, "mixed_precision": false,
"output_path": "recipes/ljspeech/glow_tts/", "output_path": "recipes/ljspeech/glow_tts/",
"datasets":[{"name": "ljspeech", "meta_file_train":"metadata.csv", "path": "recipes/ljspeech/LJSpeech-1.1/"}] "datasets":[{"formatter": "ljspeech", "meta_file_train":"metadata.csv", "path": "recipes/ljspeech/LJSpeech-1.1/"}]
} }
``` ```
@ -120,6 +120,3 @@ $ tts-server -h # see the help
$ tts-server --list_models # list the available models. $ tts-server --list_models # list the available models.
``` ```
![server.gif](https://github.com/coqui-ai/TTS/raw/main/images/demo_server.gif) ![server.gif](https://github.com/coqui-ai/TTS/raw/main/images/demo_server.gif)

View File

@ -74,7 +74,7 @@
"<span style=\"color:purple;font-size:15px\">\n", "<span style=\"color:purple;font-size:15px\">\n",
"/MyTTSDataset <br /> \n", "/MyTTSDataset <br /> \n",
"&emsp;| <br /> \n", "&emsp;| <br /> \n",
"&emsp;| -> metadata.txt<br /> \n", "&emsp;| -> metadata.csv<br /> \n",
"&emsp;| -> /wavs<br /> \n", "&emsp;| -> /wavs<br /> \n",
"&emsp;&emsp;| -> audio1.wav<br /> \n", "&emsp;&emsp;| -> audio1.wav<br /> \n",
"&emsp;&emsp;| -> audio2.wav<br /> \n", "&emsp;&emsp;| -> audio2.wav<br /> \n",

View File

@ -15,7 +15,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
data_path = "/srv/data/" data_path = "/srv/data/"
# Using LJSpeech like dataset processing for the blizzard dataset # Using LJSpeech like dataset processing for the blizzard dataset
dataset_config = BaseDatasetConfig(name="ljspeech", meta_file_train="metadata.csv", path=data_path) dataset_config = BaseDatasetConfig(formatter="ljspeech", meta_file_train="metadata.csv", path=data_path)
audio_config = BaseAudioConfig( audio_config = BaseAudioConfig(
sample_rate=24000, sample_rate=24000,

View File

@ -16,7 +16,7 @@ data_path = "/srv/data/blizzard2013/segmented"
# Using LJSpeech like dataset processing for the blizzard dataset # Using LJSpeech like dataset processing for the blizzard dataset
dataset_config = BaseDatasetConfig( dataset_config = BaseDatasetConfig(
name="ljspeech", formatter="ljspeech",
meta_file_train="metadata.csv", meta_file_train="metadata.csv",
path=data_path, path=data_path,
) )

View File

@ -1,7 +1,7 @@
{ {
"datasets": [ "datasets": [
{ {
"name": "kokoro", "formatter": "kokoro",
"path": "DEFINE THIS", "path": "DEFINE THIS",
"meta_file_train": "metadata.csv", "meta_file_train": "metadata.csv",
"meta_file_val": null "meta_file_val": null

View File

@ -13,7 +13,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
# init configs # init configs
dataset_config = BaseDatasetConfig( dataset_config = BaseDatasetConfig(
name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/") formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
) )
config = AlignTTSConfig( config = AlignTTSConfig(
batch_size=32, batch_size=32,

View File

@ -14,7 +14,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
# init configs # init configs
dataset_config = BaseDatasetConfig( dataset_config = BaseDatasetConfig(
name="ljspeech", formatter="ljspeech",
meta_file_train="metadata.csv", meta_file_train="metadata.csv",
# meta_file_attn_mask=os.path.join(output_path, "../LJSpeech-1.1/metadata_attn_mask.txt"), # meta_file_attn_mask=os.path.join(output_path, "../LJSpeech-1.1/metadata_attn_mask.txt"),
path=os.path.join(output_path, "../LJSpeech-1.1/"), path=os.path.join(output_path, "../LJSpeech-1.1/"),

View File

@ -14,7 +14,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
# init configs # init configs
dataset_config = BaseDatasetConfig( dataset_config = BaseDatasetConfig(
name="ljspeech", formatter="ljspeech",
meta_file_train="metadata.csv", meta_file_train="metadata.csv",
# meta_file_attn_mask=os.path.join(output_path, "../LJSpeech-1.1/metadata_attn_mask.txt"), # meta_file_attn_mask=os.path.join(output_path, "../LJSpeech-1.1/metadata_attn_mask.txt"),
path=os.path.join(output_path, "../LJSpeech-1.1/"), path=os.path.join(output_path, "../LJSpeech-1.1/"),

View File

@ -21,7 +21,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
# Set LJSpeech as our target dataset and define its path. # Set LJSpeech as our target dataset and define its path.
# You can also use a simple Dict to define the dataset and pass it to your custom formatter. # You can also use a simple Dict to define the dataset and pass it to your custom formatter.
dataset_config = BaseDatasetConfig( dataset_config = BaseDatasetConfig(
name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/") formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
) )
# INITIALIZE THE TRAINING CONFIGURATION # INITIALIZE THE TRAINING CONFIGURATION

View File

@ -11,7 +11,7 @@ from TTS.utils.audio import AudioProcessor
output_path = os.path.dirname(os.path.abspath(__file__)) output_path = os.path.dirname(os.path.abspath(__file__))
dataset_config = BaseDatasetConfig( dataset_config = BaseDatasetConfig(
name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/") formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
) )
audio_config = BaseAudioConfig( audio_config = BaseAudioConfig(

View File

@ -16,7 +16,7 @@ data_path = "/srv/data/"
# Using LJSpeech like dataset processing for the blizzard dataset # Using LJSpeech like dataset processing for the blizzard dataset
dataset_config = BaseDatasetConfig( dataset_config = BaseDatasetConfig(
name="ljspeech", formatter="ljspeech",
meta_file_train="metadata.csv", meta_file_train="metadata.csv",
path=data_path, path=data_path,
) )

View File

@ -16,7 +16,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
# init configs # init configs
dataset_config = BaseDatasetConfig( dataset_config = BaseDatasetConfig(
name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/") formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
) )
audio_config = BaseAudioConfig( audio_config = BaseAudioConfig(

View File

@ -16,7 +16,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
# init configs # init configs
dataset_config = BaseDatasetConfig( dataset_config = BaseDatasetConfig(
name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/") formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
) )
audio_config = BaseAudioConfig( audio_config = BaseAudioConfig(

View File

@ -11,7 +11,7 @@ from TTS.utils.audio import AudioProcessor
output_path = os.path.dirname(os.path.abspath(__file__)) output_path = os.path.dirname(os.path.abspath(__file__))
dataset_config = BaseDatasetConfig( dataset_config = BaseDatasetConfig(
name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/") formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
) )
audio_config = VitsAudioConfig( audio_config = VitsAudioConfig(
sample_rate=22050, win_length=1024, hop_length=256, num_mels=80, mel_fmin=0, mel_fmax=None sample_rate=22050, win_length=1024, hop_length=256, num_mels=80, mel_fmin=0, mel_fmax=None

View File

@ -17,7 +17,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
mailabs_path = "/home/julian/workspace/mailabs/**" mailabs_path = "/home/julian/workspace/mailabs/**"
dataset_paths = glob(mailabs_path) dataset_paths = glob(mailabs_path)
dataset_config = [ dataset_config = [
BaseDatasetConfig(name="mailabs", meta_file_train=None, path=path, language=path.split("/")[-1]) BaseDatasetConfig(formatter="mailabs", meta_file_train=None, path=path, language=path.split("/")[-1])
for path in dataset_paths for path in dataset_paths
] ]

View File

@ -14,7 +14,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
# init configs # init configs
dataset_config = BaseDatasetConfig( dataset_config = BaseDatasetConfig(
name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/") formatter="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
) )
# download dataset if not already present # download dataset if not already present

View File

@ -22,7 +22,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
# Set LJSpeech as our target dataset and define its path. # Set LJSpeech as our target dataset and define its path.
# You can also use a simple Dict to define the dataset and pass it to your custom formatter. # You can also use a simple Dict to define the dataset and pass it to your custom formatter.
dataset_config = BaseDatasetConfig( dataset_config = BaseDatasetConfig(
name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/") formatter="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
) )
# download dataset if not already present # download dataset if not already present

View File

@ -12,7 +12,7 @@ from TTS.utils.downloaders import download_thorsten_de
output_path = os.path.dirname(os.path.abspath(__file__)) output_path = os.path.dirname(os.path.abspath(__file__))
dataset_config = BaseDatasetConfig( dataset_config = BaseDatasetConfig(
name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/") formatter="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
) )
# download dataset if not already present # download dataset if not already present

View File

@ -16,7 +16,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
# init configs # init configs
dataset_config = BaseDatasetConfig( dataset_config = BaseDatasetConfig(
name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/") formatter="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
) )
# download dataset if not already present # download dataset if not already present

View File

@ -12,7 +12,7 @@ from TTS.utils.downloaders import download_thorsten_de
output_path = os.path.dirname(os.path.abspath(__file__)) output_path = os.path.dirname(os.path.abspath(__file__))
dataset_config = BaseDatasetConfig( dataset_config = BaseDatasetConfig(
name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/") formatter="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
) )
# download dataset if not already present # download dataset if not already present

View File

@ -11,7 +11,7 @@ from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor
output_path = os.path.dirname(os.path.abspath(__file__)) output_path = os.path.dirname(os.path.abspath(__file__))
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/")) dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
audio_config = BaseAudioConfig( audio_config = BaseAudioConfig(
sample_rate=22050, sample_rate=22050,

View File

@ -11,7 +11,7 @@ from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor
output_path = os.path.dirname(os.path.abspath(__file__)) output_path = os.path.dirname(os.path.abspath(__file__))
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/")) dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
audio_config = BaseAudioConfig( audio_config = BaseAudioConfig(
sample_rate=22050, sample_rate=22050,

View File

@ -22,7 +22,7 @@ if not os.path.exists(dataset_path):
download_vctk(dataset_path) download_vctk(dataset_path)
# define dataset config # define dataset config
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=dataset_path) dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", path=dataset_path)
# define audio config # define audio config
# ❗ resample the dataset externally using `TTS/bin/resample.py` and set `resample=False` for faster training # ❗ resample the dataset externally using `TTS/bin/resample.py` and set `resample=False` for faster training

View File

@ -0,0 +1,139 @@
import os
from TTS.encoder.configs.speaker_encoder_config import SpeakerEncoderConfig
# from TTS.encoder.configs.emotion_encoder_config import EmotionEncoderConfig
from TTS.tts.configs.shared_configs import BaseDatasetConfig
CURRENT_PATH = os.getcwd()
# change the root path to the TTS root path
os.chdir("../../../")
### Definitions ###
# dataset
VCTK_PATH = "/raid/datasets/VCTK_NEW_16khz_removed_silence_silero_vad/" # download: https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zipdddddddddd
RIR_SIMULATED_PATH = "/raid/datasets/DA/RIRS_NOISES/simulated_rirs/" # download: https://www.openslr.org/17/
MUSAN_PATH = "/raid/datasets/DA/musan/" # download: https://www.openslr.org/17/
# training
OUTPUT_PATH = os.path.join(
CURRENT_PATH, "resnet_speaker_encoder_training_output/"
) # path to save the train logs and checkpoint
CONFIG_OUT_PATH = os.path.join(OUTPUT_PATH, "config_se.json")
RESTORE_PATH = None # Checkpoint to use for transfer learning if None ignore
# instance the config
# to speaker encoder
config = SpeakerEncoderConfig()
# to emotion encoder
# config = EmotionEncoderConfig()
#### DATASET CONFIG ####
# The formatter need to return the key "speaker_name" for the speaker encoder and the "emotion_name" for the emotion encoder
dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", language="en-us", path=VCTK_PATH)
# add the dataset to the config
config.datasets = [dataset_config]
#### TRAINING CONFIG ####
# The encoder data loader balancer the dataset item equally to guarantee better training and to attend the losses requirements
# It have two parameters to control the final batch size the number total of speaker used in each batch and the number of samples for each speaker
# number total of speaker in batch in training
config.num_classes_in_batch = 100
# number of utterance per class/speaker in the batch in training
config.num_utter_per_class = 4
# final batch size = config.num_classes_in_batch * config.num_utter_per_class
# number total of speaker in batch in evaluation
config.eval_num_classes_in_batch = 100
# number of utterance per class/speaker in the batch in evaluation
config.eval_num_utter_per_class = 4
# number of data loader workers
config.num_loader_workers = 8
config.num_val_loader_workers = 8
# number of epochs
config.epochs = 10000
# loss to be used in training
config.loss = "softmaxproto"
# run eval
config.run_eval = False
# output path for the checkpoints
config.output_path = OUTPUT_PATH
# Save local checkpoint every save_step steps
config.save_step = 2000
### Model Config ###
config.model_params = {
"model_name": "resnet", # supported "lstm" and "resnet"
"input_dim": 64,
"use_torch_spec": True,
"log_input": True,
"proj_dim": 512, # embedding dim
}
### Audio Config ###
# To fast train the model divides the audio in small parts. it parameter defines the length in seconds of these "parts"
config.voice_len = 2.0
# all others configs
config.audio = {
"fft_size": 512,
"win_length": 400,
"hop_length": 160,
"frame_shift_ms": None,
"frame_length_ms": None,
"stft_pad_mode": "reflect",
"sample_rate": 16000,
"resample": False,
"preemphasis": 0.97,
"ref_level_db": 20,
"do_sound_norm": False,
"do_trim_silence": False,
"trim_db": 60,
"power": 1.5,
"griffin_lim_iters": 60,
"num_mels": 64,
"mel_fmin": 0.0,
"mel_fmax": 8000.0,
"spec_gain": 20,
"signal_norm": False,
"min_level_db": -100,
"symmetric_norm": False,
"max_norm": 4.0,
"clip_norm": False,
"stats_path": None,
"do_rms_norm": True,
"db_level": -27.0,
}
### Augmentation Config ###
config.audio_augmentation = {
# additive noise and room impulse response (RIR) simulation similar to: https://arxiv.org/pdf/2009.14153.pdf
"p": 0.5, # probability to the use of one of the augmentation - 0 means disabled
"rir": {"rir_path": RIR_SIMULATED_PATH, "conv_mode": "full"}, # download: https://www.openslr.org/17/
"additive": {
"sounds_path": MUSAN_PATH,
"speech": {"min_snr_in_db": 13, "max_snr_in_db": 20, "min_num_noises": 1, "max_num_noises": 1},
"noise": {"min_snr_in_db": 0, "max_snr_in_db": 15, "min_num_noises": 1, "max_num_noises": 1},
"music": {"min_snr_in_db": 5, "max_snr_in_db": 15, "min_num_noises": 1, "max_num_noises": 1},
},
"gaussian": {"p": 0.7, "min_amplitude": 0.0, "max_amplitude": 1e-05},
}
config.save_json(CONFIG_OUT_PATH)
print(CONFIG_OUT_PATH)
if RESTORE_PATH is not None:
command = f"python TTS/bin/train_encoder.py --config_path {CONFIG_OUT_PATH} --restore_path {RESTORE_PATH}"
else:
command = f"python TTS/bin/train_encoder.py --config_path {CONFIG_OUT_PATH}"
os.system(command)

View File

@ -11,7 +11,7 @@ from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor
output_path = os.path.dirname(os.path.abspath(__file__)) output_path = os.path.dirname(os.path.abspath(__file__))
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/")) dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
audio_config = BaseAudioConfig( audio_config = BaseAudioConfig(
sample_rate=22050, sample_rate=22050,

View File

@ -12,7 +12,7 @@ from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor
output_path = os.path.dirname(os.path.abspath(__file__)) output_path = os.path.dirname(os.path.abspath(__file__))
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/")) dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
audio_config = BaseAudioConfig( audio_config = BaseAudioConfig(
sample_rate=22050, sample_rate=22050,

View File

@ -12,7 +12,7 @@ from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor
output_path = os.path.dirname(os.path.abspath(__file__)) output_path = os.path.dirname(os.path.abspath(__file__))
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/")) dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
audio_config = BaseAudioConfig( audio_config = BaseAudioConfig(
sample_rate=22050, sample_rate=22050,

View File

@ -12,7 +12,7 @@ from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor
output_path = os.path.dirname(os.path.abspath(__file__)) output_path = os.path.dirname(os.path.abspath(__file__))
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/")) dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
audio_config = BaseAudioConfig( audio_config = BaseAudioConfig(
sample_rate=22050, sample_rate=22050,

View File

@ -12,7 +12,7 @@ from TTS.utils.audio import AudioProcessor
output_path = os.path.dirname(os.path.abspath(__file__)) output_path = os.path.dirname(os.path.abspath(__file__))
dataset_config = BaseDatasetConfig( dataset_config = BaseDatasetConfig(
name="vctk", meta_file_train="", language="en-us", path=os.path.join(output_path, "../VCTK/") formatter="vctk", meta_file_train="", language="en-us", path=os.path.join(output_path, "../VCTK/")
) )

View File

@ -23,7 +23,6 @@ umap-learn==0.5.1
pandas pandas
# deps for training # deps for training
matplotlib matplotlib
pyworld==0.2.10 # > 0.2.10 is not p3.10.x compatible
# coqui stack # coqui stack
trainer trainer
# config management # config management
@ -35,4 +34,8 @@ pypinyin
mecab-python3==1.0.5 mecab-python3==1.0.5
unidic-lite==1.0.8 unidic-lite==1.0.8
# gruut+supported langs # gruut+supported langs
gruut[cs,de,es,fr,it,nl,pt,ru,sv]==2.2.3 gruut[de]==2.2.3
# deps for korean
jamo
nltk
g2pkk>=0.1.1

View File

@ -33,7 +33,9 @@ def get_tests_data_path():
def get_tests_output_path(): def get_tests_output_path():
"""Returns the path to the directory for test outputs.""" """Returns the path to the directory for test outputs."""
return os.path.join(get_tests_path(), "outputs") path = os.path.join(get_tests_path(), "outputs")
os.makedirs(path, exist_ok=True)
return path
def run_cli(command): def run_cli(command):
@ -42,7 +44,7 @@ def run_cli(command):
def get_test_data_config(): def get_test_data_config():
return BaseDatasetConfig(name="ljspeech", path="tests/data/ljspeech/", meta_file_train="metadata.csv") return BaseDatasetConfig(formatter="ljspeech", path="tests/data/ljspeech/", meta_file_train="metadata.csv")
def assertHasAttr(test_obj, obj, intendedAttr): def assertHasAttr(test_obj, obj, intendedAttr):

View File

@ -10,7 +10,7 @@ OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests")
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
os.makedirs(OUT_PATH, exist_ok=True) os.makedirs(OUT_PATH, exist_ok=True)
conf = BaseAudioConfig(mel_fmax=8000) conf = BaseAudioConfig(mel_fmax=8000, pitch_fmax=640, pitch_fmin=1)
# pylint: disable=protected-access # pylint: disable=protected-access

View File

@ -0,0 +1,92 @@
import os
import unittest
import numpy as np
import torch
from tests import get_tests_input_path
from TTS.config import load_config
from TTS.encoder.utils.generic_utils import setup_encoder_model
from TTS.encoder.utils.io import save_checkpoint
from TTS.tts.utils.managers import EmbeddingManager
from TTS.utils.audio import AudioProcessor
encoder_config_path = os.path.join(get_tests_input_path(), "test_speaker_encoder_config.json")
encoder_model_path = os.path.join(get_tests_input_path(), "checkpoint_0.pth")
sample_wav_path = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0001.wav")
sample_wav_path2 = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0002.wav")
embedding_file_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.json")
embeddings_file_path2 = os.path.join(get_tests_input_path(), "../data/dummy_speakers2.json")
embeddings_file_pth_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.pth")
class EmbeddingManagerTest(unittest.TestCase):
"""Test emEeddingManager for loading embedding files and computing embeddings from waveforms"""
@staticmethod
def test_speaker_embedding():
# load config
config = load_config(encoder_config_path)
config.audio.resample = True
# create a dummy speaker encoder
model = setup_encoder_model(config)
save_checkpoint(model, None, None, get_tests_input_path(), 0)
# load audio processor and speaker encoder
manager = EmbeddingManager(encoder_model_path=encoder_model_path, encoder_config_path=encoder_config_path)
# load a sample audio and compute embedding
ap = AudioProcessor(**config.audio)
waveform = ap.load_wav(sample_wav_path)
mel = ap.melspectrogram(waveform)
embedding = manager.compute_embeddings(mel)
assert embedding.shape[1] == 256
# compute embedding directly from an input file
embedding = manager.compute_embedding_from_clip(sample_wav_path)
embedding2 = manager.compute_embedding_from_clip(sample_wav_path)
embedding = torch.FloatTensor(embedding)
embedding2 = torch.FloatTensor(embedding2)
assert embedding.shape[0] == 256
assert (embedding - embedding2).sum() == 0.0
# compute embedding from a list of wav files.
embedding3 = manager.compute_embedding_from_clip([sample_wav_path, sample_wav_path2])
embedding3 = torch.FloatTensor(embedding3)
assert embedding3.shape[0] == 256
assert (embedding - embedding3).sum() != 0.0
# remove dummy model
os.remove(encoder_model_path)
def test_embedding_file_processing(self): # pylint: disable=no-self-use
manager = EmbeddingManager(embedding_file_path=embeddings_file_pth_path)
# test embedding querying
embedding = manager.get_embedding_by_clip(manager.clip_ids[0])
assert len(embedding) == 256
embeddings = manager.get_embeddings_by_name(manager.embedding_names[0])
assert len(embeddings[0]) == 256
embedding1 = manager.get_mean_embedding(manager.embedding_names[0], num_samples=2, randomize=True)
assert len(embedding1) == 256
embedding2 = manager.get_mean_embedding(manager.embedding_names[0], num_samples=2, randomize=False)
assert len(embedding2) == 256
assert np.sum(np.array(embedding1) - np.array(embedding2)) != 0
def test_embedding_file_loading(self):
# test loading a json file
manager = EmbeddingManager(embedding_file_path=embedding_file_path)
self.assertEqual(manager.num_embeddings, 384)
self.assertEqual(manager.embedding_dim, 256)
# test loading a pth file
manager = EmbeddingManager(embedding_file_path=embeddings_file_pth_path)
self.assertEqual(manager.num_embeddings, 384)
self.assertEqual(manager.embedding_dim, 256)
# test loading a pth files with duplicate embedding keys
with self.assertRaises(Exception) as context:
manager = EmbeddingManager(embedding_file_path=[embeddings_file_pth_path, embeddings_file_pth_path])
self.assertTrue("Duplicate embedding names" in str(context.exception))
# test loading embedding files with different embedding keys
manager = EmbeddingManager(embedding_file_path=[embeddings_file_pth_path, embeddings_file_path2])
self.assertEqual(manager.embedding_dim, 256)
self.assertEqual(manager.num_embeddings, 384 * 2)

View File

@ -15,7 +15,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase):
def test_GlowTTS(): def test_GlowTTS():
# set paths # set paths
config_path = os.path.join(get_tests_input_path(), "test_glow_tts.json") config_path = os.path.join(get_tests_input_path(), "test_glow_tts.json")
checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth") checkpoint_path = os.path.join(get_tests_output_path(), "glowtts.pth")
output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/") output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/")
# load config # load config
c = load_config(config_path) c = load_config(config_path)
@ -33,7 +33,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase):
def test_Tacotron2(): def test_Tacotron2():
# set paths # set paths
config_path = os.path.join(get_tests_input_path(), "test_tacotron2_config.json") config_path = os.path.join(get_tests_input_path(), "test_tacotron2_config.json")
checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth") checkpoint_path = os.path.join(get_tests_output_path(), "tacotron2.pth")
output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/") output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/")
# load config # load config
c = load_config(config_path) c = load_config(config_path)
@ -51,7 +51,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase):
def test_Tacotron(): def test_Tacotron():
# set paths # set paths
config_path = os.path.join(get_tests_input_path(), "test_tacotron_config.json") config_path = os.path.join(get_tests_input_path(), "test_tacotron_config.json")
checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth") checkpoint_path = os.path.join(get_tests_output_path(), "tacotron.pth")
output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/") output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/")
# load config # load config
c = load_config(config_path) c = load_config(config_path)

View File

@ -12,20 +12,22 @@ torch.manual_seed(1)
config_path = os.path.join(get_tests_output_path(), "test_model_config.json") config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
dataset_config_en = BaseDatasetConfig( dataset_config_en = BaseDatasetConfig(
name="ljspeech", formatter="ljspeech",
meta_file_train="metadata.csv", meta_file_train="metadata.csv",
meta_file_val="metadata.csv", meta_file_val="metadata.csv",
path="tests/data/ljspeech", path="tests/data/ljspeech",
language="en", language="en",
) )
"""
dataset_config_pt = BaseDatasetConfig( dataset_config_pt = BaseDatasetConfig(
name="ljspeech", formatter="ljspeech",
meta_file_train="metadata.csv", meta_file_train="metadata.csv",
meta_file_val="metadata.csv", meta_file_val="metadata.csv",
path="tests/data/ljspeech", path="tests/data/ljspeech",
language="pt-br", language="pt-br",
) )
"""
# pylint: disable=protected-access # pylint: disable=protected-access
class TestFindUniquePhonemes(unittest.TestCase): class TestFindUniquePhonemes(unittest.TestCase):
@ -46,7 +48,7 @@ class TestFindUniquePhonemes(unittest.TestCase):
epochs=1, epochs=1,
print_step=1, print_step=1,
print_eval=True, print_eval=True,
datasets=[dataset_config_en, dataset_config_pt], datasets=[dataset_config_en],
) )
config.save_json(config_path) config.save_json(config_path)
@ -70,7 +72,7 @@ class TestFindUniquePhonemes(unittest.TestCase):
epochs=1, epochs=1,
print_step=1, print_step=1,
print_eval=True, print_eval=True,
datasets=[dataset_config_en, dataset_config_pt], datasets=[dataset_config_en],
) )
config.save_json(config_path) config.save_json(config_path)

View File

@ -31,7 +31,8 @@ class TestNumpyTransforms(unittest.TestCase):
mel_fmin: int = 0 mel_fmin: int = 0
hop_length: int = 256 hop_length: int = 256
win_length: int = 1024 win_length: int = 1024
pitch_fmax: int = 450 pitch_fmax: int = 640
pitch_fmin: int = 1
trim_db: int = -1 trim_db: int = -1
min_silence_sec: float = 0.01 min_silence_sec: float = 0.01
gain: float = 1.0 gain: float = 1.0

Some files were not shown because too many files have changed in this diff Show More