Merge pull request #1942 from coqui-ai/dev

v0.9.0
This commit is contained in:
Eren Gölge 2022-11-16 16:50:57 +01:00 committed by GitHub
commit 56ba616a03
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
165 changed files with 101831 additions and 443 deletions

View File

@ -1,2 +1,9 @@
.git/
Dockerfile
build/
dist/
TTS.egg-info/
tests/outputs/*
tests/train_outputs/*
__pycache__/
*.pyc

View File

@ -21,9 +21,9 @@ jobs:
python-version: [3.7, 3.8, 3.9, "3.10"]
experimental: [false]
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: coqui-ai/setup-python@pip-cache-key-py-ver
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
architecture: x64

View File

@ -21,9 +21,9 @@ jobs:
python-version: [3.7, 3.8, 3.9, "3.10"]
experimental: [false]
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: coqui-ai/setup-python@pip-cache-key-py-ver
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
architecture: x64

View File

@ -15,8 +15,8 @@ jobs:
matrix:
arch: ["amd64"]
base:
- "nvcr.io/nvidia/pytorch:22.03-py3" # GPU enabled
- "ubuntu:20.04" # CPU only
- "nvidia/cuda:11.8.0-base-ubuntu22.04" # GPU enabled
- "python:3.10.8-slim" # CPU only
steps:
- uses: actions/checkout@v2
- name: Log in to the Container registry
@ -32,7 +32,7 @@ jobs:
base="ghcr.io/coqui-ai/tts"
tags="" # PR build
if [[ ${{ matrix.base }} = "ubuntu:20.04" ]]; then
if [[ ${{ matrix.base }} = "python:3.10.8-slim" ]]; then
base="ghcr.io/coqui-ai/tts-cpu"
fi

View File

@ -21,9 +21,9 @@ jobs:
python-version: [3.7, 3.8, 3.9, "3.10"]
experimental: [false]
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: coqui-ai/setup-python@pip-cache-key-py-ver
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
architecture: x64

View File

@ -21,9 +21,9 @@ jobs:
python-version: [3.9]
experimental: [false]
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: coqui-ai/setup-python@pip-cache-key-py-ver
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
architecture: x64

View File

@ -21,9 +21,9 @@ jobs:
python-version: [3.7, 3.8, 3.9, "3.10"]
experimental: [false]
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: coqui-ai/setup-python@pip-cache-key-py-ver
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
architecture: x64

View File

@ -21,9 +21,9 @@ jobs:
python-version: [3.7, 3.8, 3.9, "3.10"]
experimental: [false]
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: coqui-ai/setup-python@pip-cache-key-py-ver
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
architecture: x64

View File

@ -21,9 +21,9 @@ jobs:
python-version: [3.7, 3.8, 3.9, "3.10"]
experimental: [false]
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: coqui-ai/setup-python@pip-cache-key-py-ver
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
architecture: x64

52
.github/workflows/zoo_tests0.yml vendored Normal file
View File

@ -0,0 +1,52 @@
name: zoo-tests-0
on:
push:
branches:
- main
pull_request:
types: [opened, synchronize, reopened]
jobs:
check_skip:
runs-on: ubuntu-latest
if: "! contains(github.event.head_commit.message, '[ci skip]')"
steps:
- run: echo "${{ github.event.head_commit.message }}"
test:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: [3.7, 3.8, 3.9, "3.10"]
experimental: [false]
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
architecture: x64
cache: 'pip'
cache-dependency-path: 'requirements*'
- name: check OS
run: cat /etc/os-release
- name: Install dependencies
run: |
sudo apt-get update
sudo apt-get install -y git make gcc
sudo apt-get install espeak espeak-ng
make system-deps
- name: Install/upgrade Python setup deps
run: python3 -m pip install --upgrade pip setuptools wheel
- name: Replace scarf urls
run: |
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
- name: Install TTS
run: |
python3 -m pip install .[all]
python3 setup.py egg_info
- name: Unit tests
run: |
nose2 -F -v -B TTS tests.zoo_tests.test_models.test_models_offset_0_step_3
nose2 -F -v -B TTS tests.zoo_tests.test_models.test_voice_conversion

View File

@ -1,4 +1,4 @@
name: zoo-tests
name: zoo-tests-1
on:
push:
@ -21,9 +21,9 @@ jobs:
python-version: [3.7, 3.8, 3.9, "3.10"]
experimental: [false]
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: coqui-ai/setup-python@pip-cache-key-py-ver
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
architecture: x64
@ -47,4 +47,4 @@ jobs:
python3 -m pip install .[all]
python3 setup.py egg_info
- name: Unit tests
run: make test_zoo
run: nose2 -F -v -B --with-coverage --coverage TTS tests.zoo_tests.test_models.test_models_offset_1_step_3

50
.github/workflows/zoo_tests2.yml vendored Normal file
View File

@ -0,0 +1,50 @@
name: zoo-tests-2
on:
push:
branches:
- main
pull_request:
types: [opened, synchronize, reopened]
jobs:
check_skip:
runs-on: ubuntu-latest
if: "! contains(github.event.head_commit.message, '[ci skip]')"
steps:
- run: echo "${{ github.event.head_commit.message }}"
test:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: [3.7, 3.8, 3.9, "3.10"]
experimental: [false]
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
architecture: x64
cache: 'pip'
cache-dependency-path: 'requirements*'
- name: check OS
run: cat /etc/os-release
- name: Install dependencies
run: |
sudo apt-get update
sudo apt-get install -y git make gcc
sudo apt-get install espeak espeak-ng
make system-deps
- name: Install/upgrade Python setup deps
run: python3 -m pip install --upgrade pip setuptools wheel
- name: Replace scarf urls
run: |
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
- name: Install TTS
run: |
python3 -m pip install .[all]
python3 setup.py egg_info
- name: Unit tests
run: nose2 -F -v -B --with-coverage --coverage TTS tests.zoo_tests.test_models.test_models_offset_2_step_3

View File

@ -1,20 +1,12 @@
ARG BASE=nvcr.io/nvidia/pytorch:22.03-py3
ARG BASE=nvidia/cuda:11.8.0-base-ubuntu22.04
FROM ${BASE}
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ make python3 python3-dev python3-pip python3-venv python3-wheel espeak espeak-ng libsndfile1-dev && rm -rf /var/lib/apt/lists/*
RUN pip install llvmlite --ignore-installed
# Create and activate virtual env
ENV VIRTUAL_ENV=/venv
RUN python3 -m venv $VIRTUAL_ENV
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
RUN pip install -U pip setuptools wheel
RUN apt-get update && apt-get upgrade -y
RUN apt-get install -y --no-install-recommends gcc g++ make python3 python3-dev python3-pip python3-venv python3-wheel espeak-ng libsndfile1-dev && rm -rf /var/lib/apt/lists/*
RUN pip3 install llvmlite --ignore-installed
WORKDIR /root
COPY requirements.txt /root
COPY requirements.dev.txt /root
COPY requirements.notebooks.txt /root
RUN ["/bin/bash", "-c", "pip install -r <(cat requirements.txt requirements.dev.txt requirements.notebooks.txt)"]
COPY . /root
RUN pip3 install torch torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
RUN make install
ENTRYPOINT ["tts"]
CMD ["--help"]

View File

@ -1,9 +1,16 @@
# <img src="https://raw.githubusercontent.com/coqui-ai/TTS/main/images/coqui-log-green-TTS.png" height="56"/>
<img src="https://raw.githubusercontent.com/coqui-ai/TTS/main/images/coqui-log-green-TTS.png" height="56"/>
----
### 📣 Clone your voice with a single click on [🐸Coqui.ai](https://app.coqui.ai/auth/signin)
### 📣 🐸Coqui Studio is launching soon!! Join our [waiting list](https://coqui.ai/)!!
----
🐸TTS is a library for advanced Text-to-Speech generation. It's built on the latest research, was designed to achieve the best trade-off among ease-of-training, speed and quality.
🐸TTS comes with pretrained models, tools for measuring dataset quality and already used in **20+ languages** for products and research projects.
[![Gitter](https://badges.gitter.im/coqui-ai/TTS.svg)](https://gitter.im/coqui-ai/TTS?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge)
[![Dicord](https://img.shields.io/discord/1037326658807533628?color=%239B59B6&label=chat%20on%20discord)](https://discord.gg/5eXr5seRrv)
[![License](<https://img.shields.io/badge/License-MPL%202.0-brightgreen.svg>)](https://opensource.org/licenses/MPL-2.0)
[![PyPI version](https://badge.fury.io/py/TTS.svg)](https://badge.fury.io/py/TTS)
[![Covenant](https://camo.githubusercontent.com/7d620efaa3eac1c5b060ece5d6aacfcc8b81a74a04d05cd0398689c01c4463bb/68747470733a2f2f696d672e736869656c64732e696f2f62616467652f436f6e7472696275746f72253230436f76656e616e742d76322e3025323061646f707465642d6666363962342e737667)](https://github.com/coqui-ai/TTS/blob/master/CODE_OF_CONDUCT.md)
@ -36,12 +43,12 @@ Please use our dedicated channels for questions and discussion. Help is much mor
| ------------------------------- | --------------------------------------- |
| 🚨 **Bug Reports** | [GitHub Issue Tracker] |
| 🎁 **Feature Requests & Ideas** | [GitHub Issue Tracker] |
| 👩‍💻 **Usage Questions** | [Github Discussions] |
| 🗯 **General Discussion** | [Github Discussions] or [Gitter Room] |
| 👩‍💻 **Usage Questions** | [GitHub Discussions] |
| 🗯 **General Discussion** | [GitHub Discussions] or [Discord] |
[github issue tracker]: https://github.com/coqui-ai/tts/issues
[github discussions]: https://github.com/coqui-ai/TTS/discussions
[gitter room]: https://gitter.im/coqui-ai/TTS?utm_source=share-link&utm_medium=link&utm_campaign=share-link
[discord]: https://discord.gg/5eXr5seRrv
[Tutorials and Examples]: https://github.com/coqui-ai/TTS/wiki/TTS-Notebooks-and-Tutorials
@ -75,7 +82,7 @@ Underlined "TTS*" and "Judy*" are 🐸TTS models
- Modular (but not too much) code base enabling easy implementation of new ideas.
## Implemented Models
### Text-to-Spectrogram
### Spectrogram models
- Tacotron: [paper](https://arxiv.org/abs/1703.10135)
- Tacotron2: [paper](https://arxiv.org/abs/1712.05884)
- Glow-TTS: [paper](https://arxiv.org/abs/2005.11129)
@ -83,9 +90,12 @@ Underlined "TTS*" and "Judy*" are 🐸TTS models
- Align-TTS: [paper](https://arxiv.org/abs/2003.01950)
- FastPitch: [paper](https://arxiv.org/pdf/2006.06873.pdf)
- FastSpeech: [paper](https://arxiv.org/abs/1905.09263)
- SC-GlowTTS: [paper](https://arxiv.org/abs/2104.05557)
- Capacitron: [paper](https://arxiv.org/abs/1906.03402)
### End-to-End Models
- VITS: [paper](https://arxiv.org/pdf/2106.06103)
- YourTTS: [paper](https://arxiv.org/abs/2112.02418)
### Attention Methods
- Guided Attention: [paper](https://arxiv.org/abs/1710.08969)
@ -136,6 +146,21 @@ $ make install
If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](https://stackoverflow.com/questions/66726331/how-can-i-run-mozilla-tts-coqui-tts-training-with-cuda-on-a-windows-system).
## Docker Image
You can also try TTS without install with the docker image.
Simply run the following command and you will be able to run TTS without installing it.
```bash
docker run --rm -it -p 5002:5002 --entrypoint /bin/bash ghcr.io/coqui-ai/tts-cpu
python3 TTS/server/server.py --list_models #To get the list of available models
python3 TTS/server/server.py --model_name tts_models/en/vctk/vits # To start a server
```
You can then enjoy the TTS server [here](http://[::1]:5002/)
More details about the docker images (like GPU support) can be found [here](https://tts.readthedocs.io/en/latest/docker_images.html)
## Use TTS
### Single Speaker Models
@ -147,12 +172,12 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht
```
- Get model info (for both tts_models and vocoder_models):
- Query by type/name:
The model_info_by_name uses the name as it from the --list_models.
The model_info_by_name uses the name as it from the --list_models.
```
$ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
```
For example:
```
$ tts --model_info_by_name tts_models/tr/common-voice/glow-tts
```
@ -160,16 +185,16 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht
$ tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2
```
- Query by type/idx:
The model_query_idx uses the corresponding idx from --list_models.
The model_query_idx uses the corresponding idx from --list_models.
```
$ tts --model_info_by_idx "<model_type>/<model_query_idx>"
```
For example:
```
$ tts --model_info_by_idx tts_models/3
$ tts --model_info_by_idx tts_models/3
```
- Run TTS with default models:
```
@ -208,7 +233,7 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht
- Run your own TTS and Vocoder models:
```
$ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth --out_path output/path/speech.wav
$ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
--vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
```
@ -229,7 +254,7 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht
- Run your own multi-speaker TTS model:
```
$ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
$ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/model.pth --config_path path/to/config.json --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
```
## Directory Structure
@ -239,8 +264,6 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht
|- TTS
|- bin/ (folder for all the executables.)
|- train*.py (train your target model.)
|- distribute.py (train your TTS model using Multiple GPUs.)
|- compute_statistics.py (compute dataset statistics for normalization.)
|- ...
|- tts/ (text to speech models)
|- layers/ (model layer definitions)

View File

@ -12,6 +12,61 @@
}
}
},
"bg": {
"cv": {
"vits":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--bg--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"cs": {
"cv": {
"vits":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--cs--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"da": {
"cv": {
"vits":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--da--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"et": {
"cv": {
"vits":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--et--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"ga": {
"cv": {
"vits":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ga--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"en": {
"ek1": {
"tacotron2": {
@ -79,6 +134,14 @@
"license": "apache 2.0",
"contact": "egolge@coqui.com"
},
"vits--neon": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--en--ljspeech--vits.zip",
"default_vocoder": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause",
"contact": null,
"commit": null
},
"fast_pitch": {
"description": "FastPitch model trained on LJSpeech using the Aligner Network",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--fast_pitch.zip",
@ -130,10 +193,10 @@
"license": "apache 2.0",
"contact": "adamfroghyar@gmail.com"
},
"capacitron-t2-c150": {
"capacitron-t2-c150_v2": {
"description": "Capacitron additions to Tacotron 2 with Capacity at 150 as in https://arxiv.org/pdf/1906.03402.pdf",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--en--blizzard2013--capacitron-t2-c150.zip",
"commit": "d6284e7",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.1_models/tts_models--en--blizzard2013--capacitron-t2-c150_v2.zip",
"commit": "a67039d",
"default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
"author": "Adam Froghyar @a-froghyar",
"license": "apache 2.0",
@ -151,18 +214,36 @@
"license": "MPL",
"contact": "egolge@coqui.com"
}
}
},
"css10":{
"vits":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--es--css10--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"fr": {
"mai": {
"tacotron2-DDC": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--fr--mai--tacotron2-DDC.zip",
"default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
"commit": "",
"commit": null,
"author": "Eren Gölge @erogol",
"license": "MPL",
"contact": "egolge@coqui.com"
}
},
"css10":{
"vits":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fr--css10--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"uk":{
@ -174,6 +255,13 @@
"license": "MIT",
"contact": "",
"default_vocoder": "vocoder_models/uk/mai/multiband-melgan"
},
"vits":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--uk--mai--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
@ -198,6 +286,15 @@
"stats_file": null,
"commit": "540d811"
}
},
"css10":{
"vits":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--nl--css10--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"de": {
@ -224,6 +321,15 @@
"license": "apache 2.0",
"commit": "unknown"
}
},
"css10": {
"vits-neon":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--de--css10--vits.zip",
"default_vocoder": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause",
"commit": null
}
}
},
"ja": {
@ -359,6 +465,149 @@
"commit": "1b22f03"
}
}
},
"hu": {
"css10": {
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hu--css10--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"el": {
"cv": {
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--el--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"fi": {
"css10": {
"vits":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fi--css10--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"hr": {
"cv": {
"vits":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hr--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"lt": {
"cv": {
"vits":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lt--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"lv": {
"cv": {
"vits":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lv--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"mt": {
"cv": {
"vits":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--mt--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"pl": {
"mai_female": {
"vits":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pl--mai_female--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"pt": {
"cv": {
"vits":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pt--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"ro": {
"cv": {
"vits":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ro--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"sk": {
"cv": {
"vits":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sk--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"sl": {
"cv": {
"vits":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sl--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
},
"sv": {
"cv": {
"vits":{
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sv--cv--vits.zip",
"default_vocoder": null,
"commit": null,
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
}
},
"vocoder_models": {
@ -512,4 +761,4 @@
}
}
}
}
}

View File

@ -1 +1 @@
0.8.0
0.9.0

View File

@ -6,38 +6,87 @@ import torch
from tqdm import tqdm
from TTS.config import load_config
from TTS.config.shared_configs import BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.utils.managers import save_file
from TTS.tts.utils.speakers import SpeakerManager
parser = argparse.ArgumentParser(
description="""Compute embedding vectors for each wav file in a dataset.\n\n"""
description="""Compute embedding vectors for each audio file in a dataset and store them keyed by `{dataset_name}#{file_path}` in a .pth file\n\n"""
"""
Example runs:
python TTS/bin/compute_embeddings.py speaker_encoder_model.pth speaker_encoder_config.json dataset_config.json
python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --config_dataset_path dataset_config.json
python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --fomatter vctk --dataset_path /path/to/vctk/dataset --dataset_name my_vctk --metafile /path/to/vctk/metafile.csv
""",
formatter_class=RawTextHelpFormatter,
)
parser.add_argument("model_path", type=str, help="Path to model checkpoint file.")
parser.add_argument("config_path", type=str, help="Path to model config file.")
parser.add_argument("config_dataset_path", type=str, help="Path to dataset config file.")
parser.add_argument(
"--model_path",
type=str,
help="Path to model checkpoint file. It defaults to the released speaker encoder.",
default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
)
parser.add_argument(
"--config_path",
type=str,
help="Path to model config file. It defaults to the released speaker encoder config.",
default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
)
parser.add_argument(
"--config_dataset_path",
type=str,
help="Path to dataset config file. You either need to provide this or `formatter_name`, `dataset_name` and `dataset_path` arguments.",
default=None,
)
parser.add_argument("--output_path", type=str, help="Path for output `pth` or `json` file.", default="speakers.pth")
parser.add_argument("--old_file", type=str, help="Previous embedding file to only compute new audios.", default=None)
parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False)
parser.add_argument("--no_eval", type=bool, help="Do not compute eval?. Default False", default=False)
parser.add_argument(
"--formatter_name",
type=str,
help="Name of the formatter to use. You either need to provide this or `config_dataset_path`",
default=None,
)
parser.add_argument(
"--dataset_name",
type=str,
help="Name of the dataset to use. You either need to provide this or `config_dataset_path`",
default=None,
)
parser.add_argument(
"--dataset_path",
type=str,
help="Path to the dataset. You either need to provide this or `config_dataset_path`",
default=None,
)
parser.add_argument(
"--metafile",
type=str,
help="Path to the meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
default=None,
)
args = parser.parse_args()
use_cuda = torch.cuda.is_available() and not args.disable_cuda
c_dataset = load_config(args.config_dataset_path)
if args.config_dataset_path is not None:
c_dataset = load_config(args.config_dataset_path)
meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=not args.no_eval)
else:
c_dataset = BaseDatasetConfig()
c_dataset.formatter = args.formatter_name
c_dataset.dataset_name = args.dataset_name
c_dataset.path = args.dataset_path
c_dataset.meta_file_train = args.metafile if args.metafile else None
meta_data_train, meta_data_eval = load_tts_samples(c_dataset, eval_split=not args.no_eval)
meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=not args.no_eval)
if meta_data_eval is None:
wav_files = meta_data_train
samples = meta_data_train
else:
wav_files = meta_data_train + meta_data_eval
samples = meta_data_train + meta_data_eval
encoder_manager = SpeakerManager(
encoder_model_path=args.model_path,
@ -50,25 +99,23 @@ class_name_key = encoder_manager.encoder_config.class_name_key
# compute speaker embeddings
speaker_mapping = {}
for idx, wav_file in enumerate(tqdm(wav_files)):
if isinstance(wav_file, dict):
class_name = wav_file[class_name_key]
wav_file = wav_file["audio_file"]
else:
class_name = None
for idx, fields in enumerate(tqdm(samples)):
class_name = fields[class_name_key]
audio_file = fields["audio_file"]
embedding_key = fields["audio_unique_name"]
root_path = fields["root_path"]
wav_file_name = os.path.basename(wav_file)
if args.old_file is not None and wav_file_name in encoder_manager.clip_ids:
if args.old_file is not None and embedding_key in encoder_manager.clip_ids:
# get the embedding from the old file
embedd = encoder_manager.get_embedding_by_clip(wav_file_name)
embedd = encoder_manager.get_embedding_by_clip(embedding_key)
else:
# extract the embedding
embedd = encoder_manager.compute_embedding_from_clip(wav_file)
embedd = encoder_manager.compute_embedding_from_clip(audio_file)
# create speaker_mapping if target dataset is defined
speaker_mapping[wav_file_name] = {}
speaker_mapping[wav_file_name]["name"] = class_name
speaker_mapping[wav_file_name]["embedding"] = embedd
speaker_mapping[embedding_key] = {}
speaker_mapping[embedding_key]["name"] = class_name
speaker_mapping[embedding_key]["embedding"] = embedd
if speaker_mapping:
# save speaker_mapping if target dataset is defined

View File

@ -37,7 +37,7 @@ def setup_loader(ap, r, verbose=False):
precompute_num_workers=0,
use_noise_augment=False,
verbose=verbose,
speaker_id_mapping=speaker_manager.ids if c.use_speaker_embedding else None,
speaker_id_mapping=speaker_manager.name_to_id if c.use_speaker_embedding else None,
d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None,
)

View File

@ -7,30 +7,25 @@ from tqdm.contrib.concurrent import process_map
from TTS.config import load_config
from TTS.tts.datasets import load_tts_samples
from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut
phonemizer = Gruut(language="en-us")
from TTS.tts.utils.text.phonemizers import Gruut
def compute_phonemes(item):
try:
text = item[0]
ph = phonemizer.phonemize(text).split("|")
except:
return []
return list(set(ph))
text = item["text"]
ph = phonemizer.phonemize(text).replace("|", "")
return set(list(ph))
def main():
# pylint: disable=W0601
global c
global c, phonemizer
# pylint: disable=bad-option-value
parser = argparse.ArgumentParser(
description="""Find all the unique characters or phonemes in a dataset.\n\n"""
"""
Example runs:
python TTS/bin/find_unique_chars.py --config_path config.json
python TTS/bin/find_unique_phonemes.py --config_path config.json
""",
formatter_class=RawTextHelpFormatter,
)
@ -46,15 +41,24 @@ def main():
items = train_items + eval_items
print("Num items:", len(items))
is_lang_def = all(item["language"] for item in items)
language_list = [item["language"] for item in items]
is_lang_def = all(language_list)
if not c.phoneme_language or not is_lang_def:
raise ValueError("Phoneme language must be defined in config.")
if not language_list.count(language_list[0]) == len(language_list):
raise ValueError(
"Currently, just one phoneme language per config file is supported !! Please split the dataset config into different configs and run it individually for each language !!"
)
phonemizer = Gruut(language=language_list[0], keep_puncs=True)
phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15)
phones = []
for ph in phonemes:
phones.extend(ph)
phones = set(phones)
lower_phones = filter(lambda c: c.islower(), phones)
phones_force_lower = [c.lower() for c in phones]

View File

@ -17,7 +17,7 @@ def adjust_path_and_remove_silence(audio_path):
# create all directory structure
pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
# remove the silence and save the audio
output_path = remove_silence(
output_path, is_speech = remove_silence(
model_and_utils,
audio_path,
output_path,
@ -25,26 +25,34 @@ def adjust_path_and_remove_silence(audio_path):
use_cuda=args.use_cuda,
)
return output_path
return output_path, is_speech
def preprocess_audios():
files = sorted(glob.glob(os.path.join(args.input_dir, args.glob), recursive=True))
print("> Number of files: ", len(files))
if not args.force:
print("> Ignoring files that already exist in the output directory.")
print("> Ignoring files that already exist in the output idrectory.")
if args.trim_just_beginning_and_end:
print("> Trimming just the beginning and the end with nonspeech parts.")
else:
print("> Trimming all nonspeech parts.")
filtered_files = []
if files:
# create threads
# num_threads = multiprocessing.cpu_count()
# process_map(adjust_path_and_remove_silence, files, max_workers=num_threads, chunksize=15)
for f in tqdm(files):
adjust_path_and_remove_silence(f)
output_path, is_speech = adjust_path_and_remove_silence(f)
if not is_speech:
filtered_files.append(output_path)
# write files that do not have speech
with open(os.path.join(args.output_dir, "filtered_files.txt"), "w", encoding="utf-8") as f:
for file in filtered_files:
f.write(file + "\n")
else:
print("> No files Found !")

View File

@ -238,6 +238,13 @@ If you don't specify any models, then it uses LJSpeech based English model.
help="speaker ID of the reference_wav speaker (If not provided the embedding will be computed using the Speaker Encoder).",
default=None,
)
parser.add_argument(
"--progress_bar",
type=str2bool,
help="If true shows a progress bar for the model download. Defaults to True",
default=True,
)
args = parser.parse_args()
# print the description if either text or list_models is not set
@ -255,7 +262,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
# load model manager
path = Path(__file__).parent / "../.models.json"
manager = ModelManager(path)
manager = ModelManager(path, progress_bar=args.progress_bar)
model_path = None
config_path = None
@ -323,7 +330,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
print(
" > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
)
print(synthesizer.tts_model.speaker_manager.ids)
print(synthesizer.tts_model.speaker_manager.name_to_id)
return
# query langauge ids of a multi-lingual model.
@ -331,7 +338,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
print(
" > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
)
print(synthesizer.tts_model.language_manager.ids)
print(synthesizer.tts_model.language_manager.name_to_id)
return
# check the arguments against a multi-speaker model.

View File

@ -62,7 +62,7 @@ class BaseAudioConfig(Coqpit):
Maximum frequency of the F0 frames. Defaults to ```640```.
pitch_fmin (float, optional):
Minimum frequency of the F0 frames. Defaults to ```0```.
Minimum frequency of the F0 frames. Defaults to ```1```.
trim_db (int):
Silence threshold used for silence trimming. Defaults to 45.
@ -144,7 +144,7 @@ class BaseAudioConfig(Coqpit):
do_amp_to_db_mel: bool = True
# f0 params
pitch_fmax: float = 640.0
pitch_fmin: float = 0.0
pitch_fmin: float = 1.0
# normalization params
signal_norm: bool = True
min_level_db: int = -100
@ -193,21 +193,24 @@ class BaseDatasetConfig(Coqpit):
"""Base config for TTS datasets.
Args:
name (str):
Dataset name that defines the preprocessor in use. Defaults to None.
formatter (str):
Formatter name that defines used formatter in ```TTS.tts.datasets.formatter```. Defaults to `""`.
dataset_name (str):
Unique name for the dataset. Defaults to `""`.
path (str):
Root path to the dataset files. Defaults to None.
Root path to the dataset files. Defaults to `""`.
meta_file_train (str):
Name of the dataset meta file. Or a list of speakers to be ignored at training for multi-speaker datasets.
Defaults to None.
Defaults to `""`.
ignored_speakers (List):
List of speakers IDs that are not used at the training. Default None.
language (str):
Language code of the dataset. If defined, it overrides `phoneme_language`. Defaults to None.
Language code of the dataset. If defined, it overrides `phoneme_language`. Defaults to `""`.
meta_file_val (str):
Name of the dataset meta file that defines the instances used at validation.
@ -217,7 +220,8 @@ class BaseDatasetConfig(Coqpit):
train the duration predictor.
"""
name: str = ""
formatter: str = ""
dataset_name: str = ""
path: str = ""
meta_file_train: str = ""
ignored_speakers: List[str] = None
@ -230,7 +234,7 @@ class BaseDatasetConfig(Coqpit):
):
"""Check config fields"""
c = asdict(self)
check_argument("name", c, restricted=True)
check_argument("formatter", c, restricted=True)
check_argument("path", c, restricted=True)
check_argument("meta_file_train", c, restricted=True)
check_argument("meta_file_val", c, restricted=False)

View File

@ -107,11 +107,18 @@ class BaseEncoder(nn.Module):
return criterion
def load_checkpoint(
self, config: Coqpit, checkpoint_path: str, eval: bool = False, use_cuda: bool = False, criterion=None
self,
config: Coqpit,
checkpoint_path: str,
eval: bool = False,
use_cuda: bool = False,
criterion=None,
cache=False,
):
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
try:
self.load_state_dict(state["model"])
print(" > Model fully restored. ")
except (KeyError, RuntimeError) as error:
# If eval raise the error
if eval:

View File

@ -44,13 +44,16 @@ class BaseTrainerModel(TrainerModel):
return outputs_dict
@abstractmethod
def load_checkpoint(self, config: Coqpit, checkpoint_path: str, eval: bool = False, strict: bool = True) -> None:
def load_checkpoint(
self, config: Coqpit, checkpoint_path: str, eval: bool = False, strict: bool = True, cache=False
) -> None:
"""Load a model checkpoint gile and get ready for training or inference.
Args:
config (Coqpit): Model configuration.
checkpoint_path (str): Path to the model checkpoint file.
eval (bool, optional): If true, init model for inference else for training. Defaults to False.
strcit (bool, optional): Match all checkpoint keys to model's keys. Defaults to True.
strict (bool, optional): Match all checkpoint keys to model's keys. Defaults to True.
cache (bool, optional): If True, cache the file locally for subsequent calls. It is cached under `get_user_data_dir()/tts_cache`. Defaults to False.
"""
...

View File

@ -5,6 +5,7 @@ import json
import os
import sys
from pathlib import Path
from threading import Lock
from typing import Union
from flask import Flask, render_template, request, send_file
@ -146,7 +147,7 @@ def index():
"index.html",
show_details=args.show_details,
use_multi_speaker=use_multi_speaker,
speaker_ids=speaker_manager.ids if speaker_manager is not None else None,
speaker_ids=speaker_manager.name_to_id if speaker_manager is not None else None,
use_gst=use_gst,
)
@ -168,17 +169,21 @@ def details():
)
lock = Lock()
@app.route("/api/tts", methods=["GET"])
def tts():
text = request.args.get("text")
speaker_idx = request.args.get("speaker_id", "")
style_wav = request.args.get("style_wav", "")
style_wav = style_wav_uri_to_dict(style_wav)
print(" > Model input: {}".format(text))
print(" > Speaker Idx: {}".format(speaker_idx))
wavs = synthesizer.tts(text, speaker_name=speaker_idx, style_wav=style_wav)
out = io.BytesIO()
synthesizer.save_wav(wavs, out)
with lock:
text = request.args.get("text")
speaker_idx = request.args.get("speaker_id", "")
style_wav = request.args.get("style_wav", "")
style_wav = style_wav_uri_to_dict(style_wav)
print(" > Model input: {}".format(text))
print(" > Speaker Idx: {}".format(speaker_idx))
wavs = synthesizer.tts(text, speaker_name=speaker_idx, style_wav=style_wav)
out = io.BytesIO()
synthesizer.save_wav(wavs, out)
return send_file(out, mimetype="audio/wav")

View File

@ -1,3 +1,4 @@
import os
import sys
from collections import Counter
from pathlib import Path
@ -12,20 +13,16 @@ from TTS.tts.datasets.formatters import *
def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01):
"""Split a dataset into train and eval. Consider speaker distribution in multi-speaker training.
Args:
<<<<<<< HEAD
items (List[List]):
A list of samples. Each sample is a list of `[audio_path, text, speaker_id]`.
Args:
items (List[List]):
A list of samples. Each sample is a list of `[audio_path, text, speaker_id]`.
eval_split_max_size (int):
Number maximum of samples to be used for evaluation in proportion split. Defaults to None (Disabled).
eval_split_max_size (int):
Number maximum of samples to be used for evaluation in proportion split. Defaults to None (Disabled).
eval_split_size (float):
If between 0.0 and 1.0 represents the proportion of the dataset to include in the evaluation set.
If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%).
=======
items (List[List]): A list of samples. Each sample is a list of `[text, audio_path, speaker_id]`.
>>>>>>> Fix docstring
eval_split_size (float):
If between 0.0 and 1.0 represents the proportion of the dataset to include in the evaluation set.
If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%).
"""
speakers = [item["speaker_name"] for item in items]
is_multi_speaker = len(set(speakers)) > 1
@ -59,6 +56,17 @@ def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01):
return items[:eval_split_size], items[eval_split_size:]
def add_extra_keys(metadata, language, dataset_name):
for item in metadata:
# add language name
item["language"] = language
# add unique audio name
relfilepath = os.path.splitext(os.path.relpath(item["audio_file"], item["root_path"]))[0]
audio_unique_name = f"{dataset_name}#{relfilepath}"
item["audio_unique_name"] = audio_unique_name
return metadata
def load_tts_samples(
datasets: Union[List[Dict], Dict],
eval_split=True,
@ -97,7 +105,8 @@ def load_tts_samples(
if not isinstance(datasets, list):
datasets = [datasets]
for dataset in datasets:
name = dataset["name"]
formatter_name = dataset["formatter"]
dataset_name = dataset["dataset_name"]
root_path = dataset["path"]
meta_file_train = dataset["meta_file_train"]
meta_file_val = dataset["meta_file_val"]
@ -106,17 +115,19 @@ def load_tts_samples(
# setup the right data processor
if formatter is None:
formatter = _get_formatter_by_name(name)
formatter = _get_formatter_by_name(formatter_name)
# load train set
meta_data_train = formatter(root_path, meta_file_train, ignored_speakers=ignored_speakers)
meta_data_train = [{**item, **{"language": language}} for item in meta_data_train]
assert len(meta_data_train) > 0, f" [!] No training samples found in {root_path}/{meta_file_train}"
meta_data_train = add_extra_keys(meta_data_train, language, dataset_name)
print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}")
# load evaluation split if set
if eval_split:
if meta_file_val:
meta_data_eval = formatter(root_path, meta_file_val, ignored_speakers=ignored_speakers)
meta_data_eval = [{**item, **{"language": language}} for item in meta_data_eval]
meta_data_eval = add_extra_keys(meta_data_eval, language, dataset_name)
else:
meta_data_eval, meta_data_train = split_dataset(meta_data_train, eval_split_max_size, eval_split_size)
meta_data_eval_all += meta_data_eval

View File

@ -1,3 +1,4 @@
import base64
import collections
import os
import random
@ -34,6 +35,12 @@ def noise_augment_audio(wav):
return wav + (1.0 / 32768.0) * np.random.rand(*wav.shape)
def string2filename(string):
# generate a safe and reversible filename based on a string
filename = base64.urlsafe_b64encode(string.encode("utf-8")).decode("utf-8", "ignore")
return filename
class TTSDataset(Dataset):
def __init__(
self,
@ -201,7 +208,7 @@ class TTSDataset(Dataset):
def get_f0(self, idx):
out_dict = self.f0_dataset[idx]
item = self.samples[idx]
assert item["audio_file"] == out_dict["audio_file"]
assert item["audio_unique_name"] == out_dict["audio_unique_name"]
return out_dict
@staticmethod
@ -256,6 +263,7 @@ class TTSDataset(Dataset):
"speaker_name": item["speaker_name"],
"language_name": item["language"],
"wav_file_name": os.path.basename(item["audio_file"]),
"audio_unique_name": item["audio_unique_name"],
}
return sample
@ -397,8 +405,8 @@ class TTSDataset(Dataset):
language_ids = None
# get pre-computed d-vectors
if self.d_vector_mapping is not None:
wav_files_names = list(batch["wav_file_name"])
d_vectors = [self.d_vector_mapping[w]["embedding"] for w in wav_files_names]
embedding_keys = list(batch["audio_unique_name"])
d_vectors = [self.d_vector_mapping[w]["embedding"] for w in embedding_keys]
else:
d_vectors = None
@ -560,19 +568,18 @@ class PhonemeDataset(Dataset):
def __getitem__(self, index):
item = self.samples[index]
ids = self.compute_or_load(item["audio_file"], item["text"])
ids = self.compute_or_load(string2filename(item["audio_unique_name"]), item["text"])
ph_hat = self.tokenizer.ids_to_text(ids)
return {"text": item["text"], "ph_hat": ph_hat, "token_ids": ids, "token_ids_len": len(ids)}
def __len__(self):
return len(self.samples)
def compute_or_load(self, wav_file, text):
def compute_or_load(self, file_name, text):
"""Compute phonemes for the given text.
If the phonemes are already cached, load them from cache.
"""
file_name = os.path.splitext(os.path.basename(wav_file))[0]
file_ext = "_phoneme.npy"
cache_path = os.path.join(self.cache_path, file_name + file_ext)
try:
@ -669,11 +676,11 @@ class F0Dataset:
def __getitem__(self, idx):
item = self.samples[idx]
f0 = self.compute_or_load(item["audio_file"])
f0 = self.compute_or_load(item["audio_file"], string2filename(item["audio_unique_name"]))
if self.normalize_f0:
assert self.mean is not None and self.std is not None, " [!] Mean and STD is not available"
f0 = self.normalize(f0)
return {"audio_file": item["audio_file"], "f0": f0}
return {"audio_unique_name": item["audio_unique_name"], "f0": f0}
def __len__(self):
return len(self.samples)
@ -705,8 +712,7 @@ class F0Dataset:
return self.pad_id
@staticmethod
def create_pitch_file_path(wav_file, cache_path):
file_name = os.path.splitext(os.path.basename(wav_file))[0]
def create_pitch_file_path(file_name, cache_path):
pitch_file = os.path.join(cache_path, file_name + "_pitch.npy")
return pitch_file
@ -744,11 +750,11 @@ class F0Dataset:
pitch[zero_idxs] = 0.0
return pitch
def compute_or_load(self, wav_file):
def compute_or_load(self, wav_file, audio_unique_name):
"""
compute pitch and return a numpy array of pitch values
"""
pitch_file = self.create_pitch_file_path(wav_file, self.cache_path)
pitch_file = self.create_pitch_file_path(audio_unique_name, self.cache_path)
if not os.path.exists(pitch_file):
pitch = self._compute_and_save_pitch(self.ap, wav_file, pitch_file)
else:
@ -756,14 +762,14 @@ class F0Dataset:
return pitch.astype(np.float32)
def collate_fn(self, batch):
audio_file = [item["audio_file"] for item in batch]
audio_unique_name = [item["audio_unique_name"] for item in batch]
f0s = [item["f0"] for item in batch]
f0_lens = [len(item["f0"]) for item in batch]
f0_lens_max = max(f0_lens)
f0s_torch = torch.LongTensor(len(f0s), f0_lens_max).fill_(self.get_pad_id())
for i, f0_len in enumerate(f0_lens):
f0s_torch[i, :f0_len] = torch.LongTensor(f0s[i])
return {"audio_file": audio_file, "f0": f0s_torch, "f0_lens": f0_lens}
return {"audio_unique_name": audio_unique_name, "f0": f0s_torch, "f0_lens": f0_lens}
def print_logs(self, level: int = 0) -> None:
indent = "\t" * level

View File

@ -15,6 +15,15 @@ from tqdm import tqdm
def coqui(root_path, meta_file, ignored_speakers=None):
"""Interal dataset formatter."""
filepath = os.path.join(root_path, meta_file)
# ensure there are 4 columns for every line
with open(filepath, "r", encoding="utf8") as f:
lines = f.readlines()
num_cols = len(lines[0].split("|")) # take the first row as reference
for idx, line in enumerate(lines[1:]):
if len(line.split("|")) != num_cols:
print(f" > Missing column in line {idx + 1} -> {line.strip()}")
# load metadata
metadata = pd.read_csv(os.path.join(root_path, meta_file), sep="|")
assert all(x in metadata.columns for x in ["audio_file", "text"])
speaker_name = None if "speaker_name" in metadata.columns else "coqui"
@ -97,9 +106,9 @@ def mailabs(root_path, meta_files=None, ignored_speakers=None):
meta_files (str): list of meta files to be used in the training. If None, finds all the csv files
recursively. Defaults to None
"""
speaker_regex = re.compile("by_book/(male|female)/(?P<speaker_name>[^/]+)/")
speaker_regex = re.compile(f"by_book{os.sep}(male|female){os.sep}(?P<speaker_name>[^{os.sep}]+){os.sep}")
if not meta_files:
csv_files = glob(root_path + "/**/metadata.csv", recursive=True)
csv_files = glob(root_path + f"{os.sep}**{os.sep}metadata.csv", recursive=True)
else:
csv_files = meta_files
@ -578,3 +587,17 @@ def kokoro(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
text = cols[2].replace(" ", "")
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
return items
def kss(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
"""Korean single-speaker dataset from https://www.kaggle.com/datasets/bryanpark/korean-single-speaker-speech-dataset"""
txt_file = os.path.join(root_path, meta_file)
items = []
speaker_name = "kss"
with open(txt_file, "r", encoding="utf-8") as ttf:
for line in ttf:
cols = line.split("|")
wav_file = os.path.join(root_path, cols[0])
text = cols[2] # cols[1] => 6월, cols[2] => 유월
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name})
return items

View File

@ -398,9 +398,9 @@ class AlignTTS(BaseTTS):
logger.eval_audios(steps, audios, self.ap.sample_rate)
def load_checkpoint(
self, config, checkpoint_path, eval=False
self, config, checkpoint_path, eval=False, cache=False
): # pylint: disable=unused-argument, redefined-builtin
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
self.load_state_dict(state["model"])
if eval:
self.eval()

View File

@ -92,16 +92,17 @@ class BaseTacotron(BaseTTS):
pass
def load_checkpoint(
self, config, checkpoint_path, eval=False
self, config, checkpoint_path, eval=False, cache=False
): # pylint: disable=unused-argument, redefined-builtin
"""Load model checkpoint and set up internals.
Args:
config (Coqpi): model configuration.
checkpoint_path (str): path to checkpoint file.
eval (bool): whether to load model for evaluation.
eval (bool, optional): whether to load model for evaluation.
cache (bool, optional): If True, cache the file locally for subsequent calls. It is cached under `get_user_data_dir()/tts_cache`. Defaults to False.
"""
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
self.load_state_dict(state["model"])
# TODO: set r in run-time by taking it from the new config
if "r" in state:

View File

@ -144,11 +144,11 @@ class BaseTTS(BaseTrainerModel):
if speaker_name is None:
speaker_id = self.speaker_manager.get_random_id()
else:
speaker_id = self.speaker_manager.ids[speaker_name]
speaker_id = self.speaker_manager.name_to_id[speaker_name]
# get language id
if hasattr(self, "language_manager") and config.use_language_embedding and language_name is not None:
language_id = self.language_manager.ids[language_name]
language_id = self.language_manager.name_to_id[language_name]
return {
"text": text,
@ -288,11 +288,13 @@ class BaseTTS(BaseTrainerModel):
# setup multi-speaker attributes
if hasattr(self, "speaker_manager") and self.speaker_manager is not None:
if hasattr(config, "model_args"):
speaker_id_mapping = self.speaker_manager.ids if config.model_args.use_speaker_embedding else None
speaker_id_mapping = (
self.speaker_manager.name_to_id if config.model_args.use_speaker_embedding else None
)
d_vector_mapping = self.speaker_manager.embeddings if config.model_args.use_d_vector_file else None
config.use_d_vector_file = config.model_args.use_d_vector_file
else:
speaker_id_mapping = self.speaker_manager.ids if config.use_speaker_embedding else None
speaker_id_mapping = self.speaker_manager.name_to_id if config.use_speaker_embedding else None
d_vector_mapping = self.speaker_manager.embeddings if config.use_d_vector_file else None
else:
speaker_id_mapping = None
@ -300,7 +302,7 @@ class BaseTTS(BaseTrainerModel):
# setup multi-lingual attributes
if hasattr(self, "language_manager") and self.language_manager is not None:
language_id_mapping = self.language_manager.ids if self.args.use_language_embedding else None
language_id_mapping = self.language_manager.name_to_id if self.args.use_language_embedding else None
else:
language_id_mapping = None
@ -342,7 +344,7 @@ class BaseTTS(BaseTrainerModel):
loader = DataLoader(
dataset,
batch_size=config.eval_batch_size if is_eval else config.batch_size,
shuffle=False, # shuffle is done in the dataset.
shuffle=True, # if there is no other sampler
collate_fn=dataset.collate_fn,
drop_last=False, # setting this False might cause issues in AMP training.
sampler=sampler,
@ -363,7 +365,7 @@ class BaseTTS(BaseTrainerModel):
aux_inputs = {
"speaker_id": None
if not self.config.use_speaker_embedding
else random.sample(sorted(self.speaker_manager.ids.values()), 1),
else random.sample(sorted(self.speaker_manager.name_to_id.values()), 1),
"d_vector": d_vector,
"style_wav": None, # TODO: handle GST style input
}

View File

@ -16,6 +16,7 @@ from TTS.tts.utils.helpers import average_over_durations, generate_path, maximum
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.tts.utils.visual import plot_alignment, plot_avg_pitch, plot_spectrogram
from TTS.utils.io import load_fsspec
@dataclass
@ -707,9 +708,9 @@ class ForwardTTS(BaseTTS):
logger.eval_audios(steps, audios, self.ap.sample_rate)
def load_checkpoint(
self, config, checkpoint_path, eval=False
self, config, checkpoint_path, eval=False, cache=False
): # pylint: disable=unused-argument, redefined-builtin
state = torch.load(checkpoint_path, map_location=torch.device("cpu"))
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
self.load_state_dict(state["model"])
if eval:
self.eval()

View File

@ -284,6 +284,7 @@ class VitsDataset(TTSDataset):
"wav_file": wav_filename,
"speaker_name": item["speaker_name"],
"language_name": item["language"],
"audio_unique_name": item["audio_unique_name"],
}
@property
@ -308,6 +309,7 @@ class VitsDataset(TTSDataset):
- language_names: :math:`[B]`
- audiofile_paths: :math:`[B]`
- raw_texts: :math:`[B]`
- audio_unique_names: :math:`[B]`
"""
# convert list of dicts to dict of lists
B = len(batch)
@ -348,6 +350,7 @@ class VitsDataset(TTSDataset):
"language_names": batch["language_name"],
"audio_files": batch["wav_file"],
"raw_text": batch["raw_text"],
"audio_unique_names": batch["audio_unique_name"],
}
@ -718,6 +721,10 @@ class Vits(BaseTTS):
use_spectral_norm=self.args.use_spectral_norm_disriminator,
)
@property
def device(self):
return next(self.parameters()).device
def init_multispeaker(self, config: Coqpit):
"""Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer
or with external `d_vectors` computed from a speaker encoder model.
@ -755,17 +762,12 @@ class Vits(BaseTTS):
if (
hasattr(self.speaker_manager.encoder, "audio_config")
and self.config.audio["sample_rate"] != self.speaker_manager.encoder.audio_config["sample_rate"]
and self.config.audio.sample_rate != self.speaker_manager.encoder.audio_config["sample_rate"]
):
self.audio_transform = torchaudio.transforms.Resample(
orig_freq=self.audio_config["sample_rate"],
orig_freq=self.config.audio.sample_rate,
new_freq=self.speaker_manager.encoder.audio_config["sample_rate"],
)
# pylint: disable=W0101,W0105
self.audio_transform = torchaudio.transforms.Resample(
orig_freq=self.config.audio.sample_rate,
new_freq=self.speaker_manager.encoder.audio_config["sample_rate"],
)
def _init_speaker_embedding(self):
# pylint: disable=attribute-defined-outside-init
@ -808,6 +810,13 @@ class Vits(BaseTTS):
orig_freq=self.config.audio["sample_rate"], new_freq=self.args.encoder_sample_rate
) # pylint: disable=W0201
def on_epoch_start(self, trainer): # pylint: disable=W0613
"""Freeze layers at the beginning of an epoch"""
self._freeze_layers()
# set the device of speaker encoder
if self.args.use_speaker_encoder_as_loss:
self.speaker_manager.encoder = self.speaker_manager.encoder.to(self.device)
def on_init_end(self, trainer): # pylint: disable=W0613
"""Reinit layes if needed"""
if self.args.reinit_DP:
@ -1185,7 +1194,6 @@ class Vits(BaseTTS):
y_lengths = torch.tensor([y.size(-1)]).to(y.device)
speaker_cond_src = reference_speaker_id if reference_speaker_id is not None else reference_d_vector
speaker_cond_tgt = speaker_id if speaker_id is not None else d_vector
# print(y.shape, y_lengths.shape)
wav, _, _ = self.voice_conversion(y, y_lengths, speaker_cond_src, speaker_cond_tgt)
return wav
@ -1229,8 +1237,6 @@ class Vits(BaseTTS):
Tuple[Dict, Dict]: Model ouputs and computed losses.
"""
self._freeze_layers()
spec_lens = batch["spec_lens"]
if optimizer_idx == 0:
@ -1402,11 +1408,11 @@ class Vits(BaseTTS):
if speaker_name is None:
speaker_id = self.speaker_manager.get_random_id()
else:
speaker_id = self.speaker_manager.ids[speaker_name]
speaker_id = self.speaker_manager.name_to_id[speaker_name]
# get language id
if hasattr(self, "language_manager") and config.use_language_embedding and language_name is not None:
language_id = self.language_manager.ids[language_name]
language_id = self.language_manager.name_to_id[language_name]
return {
"text": text,
@ -1461,8 +1467,8 @@ class Vits(BaseTTS):
d_vectors = None
# get numerical speaker ids from speaker names
if self.speaker_manager is not None and self.speaker_manager.ids and self.args.use_speaker_embedding:
speaker_ids = [self.speaker_manager.ids[sn] for sn in batch["speaker_names"]]
if self.speaker_manager is not None and self.speaker_manager.name_to_id and self.args.use_speaker_embedding:
speaker_ids = [self.speaker_manager.name_to_id[sn] for sn in batch["speaker_names"]]
if speaker_ids is not None:
speaker_ids = torch.LongTensor(speaker_ids)
@ -1471,12 +1477,12 @@ class Vits(BaseTTS):
# get d_vectors from audio file names
if self.speaker_manager is not None and self.speaker_manager.embeddings and self.args.use_d_vector_file:
d_vector_mapping = self.speaker_manager.embeddings
d_vectors = [d_vector_mapping[w]["embedding"] for w in batch["audio_files"]]
d_vectors = [d_vector_mapping[w]["embedding"] for w in batch["audio_unique_names"]]
d_vectors = torch.FloatTensor(d_vectors)
# get language ids from language names
if self.language_manager is not None and self.language_manager.ids and self.args.use_language_embedding:
language_ids = [self.language_manager.ids[ln] for ln in batch["language_names"]]
if self.language_manager is not None and self.language_manager.name_to_id and self.args.use_language_embedding:
language_ids = [self.language_manager.name_to_id[ln] for ln in batch["language_names"]]
if language_ids is not None:
language_ids = torch.LongTensor(language_ids)
@ -1680,14 +1686,10 @@ class Vits(BaseTTS):
return [VitsDiscriminatorLoss(self.config), VitsGeneratorLoss(self.config)]
def load_checkpoint(
self,
config,
checkpoint_path,
eval=False,
strict=True,
self, config, checkpoint_path, eval=False, strict=True, cache=False
): # pylint: disable=unused-argument, redefined-builtin
"""Load the model checkpoint and setup for training or inference"""
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
# compat band-aid for the pre-trained models to not use the encoder baked into the model
# TODO: consider baking the speaker encoder into the model and call it from there.
# as it is probably easier for model distribution.

View File

@ -37,11 +37,11 @@ class LanguageManager(BaseIDManager):
@property
def num_languages(self) -> int:
return len(list(self.ids.keys()))
return len(list(self.name_to_id.keys()))
@property
def language_names(self) -> List:
return list(self.ids.keys())
return list(self.name_to_id.keys())
@staticmethod
def parse_language_ids_from_config(c: Coqpit) -> Dict:
@ -67,7 +67,7 @@ class LanguageManager(BaseIDManager):
Args:
c (Coqpit): Config.
"""
self.ids = self.parse_language_ids_from_config(c)
self.name_to_id = self.parse_language_ids_from_config(c)
@staticmethod
def parse_ids_from_data(items: List, parse_key: str) -> Any:
@ -82,7 +82,7 @@ class LanguageManager(BaseIDManager):
Args:
file_path (str): Path to the output file.
"""
self._save_json(file_path, self.ids)
self._save_json(file_path, self.name_to_id)
@staticmethod
def init_from_config(config: Coqpit) -> "LanguageManager":

View File

@ -39,7 +39,7 @@ class BaseIDManager:
"""
def __init__(self, id_file_path: str = ""):
self.ids = {}
self.name_to_id = {}
if id_file_path:
self.load_ids_from_file(id_file_path)
@ -60,7 +60,7 @@ class BaseIDManager:
Args:
items (List): Data sampled returned by `load_tts_samples()`.
"""
self.ids = self.parse_ids_from_data(items, parse_key=parse_key)
self.name_to_id = self.parse_ids_from_data(items, parse_key=parse_key)
def load_ids_from_file(self, file_path: str) -> None:
"""Set IDs from a file.
@ -68,7 +68,7 @@ class BaseIDManager:
Args:
file_path (str): Path to the file.
"""
self.ids = load_file(file_path)
self.name_to_id = load_file(file_path)
def save_ids_to_file(self, file_path: str) -> None:
"""Save IDs to a json file.
@ -76,7 +76,7 @@ class BaseIDManager:
Args:
file_path (str): Path to the output file.
"""
save_file(self.ids, file_path)
save_file(self.name_to_id, file_path)
def get_random_id(self) -> Any:
"""Get a random embedding.
@ -86,8 +86,8 @@ class BaseIDManager:
Returns:
np.ndarray: embedding.
"""
if self.ids:
return self.ids[random.choices(list(self.ids.keys()))[0]]
if self.name_to_id:
return self.name_to_id[random.choices(list(self.name_to_id.keys()))[0]]
return None
@ -109,11 +109,27 @@ class BaseIDManager:
class EmbeddingManager(BaseIDManager):
"""Base `Embedding` Manager class. Every new `Embedding` manager must inherit this.
It defines common `Embedding` manager specific functions.
It expects embeddings files in the following format:
::
{
'audio_file_key':{
'name': 'category_name',
'embedding'[<embedding_values>]
},
...
}
`audio_file_key` is a unique key to the audio file in the dataset. It can be the path to the file or any other unique key.
`embedding` is the embedding vector of the audio file.
`name` can be name of the speaker of the audio file.
"""
def __init__(
self,
embedding_file_path: str = "",
embedding_file_path: Union[str, List[str]] = "",
id_file_path: str = "",
encoder_model_path: str = "",
encoder_config_path: str = "",
@ -129,11 +145,24 @@ class EmbeddingManager(BaseIDManager):
self.use_cuda = use_cuda
if embedding_file_path:
self.load_embeddings_from_file(embedding_file_path)
if isinstance(embedding_file_path, list):
self.load_embeddings_from_list_of_files(embedding_file_path)
else:
self.load_embeddings_from_file(embedding_file_path)
if encoder_model_path and encoder_config_path:
self.init_encoder(encoder_model_path, encoder_config_path, use_cuda)
@property
def num_embeddings(self):
"""Get number of embeddings."""
return len(self.embeddings)
@property
def num_names(self):
"""Get number of embeddings."""
return len(self.embeddings_by_names)
@property
def embedding_dim(self):
"""Dimensionality of embeddings. If embeddings are not loaded, returns zero."""
@ -141,6 +170,11 @@ class EmbeddingManager(BaseIDManager):
return len(self.embeddings[list(self.embeddings.keys())[0]]["embedding"])
return 0
@property
def embedding_names(self):
"""Get embedding names."""
return list(self.embeddings_by_names.keys())
def save_embeddings_to_file(self, file_path: str) -> None:
"""Save embeddings to a json file.
@ -149,20 +183,57 @@ class EmbeddingManager(BaseIDManager):
"""
save_file(self.embeddings, file_path)
@staticmethod
def read_embeddings_from_file(file_path: str):
"""Load embeddings from a json file.
Args:
file_path (str): Path to the file.
"""
embeddings = load_file(file_path)
speakers = sorted({x["name"] for x in embeddings.values()})
name_to_id = {name: i for i, name in enumerate(speakers)}
clip_ids = list(set(sorted(clip_name for clip_name in embeddings.keys())))
# cache embeddings_by_names for fast inference using a bigger speakers.json
embeddings_by_names = {}
for x in embeddings.values():
if x["name"] not in embeddings_by_names.keys():
embeddings_by_names[x["name"]] = [x["embedding"]]
else:
embeddings_by_names[x["name"]].append(x["embedding"])
return name_to_id, clip_ids, embeddings, embeddings_by_names
def load_embeddings_from_file(self, file_path: str) -> None:
"""Load embeddings from a json file.
Args:
file_path (str): Path to the target json file.
"""
self.embeddings = load_file(file_path)
self.name_to_id, self.clip_ids, self.embeddings, self.embeddings_by_names = self.read_embeddings_from_file(
file_path
)
speakers = sorted({x["name"] for x in self.embeddings.values()})
self.ids = {name: i for i, name in enumerate(speakers)}
def load_embeddings_from_list_of_files(self, file_paths: List[str]) -> None:
"""Load embeddings from a list of json files and don't allow duplicate keys.
self.clip_ids = list(set(sorted(clip_name for clip_name in self.embeddings.keys())))
# cache embeddings_by_names for fast inference using a bigger speakers.json
self.embeddings_by_names = self.get_embeddings_by_names()
Args:
file_paths (List[str]): List of paths to the target json files.
"""
self.name_to_id = {}
self.clip_ids = []
self.embeddings_by_names = {}
self.embeddings = {}
for file_path in file_paths:
ids, clip_ids, embeddings, embeddings_by_names = self.read_embeddings_from_file(file_path)
# check colliding keys
duplicates = set(self.embeddings.keys()) & set(embeddings.keys())
if duplicates:
raise ValueError(f" [!] Duplicate embedding names <{duplicates}> in {file_path}")
# store values
self.name_to_id.update(ids)
self.clip_ids.extend(clip_ids)
self.embeddings_by_names.update(embeddings_by_names)
self.embeddings.update(embeddings)
def get_embedding_by_clip(self, clip_idx: str) -> List:
"""Get embedding by clip ID.

View File

@ -73,14 +73,14 @@ class SpeakerManager(EmbeddingManager):
@property
def num_speakers(self):
return len(self.ids)
return len(self.name_to_id)
@property
def speaker_names(self):
return list(self.ids.keys())
return list(self.name_to_id.keys())
def get_speakers(self) -> List:
return self.ids
return self.name_to_id
@staticmethod
def init_from_config(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "SpeakerManager":
@ -182,10 +182,10 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None,
speaker_manager.load_embeddings_from_file(c.d_vector_file)
speaker_manager.load_embeddings_from_file(speakers_file)
elif not c.use_d_vector_file: # restor speaker manager with speaker ID file.
speaker_ids_from_data = speaker_manager.ids
speaker_ids_from_data = speaker_manager.name_to_id
speaker_manager.load_ids_from_file(speakers_file)
assert all(
speaker in speaker_manager.ids for speaker in speaker_ids_from_data
speaker in speaker_manager.name_to_id for speaker in speaker_ids_from_data
), " [!] You cannot introduce new speakers to a pre-trained model."
elif c.use_d_vector_file and c.d_vector_file:
# new speaker manager with external speaker embeddings.
@ -199,7 +199,7 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None,
if speaker_manager.num_speakers > 0:
print(
" > Speaker manager is loaded with {} speakers: {}".format(
speaker_manager.num_speakers, ", ".join(speaker_manager.ids)
speaker_manager.num_speakers, ", ".join(speaker_manager.name_to_id)
)
)

View File

@ -295,7 +295,12 @@ def transfer_voice(
reference_d_vector = embedding_to_torch(reference_d_vector, cuda=use_cuda)
# load reference_wav audio
reference_wav = embedding_to_torch(model.ap.load_wav(reference_wav, sr=model.ap.sample_rate), cuda=use_cuda)
reference_wav = embedding_to_torch(
model.ap.load_wav(
reference_wav, sr=model.args.encoder_sample_rate if model.args.encoder_sample_rate else model.ap.sample_rate
),
cuda=use_cuda,
)
if hasattr(model, "module"):
_func = model.module.inference_voice_conversion

View File

View File

@ -0,0 +1,44 @@
# coding: utf-8
# Add the word you want to the dictionary.
etc_dictionary = {"1+1": "원플러스원", "2+1": "투플러스원"}
english_dictionary = {
"KOREA": "코리아",
"IDOL": "아이돌",
"IT": "아이티",
"IQ": "아이큐",
"UP": "",
"DOWN": "다운",
"PC": "피씨",
"CCTV": "씨씨티비",
"SNS": "에스엔에스",
"AI": "에이아이",
"CEO": "씨이오",
"A": "에이",
"B": "",
"C": "",
"D": "",
"E": "",
"F": "에프",
"G": "",
"H": "에이치",
"I": "아이",
"J": "제이",
"K": "케이",
"L": "",
"M": "",
"N": "",
"O": "",
"P": "",
"Q": "",
"R": "",
"S": "에스",
"T": "",
"U": "",
"V": "브이",
"W": "더블유",
"X": "엑스",
"Y": "와이",
"Z": "제트",
}

View File

@ -0,0 +1,32 @@
# coding: utf-8
# Code based on https://github.com/carpedm20/multi-speaker-tacotron-tensorflow/blob/master/text/korean.py
import re
from TTS.tts.utils.text.korean.ko_dictionary import english_dictionary, etc_dictionary
def normalize(text):
text = text.strip()
text = re.sub("[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]", "", text)
text = normalize_with_dictionary(text, etc_dictionary)
text = normalize_english(text)
text = text.lower()
return text
def normalize_with_dictionary(text, dic):
if any(key in text for key in dic.keys()):
pattern = re.compile("|".join(re.escape(key) for key in dic.keys()))
return pattern.sub(lambda x: dic[x.group()], text)
return text
def normalize_english(text):
def fn(m):
word = m.group()
if word in english_dictionary:
return english_dictionary.get(word)
return word
text = re.sub("([A-Za-z]+)", fn, text)
return text

View File

@ -0,0 +1,36 @@
from jamo import hangul_to_jamo
from TTS.tts.utils.text.korean.korean import normalize
g2p = None
def korean_text_to_phonemes(text, character: str = "hangeul") -> str:
"""
The input and output values look the same, but they are different in Unicode.
example :
input = '하늘' (Unicode : \ud558\ub298), ( + )
output = '하늘' (Unicode :\u1112\u1161\u1102\u1173\u11af), ( + + + + )
"""
global g2p # pylint: disable=global-statement
if g2p is None:
from g2pkk import G2p
g2p = G2p()
if character == "english":
from anyascii import anyascii
text = normalize(text)
text = g2p(text)
text = anyascii(text)
return text
text = normalize(text)
text = g2p(text)
text = list(hangul_to_jamo(text)) # '하늘' --> ['ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆯ']
return "".join(text)

View File

@ -1,53 +1,57 @@
from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
from TTS.tts.utils.text.phonemizers.espeak_wrapper import ESpeak
from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut
from TTS.tts.utils.text.phonemizers.ja_jp_phonemizer import JA_JP_Phonemizer
from TTS.tts.utils.text.phonemizers.zh_cn_phonemizer import ZH_CN_Phonemizer
PHONEMIZERS = {b.name(): b for b in (ESpeak, Gruut, JA_JP_Phonemizer)}
ESPEAK_LANGS = list(ESpeak.supported_languages().keys())
GRUUT_LANGS = list(Gruut.supported_languages())
# Dict setting default phonemizers for each language
# Add Gruut languages
_ = [Gruut.name()] * len(GRUUT_LANGS)
DEF_LANG_TO_PHONEMIZER = dict(list(zip(GRUUT_LANGS, _)))
# Add ESpeak languages and override any existing ones
_ = [ESpeak.name()] * len(ESPEAK_LANGS)
_new_dict = dict(list(zip(list(ESPEAK_LANGS), _)))
DEF_LANG_TO_PHONEMIZER.update(_new_dict)
# Force default for some languages
DEF_LANG_TO_PHONEMIZER["en"] = DEF_LANG_TO_PHONEMIZER["en-us"]
DEF_LANG_TO_PHONEMIZER["ja-jp"] = JA_JP_Phonemizer.name()
DEF_LANG_TO_PHONEMIZER["zh-cn"] = ZH_CN_Phonemizer.name()
def get_phonemizer_by_name(name: str, **kwargs) -> BasePhonemizer:
"""Initiate a phonemizer by name
Args:
name (str):
Name of the phonemizer that should match `phonemizer.name()`.
kwargs (dict):
Extra keyword arguments that should be passed to the phonemizer.
"""
if name == "espeak":
return ESpeak(**kwargs)
if name == "gruut":
return Gruut(**kwargs)
if name == "zh_cn_phonemizer":
return ZH_CN_Phonemizer(**kwargs)
if name == "ja_jp_phonemizer":
return JA_JP_Phonemizer(**kwargs)
raise ValueError(f"Phonemizer {name} not found")
if __name__ == "__main__":
print(DEF_LANG_TO_PHONEMIZER)
from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
from TTS.tts.utils.text.phonemizers.espeak_wrapper import ESpeak
from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut
from TTS.tts.utils.text.phonemizers.ja_jp_phonemizer import JA_JP_Phonemizer
from TTS.tts.utils.text.phonemizers.ko_kr_phonemizer import KO_KR_Phonemizer
from TTS.tts.utils.text.phonemizers.zh_cn_phonemizer import ZH_CN_Phonemizer
PHONEMIZERS = {b.name(): b for b in (ESpeak, Gruut, JA_JP_Phonemizer)}
ESPEAK_LANGS = list(ESpeak.supported_languages().keys())
GRUUT_LANGS = list(Gruut.supported_languages())
# Dict setting default phonemizers for each language
# Add Gruut languages
_ = [Gruut.name()] * len(GRUUT_LANGS)
DEF_LANG_TO_PHONEMIZER = dict(list(zip(GRUUT_LANGS, _)))
# Add ESpeak languages and override any existing ones
_ = [ESpeak.name()] * len(ESPEAK_LANGS)
_new_dict = dict(list(zip(list(ESPEAK_LANGS), _)))
DEF_LANG_TO_PHONEMIZER.update(_new_dict)
# Force default for some languages
DEF_LANG_TO_PHONEMIZER["en"] = DEF_LANG_TO_PHONEMIZER["en-us"]
DEF_LANG_TO_PHONEMIZER["ja-jp"] = JA_JP_Phonemizer.name()
DEF_LANG_TO_PHONEMIZER["zh-cn"] = ZH_CN_Phonemizer.name()
DEF_LANG_TO_PHONEMIZER["ko-kr"] = KO_KR_Phonemizer.name()
def get_phonemizer_by_name(name: str, **kwargs) -> BasePhonemizer:
"""Initiate a phonemizer by name
Args:
name (str):
Name of the phonemizer that should match `phonemizer.name()`.
kwargs (dict):
Extra keyword arguments that should be passed to the phonemizer.
"""
if name == "espeak":
return ESpeak(**kwargs)
if name == "gruut":
return Gruut(**kwargs)
if name == "zh_cn_phonemizer":
return ZH_CN_Phonemizer(**kwargs)
if name == "ja_jp_phonemizer":
return JA_JP_Phonemizer(**kwargs)
if name == "ko_kr_phonemizer":
return KO_KR_Phonemizer(**kwargs)
raise ValueError(f"Phonemizer {name} not found")
if __name__ == "__main__":
print(DEF_LANG_TO_PHONEMIZER)

View File

@ -94,6 +94,8 @@ class ESpeak(BasePhonemizer):
# band-aid for backwards compatibility
if language == "en":
language = "en-us"
if language == "zh-cn":
language = "cmn"
super().__init__(language, punctuations=punctuations, keep_puncs=keep_puncs)
if backend is not None:

View File

@ -0,0 +1,65 @@
from typing import Dict
from TTS.tts.utils.text.korean.phonemizer import korean_text_to_phonemes
from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
_DEF_KO_PUNCS = "、.,[]()?!〽~『』「」【】"
class KO_KR_Phonemizer(BasePhonemizer):
"""🐸TTS ko_kr_phonemizer using functions in `TTS.tts.utils.text.korean.phonemizer`
TODO: Add Korean to character (ᄀᄁᄂᄃᄄᄅᄆᄇᄈᄉᄊᄋᄌᄍᄎᄏᄐᄑ하ᅢᅣᅤᅥᅦᅧᅨᅩᅪᅫᅬᅭᅮᅯᅰᅱᅲᅳᅴᅵᆨᆩᆪᆫᆬᆭᆮᆯᆰᆱᆲᆳᆴᆵᆶᆷᆸᆹᆺᆻᆼᆽᆾᆿᇀᇁᇂ)
Example:
>>> from TTS.tts.utils.text.phonemizers import KO_KR_Phonemizer
>>> phonemizer = KO_KR_Phonemizer()
>>> phonemizer.phonemize("이 문장은 음성합성 테스트를 위한 문장입니다.", separator="|")
'ᄋ|ᅵ| |ᄆ|ᅮ|ᆫ|ᄌ|ᅡ|ᆼ|ᄋ|ᅳ| |ᄂ|ᅳ|ᆷ|ᄉ|ᅥ|ᆼ|ᄒ|ᅡ|ᆸ|ᄊ|ᅥ|ᆼ| |ᄐ|ᅦ|ᄉ|ᅳ|ᄐ|ᅳ|ᄅ|ᅳ| |ᄅ|ᅱ|ᄒ|ᅡ|ᆫ| |ᄆ|ᅮ|ᆫ|ᄌ|ᅡ|ᆼ|ᄋ|ᅵ|ᆷ|ᄂ|ᅵ|ᄃ|ᅡ|.'
>>> from TTS.tts.utils.text.phonemizers import KO_KR_Phonemizer
>>> phonemizer = KO_KR_Phonemizer()
>>> phonemizer.phonemize("이 문장은 음성합성 테스트를 위한 문장입니다.", separator="|", character='english')
'I| |M|u|n|J|a|n|g|E|u| |N|e|u|m|S|e|o|n|g|H|a|b|S|s|e|o|n|g| |T|e|S|e|u|T|e|u|L|e|u| |L|w|i|H|a|n| |M|u|n|J|a|n|g|I|m|N|i|D|a|.'
"""
language = "ko-kr"
def __init__(self, punctuations=_DEF_KO_PUNCS, keep_puncs=True, **kwargs): # pylint: disable=unused-argument
super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs)
@staticmethod
def name():
return "ko_kr_phonemizer"
def _phonemize(self, text: str, separator: str = "", character: str = "hangeul") -> str:
ph = korean_text_to_phonemes(text, character=character)
if separator is not None or separator != "":
return separator.join(ph)
return ph
def phonemize(self, text: str, separator: str = "", character: str = "hangeul") -> str:
return self._phonemize(text, separator, character)
@staticmethod
def supported_languages() -> Dict:
return {"ko-kr": "hangeul(korean)"}
def version(self) -> str:
return "0.0.2"
def is_available(self) -> bool:
return True
if __name__ == "__main__":
texts = "이 문장은 음성합성 테스트를 위한 문장입니다."
e = KO_KR_Phonemizer()
print(e.supported_languages())
print(e.version())
print(e.language)
print(e.name())
print(e.is_available())
print(e.phonemize(texts))

View File

@ -42,7 +42,7 @@ class ZH_CN_Phonemizer(BasePhonemizer):
@staticmethod
def supported_languages() -> Dict:
return {"zh-cn": "Japanese (Japan)"}
return {"zh-cn": "Chinese (China)"}
def version(self) -> str:
return "0.0.1"

View File

@ -2,9 +2,9 @@ from typing import Tuple
import librosa
import numpy as np
import pyworld as pw
import scipy
import soundfile as sf
from librosa import pyin
# For using kwargs
# pylint: disable=unused-argument
@ -242,12 +242,28 @@ def compute_stft_paddings(
def compute_f0(
*, x: np.ndarray = None, pitch_fmax: float = None, hop_length: int = None, sample_rate: int = None, **kwargs
*,
x: np.ndarray = None,
pitch_fmax: float = None,
pitch_fmin: float = None,
hop_length: int = None,
win_length: int = None,
sample_rate: int = None,
stft_pad_mode: str = "reflect",
center: bool = True,
**kwargs,
) -> np.ndarray:
"""Compute pitch (f0) of a waveform using the same parameters used for computing melspectrogram.
Args:
x (np.ndarray): Waveform. Shape :math:`[T_wav,]`
pitch_fmax (float): Pitch max value.
pitch_fmin (float): Pitch min value.
hop_length (int): Number of frames between STFT columns.
win_length (int): STFT window length.
sample_rate (int): Audio sampling rate.
stft_pad_mode (str): Padding mode for STFT.
center (bool): Centered padding.
Returns:
np.ndarray: Pitch. Shape :math:`[T_pitch,]`. :math:`T_pitch == T_wav / hop_length`
@ -255,20 +271,35 @@ def compute_f0(
Examples:
>>> WAV_FILE = filename = librosa.util.example_audio_file()
>>> from TTS.config import BaseAudioConfig
>>> from TTS.utils.audio.processor import AudioProcessor >>> conf = BaseAudioConfig(pitch_fmax=8000)
>>> from TTS.utils.audio import AudioProcessor
>>> conf = BaseAudioConfig(pitch_fmax=640, pitch_fmin=1)
>>> ap = AudioProcessor(**conf)
>>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050]
>>> wav = ap.load_wav(WAV_FILE, sr=ap.sample_rate)[:5 * ap.sample_rate]
>>> pitch = ap.compute_f0(wav)
"""
assert pitch_fmax is not None, " [!] Set `pitch_fmax` before caling `compute_f0`."
assert pitch_fmin is not None, " [!] Set `pitch_fmin` before caling `compute_f0`."
f0, t = pw.dio(
x.astype(np.double),
fs=sample_rate,
f0_ceil=pitch_fmax,
frame_period=1000 * hop_length / sample_rate,
f0, voiced_mask, _ = pyin(
y=x.astype(np.double),
fmin=pitch_fmin,
fmax=pitch_fmax,
sr=sample_rate,
frame_length=win_length,
win_length=win_length // 2,
hop_length=hop_length,
pad_mode=stft_pad_mode,
center=center,
n_thresholds=100,
beta_parameters=(2, 18),
boltzmann_parameter=2,
resolution=0.1,
max_transition_rate=35.92,
switch_prob=0.01,
no_trough_prob=0.01,
)
f0 = pw.stonemask(x.astype(np.double), f0, t, sample_rate)
f0[~voiced_mask] = 0.0
return f0

View File

@ -2,12 +2,12 @@ from typing import Dict, Tuple
import librosa
import numpy as np
import pyworld as pw
import scipy.io.wavfile
import scipy.signal
import soundfile as sf
from TTS.tts.utils.helpers import StandardScaler
from TTS.utils.audio.numpy_transforms import compute_f0
# pylint: disable=too-many-public-methods
@ -573,23 +573,28 @@ class AudioProcessor(object):
>>> WAV_FILE = filename = librosa.util.example_audio_file()
>>> from TTS.config import BaseAudioConfig
>>> from TTS.utils.audio import AudioProcessor
>>> conf = BaseAudioConfig(pitch_fmax=8000)
>>> conf = BaseAudioConfig(pitch_fmax=640, pitch_fmin=1)
>>> ap = AudioProcessor(**conf)
>>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050]
>>> wav = ap.load_wav(WAV_FILE, sr=ap.sample_rate)[:5 * ap.sample_rate]
>>> pitch = ap.compute_f0(wav)
"""
assert self.pitch_fmax is not None, " [!] Set `pitch_fmax` before caling `compute_f0`."
assert self.pitch_fmin is not None, " [!] Set `pitch_fmin` before caling `compute_f0`."
# align F0 length to the spectrogram length
if len(x) % self.hop_length == 0:
x = np.pad(x, (0, self.hop_length // 2), mode="reflect")
x = np.pad(x, (0, self.hop_length // 2), mode=self.stft_pad_mode)
f0, t = pw.dio(
x.astype(np.double),
fs=self.sample_rate,
f0_ceil=self.pitch_fmax,
frame_period=1000 * self.hop_length / self.sample_rate,
f0 = compute_f0(
x=x,
pitch_fmax=self.pitch_fmax,
pitch_fmin=self.pitch_fmin,
hop_length=self.hop_length,
win_length=self.win_length,
sample_rate=self.sample_rate,
stft_pad_mode=self.stft_pad_mode,
center=True,
)
f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate)
return f0
### Audio Processing ###

View File

@ -38,9 +38,9 @@ class CapacitronOptimizer:
self.param_groups = self.primary_optimizer.param_groups
self.primary_optimizer.step()
def zero_grad(self):
self.primary_optimizer.zero_grad()
self.secondary_optimizer.zero_grad()
def zero_grad(self, set_to_none=False):
self.primary_optimizer.zero_grad(set_to_none)
self.secondary_optimizer.zero_grad(set_to_none)
def load_state_dict(self, state_dict):
self.primary_optimizer.load_state_dict(state_dict[0])

View File

@ -9,6 +9,8 @@ import fsspec
import torch
from coqpit import Coqpit
from TTS.utils.generic_utils import get_user_data_dir
class RenamingUnpickler(pickle_tts.Unpickler):
"""Overload default pickler to solve module renaming problem"""
@ -57,6 +59,7 @@ def copy_model_files(config: Coqpit, out_path, new_fields=None):
def load_fsspec(
path: str,
map_location: Union[str, Callable, torch.device, Dict[Union[str, torch.device], Union[str, torch.device]]] = None,
cache: bool = True,
**kwargs,
) -> Any:
"""Like torch.load but can load from other locations (e.g. s3:// , gs://).
@ -64,21 +67,33 @@ def load_fsspec(
Args:
path: Any path or url supported by fsspec.
map_location: torch.device or str.
cache: If True, cache a remote file locally for subsequent calls. It is cached under `get_user_data_dir()/tts_cache`. Defaults to True.
**kwargs: Keyword arguments forwarded to torch.load.
Returns:
Object stored in path.
"""
with fsspec.open(path, "rb") as f:
return torch.load(f, map_location=map_location, **kwargs)
is_local = os.path.isdir(path) or os.path.isfile(path)
if cache and not is_local:
with fsspec.open(
f"filecache::{path}",
filecache={"cache_storage": str(get_user_data_dir("tts_cache"))},
mode="rb",
) as f:
return torch.load(f, map_location=map_location, **kwargs)
else:
with fsspec.open(path, "rb") as f:
return torch.load(f, map_location=map_location, **kwargs)
def load_checkpoint(model, checkpoint_path, use_cuda=False, eval=False): # pylint: disable=redefined-builtin
def load_checkpoint(
model, checkpoint_path, use_cuda=False, eval=False, cache=False
): # pylint: disable=redefined-builtin
try:
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
except ModuleNotFoundError:
pickle_tts.Unpickler = RenamingUnpickler
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), pickle_module=pickle_tts)
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), pickle_module=pickle_tts, cache=cache)
model.load_state_dict(state["model"])
if use_cuda:
model.cuda()

View File

@ -32,11 +32,14 @@ class ModelManager(object):
home path.
Args:
models_file (str): path to .model.json
models_file (str): path to .model.json file. Defaults to None.
output_prefix (str): prefix to `tts` to download models. Defaults to None
progress_bar (bool): print a progress bar when donwloading a file. Defaults to False.
"""
def __init__(self, models_file=None, output_prefix=None):
def __init__(self, models_file=None, output_prefix=None, progress_bar=False):
super().__init__()
self.progress_bar = progress_bar
if output_prefix is None:
self.output_prefix = get_user_data_dir("tts")
else:
@ -236,7 +239,7 @@ class ModelManager(object):
os.makedirs(output_path, exist_ok=True)
print(f" > Downloading model to {output_path}")
# download from github release
self._download_zip_file(model_item["github_rls_url"], output_path)
self._download_zip_file(model_item["github_rls_url"], output_path, self.progress_bar)
self.print_model_license(model_item=model_item)
# find downloaded files
output_model_path, output_config_path = self._find_files(output_path)
@ -334,7 +337,7 @@ class ModelManager(object):
config.save_json(config_path)
@staticmethod
def _download_zip_file(file_url, output_folder):
def _download_zip_file(file_url, output_folder, progress_bar):
"""Download the github releases"""
# download the file
r = requests.get(file_url, stream=True)
@ -342,11 +345,13 @@ class ModelManager(object):
try:
total_size_in_bytes = int(r.headers.get("content-length", 0))
block_size = 1024 # 1 Kibibyte
progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
if progress_bar:
progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
temp_zip_name = os.path.join(output_folder, file_url.split("/")[-1])
with open(temp_zip_name, "wb") as file:
for data in r.iter_content(block_size):
progress_bar.update(len(data))
if progress_bar:
progress_bar.update(len(data))
file.write(data)
with zipfile.ZipFile(temp_zip_name) as z:
z.extractall(output_folder)

View File

@ -212,8 +212,13 @@ class Synthesizer(object):
# handle multi-speaker
speaker_embedding = None
speaker_id = None
if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "ids"):
if speaker_name and isinstance(speaker_name, str):
if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "name_to_id"):
# handle Neon models with single speaker.
if len(self.tts_model.speaker_manager.name_to_id) == 1:
speaker_id = list(self.tts_model.speaker_manager.name_to_id.values())[0]
elif speaker_name and isinstance(speaker_name, str):
if self.tts_config.use_d_vector_file:
# get the average speaker embedding from the saved d_vectors.
speaker_embedding = self.tts_model.speaker_manager.get_mean_embedding(
@ -222,7 +227,7 @@ class Synthesizer(object):
speaker_embedding = np.array(speaker_embedding)[None, :] # [1 x embedding_dim]
else:
# get speaker idx from the speaker name
speaker_id = self.tts_model.speaker_manager.ids[speaker_name]
speaker_id = self.tts_model.speaker_manager.name_to_id[speaker_name]
elif not speaker_name and not speaker_wav:
raise ValueError(
@ -243,8 +248,12 @@ class Synthesizer(object):
if self.tts_languages_file or (
hasattr(self.tts_model, "language_manager") and self.tts_model.language_manager is not None
):
if language_name and isinstance(language_name, str):
language_id = self.tts_model.language_manager.ids[language_name]
if len(self.tts_model.language_manager.name_to_id) == 1:
language_id = list(self.tts_model.language_manager.name_to_id.values())[0]
elif language_name and isinstance(language_name, str):
language_id = self.tts_model.language_manager.name_to_id[language_name]
elif not language_name:
raise ValueError(
@ -316,7 +325,7 @@ class Synthesizer(object):
# get the speaker embedding or speaker id for the reference wav file
reference_speaker_embedding = None
reference_speaker_id = None
if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "ids"):
if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "name_to_id"):
if reference_speaker_name and isinstance(reference_speaker_name, str):
if self.tts_config.use_d_vector_file:
# get the speaker embedding from the saved d_vectors.
@ -328,12 +337,11 @@ class Synthesizer(object):
] # [1 x embedding_dim]
else:
# get speaker idx from the speaker name
reference_speaker_id = self.tts_model.speaker_manager.ids[reference_speaker_name]
reference_speaker_id = self.tts_model.speaker_manager.name_to_id[reference_speaker_name]
else:
reference_speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(
reference_wav
)
outputs = transfer_voice(
model=self.tts_model,
CONFIG=self.tts_config,

View File

@ -1,3 +1,4 @@
import soundfile as sf
import torch
import torchaudio
@ -48,7 +49,7 @@ def remove_silence(
):
# get the VAD model and utils functions
model, get_speech_timestamps, save_audio, collect_chunks = model_and_utils
model, get_speech_timestamps, _, collect_chunks = model_and_utils
# read ground truth wav and resample the audio for the VAD
wav, gt_sample_rate = read_audio(audio_path)
@ -73,9 +74,11 @@ def remove_silence(
# if have speech timestamps else save the wav
if new_speech_timestamps:
wav = collect_chunks(new_speech_timestamps, wav)
is_speech = True
else:
print(f"> The file {audio_path} probably does not have speech please check it !!")
is_speech = False
# save audio
save_audio(out_path, wav, sampling_rate=gt_sample_rate)
return out_path
sf.write(out_path, wav, gt_sample_rate, subtype="PCM_16")
return out_path, is_speech

View File

@ -22,14 +22,12 @@ class HifiganConfig(BaseGANVocoderConfig):
generator_model_params (dict): Parameters of the generator model. Defaults to
`
{
"use_mel": True,
"sample_rate": 22050,
"n_fft": 1024,
"hop_length": 256,
"win_length": 1024,
"n_mels": 80,
"mel_fmin": 0.0,
"mel_fmax": None,
"upsample_factors": [8, 8, 2, 2],
"upsample_kernel_sizes": [16, 16, 4, 4],
"upsample_initial_channel": 512,
"resblock_kernel_sizes": [3, 7, 11],
"resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
"resblock_type": "1",
}
`
batch_size (int):

View File

@ -231,6 +231,7 @@ class GAN(BaseVocoder):
config: Coqpit,
checkpoint_path: str,
eval: bool = False, # pylint: disable=unused-argument, redefined-builtin
cache: bool = False,
) -> None:
"""Load a GAN checkpoint and initialize model parameters.
@ -239,7 +240,7 @@ class GAN(BaseVocoder):
checkpoint_path (str): Checkpoint file path.
eval (bool, optional): If true, load the model for inference. If falseDefaults to False.
"""
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
# band-aid for older than v0.0.15 GAN models
if "model_disc" in state:
self.model_g.load_checkpoint(config, checkpoint_path, eval)

View File

@ -290,9 +290,9 @@ class HifiganGenerator(torch.nn.Module):
remove_weight_norm(self.conv_post)
def load_checkpoint(
self, config, checkpoint_path, eval=False
self, config, checkpoint_path, eval=False, cache=False
): # pylint: disable=unused-argument, redefined-builtin
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
self.load_state_dict(state["model"])
if eval:
self.eval()

View File

@ -85,9 +85,9 @@ class MelganGenerator(nn.Module):
layer.remove_weight_norm()
def load_checkpoint(
self, config, checkpoint_path, eval=False
self, config, checkpoint_path, eval=False, cache=False
): # pylint: disable=unused-argument, redefined-builtin
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
self.load_state_dict(state["model"])
if eval:
self.eval()

View File

@ -153,9 +153,9 @@ class ParallelWaveganGenerator(torch.nn.Module):
return self._get_receptive_field_size(self.layers, self.stacks, self.kernel_size)
def load_checkpoint(
self, config, checkpoint_path, eval=False
self, config, checkpoint_path, eval=False, cache=False
): # pylint: disable=unused-argument, redefined-builtin
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
self.load_state_dict(state["model"])
if eval:
self.eval()

View File

@ -218,9 +218,9 @@ class Wavegrad(BaseVocoder):
self.y_conv = weight_norm(self.y_conv)
def load_checkpoint(
self, config, checkpoint_path, eval=False
self, config, checkpoint_path, eval=False, cache=False
): # pylint: disable=unused-argument, redefined-builtin
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
self.load_state_dict(state["model"])
if eval:
self.eval()

View File

@ -542,9 +542,9 @@ class Wavernn(BaseVocoder):
return unfolded
def load_checkpoint(
self, config, checkpoint_path, eval=False
self, config, checkpoint_path, eval=False, cache=False
): # pylint: disable=unused-argument, redefined-builtin
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
self.load_state_dict(state["model"])
if eval:
self.eval()

View File

@ -0,0 +1,56 @@
(docker_images)=
## Docker images
We provide docker images to be able to test TTS without having to setup your own environment.
### Using premade images
You can use premade images built automatically from the latest TTS version.
#### CPU version
```bash
docker pull ghcr.io/coqui-ai/tts-cpu
```
#### GPU version
```bash
docker pull ghcr.io/coqui-ai/tts
```
### Building your own image
```bash
docker build -t tts .
```
## Basic inference
Basic usage: generating an audio file from a text passed as argument.
You can pass any tts argument after the image name.
### CPU version
```bash
docker run --rm -v ~/tts-output:/root/tts-output ghcr.io/coqui-ai/tts-cpu --text "Hello." --out_path /root/tts-output/hello.wav
```
### GPU version
For the GPU version, you need to have the latest NVIDIA drivers installed.
With `nvidia-smi` you can check the CUDA version supported, it must be >= 11.8
```bash
docker run --rm --gpus all -v ~/tts-output:/root/tts-output ghcr.io/coqui-ai/tts --text "Hello." --out_path /root/tts-output/hello.wav --use_cuda true
```
## Start a server
Starting a TTS server:
Start the container and get a shell inside it.
### CPU version
```bash
docker run --rm -it -p 5002:5002 --entrypoint /bin/bash ghcr.io/coqui-ai/tts-cpu
python3 TTS/server/server.py --list_models #To get the list of available models
python3 TTS/server/server.py --model_name tts_models/en/vctk/vits
```
### GPU version
```bash
docker run --rm -it -p 5002:5002 --gpus all --entrypoint /bin/bash ghcr.io/coqui-ai/tts
python3 TTS/server/server.py --list_models #To get the list of available models
python3 TTS/server/server.py --model_name tts_models/en/vctk/vits --use_cuda true
```
Click [there](http://[::1]:5002/) and have fun with the server!

View File

@ -53,7 +53,7 @@ We tried to collect common issues and questions we receive about 🐸TTS. It is
"mixed_precision": false,
"output_path": "recipes/ljspeech/glow_tts/",
"test_sentences": ["Test this sentence.", "This test sentence.", "Sentence this test."],
"datasets":[{"name": "ljspeech", "meta_file_train":"metadata.csv", "path": "recipes/ljspeech/LJSpeech-1.1/"}]
"datasets":[{"formatter": "ljspeech", "meta_file_train":"metadata.csv", "path": "recipes/ljspeech/LJSpeech-1.1/"}]
}
```

View File

@ -88,7 +88,7 @@ from TTS.tts.datasets import load_tts_samples
# dataset config for one of the pre-defined datasets
dataset_config = BaseDatasetConfig(
name="vctk", meta_file_train="", language="en-us", path="dataset-path")
formatter="vctk", meta_file_train="", language="en-us", path="dataset-path")
)
# load training samples

View File

@ -20,6 +20,7 @@
:caption: Using 🐸TTS
inference
docker_images
implementing_a_new_model
training_a_model
finetuning

View File

@ -12,7 +12,7 @@ Currently we provide the following pre-configured architectures:
- **FastPitch:**
It uses the same FastSpeech architecture that us conditioned on fundemental frequency (f0) contours with the
It uses the same FastSpeech architecture that is conditioned on fundemental frequency (f0) contours with the
promise of more expressive speech.
- **SpeedySpeech:**

View File

@ -84,7 +84,7 @@ We still support running training from CLI like in the old days. The same traini
"print_eval": true,
"mixed_precision": false,
"output_path": "recipes/ljspeech/glow_tts/",
"datasets":[{"name": "ljspeech", "meta_file_train":"metadata.csv", "path": "recipes/ljspeech/LJSpeech-1.1/"}]
"datasets":[{"formatter": "ljspeech", "meta_file_train":"metadata.csv", "path": "recipes/ljspeech/LJSpeech-1.1/"}]
}
```
@ -120,6 +120,3 @@ $ tts-server -h # see the help
$ tts-server --list_models # list the available models.
```
![server.gif](https://github.com/coqui-ai/TTS/raw/main/images/demo_server.gif)

View File

@ -74,7 +74,7 @@
"<span style=\"color:purple;font-size:15px\">\n",
"/MyTTSDataset <br /> \n",
"&emsp;| <br /> \n",
"&emsp;| -> metadata.txt<br /> \n",
"&emsp;| -> metadata.csv<br /> \n",
"&emsp;| -> /wavs<br /> \n",
"&emsp;&emsp;| -> audio1.wav<br /> \n",
"&emsp;&emsp;| -> audio2.wav<br /> \n",

View File

@ -15,7 +15,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
data_path = "/srv/data/"
# Using LJSpeech like dataset processing for the blizzard dataset
dataset_config = BaseDatasetConfig(name="ljspeech", meta_file_train="metadata.csv", path=data_path)
dataset_config = BaseDatasetConfig(formatter="ljspeech", meta_file_train="metadata.csv", path=data_path)
audio_config = BaseAudioConfig(
sample_rate=24000,

View File

@ -16,7 +16,7 @@ data_path = "/srv/data/blizzard2013/segmented"
# Using LJSpeech like dataset processing for the blizzard dataset
dataset_config = BaseDatasetConfig(
name="ljspeech",
formatter="ljspeech",
meta_file_train="metadata.csv",
path=data_path,
)

View File

@ -1,7 +1,7 @@
{
"datasets": [
{
"name": "kokoro",
"formatter": "kokoro",
"path": "DEFINE THIS",
"meta_file_train": "metadata.csv",
"meta_file_val": null
@ -119,7 +119,7 @@
"phonemes": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
},
"use_speaker_embedding": false,
"use_gst": false,
"use_gst": false,
"use_external_speaker_embedding_file": false,
"external_speaker_embedding_file": "../../speakers-vctk-en.json"
}

View File

@ -13,7 +13,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
# init configs
dataset_config = BaseDatasetConfig(
name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
)
config = AlignTTSConfig(
batch_size=32,

View File

@ -14,7 +14,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
# init configs
dataset_config = BaseDatasetConfig(
name="ljspeech",
formatter="ljspeech",
meta_file_train="metadata.csv",
# meta_file_attn_mask=os.path.join(output_path, "../LJSpeech-1.1/metadata_attn_mask.txt"),
path=os.path.join(output_path, "../LJSpeech-1.1/"),

View File

@ -14,7 +14,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
# init configs
dataset_config = BaseDatasetConfig(
name="ljspeech",
formatter="ljspeech",
meta_file_train="metadata.csv",
# meta_file_attn_mask=os.path.join(output_path, "../LJSpeech-1.1/metadata_attn_mask.txt"),
path=os.path.join(output_path, "../LJSpeech-1.1/"),

View File

@ -21,7 +21,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
# Set LJSpeech as our target dataset and define its path.
# You can also use a simple Dict to define the dataset and pass it to your custom formatter.
dataset_config = BaseDatasetConfig(
name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
)
# INITIALIZE THE TRAINING CONFIGURATION

View File

@ -11,7 +11,7 @@ from TTS.utils.audio import AudioProcessor
output_path = os.path.dirname(os.path.abspath(__file__))
dataset_config = BaseDatasetConfig(
name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
)
audio_config = BaseAudioConfig(

View File

@ -16,7 +16,7 @@ data_path = "/srv/data/"
# Using LJSpeech like dataset processing for the blizzard dataset
dataset_config = BaseDatasetConfig(
name="ljspeech",
formatter="ljspeech",
meta_file_train="metadata.csv",
path=data_path,
)

View File

@ -16,7 +16,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
# init configs
dataset_config = BaseDatasetConfig(
name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
)
audio_config = BaseAudioConfig(

View File

@ -16,7 +16,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
# init configs
dataset_config = BaseDatasetConfig(
name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
)
audio_config = BaseAudioConfig(

View File

@ -11,7 +11,7 @@ from TTS.utils.audio import AudioProcessor
output_path = os.path.dirname(os.path.abspath(__file__))
dataset_config = BaseDatasetConfig(
name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
)
audio_config = VitsAudioConfig(
sample_rate=22050, win_length=1024, hop_length=256, num_mels=80, mel_fmin=0, mel_fmax=None

View File

@ -17,7 +17,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
mailabs_path = "/home/julian/workspace/mailabs/**"
dataset_paths = glob(mailabs_path)
dataset_config = [
BaseDatasetConfig(name="mailabs", meta_file_train=None, path=path, language=path.split("/")[-1])
BaseDatasetConfig(formatter="mailabs", meta_file_train=None, path=path, language=path.split("/")[-1])
for path in dataset_paths
]

View File

@ -14,7 +14,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
# init configs
dataset_config = BaseDatasetConfig(
name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
formatter="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
)
# download dataset if not already present

View File

@ -22,7 +22,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
# Set LJSpeech as our target dataset and define its path.
# You can also use a simple Dict to define the dataset and pass it to your custom formatter.
dataset_config = BaseDatasetConfig(
name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
formatter="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
)
# download dataset if not already present

View File

@ -12,7 +12,7 @@ from TTS.utils.downloaders import download_thorsten_de
output_path = os.path.dirname(os.path.abspath(__file__))
dataset_config = BaseDatasetConfig(
name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
formatter="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
)
# download dataset if not already present

View File

@ -16,7 +16,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
# init configs
dataset_config = BaseDatasetConfig(
name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
formatter="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
)
# download dataset if not already present

View File

@ -12,7 +12,7 @@ from TTS.utils.downloaders import download_thorsten_de
output_path = os.path.dirname(os.path.abspath(__file__))
dataset_config = BaseDatasetConfig(
name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
formatter="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
)
# download dataset if not already present

View File

@ -11,7 +11,7 @@ from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
output_path = os.path.dirname(os.path.abspath(__file__))
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
audio_config = BaseAudioConfig(
sample_rate=22050,

View File

@ -11,7 +11,7 @@ from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
output_path = os.path.dirname(os.path.abspath(__file__))
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
audio_config = BaseAudioConfig(
sample_rate=22050,

View File

@ -22,7 +22,7 @@ if not os.path.exists(dataset_path):
download_vctk(dataset_path)
# define dataset config
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=dataset_path)
dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", path=dataset_path)
# define audio config
# ❗ resample the dataset externally using `TTS/bin/resample.py` and set `resample=False` for faster training

View File

@ -0,0 +1,139 @@
import os
from TTS.encoder.configs.speaker_encoder_config import SpeakerEncoderConfig
# from TTS.encoder.configs.emotion_encoder_config import EmotionEncoderConfig
from TTS.tts.configs.shared_configs import BaseDatasetConfig
CURRENT_PATH = os.getcwd()
# change the root path to the TTS root path
os.chdir("../../../")
### Definitions ###
# dataset
VCTK_PATH = "/raid/datasets/VCTK_NEW_16khz_removed_silence_silero_vad/" # download: https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zipdddddddddd
RIR_SIMULATED_PATH = "/raid/datasets/DA/RIRS_NOISES/simulated_rirs/" # download: https://www.openslr.org/17/
MUSAN_PATH = "/raid/datasets/DA/musan/" # download: https://www.openslr.org/17/
# training
OUTPUT_PATH = os.path.join(
CURRENT_PATH, "resnet_speaker_encoder_training_output/"
) # path to save the train logs and checkpoint
CONFIG_OUT_PATH = os.path.join(OUTPUT_PATH, "config_se.json")
RESTORE_PATH = None # Checkpoint to use for transfer learning if None ignore
# instance the config
# to speaker encoder
config = SpeakerEncoderConfig()
# to emotion encoder
# config = EmotionEncoderConfig()
#### DATASET CONFIG ####
# The formatter need to return the key "speaker_name" for the speaker encoder and the "emotion_name" for the emotion encoder
dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", language="en-us", path=VCTK_PATH)
# add the dataset to the config
config.datasets = [dataset_config]
#### TRAINING CONFIG ####
# The encoder data loader balancer the dataset item equally to guarantee better training and to attend the losses requirements
# It have two parameters to control the final batch size the number total of speaker used in each batch and the number of samples for each speaker
# number total of speaker in batch in training
config.num_classes_in_batch = 100
# number of utterance per class/speaker in the batch in training
config.num_utter_per_class = 4
# final batch size = config.num_classes_in_batch * config.num_utter_per_class
# number total of speaker in batch in evaluation
config.eval_num_classes_in_batch = 100
# number of utterance per class/speaker in the batch in evaluation
config.eval_num_utter_per_class = 4
# number of data loader workers
config.num_loader_workers = 8
config.num_val_loader_workers = 8
# number of epochs
config.epochs = 10000
# loss to be used in training
config.loss = "softmaxproto"
# run eval
config.run_eval = False
# output path for the checkpoints
config.output_path = OUTPUT_PATH
# Save local checkpoint every save_step steps
config.save_step = 2000
### Model Config ###
config.model_params = {
"model_name": "resnet", # supported "lstm" and "resnet"
"input_dim": 64,
"use_torch_spec": True,
"log_input": True,
"proj_dim": 512, # embedding dim
}
### Audio Config ###
# To fast train the model divides the audio in small parts. it parameter defines the length in seconds of these "parts"
config.voice_len = 2.0
# all others configs
config.audio = {
"fft_size": 512,
"win_length": 400,
"hop_length": 160,
"frame_shift_ms": None,
"frame_length_ms": None,
"stft_pad_mode": "reflect",
"sample_rate": 16000,
"resample": False,
"preemphasis": 0.97,
"ref_level_db": 20,
"do_sound_norm": False,
"do_trim_silence": False,
"trim_db": 60,
"power": 1.5,
"griffin_lim_iters": 60,
"num_mels": 64,
"mel_fmin": 0.0,
"mel_fmax": 8000.0,
"spec_gain": 20,
"signal_norm": False,
"min_level_db": -100,
"symmetric_norm": False,
"max_norm": 4.0,
"clip_norm": False,
"stats_path": None,
"do_rms_norm": True,
"db_level": -27.0,
}
### Augmentation Config ###
config.audio_augmentation = {
# additive noise and room impulse response (RIR) simulation similar to: https://arxiv.org/pdf/2009.14153.pdf
"p": 0.5, # probability to the use of one of the augmentation - 0 means disabled
"rir": {"rir_path": RIR_SIMULATED_PATH, "conv_mode": "full"}, # download: https://www.openslr.org/17/
"additive": {
"sounds_path": MUSAN_PATH,
"speech": {"min_snr_in_db": 13, "max_snr_in_db": 20, "min_num_noises": 1, "max_num_noises": 1},
"noise": {"min_snr_in_db": 0, "max_snr_in_db": 15, "min_num_noises": 1, "max_num_noises": 1},
"music": {"min_snr_in_db": 5, "max_snr_in_db": 15, "min_num_noises": 1, "max_num_noises": 1},
},
"gaussian": {"p": 0.7, "min_amplitude": 0.0, "max_amplitude": 1e-05},
}
config.save_json(CONFIG_OUT_PATH)
print(CONFIG_OUT_PATH)
if RESTORE_PATH is not None:
command = f"python TTS/bin/train_encoder.py --config_path {CONFIG_OUT_PATH} --restore_path {RESTORE_PATH}"
else:
command = f"python TTS/bin/train_encoder.py --config_path {CONFIG_OUT_PATH}"
os.system(command)

View File

@ -11,7 +11,7 @@ from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
output_path = os.path.dirname(os.path.abspath(__file__))
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
audio_config = BaseAudioConfig(
sample_rate=22050,

View File

@ -12,7 +12,7 @@ from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
output_path = os.path.dirname(os.path.abspath(__file__))
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
audio_config = BaseAudioConfig(
sample_rate=22050,

View File

@ -12,7 +12,7 @@ from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
output_path = os.path.dirname(os.path.abspath(__file__))
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
audio_config = BaseAudioConfig(
sample_rate=22050,

View File

@ -12,7 +12,7 @@ from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
output_path = os.path.dirname(os.path.abspath(__file__))
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
audio_config = BaseAudioConfig(
sample_rate=22050,

View File

@ -12,7 +12,7 @@ from TTS.utils.audio import AudioProcessor
output_path = os.path.dirname(os.path.abspath(__file__))
dataset_config = BaseDatasetConfig(
name="vctk", meta_file_train="", language="en-us", path=os.path.join(output_path, "../VCTK/")
formatter="vctk", meta_file_train="", language="en-us", path=os.path.join(output_path, "../VCTK/")
)

View File

@ -23,7 +23,6 @@ umap-learn==0.5.1
pandas
# deps for training
matplotlib
pyworld==0.2.10 # > 0.2.10 is not p3.10.x compatible
# coqui stack
trainer
# config management
@ -35,4 +34,8 @@ pypinyin
mecab-python3==1.0.5
unidic-lite==1.0.8
# gruut+supported langs
gruut[cs,de,es,fr,it,nl,pt,ru,sv]==2.2.3
gruut[de]==2.2.3
# deps for korean
jamo
nltk
g2pkk>=0.1.1

View File

@ -33,7 +33,9 @@ def get_tests_data_path():
def get_tests_output_path():
"""Returns the path to the directory for test outputs."""
return os.path.join(get_tests_path(), "outputs")
path = os.path.join(get_tests_path(), "outputs")
os.makedirs(path, exist_ok=True)
return path
def run_cli(command):
@ -42,7 +44,7 @@ def run_cli(command):
def get_test_data_config():
return BaseDatasetConfig(name="ljspeech", path="tests/data/ljspeech/", meta_file_train="metadata.csv")
return BaseDatasetConfig(formatter="ljspeech", path="tests/data/ljspeech/", meta_file_train="metadata.csv")
def assertHasAttr(test_obj, obj, intendedAttr):

View File

@ -10,7 +10,7 @@ OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests")
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
os.makedirs(OUT_PATH, exist_ok=True)
conf = BaseAudioConfig(mel_fmax=8000)
conf = BaseAudioConfig(mel_fmax=8000, pitch_fmax=640, pitch_fmin=1)
# pylint: disable=protected-access

View File

@ -0,0 +1,92 @@
import os
import unittest
import numpy as np
import torch
from tests import get_tests_input_path
from TTS.config import load_config
from TTS.encoder.utils.generic_utils import setup_encoder_model
from TTS.encoder.utils.io import save_checkpoint
from TTS.tts.utils.managers import EmbeddingManager
from TTS.utils.audio import AudioProcessor
encoder_config_path = os.path.join(get_tests_input_path(), "test_speaker_encoder_config.json")
encoder_model_path = os.path.join(get_tests_input_path(), "checkpoint_0.pth")
sample_wav_path = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0001.wav")
sample_wav_path2 = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0002.wav")
embedding_file_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.json")
embeddings_file_path2 = os.path.join(get_tests_input_path(), "../data/dummy_speakers2.json")
embeddings_file_pth_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.pth")
class EmbeddingManagerTest(unittest.TestCase):
"""Test emEeddingManager for loading embedding files and computing embeddings from waveforms"""
@staticmethod
def test_speaker_embedding():
# load config
config = load_config(encoder_config_path)
config.audio.resample = True
# create a dummy speaker encoder
model = setup_encoder_model(config)
save_checkpoint(model, None, None, get_tests_input_path(), 0)
# load audio processor and speaker encoder
manager = EmbeddingManager(encoder_model_path=encoder_model_path, encoder_config_path=encoder_config_path)
# load a sample audio and compute embedding
ap = AudioProcessor(**config.audio)
waveform = ap.load_wav(sample_wav_path)
mel = ap.melspectrogram(waveform)
embedding = manager.compute_embeddings(mel)
assert embedding.shape[1] == 256
# compute embedding directly from an input file
embedding = manager.compute_embedding_from_clip(sample_wav_path)
embedding2 = manager.compute_embedding_from_clip(sample_wav_path)
embedding = torch.FloatTensor(embedding)
embedding2 = torch.FloatTensor(embedding2)
assert embedding.shape[0] == 256
assert (embedding - embedding2).sum() == 0.0
# compute embedding from a list of wav files.
embedding3 = manager.compute_embedding_from_clip([sample_wav_path, sample_wav_path2])
embedding3 = torch.FloatTensor(embedding3)
assert embedding3.shape[0] == 256
assert (embedding - embedding3).sum() != 0.0
# remove dummy model
os.remove(encoder_model_path)
def test_embedding_file_processing(self): # pylint: disable=no-self-use
manager = EmbeddingManager(embedding_file_path=embeddings_file_pth_path)
# test embedding querying
embedding = manager.get_embedding_by_clip(manager.clip_ids[0])
assert len(embedding) == 256
embeddings = manager.get_embeddings_by_name(manager.embedding_names[0])
assert len(embeddings[0]) == 256
embedding1 = manager.get_mean_embedding(manager.embedding_names[0], num_samples=2, randomize=True)
assert len(embedding1) == 256
embedding2 = manager.get_mean_embedding(manager.embedding_names[0], num_samples=2, randomize=False)
assert len(embedding2) == 256
assert np.sum(np.array(embedding1) - np.array(embedding2)) != 0
def test_embedding_file_loading(self):
# test loading a json file
manager = EmbeddingManager(embedding_file_path=embedding_file_path)
self.assertEqual(manager.num_embeddings, 384)
self.assertEqual(manager.embedding_dim, 256)
# test loading a pth file
manager = EmbeddingManager(embedding_file_path=embeddings_file_pth_path)
self.assertEqual(manager.num_embeddings, 384)
self.assertEqual(manager.embedding_dim, 256)
# test loading a pth files with duplicate embedding keys
with self.assertRaises(Exception) as context:
manager = EmbeddingManager(embedding_file_path=[embeddings_file_pth_path, embeddings_file_pth_path])
self.assertTrue("Duplicate embedding names" in str(context.exception))
# test loading embedding files with different embedding keys
manager = EmbeddingManager(embedding_file_path=[embeddings_file_pth_path, embeddings_file_path2])
self.assertEqual(manager.embedding_dim, 256)
self.assertEqual(manager.num_embeddings, 384 * 2)

View File

@ -15,7 +15,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase):
def test_GlowTTS():
# set paths
config_path = os.path.join(get_tests_input_path(), "test_glow_tts.json")
checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth")
checkpoint_path = os.path.join(get_tests_output_path(), "glowtts.pth")
output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/")
# load config
c = load_config(config_path)
@ -33,7 +33,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase):
def test_Tacotron2():
# set paths
config_path = os.path.join(get_tests_input_path(), "test_tacotron2_config.json")
checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth")
checkpoint_path = os.path.join(get_tests_output_path(), "tacotron2.pth")
output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/")
# load config
c = load_config(config_path)
@ -51,7 +51,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase):
def test_Tacotron():
# set paths
config_path = os.path.join(get_tests_input_path(), "test_tacotron_config.json")
checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth")
checkpoint_path = os.path.join(get_tests_output_path(), "tacotron.pth")
output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/")
# load config
c = load_config(config_path)

View File

@ -12,20 +12,22 @@ torch.manual_seed(1)
config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
dataset_config_en = BaseDatasetConfig(
name="ljspeech",
formatter="ljspeech",
meta_file_train="metadata.csv",
meta_file_val="metadata.csv",
path="tests/data/ljspeech",
language="en",
)
"""
dataset_config_pt = BaseDatasetConfig(
name="ljspeech",
formatter="ljspeech",
meta_file_train="metadata.csv",
meta_file_val="metadata.csv",
path="tests/data/ljspeech",
language="pt-br",
)
"""
# pylint: disable=protected-access
class TestFindUniquePhonemes(unittest.TestCase):
@ -46,7 +48,7 @@ class TestFindUniquePhonemes(unittest.TestCase):
epochs=1,
print_step=1,
print_eval=True,
datasets=[dataset_config_en, dataset_config_pt],
datasets=[dataset_config_en],
)
config.save_json(config_path)
@ -70,7 +72,7 @@ class TestFindUniquePhonemes(unittest.TestCase):
epochs=1,
print_step=1,
print_eval=True,
datasets=[dataset_config_en, dataset_config_pt],
datasets=[dataset_config_en],
)
config.save_json(config_path)

View File

@ -31,7 +31,8 @@ class TestNumpyTransforms(unittest.TestCase):
mel_fmin: int = 0
hop_length: int = 256
win_length: int = 1024
pitch_fmax: int = 450
pitch_fmax: int = 640
pitch_fmin: int = 1
trim_db: int = -1
min_silence_sec: float = 0.01
gain: float = 1.0

Some files were not shown because too many files have changed in this diff Show More