Merge pull request #1537 from coqui-ai/dev

v0.7.0
This commit is contained in:
Eren Gölge 2022-06-20 23:55:22 +02:00 committed by GitHub
commit c7cca4135d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
78 changed files with 3667 additions and 421 deletions

View File

@ -18,7 +18,7 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
python-version: [3.6, 3.7, 3.8, 3.9] python-version: [3.7, 3.8, 3.9, "3.10"]
experimental: [false] experimental: [false]
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v2
@ -38,6 +38,9 @@ jobs:
make system-deps make system-deps
- name: Install/upgrade Python setup deps - name: Install/upgrade Python setup deps
run: python3 -m pip install --upgrade pip setuptools wheel run: python3 -m pip install --upgrade pip setuptools wheel
- name: Replace scarf urls
run: |
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
- name: Install TTS - name: Install TTS
run: | run: |
python3 -m pip install .[all] python3 -m pip install .[all]

View File

@ -18,7 +18,7 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
python-version: [3.6, 3.7, 3.8, 3.9] python-version: [3.7, 3.8, 3.9, "3.10"]
experimental: [false] experimental: [false]
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v2
@ -38,6 +38,9 @@ jobs:
make system-deps make system-deps
- name: Install/upgrade Python setup deps - name: Install/upgrade Python setup deps
run: python3 -m pip install --upgrade pip setuptools wheel run: python3 -m pip install --upgrade pip setuptools wheel
- name: Replace scarf urls
run: |
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
- name: Install TTS - name: Install TTS
run: | run: |
python3 -m pip install .[all] python3 -m pip install .[all]

View File

@ -14,6 +14,9 @@ jobs:
strategy: strategy:
matrix: matrix:
arch: ["amd64"] arch: ["amd64"]
base:
- "nvcr.io/nvidia/pytorch:22.03-py3" # GPU enabled
- "ubuntu:20.04" # CPU only
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v2
- name: Log in to the Container registry - name: Log in to the Container registry
@ -28,6 +31,11 @@ jobs:
set -ex set -ex
base="ghcr.io/coqui-ai/tts" base="ghcr.io/coqui-ai/tts"
tags="" # PR build tags="" # PR build
if [[ ${{ matrix.base }} = "ubuntu:20.04" ]]; then
base="ghcr.io/coqui-ai/tts-cpu"
fi
if [[ "${{ startsWith(github.ref, 'refs/heads/') }}" = "true" ]]; then if [[ "${{ startsWith(github.ref, 'refs/heads/') }}" = "true" ]]; then
# Push to branch # Push to branch
github_ref="${{ github.ref }}" github_ref="${{ github.ref }}"
@ -53,4 +61,5 @@ jobs:
context: . context: .
platforms: linux/${{ matrix.arch }} platforms: linux/${{ matrix.arch }}
push: ${{ github.event_name == 'push' }} push: ${{ github.event_name == 'push' }}
build-args: "BASE=${{ matrix.base }}"
tags: ${{ steps.compute-tag.outputs.tags }} tags: ${{ steps.compute-tag.outputs.tags }}

View File

@ -18,7 +18,7 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
python-version: [3.6, 3.7, 3.8, 3.9] python-version: [3.7, 3.8, 3.9, "3.10"]
experimental: [false] experimental: [false]
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v2
@ -38,6 +38,9 @@ jobs:
make system-deps make system-deps
- name: Install/upgrade Python setup deps - name: Install/upgrade Python setup deps
run: python3 -m pip install --upgrade pip setuptools wheel run: python3 -m pip install --upgrade pip setuptools wheel
- name: Replace scarf urls
run: |
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
- name: Install TTS - name: Install TTS
run: | run: |
python3 -m pip install .[all] python3 -m pip install .[all]

View File

@ -36,7 +36,7 @@ jobs:
runs-on: ubuntu-20.04 runs-on: ubuntu-20.04
strategy: strategy:
matrix: matrix:
python-version: ["3.6", "3.7", "3.8", "3.9"] python-version: ["3.7", "3.8", "3.9", "3.10"]
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v2
- uses: actions/setup-python@v2 - uses: actions/setup-python@v2
@ -62,10 +62,6 @@ jobs:
with: with:
name: "sdist" name: "sdist"
path: "dist/" path: "dist/"
- uses: actions/download-artifact@v2
with:
name: "wheel-3.6"
path: "dist/"
- uses: actions/download-artifact@v2 - uses: actions/download-artifact@v2
with: with:
name: "wheel-3.7" name: "wheel-3.7"
@ -78,6 +74,10 @@ jobs:
with: with:
name: "wheel-3.9" name: "wheel-3.9"
path: "dist/" path: "dist/"
- uses: actions/download-artifact@v2
with:
name: "wheel-3.10"
path: "dist/"
- run: | - run: |
ls -lh dist/ ls -lh dist/
- name: Setup PyPI config - name: Setup PyPI config

View File

@ -18,7 +18,7 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
python-version: [3.6, 3.7, 3.8, 3.9] python-version: [3.7, 3.8, 3.9, "3.10"]
experimental: [false] experimental: [false]
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v2

View File

@ -18,7 +18,7 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
python-version: [3.6, 3.7, 3.8, 3.9] python-version: [3.7, 3.8, 3.9, "3.10"]
experimental: [false] experimental: [false]
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v2
@ -40,6 +40,9 @@ jobs:
make system-deps make system-deps
- name: Install/upgrade Python setup deps - name: Install/upgrade Python setup deps
run: python3 -m pip install --upgrade pip setuptools wheel run: python3 -m pip install --upgrade pip setuptools wheel
- name: Replace scarf urls
run: |
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
- name: Install TTS - name: Install TTS
run: | run: |
python3 -m pip install .[all] python3 -m pip install .[all]

View File

@ -18,7 +18,7 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
python-version: [3.6, 3.7, 3.8, 3.9] python-version: [3.7, 3.8, 3.9, "3.10"]
experimental: [false] experimental: [false]
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v2

View File

@ -18,7 +18,7 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
python-version: [3.6, 3.7, 3.8, 3.9] python-version: [3.7, 3.8, 3.9, "3.10"]
experimental: [false] experimental: [false]
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v2
@ -39,6 +39,9 @@ jobs:
make system-deps make system-deps
- name: Install/upgrade Python setup deps - name: Install/upgrade Python setup deps
run: python3 -m pip install --upgrade pip setuptools wheel run: python3 -m pip install --upgrade pip setuptools wheel
- name: Replace scarf urls
run: |
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
- name: Install TTS - name: Install TTS
run: | run: |
python3 -m pip install .[all] python3 -m pip install .[all]

1
.gitignore vendored
View File

@ -117,6 +117,7 @@ venv.bak/
# pytorch models # pytorch models
*.pth *.pth
*.pth.tar *.pth.tar
!dummy_speakers.pth
result/ result/
# setup.py # setup.py

View File

@ -1,10 +1,19 @@
FROM nvcr.io/nvidia/pytorch:22.03-py3 ARG BASE=nvcr.io/nvidia/pytorch:22.03-py3
RUN apt-get update && apt-get install -y --no-install-recommends espeak && rm -rf /var/lib/apt/lists/* FROM ${BASE}
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ make python3 python3-dev python3-pip python3-venv python3-wheel espeak espeak-ng libsndfile1-dev && rm -rf /var/lib/apt/lists/*
RUN pip install llvmlite --ignore-installed
# Create and activate virtual env
ENV VIRTUAL_ENV=/venv
RUN python3 -m venv $VIRTUAL_ENV
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
RUN pip install -U pip setuptools wheel
WORKDIR /root WORKDIR /root
COPY requirements.txt /root COPY requirements.txt /root
COPY requirements.dev.txt /root COPY requirements.dev.txt /root
COPY requirements.notebooks.txt /root COPY requirements.notebooks.txt /root
RUN pip install -r <(cat requirements.txt requirements.dev.txt requirements.notebooks.txt) RUN ["/bin/bash", "-c", "pip install -r <(cat requirements.txt requirements.dev.txt requirements.notebooks.txt)"]
COPY . /root COPY . /root
RUN make install RUN make install
ENTRYPOINT ["tts"] ENTRYPOINT ["tts"]

View File

@ -7,36 +7,36 @@ help:
target_dirs := tests TTS notebooks recipes target_dirs := tests TTS notebooks recipes
test_all: ## run tests and don't stop on an error. test_all: ## run tests and don't stop on an error.
nosetests --with-cov -cov --cover-erase --cover-package TTS tests --nologcapture --with-id nose2 --with-coverage --coverage TTS tests
./run_bash_tests.sh ./run_bash_tests.sh
test: ## run tests. test: ## run tests.
nosetests -x --with-cov -cov --cover-erase --cover-package TTS tests --nologcapture --with-id nose2 -F -v -B --with-coverage --coverage TTS tests
test_vocoder: ## run vocoder tests. test_vocoder: ## run vocoder tests.
nosetests tests.vocoder_tests -x --with-cov -cov --cover-erase --cover-package TTS tests.vocoder_tests --nologcapture --with-id nose2 -F -v -B --with-coverage --coverage TTS tests.vocoder_tests
test_tts: ## run tts tests. test_tts: ## run tts tests.
nosetests tests.tts_tests -x --with-cov -cov --cover-erase --cover-package TTS tests.tts_tests --nologcapture --with-id nose2 -F -v -B --with-coverage --coverage TTS tests.tts_tests
test_aux: ## run aux tests. test_aux: ## run aux tests.
nosetests tests.aux_tests -x --with-cov -cov --cover-erase --cover-package TTS tests.aux_tests --nologcapture --with-id nose2 -F -v -B --with-coverage --coverage TTS tests.aux_tests
./run_bash_tests.sh ./run_bash_tests.sh
test_zoo: ## run zoo tests. test_zoo: ## run zoo tests.
nosetests tests.zoo_tests -x --with-cov -cov --cover-erase --cover-package TTS tests.zoo_tests --nologcapture --with-id nose2 -F -v -B --with-coverage --coverage TTS tests.zoo_tests
inference_tests: ## run inference tests. inference_tests: ## run inference tests.
nosetests tests.inference_tests -x --with-cov -cov --cover-erase --cover-package TTS tests.inference_tests --nologcapture --with-id nose2 -F -v -B --with-coverage --coverage TTS tests.inference_tests
data_tests: ## run data tests. data_tests: ## run data tests.
nosetests tests.data_tests -x --with-cov -cov --cover-erase --cover-package TTS tests.data_tests --nologcapture --with-id nose2 -F -v -B --with-coverage --coverage TTS tests.data_tests
test_text: ## run text tests. test_text: ## run text tests.
nosetests tests.text_tests -x --with-cov -cov --cover-erase --cover-package TTS tests.text_tests --nologcapture --with-id nose2 -F -v -B --with-coverage --coverage TTS tests.text_tests
test_failed: ## only run tests failed the last time. test_failed: ## only run tests failed the last time.
nosetests -x --with-cov -cov --cover-erase --cover-package TTS tests --nologcapture --failed nose2 -F -v -B --with-coverage --coverage TTS tests
style: ## update code style. style: ## update code style.
black ${target_dirs} black ${target_dirs}

View File

@ -3,15 +3,23 @@
🐸TTS is a library for advanced Text-to-Speech generation. It's built on the latest research, was designed to achieve the best trade-off among ease-of-training, speed and quality. 🐸TTS is a library for advanced Text-to-Speech generation. It's built on the latest research, was designed to achieve the best trade-off among ease-of-training, speed and quality.
🐸TTS comes with pretrained models, tools for measuring dataset quality and already used in **20+ languages** for products and research projects. 🐸TTS comes with pretrained models, tools for measuring dataset quality and already used in **20+ languages** for products and research projects.
[![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/main.yml/badge.svg)](https://github.com/coqui-ai/TTS/actions) [![Gitter](https://badges.gitter.im/coqui-ai/TTS.svg)](https://gitter.im/coqui-ai/TTS?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge)
[![License](<https://img.shields.io/badge/License-MPL%202.0-brightgreen.svg>)](https://opensource.org/licenses/MPL-2.0)
[![PyPI version](https://badge.fury.io/py/TTS.svg)](https://badge.fury.io/py/TTS) [![PyPI version](https://badge.fury.io/py/TTS.svg)](https://badge.fury.io/py/TTS)
[![Covenant](https://camo.githubusercontent.com/7d620efaa3eac1c5b060ece5d6aacfcc8b81a74a04d05cd0398689c01c4463bb/68747470733a2f2f696d672e736869656c64732e696f2f62616467652f436f6e7472696275746f72253230436f76656e616e742d76322e3025323061646f707465642d6666363962342e737667)](https://github.com/coqui-ai/TTS/blob/master/CODE_OF_CONDUCT.md) [![Covenant](https://camo.githubusercontent.com/7d620efaa3eac1c5b060ece5d6aacfcc8b81a74a04d05cd0398689c01c4463bb/68747470733a2f2f696d672e736869656c64732e696f2f62616467652f436f6e7472696275746f72253230436f76656e616e742d76322e3025323061646f707465642d6666363962342e737667)](https://github.com/coqui-ai/TTS/blob/master/CODE_OF_CONDUCT.md)
[![Downloads](https://pepy.tech/badge/tts)](https://pepy.tech/project/tts) [![Downloads](https://pepy.tech/badge/tts)](https://pepy.tech/project/tts)
[![DOI](https://zenodo.org/badge/265612440.svg)](https://zenodo.org/badge/latestdoi/265612440) [![DOI](https://zenodo.org/badge/265612440.svg)](https://zenodo.org/badge/latestdoi/265612440)
![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/aux_tests.yml/badge.svg)
![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/data_tests.yml/badge.svg)
![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/docker.yaml/badge.svg)
![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/inference_tests.yml/badge.svg)
![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/style_check.yml/badge.svg)
![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/text_tests.yml/badge.svg)
![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/tts_tests.yml/badge.svg)
![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/vocoder_tests.yml/badge.svg)
![GithubActions](https://github.com/coqui-ai/TTS/actions/workflows/zoo_tests.yml/badge.svg)
[![Docs](<https://readthedocs.org/projects/tts/badge/?version=latest&style=plastic>)](https://tts.readthedocs.io/en/latest/) [![Docs](<https://readthedocs.org/projects/tts/badge/?version=latest&style=plastic>)](https://tts.readthedocs.io/en/latest/)
[![Gitter](https://badges.gitter.im/coqui-ai/TTS.svg)](https://gitter.im/coqui-ai/TTS?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge)
[![License](<https://img.shields.io/badge/License-MPL%202.0-brightgreen.svg>)](https://opensource.org/licenses/MPL-2.0)
📰 [**Subscribe to 🐸Coqui.ai Newsletter**](https://coqui.ai/?subscription=true) 📰 [**Subscribe to 🐸Coqui.ai Newsletter**](https://coqui.ai/?subscription=true)
@ -104,7 +112,7 @@ Underlined "TTS*" and "Judy*" are 🐸TTS models
You can also help us implement more models. You can also help us implement more models.
## Install TTS ## Install TTS
🐸TTS is tested on Ubuntu 18.04 with **python >= 3.6, < 3.9**. 🐸TTS is tested on Ubuntu 18.04 with **python >= 3.7, < 3.11.**.
If you are only interested in [synthesizing speech](https://tts.readthedocs.io/en/latest/inference.html) with the released 🐸TTS models, installing from PyPI is the easiest option. If you are only interested in [synthesizing speech](https://tts.readthedocs.io/en/latest/inference.html) with the released 🐸TTS models, installing from PyPI is the easiest option.

View File

@ -119,6 +119,26 @@
"license": "apache 2.0", "license": "apache 2.0",
"contact": "egolge@coqui.com" "contact": "egolge@coqui.com"
} }
},
"blizzard2013": {
"capacitron-t2-c50": {
"description": "Capacitron additions to Tacotron 2 with Capacity at 50 as in https://arxiv.org/pdf/1906.03402.pdf",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--en--blizzard2013--capacitron-t2-c50.zip",
"commit": "d6284e7",
"default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
"author": "Adam Froghyar @a-froghyar",
"license": "apache 2.0",
"contact": "adamfroghyar@gmail.com"
},
"capacitron-t2-c150": {
"description": "Capacitron additions to Tacotron 2 with Capacity at 150 as in https://arxiv.org/pdf/1906.03402.pdf",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--en--blizzard2013--capacitron-t2-c150.zip",
"commit": "d6284e7",
"default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
"author": "Adam Froghyar @a-froghyar",
"license": "apache 2.0",
"contact": "adamfroghyar@gmail.com"
}
} }
}, },
"es": { "es": {
@ -379,6 +399,16 @@
"contact": "egolge@coqui.ai" "contact": "egolge@coqui.ai"
} }
}, },
"blizzard2013": {
"hifigan_v2": {
"description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/vocoder_models--en--blizzard2013--hifigan_v2.zip",
"commit": "d6284e7",
"author": "Adam Froghyar @a-froghyar",
"license": "apache 2.0",
"contact": "adamfroghyar@gmail.com"
}
},
"vctk": { "vctk": {
"hifigan_v2": { "hifigan_v2": {
"description": "Finetuned and intended to be used with tts_models/en/vctk/sc-glow-tts", "description": "Finetuned and intended to be used with tts_models/en/vctk/sc-glow-tts",

View File

@ -1 +1 @@
0.6.2 0.7.0

View File

@ -2,51 +2,48 @@ import argparse
import os import os
from argparse import RawTextHelpFormatter from argparse import RawTextHelpFormatter
import torch
from tqdm import tqdm from tqdm import tqdm
from TTS.config import load_config from TTS.config import load_config
from TTS.tts.datasets import load_tts_samples from TTS.tts.datasets import load_tts_samples
from TTS.tts.utils.managers import save_file
from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.speakers import SpeakerManager
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="""Compute embedding vectors for each wav file in a dataset.\n\n""" description="""Compute embedding vectors for each wav file in a dataset.\n\n"""
""" """
Example runs: Example runs:
python TTS/bin/compute_embeddings.py speaker_encoder_model.pth speaker_encoder_config.json dataset_config.json embeddings_output_path/ python TTS/bin/compute_embeddings.py speaker_encoder_model.pth speaker_encoder_config.json dataset_config.json
""", """,
formatter_class=RawTextHelpFormatter, formatter_class=RawTextHelpFormatter,
) )
parser.add_argument("model_path", type=str, help="Path to model checkpoint file.") parser.add_argument("model_path", type=str, help="Path to model checkpoint file.")
parser.add_argument( parser.add_argument("config_path", type=str, help="Path to model config file.")
"config_path", parser.add_argument("config_dataset_path", type=str, help="Path to dataset config file.")
type=str, parser.add_argument("--output_path", type=str, help="Path for output `pth` or `json` file.", default="speakers.pth")
help="Path to model config file.", parser.add_argument("--old_file", type=str, help="Previous embedding file to only compute new audios.", default=None)
) parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False)
parser.add_argument("--no_eval", type=bool, help="Do not compute eval?. Default False", default=False)
parser.add_argument(
"config_dataset_path",
type=str,
help="Path to dataset config file.",
)
parser.add_argument("output_path", type=str, help="path for output speakers.json and/or speakers.npy.")
parser.add_argument(
"--old_file", type=str, help="Previous speakers.json file, only compute for new audios.", default=None
)
parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)
parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
args = parser.parse_args() args = parser.parse_args()
use_cuda = torch.cuda.is_available() and not args.disable_cuda
c_dataset = load_config(args.config_dataset_path) c_dataset = load_config(args.config_dataset_path)
meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=args.eval) meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=not args.no_eval)
wav_files = meta_data_train + meta_data_eval
if meta_data_eval is None:
wav_files = meta_data_train
else:
wav_files = meta_data_train + meta_data_eval
encoder_manager = SpeakerManager( encoder_manager = SpeakerManager(
encoder_model_path=args.model_path, encoder_model_path=args.model_path,
encoder_config_path=args.config_path, encoder_config_path=args.config_path,
d_vectors_file_path=args.old_file, d_vectors_file_path=args.old_file,
use_cuda=args.use_cuda, use_cuda=use_cuda,
) )
class_name_key = encoder_manager.encoder_config.class_name_key class_name_key = encoder_manager.encoder_config.class_name_key
@ -75,13 +72,13 @@ for idx, wav_file in enumerate(tqdm(wav_files)):
if speaker_mapping: if speaker_mapping:
# save speaker_mapping if target dataset is defined # save speaker_mapping if target dataset is defined
if ".json" not in args.output_path: if os.path.isdir(args.output_path):
mapping_file_path = os.path.join(args.output_path, "speakers.json") mapping_file_path = os.path.join(args.output_path, "speakers.pth")
else: else:
mapping_file_path = args.output_path mapping_file_path = args.output_path
if os.path.dirname(mapping_file_path) != "":
os.makedirs(os.path.dirname(mapping_file_path), exist_ok=True) os.makedirs(os.path.dirname(mapping_file_path), exist_ok=True)
# pylint: disable=W0212 save_file(speaker_mapping, mapping_file_path)
encoder_manager._save_json(mapping_file_path, speaker_mapping)
print("Speaker embeddings saved at:", mapping_file_path) print("Speaker embeddings saved at:", mapping_file_path)

View File

@ -39,6 +39,18 @@ If you don't specify any models, then it uses LJSpeech based English model.
$ tts --list_models $ tts --list_models
``` ```
- Query info for model info by idx:
```
$ tts --model_info_by_idx "<model_type>/<model_query_idx>"
```
- Query info for model info by full name:
```
$ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
```
- Run TTS with default models: - Run TTS with default models:
``` ```
@ -48,7 +60,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
- Run a TTS model with its default vocoder model: - Run a TTS model with its default vocoder model:
``` ```
$ tts --text "Text for TTS" --model_name "<language>/<dataset>/<model_name> $ tts --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>"
``` ```
- Run with specific TTS and vocoder models from the list: - Run with specific TTS and vocoder models from the list:
@ -104,6 +116,21 @@ If you don't specify any models, then it uses LJSpeech based English model.
default=False, default=False,
help="list available pre-trained TTS and vocoder models.", help="list available pre-trained TTS and vocoder models.",
) )
parser.add_argument(
"--model_info_by_idx",
type=str,
default=None,
help="model info using query format: <model_type>/<model_query_idx>",
)
parser.add_argument(
"--model_info_by_name",
type=str,
default=None,
help="model info using query format: <model_type>/<language>/<dataset>/<model_name>",
)
parser.add_argument("--text", type=str, default=None, help="Text to generate speech.") parser.add_argument("--text", type=str, default=None, help="Text to generate speech.")
# Args for running pre-trained TTS models. # Args for running pre-trained TTS models.
@ -171,7 +198,11 @@ If you don't specify any models, then it uses LJSpeech based English model.
help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.", help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.",
default=None, default=None,
) )
parser.add_argument("--gst_style", help="Wav path file for GST stylereference.", default=None) parser.add_argument("--gst_style", help="Wav path file for GST style reference.", default=None)
parser.add_argument(
"--capacitron_style_wav", type=str, help="Wav path file for Capacitron prosody reference.", default=None
)
parser.add_argument("--capacitron_style_text", type=str, help="Transcription of the reference.", default=None)
parser.add_argument( parser.add_argument(
"--list_speaker_idxs", "--list_speaker_idxs",
help="List available speaker ids for the defined multi-speaker model.", help="List available speaker ids for the defined multi-speaker model.",
@ -210,13 +241,16 @@ If you don't specify any models, then it uses LJSpeech based English model.
args = parser.parse_args() args = parser.parse_args()
# print the description if either text or list_models is not set # print the description if either text or list_models is not set
if ( check_args = [
not args.text args.text,
and not args.list_models args.list_models,
and not args.list_speaker_idxs args.list_speaker_idxs,
and not args.list_language_idxs args.list_language_idxs,
and not args.reference_wav args.reference_wav,
): args.model_info_by_idx,
args.model_info_by_name,
]
if not any(check_args):
parser.parse_args(["-h"]) parser.parse_args(["-h"])
# load model manager # load model manager
@ -232,12 +266,23 @@ If you don't specify any models, then it uses LJSpeech based English model.
encoder_path = None encoder_path = None
encoder_config_path = None encoder_config_path = None
# CASE1: list pre-trained TTS models # CASE1 #list : list pre-trained TTS models
if args.list_models: if args.list_models:
manager.list_models() manager.list_models()
sys.exit() sys.exit()
# CASE2: load pre-trained model paths # CASE2 #info : model info of pre-trained TTS models
if args.model_info_by_idx:
model_query = args.model_info_by_idx
manager.model_info_by_idx(model_query)
sys.exit()
if args.model_info_by_name:
model_query_full_name = args.model_info_by_name
manager.model_info_by_full_name(model_query_full_name)
sys.exit()
# CASE3: load pre-trained model paths
if args.model_name is not None and not args.model_path: if args.model_name is not None and not args.model_path:
model_path, config_path, model_item = manager.download_model(args.model_name) model_path, config_path, model_item = manager.download_model(args.model_name)
args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
@ -245,7 +290,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
if args.vocoder_name is not None and not args.vocoder_path: if args.vocoder_name is not None and not args.vocoder_path:
vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name) vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
# CASE3: set custom model paths # CASE4: set custom model paths
if args.model_path is not None: if args.model_path is not None:
model_path = args.model_path model_path = args.model_path
config_path = args.config_path config_path = args.config_path
@ -308,6 +353,8 @@ If you don't specify any models, then it uses LJSpeech based English model.
args.language_idx, args.language_idx,
args.speaker_wav, args.speaker_wav,
reference_wav=args.reference_wav, reference_wav=args.reference_wav,
style_wav=args.capacitron_style_wav,
style_text=args.capacitron_style_text,
reference_speaker_name=args.reference_speaker_idx, reference_speaker_name=args.reference_speaker_idx,
) )

View File

@ -1,9 +1,3 @@
<!-- ## TTS example web-server
You'll need a model package (Zip file, includes TTS Python wheel, model files, server configuration, and optional nginx/uwsgi configs). Publicly available models are listed [here](https://github.com/mozilla/TTS/wiki/Released-Models#simple-packaging---self-contained-package-that-runs-an-http-api-for-a-pre-trained-tts-model).
Instructions below are based on a Ubuntu 18.04 machine, but it should be simple to adapt the package names to other distros if needed. Python 3.6 is recommended, as some of the dependencies' versions predate Python 3.7 and will force building from source, which requires extra dependencies and is not guaranteed to work. -->
# :frog: TTS demo server # :frog: TTS demo server
Before you use the server, make sure you [install](https://github.com/coqui-ai/TTS/tree/dev#install-tts)) :frog: TTS properly. Then, you can follow the steps below. Before you use the server, make sure you [install](https://github.com/coqui-ai/TTS/tree/dev#install-tts)) :frog: TTS properly. Then, you can follow the steps below.

View File

@ -111,7 +111,10 @@ synthesizer = Synthesizer(
use_cuda=args.use_cuda, use_cuda=args.use_cuda,
) )
use_multi_speaker = hasattr(synthesizer.tts_model, "num_speakers") and synthesizer.tts_model.num_speakers > 1 use_multi_speaker = hasattr(synthesizer.tts_model, "num_speakers") and (
synthesizer.tts_model.num_speakers > 1 or synthesizer.tts_speakers_file is not None
)
speaker_manager = getattr(synthesizer.tts_model, "speaker_manager", None) speaker_manager = getattr(synthesizer.tts_model, "speaker_manager", None)
# TODO: set this from SpeakerManager # TODO: set this from SpeakerManager
use_gst = synthesizer.tts_config.get("use_gst", False) use_gst = synthesizer.tts_config.get("use_gst", False)

View File

@ -48,6 +48,50 @@ class GSTConfig(Coqpit):
check_argument("gst_num_style_tokens", c, restricted=True, min_val=1, max_val=1000) check_argument("gst_num_style_tokens", c, restricted=True, min_val=1, max_val=1000)
@dataclass
class CapacitronVAEConfig(Coqpit):
"""Defines the capacitron VAE Module
Args:
capacitron_capacity (int):
Defines the variational capacity limit of the prosody embeddings. Defaults to 150.
capacitron_VAE_embedding_dim (int):
Defines the size of the Capacitron embedding vector dimension. Defaults to 128.
capacitron_use_text_summary_embeddings (bool):
If True, use a text summary embedding in Capacitron. Defaults to True.
capacitron_text_summary_embedding_dim (int):
Defines the size of the capacitron text embedding vector dimension. Defaults to 128.
capacitron_use_speaker_embedding (bool):
if True use speaker embeddings in Capacitron. Defaults to False.
capacitron_VAE_loss_alpha (float):
Weight for the VAE loss of the Tacotron model. If set less than or equal to zero, it disables the
corresponding loss function. Defaults to 0.25
capacitron_grad_clip (float):
Gradient clipping value for all gradients except beta. Defaults to 5.0
"""
capacitron_loss_alpha: int = 1
capacitron_capacity: int = 150
capacitron_VAE_embedding_dim: int = 128
capacitron_use_text_summary_embeddings: bool = True
capacitron_text_summary_embedding_dim: int = 128
capacitron_use_speaker_embedding: bool = False
capacitron_VAE_loss_alpha: float = 0.25
capacitron_grad_clip: float = 5.0
def check_values(
self,
):
"""Check config fields"""
c = asdict(self)
super().check_values()
check_argument("capacitron_capacity", c, restricted=True, min_val=10, max_val=500)
check_argument("capacitron_VAE_embedding_dim", c, restricted=True, min_val=16, max_val=1024)
check_argument("capacitron_use_speaker_embedding", c, restricted=False)
check_argument("capacitron_text_summary_embedding_dim", c, restricted=False, min_val=16, max_val=512)
check_argument("capacitron_VAE_loss_alpha", c, restricted=False)
check_argument("capacitron_grad_clip", c, restricted=False)
@dataclass @dataclass
class CharactersConfig(Coqpit): class CharactersConfig(Coqpit):
"""Defines arguments for the `BaseCharacters` or `BaseVocabulary` and their subclasses. """Defines arguments for the `BaseCharacters` or `BaseVocabulary` and their subclasses.
@ -232,6 +276,14 @@ class BaseTTSConfig(BaseTrainingConfig):
language_weighted_sampler_alpha (float): language_weighted_sampler_alpha (float):
Number that control the influence of the language sampler weights. Defaults to ```1.0```. Number that control the influence of the language sampler weights. Defaults to ```1.0```.
use_length_weighted_sampler (bool):
Enable / Disable the batch balancer by audio length. If enabled the dataset will be divided
into 10 buckets considering the min and max audio of the dataset. The sampler weights will be
computed forcing to have the same quantity of data for each bucket in each training batch. Defaults to ```False```.
length_weighted_sampler_alpha (float):
Number that control the influence of the length sampler weights. Defaults to ```1.0```.
""" """
audio: BaseAudioConfig = field(default_factory=BaseAudioConfig) audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
@ -279,3 +331,5 @@ class BaseTTSConfig(BaseTrainingConfig):
speaker_weighted_sampler_alpha: float = 1.0 speaker_weighted_sampler_alpha: float = 1.0
use_language_weighted_sampler: bool = False use_language_weighted_sampler: bool = False
language_weighted_sampler_alpha: float = 1.0 language_weighted_sampler_alpha: float = 1.0
use_length_weighted_sampler: bool = False
length_weighted_sampler_alpha: float = 1.0

View File

@ -1,7 +1,7 @@
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import List from typing import List
from TTS.tts.configs.shared_configs import BaseTTSConfig, GSTConfig from TTS.tts.configs.shared_configs import BaseTTSConfig, CapacitronVAEConfig, GSTConfig
@dataclass @dataclass
@ -23,6 +23,10 @@ class TacotronConfig(BaseTTSConfig):
gst_style_input (str): gst_style_input (str):
Path to the wav file used at inference to set the speech style through GST. If `GST` is enabled and Path to the wav file used at inference to set the speech style through GST. If `GST` is enabled and
this is not defined, the model uses a zero vector as an input. Defaults to None. this is not defined, the model uses a zero vector as an input. Defaults to None.
use_capacitron_vae (bool):
enable / disable the use of Capacitron modules. Defaults to False.
capacitron_vae (CapacitronConfig):
Instance of `CapacitronConfig` class.
num_chars (int): num_chars (int):
Number of characters used by the model. It must be defined before initializing the model. Defaults to None. Number of characters used by the model. It must be defined before initializing the model. Defaults to None.
num_speakers (int): num_speakers (int):
@ -143,6 +147,9 @@ class TacotronConfig(BaseTTSConfig):
gst: GSTConfig = None gst: GSTConfig = None
gst_style_input: str = None gst_style_input: str = None
use_capacitron_vae: bool = False
capacitron_vae: CapacitronVAEConfig = None
# model specific params # model specific params
num_speakers: int = 1 num_speakers: int = 1
num_chars: int = 0 num_chars: int = 0

View File

@ -5,6 +5,7 @@ from glob import glob
from pathlib import Path from pathlib import Path
from typing import List from typing import List
import pandas as pd
from tqdm import tqdm from tqdm import tqdm
######################## ########################
@ -12,6 +13,34 @@ from tqdm import tqdm
######################## ########################
def coqui(root_path, meta_file, ignored_speakers=None):
"""Interal dataset formatter."""
metadata = pd.read_csv(os.path.join(root_path, meta_file), sep="|")
assert all(x in metadata.columns for x in ["audio_file", "text"])
speaker_name = None if "speaker_name" in metadata.columns else "coqui"
emotion_name = None if "emotion_name" in metadata.columns else "neutral"
items = []
not_found_counter = 0
for row in metadata.itertuples():
if speaker_name is None and ignored_speakers is not None and row.speaker_name in ignored_speakers:
continue
audio_path = os.path.join(root_path, row.audio_file)
if not os.path.exists(audio_path):
not_found_counter += 1
continue
items.append(
{
"text": row.text,
"audio_file": audio_path,
"speaker_name": speaker_name if speaker_name is not None else row.speaker_name,
"emotion_name": emotion_name if emotion_name is not None else row.emotion_name,
}
)
if not_found_counter > 0:
print(f" | > [!] {not_found_counter} files not found")
return items
def tweb(root_path, meta_file, **kwargs): # pylint: disable=unused-argument def tweb(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
"""Normalize TWEB dataset. """Normalize TWEB dataset.
https://www.kaggle.com/bryanpark/the-world-english-bible-speech-dataset https://www.kaggle.com/bryanpark/the-world-english-bible-speech-dataset
@ -141,6 +170,21 @@ def ljspeech_test(root_path, meta_file, **kwargs): # pylint: disable=unused-arg
return items return items
def thorsten(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
"""Normalizes the thorsten meta data file to TTS format
https://github.com/thorstenMueller/deep-learning-german-tts/"""
txt_file = os.path.join(root_path, meta_file)
items = []
speaker_name = "thorsten"
with open(txt_file, "r", encoding="utf-8") as ttf:
for line in ttf:
cols = line.split("|")
wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav")
text = cols[1]
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name})
return items
def sam_accenture(root_path, meta_file, **kwargs): # pylint: disable=unused-argument def sam_accenture(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
"""Normalizes the sam-accenture meta data file to TTS format """Normalizes the sam-accenture meta data file to TTS format
https://github.com/Sam-Accenture-Non-Binary-Voice/non-binary-voice-files""" https://github.com/Sam-Accenture-Non-Binary-Voice/non-binary-voice-files"""
@ -352,6 +396,25 @@ def vctk_old(root_path, meta_files=None, wavs_path="wav48", ignored_speakers=Non
return items return items
def synpaflex(root_path, metafiles=None, **kwargs): # pylint: disable=unused-argument
items = []
speaker_name = "synpaflex"
root_path = os.path.join(root_path, "")
wav_files = glob(f"{root_path}**/*.wav", recursive=True)
for wav_file in wav_files:
if os.sep + "wav" + os.sep in wav_file:
txt_file = wav_file.replace("wav", "txt")
else:
txt_file = os.path.join(
os.path.dirname(wav_file), "txt", os.path.basename(wav_file).replace(".wav", ".txt")
)
if os.path.exists(txt_file) and os.path.exists(wav_file):
with open(txt_file, "r", encoding="utf-8") as file_text:
text = file_text.readlines()[0]
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name})
return items
def open_bible(root_path, meta_files="train", ignore_digits_sentences=True, ignored_speakers=None): def open_bible(root_path, meta_files="train", ignore_digits_sentences=True, ignored_speakers=None):
"""ToDo: Refer the paper when available""" """ToDo: Refer the paper when available"""
items = [] items = []

View File

@ -281,6 +281,10 @@ class TacotronLoss(torch.nn.Module):
def __init__(self, c, ga_sigma=0.4): def __init__(self, c, ga_sigma=0.4):
super().__init__() super().__init__()
self.stopnet_pos_weight = c.stopnet_pos_weight self.stopnet_pos_weight = c.stopnet_pos_weight
self.use_capacitron_vae = c.use_capacitron_vae
if self.use_capacitron_vae:
self.capacitron_capacity = c.capacitron_vae.capacitron_capacity
self.capacitron_vae_loss_alpha = c.capacitron_vae.capacitron_VAE_loss_alpha
self.ga_alpha = c.ga_alpha self.ga_alpha = c.ga_alpha
self.decoder_diff_spec_alpha = c.decoder_diff_spec_alpha self.decoder_diff_spec_alpha = c.decoder_diff_spec_alpha
self.postnet_diff_spec_alpha = c.postnet_diff_spec_alpha self.postnet_diff_spec_alpha = c.postnet_diff_spec_alpha
@ -308,6 +312,9 @@ class TacotronLoss(torch.nn.Module):
# pylint: disable=not-callable # pylint: disable=not-callable
self.criterion_st = BCELossMasked(pos_weight=torch.tensor(self.stopnet_pos_weight)) if c.stopnet else None self.criterion_st = BCELossMasked(pos_weight=torch.tensor(self.stopnet_pos_weight)) if c.stopnet else None
# For dev pruposes only
self.criterion_capacitron_reconstruction_loss = nn.L1Loss(reduction="sum")
def forward( def forward(
self, self,
postnet_output, postnet_output,
@ -317,6 +324,7 @@ class TacotronLoss(torch.nn.Module):
stopnet_output, stopnet_output,
stopnet_target, stopnet_target,
stop_target_length, stop_target_length,
capacitron_vae_outputs,
output_lens, output_lens,
decoder_b_output, decoder_b_output,
alignments, alignments,
@ -348,6 +356,55 @@ class TacotronLoss(torch.nn.Module):
return_dict["decoder_loss"] = decoder_loss return_dict["decoder_loss"] = decoder_loss
return_dict["postnet_loss"] = postnet_loss return_dict["postnet_loss"] = postnet_loss
if self.use_capacitron_vae:
# extract capacitron vae infos
posterior_distribution, prior_distribution, beta = capacitron_vae_outputs
# KL divergence term between the posterior and the prior
kl_term = torch.mean(torch.distributions.kl_divergence(posterior_distribution, prior_distribution))
# Limit the mutual information between the data and latent space by the variational capacity limit
kl_capacity = kl_term - self.capacitron_capacity
# pass beta through softplus to keep it positive
beta = torch.nn.functional.softplus(beta)[0]
# This is the term going to the main ADAM optimiser, we detach beta because
# beta is optimised by a separate, SGD optimiser below
capacitron_vae_loss = beta.detach() * kl_capacity
# normalize the capacitron_vae_loss as in L1Loss or MSELoss.
# After this, both the standard loss and capacitron_vae_loss will be in the same scale.
# For this reason we don't need use L1Loss and MSELoss in "sum" reduction mode.
# Note: the batch is not considered because the L1Loss was calculated in "sum" mode
# divided by the batch size, So not dividing the capacitron_vae_loss by B is legitimate.
# get B T D dimension from input
B, T, D = mel_input.size()
# normalize
if self.config.loss_masking:
# if mask loss get T using the mask
T = output_lens.sum() / B
# Only for dev purposes to be able to compare the reconstruction loss with the values in the
# original Capacitron paper
return_dict["capaciton_reconstruction_loss"] = (
self.criterion_capacitron_reconstruction_loss(decoder_output, mel_input) / decoder_output.size(0)
) + kl_capacity
capacitron_vae_loss = capacitron_vae_loss / (T * D)
capacitron_vae_loss = capacitron_vae_loss * self.capacitron_vae_loss_alpha
# This is the term to purely optimise beta and to pass into the SGD optimizer
beta_loss = torch.negative(beta) * kl_capacity.detach()
loss += capacitron_vae_loss
return_dict["capacitron_vae_loss"] = capacitron_vae_loss
return_dict["capacitron_vae_beta_loss"] = beta_loss
return_dict["capacitron_vae_kl_term"] = kl_term
return_dict["capacitron_beta"] = beta
stop_loss = ( stop_loss = (
self.criterion_st(stopnet_output, stopnet_target, stop_target_length) self.criterion_st(stopnet_output, stopnet_target, stop_target_length)
if self.config.stopnet if self.config.stopnet

View File

@ -484,4 +484,4 @@ def init_attn(
beta=0.9, beta=0.9,
) )
raise RuntimeError(" [!] Given Attention Type '{attn_type}' is not exist.") raise RuntimeError(f" [!] Given Attention Type '{attn_type}' is not exist.")

View File

@ -0,0 +1,205 @@
import torch
from torch import nn
from torch.distributions.multivariate_normal import MultivariateNormal as MVN
from torch.nn import functional as F
class CapacitronVAE(nn.Module):
"""Effective Use of Variational Embedding Capacity for prosody transfer.
See https://arxiv.org/abs/1906.03402"""
def __init__(
self,
num_mel,
capacitron_VAE_embedding_dim,
encoder_output_dim=256,
reference_encoder_out_dim=128,
speaker_embedding_dim=None,
text_summary_embedding_dim=None,
):
super().__init__()
# Init distributions
self.prior_distribution = MVN(
torch.zeros(capacitron_VAE_embedding_dim), torch.eye(capacitron_VAE_embedding_dim)
)
self.approximate_posterior_distribution = None
# define output ReferenceEncoder dim to the capacitron_VAE_embedding_dim
self.encoder = ReferenceEncoder(num_mel, out_dim=reference_encoder_out_dim)
# Init beta, the lagrange-like term for the KL distribution
self.beta = torch.nn.Parameter(torch.log(torch.exp(torch.Tensor([1.0])) - 1), requires_grad=True)
mlp_input_dimension = reference_encoder_out_dim
if text_summary_embedding_dim is not None:
self.text_summary_net = TextSummary(text_summary_embedding_dim, encoder_output_dim=encoder_output_dim)
mlp_input_dimension += text_summary_embedding_dim
if speaker_embedding_dim is not None:
# TODO: Test a multispeaker model!
mlp_input_dimension += speaker_embedding_dim
self.post_encoder_mlp = PostEncoderMLP(mlp_input_dimension, capacitron_VAE_embedding_dim)
def forward(self, reference_mel_info=None, text_info=None, speaker_embedding=None):
# Use reference
if reference_mel_info is not None:
reference_mels = reference_mel_info[0] # [batch_size, num_frames, num_mels]
mel_lengths = reference_mel_info[1] # [batch_size]
enc_out = self.encoder(reference_mels, mel_lengths)
# concat speaker_embedding and/or text summary embedding
if text_info is not None:
text_inputs = text_info[0] # [batch_size, num_characters, num_embedding]
input_lengths = text_info[1]
text_summary_out = self.text_summary_net(text_inputs, input_lengths).to(reference_mels.device)
enc_out = torch.cat([enc_out, text_summary_out], dim=-1)
if speaker_embedding is not None:
enc_out = torch.cat([enc_out, speaker_embedding], dim=-1)
# Feed the output of the ref encoder and information about text/speaker into
# an MLP to produce the parameteres for the approximate poterior distributions
mu, sigma = self.post_encoder_mlp(enc_out)
# convert to cpu because prior_distribution was created on cpu
mu = mu.cpu()
sigma = sigma.cpu()
# Sample from the posterior: z ~ q(z|x)
self.approximate_posterior_distribution = MVN(mu, torch.diag_embed(sigma))
VAE_embedding = self.approximate_posterior_distribution.rsample()
# Infer from the model, bypasses encoding
else:
# Sample from the prior: z ~ p(z)
VAE_embedding = self.prior_distribution.sample().unsqueeze(0)
# reshape to [batch_size, 1, capacitron_VAE_embedding_dim]
return VAE_embedding.unsqueeze(1), self.approximate_posterior_distribution, self.prior_distribution, self.beta
class ReferenceEncoder(nn.Module):
"""NN module creating a fixed size prosody embedding from a spectrogram.
inputs: mel spectrograms [batch_size, num_spec_frames, num_mel]
outputs: [batch_size, embedding_dim]
"""
def __init__(self, num_mel, out_dim):
super().__init__()
self.num_mel = num_mel
filters = [1] + [32, 32, 64, 64, 128, 128]
num_layers = len(filters) - 1
convs = [
nn.Conv2d(
in_channels=filters[i], out_channels=filters[i + 1], kernel_size=(3, 3), stride=(2, 2), padding=(2, 2)
)
for i in range(num_layers)
]
self.convs = nn.ModuleList(convs)
self.training = False
self.bns = nn.ModuleList([nn.BatchNorm2d(num_features=filter_size) for filter_size in filters[1:]])
post_conv_height = self.calculate_post_conv_height(num_mel, 3, 2, 2, num_layers)
self.recurrence = nn.LSTM(
input_size=filters[-1] * post_conv_height, hidden_size=out_dim, batch_first=True, bidirectional=False
)
def forward(self, inputs, input_lengths):
batch_size = inputs.size(0)
x = inputs.view(batch_size, 1, -1, self.num_mel) # [batch_size, num_channels==1, num_frames, num_mel]
valid_lengths = input_lengths.float() # [batch_size]
for conv, bn in zip(self.convs, self.bns):
x = conv(x)
x = bn(x)
x = F.relu(x)
# Create the post conv width mask based on the valid lengths of the output of the convolution.
# The valid lengths for the output of a convolution on varying length inputs is
# ceil(input_length/stride) + 1 for stride=3 and padding=2
# For example (kernel_size=3, stride=2, padding=2):
# 0 0 x x x x x 0 0 -> Input = 5, 0 is zero padding, x is valid values coming from padding=2 in conv2d
# _____
# x _____
# x _____
# x ____
# x
# x x x x -> Output valid length = 4
# Since every example in te batch is zero padded and therefore have separate valid_lengths,
# we need to mask off all the values AFTER the valid length for each example in the batch.
# Otherwise, the convolutions create noise and a lot of not real information
valid_lengths = (valid_lengths / 2).float()
valid_lengths = torch.ceil(valid_lengths).to(dtype=torch.int64) + 1 # 2 is stride -- size: [batch_size]
post_conv_max_width = x.size(2)
mask = torch.arange(post_conv_max_width).to(inputs.device).expand(
len(valid_lengths), post_conv_max_width
) < valid_lengths.unsqueeze(1)
mask = mask.expand(1, 1, -1, -1).transpose(2, 0).transpose(-1, 2) # [batch_size, 1, post_conv_max_width, 1]
x = x * mask
x = x.transpose(1, 2)
# x: 4D tensor [batch_size, post_conv_width,
# num_channels==128, post_conv_height]
post_conv_width = x.size(1)
x = x.contiguous().view(batch_size, post_conv_width, -1)
# x: 3D tensor [batch_size, post_conv_width,
# num_channels*post_conv_height]
# Routine for fetching the last valid output of a dynamic LSTM with varying input lengths and padding
post_conv_input_lengths = valid_lengths
packed_seqs = nn.utils.rnn.pack_padded_sequence(
x, post_conv_input_lengths.tolist(), batch_first=True, enforce_sorted=False
) # dynamic rnn sequence padding
self.recurrence.flatten_parameters()
_, (ht, _) = self.recurrence(packed_seqs)
last_output = ht[-1]
return last_output.to(inputs.device) # [B, 128]
@staticmethod
def calculate_post_conv_height(height, kernel_size, stride, pad, n_convs):
"""Height of spec after n convolutions with fixed kernel/stride/pad."""
for _ in range(n_convs):
height = (height - kernel_size + 2 * pad) // stride + 1
return height
class TextSummary(nn.Module):
def __init__(self, embedding_dim, encoder_output_dim):
super().__init__()
self.lstm = nn.LSTM(
encoder_output_dim, # text embedding dimension from the text encoder
embedding_dim, # fixed length output summary the lstm creates from the input
batch_first=True,
bidirectional=False,
)
def forward(self, inputs, input_lengths):
# Routine for fetching the last valid output of a dynamic LSTM with varying input lengths and padding
packed_seqs = nn.utils.rnn.pack_padded_sequence(
inputs, input_lengths.tolist(), batch_first=True, enforce_sorted=False
) # dynamic rnn sequence padding
self.lstm.flatten_parameters()
_, (ht, _) = self.lstm(packed_seqs)
last_output = ht[-1]
return last_output
class PostEncoderMLP(nn.Module):
def __init__(self, input_size, hidden_size):
super().__init__()
self.hidden_size = hidden_size
modules = [
nn.Linear(input_size, hidden_size), # Hidden Layer
nn.Tanh(),
nn.Linear(hidden_size, hidden_size * 2),
] # Output layer twice the size for mean and variance
self.net = nn.Sequential(*modules)
self.softplus = nn.Softplus()
def forward(self, _input):
mlp_output = self.net(_input)
# The mean parameter is unconstrained
mu = mlp_output[:, : self.hidden_size]
# The standard deviation must be positive. Parameterise with a softplus
sigma = self.softplus(mlp_output[:, self.hidden_size :])
return mu, sigma

View File

@ -139,7 +139,7 @@ class MultiHeadAttention(nn.Module):
keys = torch.stack(torch.split(keys, split_size, dim=2), dim=0) # [h, N, T_k, num_units/h] keys = torch.stack(torch.split(keys, split_size, dim=2), dim=0) # [h, N, T_k, num_units/h]
values = torch.stack(torch.split(values, split_size, dim=2), dim=0) # [h, N, T_k, num_units/h] values = torch.stack(torch.split(values, split_size, dim=2), dim=0) # [h, N, T_k, num_units/h]
# score = softmax(QK^T / (d_k ** 0.5)) # score = softmax(QK^T / (d_k**0.5))
scores = torch.matmul(queries, keys.transpose(2, 3)) # [h, N, T_q, T_k] scores = torch.matmul(queries, keys.transpose(2, 3)) # [h, N, T_q, T_k]
scores = scores / (self.key_dim**0.5) scores = scores / (self.key_dim**0.5)
scores = F.softmax(scores, dim=3) scores = F.softmax(scores, dim=3)

View File

@ -58,10 +58,8 @@ class VitsDiscriminator(nn.Module):
use_spectral_norm (bool): if `True` swith to spectral norm instead of weight norm. use_spectral_norm (bool): if `True` swith to spectral norm instead of weight norm.
""" """
def __init__(self, use_spectral_norm=False): def __init__(self, periods=(2, 3, 5, 7, 11), use_spectral_norm=False):
super().__init__() super().__init__()
periods = [2, 3, 5, 7, 11]
self.nets = nn.ModuleList() self.nets = nn.ModuleList()
self.nets.append(DiscriminatorS(use_spectral_norm=use_spectral_norm)) self.nets.append(DiscriminatorS(use_spectral_norm=use_spectral_norm))
self.nets.extend([DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]) self.nets.extend([DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods])

View File

@ -1,6 +1,6 @@
import copy import copy
from abc import abstractmethod from abc import abstractmethod
from typing import Dict from typing import Dict, Tuple
import torch import torch
from coqpit import Coqpit from coqpit import Coqpit
@ -10,7 +10,9 @@ from TTS.tts.layers.losses import TacotronLoss
from TTS.tts.models.base_tts import BaseTTS from TTS.tts.models.base_tts import BaseTTS
from TTS.tts.utils.helpers import sequence_mask from TTS.tts.utils.helpers import sequence_mask
from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.synthesis import synthesis
from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
from TTS.utils.generic_utils import format_aux_input from TTS.utils.generic_utils import format_aux_input
from TTS.utils.io import load_fsspec from TTS.utils.io import load_fsspec
from TTS.utils.training import gradual_training_scheduler from TTS.utils.training import gradual_training_scheduler
@ -47,6 +49,11 @@ class BaseTacotron(BaseTTS):
self.decoder_in_features += self.gst.gst_embedding_dim # add gst embedding dim self.decoder_in_features += self.gst.gst_embedding_dim # add gst embedding dim
self.gst_layer = None self.gst_layer = None
# Capacitron
if self.capacitron_vae and self.use_capacitron_vae:
self.decoder_in_features += self.capacitron_vae.capacitron_VAE_embedding_dim # add capacitron embedding dim
self.capacitron_vae_layer = None
# additional layers # additional layers
self.decoder_backward = None self.decoder_backward = None
self.coarse_decoder = None self.coarse_decoder = None
@ -125,6 +132,53 @@ class BaseTacotron(BaseTTS):
speaker_manager = SpeakerManager.init_from_config(config) speaker_manager = SpeakerManager.init_from_config(config)
return BaseTacotron(config, ap, tokenizer, speaker_manager) return BaseTacotron(config, ap, tokenizer, speaker_manager)
##########################
# TEST AND LOG FUNCTIONS #
##########################
def test_run(self, assets: Dict) -> Tuple[Dict, Dict]:
"""Generic test run for `tts` models used by `Trainer`.
You can override this for a different behaviour.
Args:
assets (dict): A dict of training assets. For `tts` models, it must include `{'audio_processor': ap}`.
Returns:
Tuple[Dict, Dict]: Test figures and audios to be projected to Tensorboard.
"""
print(" | > Synthesizing test sentences.")
test_audios = {}
test_figures = {}
test_sentences = self.config.test_sentences
aux_inputs = self._get_test_aux_input()
for idx, sen in enumerate(test_sentences):
outputs_dict = synthesis(
self,
sen,
self.config,
"cuda" in str(next(self.parameters()).device),
speaker_id=aux_inputs["speaker_id"],
d_vector=aux_inputs["d_vector"],
style_wav=aux_inputs["style_wav"],
use_griffin_lim=True,
do_trim_silence=False,
)
test_audios["{}-audio".format(idx)] = outputs_dict["wav"]
test_figures["{}-prediction".format(idx)] = plot_spectrogram(
outputs_dict["outputs"]["model_outputs"], self.ap, output_fig=False
)
test_figures["{}-alignment".format(idx)] = plot_alignment(
outputs_dict["outputs"]["alignments"], output_fig=False
)
return {"figures": test_figures, "audios": test_audios}
def test_log(
self, outputs: dict, logger: "Logger", assets: dict, steps: int # pylint: disable=unused-argument
) -> None:
logger.test_audios(steps, outputs["audios"], self.ap.sample_rate)
logger.test_figures(steps, outputs["figures"])
############################# #############################
# COMMON COMPUTE FUNCTIONS # COMMON COMPUTE FUNCTIONS
############################# #############################
@ -160,7 +214,9 @@ class BaseTacotron(BaseTTS):
) )
# scale_factor = self.decoder.r_init / self.decoder.r # scale_factor = self.decoder.r_init / self.decoder.r
alignments_backward = torch.nn.functional.interpolate( alignments_backward = torch.nn.functional.interpolate(
alignments_backward.transpose(1, 2), size=alignments.shape[1], mode="nearest" alignments_backward.transpose(1, 2),
size=alignments.shape[1],
mode="nearest",
).transpose(1, 2) ).transpose(1, 2)
decoder_outputs_backward = decoder_outputs_backward.transpose(1, 2) decoder_outputs_backward = decoder_outputs_backward.transpose(1, 2)
decoder_outputs_backward = decoder_outputs_backward[:, :T, :] decoder_outputs_backward = decoder_outputs_backward[:, :T, :]
@ -193,6 +249,25 @@ class BaseTacotron(BaseTTS):
inputs = self._concat_speaker_embedding(inputs, gst_outputs) inputs = self._concat_speaker_embedding(inputs, gst_outputs)
return inputs return inputs
def compute_capacitron_VAE_embedding(self, inputs, reference_mel_info, text_info=None, speaker_embedding=None):
"""Capacitron Variational Autoencoder"""
(VAE_outputs, posterior_distribution, prior_distribution, capacitron_beta,) = self.capacitron_vae_layer(
reference_mel_info,
text_info,
speaker_embedding, # pylint: disable=not-callable
)
VAE_outputs = VAE_outputs.to(inputs.device)
encoder_output = self._concat_speaker_embedding(
inputs, VAE_outputs
) # concatenate to the output of the basic tacotron encoder
return (
encoder_output,
posterior_distribution,
prior_distribution,
capacitron_beta,
)
@staticmethod @staticmethod
def _add_speaker_embedding(outputs, embedded_speakers): def _add_speaker_embedding(outputs, embedded_speakers):
embedded_speakers_ = embedded_speakers.expand(outputs.size(0), outputs.size(1), -1) embedded_speakers_ = embedded_speakers.expand(outputs.size(0), outputs.size(1), -1)

View File

@ -12,6 +12,7 @@ from trainer.torch import DistributedSampler, DistributedSamplerWrapper
from TTS.model import BaseTrainerModel from TTS.model import BaseTrainerModel
from TTS.tts.datasets.dataset import TTSDataset from TTS.tts.datasets.dataset import TTSDataset
from TTS.tts.utils.data import get_length_balancer_weights
from TTS.tts.utils.languages import LanguageManager, get_language_balancer_weights from TTS.tts.utils.languages import LanguageManager, get_language_balancer_weights
from TTS.tts.utils.speakers import SpeakerManager, get_speaker_balancer_weights, get_speaker_manager from TTS.tts.utils.speakers import SpeakerManager, get_speaker_balancer_weights, get_speaker_manager
from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.synthesis import synthesis
@ -250,6 +251,14 @@ class BaseTTS(BaseTrainerModel):
else: else:
weights = get_speaker_balancer_weights(data_items) * alpha weights = get_speaker_balancer_weights(data_items) * alpha
if getattr(config, "use_length_weighted_sampler", False):
alpha = getattr(config, "length_weighted_sampler_alpha", 1.0)
print(" > Using Length weighted sampler with alpha:", alpha)
if weights is not None:
weights += get_length_balancer_weights(data_items) * alpha
else:
weights = get_length_balancer_weights(data_items) * alpha
if weights is not None: if weights is not None:
sampler = WeightedRandomSampler(weights, len(weights)) sampler = WeightedRandomSampler(weights, len(weights))
else: else:
@ -398,16 +407,16 @@ class BaseTTS(BaseTrainerModel):
return test_figures, test_audios return test_figures, test_audios
def on_init_start(self, trainer): def on_init_start(self, trainer):
"""Save the speaker.json and language_ids.json at the beginning of the training. Also update both paths.""" """Save the speaker.pth and language_ids.json at the beginning of the training. Also update both paths."""
if self.speaker_manager is not None: if self.speaker_manager is not None:
output_path = os.path.join(trainer.output_path, "speakers.json") output_path = os.path.join(trainer.output_path, "speakers.pth")
self.speaker_manager.save_ids_to_file(output_path) self.speaker_manager.save_ids_to_file(output_path)
trainer.config.speakers_file = output_path trainer.config.speakers_file = output_path
# some models don't have `model_args` set # some models don't have `model_args` set
if hasattr(trainer.config, "model_args"): if hasattr(trainer.config, "model_args"):
trainer.config.model_args.speakers_file = output_path trainer.config.model_args.speakers_file = output_path
trainer.config.save_json(os.path.join(trainer.output_path, "config.json")) trainer.config.save_json(os.path.join(trainer.output_path, "config.json"))
print(f" > `speakers.json` is saved to {output_path}.") print(f" > `speakers.pth` is saved to {output_path}.")
print(" > `speakers_file` is updated in the config.json.") print(" > `speakers_file` is updated in the config.json.")
if hasattr(self, "language_manager") and self.language_manager is not None: if hasattr(self, "language_manager") and self.language_manager is not None:

View File

@ -1,11 +1,13 @@
# coding: utf-8 # coding: utf-8
from typing import Dict, List, Union from typing import Dict, List, Tuple, Union
import torch import torch
from torch import nn from torch import nn
from torch.cuda.amp.autocast_mode import autocast from torch.cuda.amp.autocast_mode import autocast
from trainer.trainer_utils import get_optimizer, get_scheduler
from TTS.tts.layers.tacotron.capacitron_layers import CapacitronVAE
from TTS.tts.layers.tacotron.gst_layers import GST from TTS.tts.layers.tacotron.gst_layers import GST
from TTS.tts.layers.tacotron.tacotron import Decoder, Encoder, PostCBHG from TTS.tts.layers.tacotron.tacotron import Decoder, Encoder, PostCBHG
from TTS.tts.models.base_tacotron import BaseTacotron from TTS.tts.models.base_tacotron import BaseTacotron
@ -13,6 +15,7 @@ from TTS.tts.utils.measures import alignment_diagonal_score
from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
from TTS.utils.capacitron_optimizer import CapacitronOptimizer
class Tacotron(BaseTacotron): class Tacotron(BaseTacotron):
@ -51,6 +54,9 @@ class Tacotron(BaseTacotron):
if self.use_gst: if self.use_gst:
self.decoder_in_features += self.gst.gst_embedding_dim self.decoder_in_features += self.gst.gst_embedding_dim
if self.use_capacitron_vae:
self.decoder_in_features += self.capacitron_vae.capacitron_VAE_embedding_dim
# embedding layer # embedding layer
self.embedding = nn.Embedding(self.num_chars, 256, padding_idx=0) self.embedding = nn.Embedding(self.num_chars, 256, padding_idx=0)
self.embedding.weight.data.normal_(0, 0.3) self.embedding.weight.data.normal_(0, 0.3)
@ -90,6 +96,20 @@ class Tacotron(BaseTacotron):
gst_embedding_dim=self.gst.gst_embedding_dim, gst_embedding_dim=self.gst.gst_embedding_dim,
) )
# Capacitron layers
if self.capacitron_vae and self.use_capacitron_vae:
self.capacitron_vae_layer = CapacitronVAE(
num_mel=self.decoder_output_dim,
encoder_output_dim=self.encoder_in_features,
capacitron_VAE_embedding_dim=self.capacitron_vae.capacitron_VAE_embedding_dim,
speaker_embedding_dim=self.embedded_speaker_dim
if self.use_speaker_embedding and self.capacitron_vae.capacitron_use_speaker_embedding
else None,
text_summary_embedding_dim=self.capacitron_vae.capacitron_text_summary_embedding_dim
if self.capacitron_vae.capacitron_use_text_summary_embeddings
else None,
)
# backward pass decoder # backward pass decoder
if self.bidirectional_decoder: if self.bidirectional_decoder:
self._init_backward_decoder() self._init_backward_decoder()
@ -146,6 +166,19 @@ class Tacotron(BaseTacotron):
# B x 1 x speaker_embed_dim # B x 1 x speaker_embed_dim
embedded_speakers = torch.unsqueeze(aux_input["d_vectors"], 1) embedded_speakers = torch.unsqueeze(aux_input["d_vectors"], 1)
encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers) encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers)
# Capacitron
if self.capacitron_vae and self.use_capacitron_vae:
# B x capacitron_VAE_embedding_dim
encoder_outputs, *capacitron_vae_outputs = self.compute_capacitron_VAE_embedding(
encoder_outputs,
reference_mel_info=[mel_specs, mel_lengths],
text_info=[inputs, text_lengths]
if self.capacitron_vae.capacitron_use_text_summary_embeddings
else None,
speaker_embedding=embedded_speakers if self.capacitron_vae.capacitron_use_speaker_embedding else None,
)
else:
capacitron_vae_outputs = None
# decoder_outputs: B x decoder_in_features x T_out # decoder_outputs: B x decoder_in_features x T_out
# alignments: B x T_in x encoder_in_features # alignments: B x T_in x encoder_in_features
# stop_tokens: B x T_in # stop_tokens: B x T_in
@ -178,6 +211,7 @@ class Tacotron(BaseTacotron):
"decoder_outputs": decoder_outputs, "decoder_outputs": decoder_outputs,
"alignments": alignments, "alignments": alignments,
"stop_tokens": stop_tokens, "stop_tokens": stop_tokens,
"capacitron_vae_outputs": capacitron_vae_outputs,
} }
) )
return outputs return outputs
@ -190,6 +224,28 @@ class Tacotron(BaseTacotron):
if self.gst and self.use_gst: if self.gst and self.use_gst:
# B x gst_dim # B x gst_dim
encoder_outputs = self.compute_gst(encoder_outputs, aux_input["style_mel"], aux_input["d_vectors"]) encoder_outputs = self.compute_gst(encoder_outputs, aux_input["style_mel"], aux_input["d_vectors"])
if self.capacitron_vae and self.use_capacitron_vae:
if aux_input["style_text"] is not None:
style_text_embedding = self.embedding(aux_input["style_text"])
style_text_length = torch.tensor([style_text_embedding.size(1)], dtype=torch.int64).to(
encoder_outputs.device
) # pylint: disable=not-callable
reference_mel_length = (
torch.tensor([aux_input["style_mel"].size(1)], dtype=torch.int64).to(encoder_outputs.device)
if aux_input["style_mel"] is not None
else None
) # pylint: disable=not-callable
# B x capacitron_VAE_embedding_dim
encoder_outputs, *_ = self.compute_capacitron_VAE_embedding(
encoder_outputs,
reference_mel_info=[aux_input["style_mel"], reference_mel_length]
if aux_input["style_mel"] is not None
else None,
text_info=[style_text_embedding, style_text_length] if aux_input["style_text"] is not None else None,
speaker_embedding=aux_input["d_vectors"]
if self.capacitron_vae.capacitron_use_speaker_embedding
else None,
)
if self.num_speakers > 1: if self.num_speakers > 1:
if not self.use_d_vector_file: if not self.use_d_vector_file:
# B x 1 x speaker_embed_dim # B x 1 x speaker_embed_dim
@ -215,12 +271,19 @@ class Tacotron(BaseTacotron):
} }
return outputs return outputs
def train_step(self, batch, criterion): def before_backward_pass(self, loss_dict, optimizer) -> None:
"""Perform a single training step by fetching the right set if samples from the batch. # Extracting custom training specific operations for capacitron
# from the trainer
if self.use_capacitron_vae:
loss_dict["capacitron_vae_beta_loss"].backward()
optimizer.first_step()
def train_step(self, batch: Dict, criterion: torch.nn.Module) -> Tuple[Dict, Dict]:
"""Perform a single training step by fetching the right set of samples from the batch.
Args: Args:
batch ([type]): [description] batch ([Dict]): A dictionary of input tensors.
criterion ([type]): [description] criterion ([torch.nn.Module]): Callable criterion to compute model loss.
""" """
text_input = batch["text_input"] text_input = batch["text_input"]
text_lengths = batch["text_lengths"] text_lengths = batch["text_lengths"]
@ -232,14 +295,8 @@ class Tacotron(BaseTacotron):
speaker_ids = batch["speaker_ids"] speaker_ids = batch["speaker_ids"]
d_vectors = batch["d_vectors"] d_vectors = batch["d_vectors"]
# forward pass model aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
outputs = self.forward( outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input)
text_input,
text_lengths,
mel_input,
mel_lengths,
aux_input={"speaker_ids": speaker_ids, "d_vectors": d_vectors},
)
# set the [alignment] lengths wrt reduction factor for guided attention # set the [alignment] lengths wrt reduction factor for guided attention
if mel_lengths.max() % self.decoder.r != 0: if mel_lengths.max() % self.decoder.r != 0:
@ -249,9 +306,6 @@ class Tacotron(BaseTacotron):
else: else:
alignment_lengths = mel_lengths // self.decoder.r alignment_lengths = mel_lengths // self.decoder.r
aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input)
# compute loss # compute loss
with autocast(enabled=False): # use float32 for the criterion with autocast(enabled=False): # use float32 for the criterion
loss_dict = criterion( loss_dict = criterion(
@ -262,6 +316,7 @@ class Tacotron(BaseTacotron):
outputs["stop_tokens"].float(), outputs["stop_tokens"].float(),
stop_targets.float(), stop_targets.float(),
stop_target_lengths, stop_target_lengths,
outputs["capacitron_vae_outputs"] if self.capacitron_vae else None,
mel_lengths, mel_lengths,
None if outputs["decoder_outputs_backward"] is None else outputs["decoder_outputs_backward"].float(), None if outputs["decoder_outputs_backward"] is None else outputs["decoder_outputs_backward"].float(),
outputs["alignments"].float(), outputs["alignments"].float(),
@ -275,6 +330,25 @@ class Tacotron(BaseTacotron):
loss_dict["align_error"] = align_error loss_dict["align_error"] = align_error
return outputs, loss_dict return outputs, loss_dict
def get_optimizer(self) -> List:
if self.use_capacitron_vae:
return CapacitronOptimizer(self.config, self.named_parameters())
return get_optimizer(self.config.optimizer, self.config.optimizer_params, self.config.lr, self)
def get_scheduler(self, optimizer: object):
opt = optimizer.primary_optimizer if self.use_capacitron_vae else optimizer
return get_scheduler(self.config.lr_scheduler, self.config.lr_scheduler_params, opt)
def before_gradient_clipping(self):
if self.use_capacitron_vae:
# Capacitron model specific gradient clipping
model_params_to_clip = []
for name, param in self.named_parameters():
if param.requires_grad:
if name != "capacitron_vae_layer.beta":
model_params_to_clip.append(param)
torch.nn.utils.clip_grad_norm_(model_params_to_clip, self.capacitron_vae.capacitron_grad_clip)
def _create_logs(self, batch, outputs, ap): def _create_logs(self, batch, outputs, ap):
postnet_outputs = outputs["model_outputs"] postnet_outputs = outputs["model_outputs"]
decoder_outputs = outputs["decoder_outputs"] decoder_outputs = outputs["decoder_outputs"]

View File

@ -5,7 +5,9 @@ from typing import Dict, List, Union
import torch import torch
from torch import nn from torch import nn
from torch.cuda.amp.autocast_mode import autocast from torch.cuda.amp.autocast_mode import autocast
from trainer.trainer_utils import get_optimizer, get_scheduler
from TTS.tts.layers.tacotron.capacitron_layers import CapacitronVAE
from TTS.tts.layers.tacotron.gst_layers import GST from TTS.tts.layers.tacotron.gst_layers import GST
from TTS.tts.layers.tacotron.tacotron2 import Decoder, Encoder, Postnet from TTS.tts.layers.tacotron.tacotron2 import Decoder, Encoder, Postnet
from TTS.tts.models.base_tacotron import BaseTacotron from TTS.tts.models.base_tacotron import BaseTacotron
@ -13,6 +15,7 @@ from TTS.tts.utils.measures import alignment_diagonal_score
from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.tts.utils.visual import plot_alignment, plot_spectrogram from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
from TTS.utils.capacitron_optimizer import CapacitronOptimizer
class Tacotron2(BaseTacotron): class Tacotron2(BaseTacotron):
@ -65,6 +68,9 @@ class Tacotron2(BaseTacotron):
if self.use_gst: if self.use_gst:
self.decoder_in_features += self.gst.gst_embedding_dim self.decoder_in_features += self.gst.gst_embedding_dim
if self.use_capacitron_vae:
self.decoder_in_features += self.capacitron_vae.capacitron_VAE_embedding_dim
# embedding layer # embedding layer
self.embedding = nn.Embedding(self.num_chars, 512, padding_idx=0) self.embedding = nn.Embedding(self.num_chars, 512, padding_idx=0)
@ -102,6 +108,20 @@ class Tacotron2(BaseTacotron):
gst_embedding_dim=self.gst.gst_embedding_dim, gst_embedding_dim=self.gst.gst_embedding_dim,
) )
# Capacitron VAE Layers
if self.capacitron_vae and self.use_capacitron_vae:
self.capacitron_vae_layer = CapacitronVAE(
num_mel=self.decoder_output_dim,
encoder_output_dim=self.encoder_in_features,
capacitron_VAE_embedding_dim=self.capacitron_vae.capacitron_VAE_embedding_dim,
speaker_embedding_dim=self.embedded_speaker_dim
if self.capacitron_vae.capacitron_use_speaker_embedding
else None,
text_summary_embedding_dim=self.capacitron_vae.capacitron_text_summary_embedding_dim
if self.capacitron_vae.capacitron_use_text_summary_embeddings
else None,
)
# backward pass decoder # backward pass decoder
if self.bidirectional_decoder: if self.bidirectional_decoder:
self._init_backward_decoder() self._init_backward_decoder()
@ -166,6 +186,20 @@ class Tacotron2(BaseTacotron):
embedded_speakers = torch.unsqueeze(aux_input["d_vectors"], 1) embedded_speakers = torch.unsqueeze(aux_input["d_vectors"], 1)
encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers) encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers)
# capacitron
if self.capacitron_vae and self.use_capacitron_vae:
# B x capacitron_VAE_embedding_dim
encoder_outputs, *capacitron_vae_outputs = self.compute_capacitron_VAE_embedding(
encoder_outputs,
reference_mel_info=[mel_specs, mel_lengths],
text_info=[embedded_inputs.transpose(1, 2), text_lengths]
if self.capacitron_vae.capacitron_use_text_summary_embeddings
else None,
speaker_embedding=embedded_speakers if self.capacitron_vae.capacitron_use_speaker_embedding else None,
)
else:
capacitron_vae_outputs = None
encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs) encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs)
# B x mel_dim x T_out -- B x T_out//r x T_in -- B x T_out//r # B x mel_dim x T_out -- B x T_out//r x T_in -- B x T_out//r
@ -197,6 +231,7 @@ class Tacotron2(BaseTacotron):
"decoder_outputs": decoder_outputs, "decoder_outputs": decoder_outputs,
"alignments": alignments, "alignments": alignments,
"stop_tokens": stop_tokens, "stop_tokens": stop_tokens,
"capacitron_vae_outputs": capacitron_vae_outputs,
} }
) )
return outputs return outputs
@ -217,6 +252,29 @@ class Tacotron2(BaseTacotron):
# B x gst_dim # B x gst_dim
encoder_outputs = self.compute_gst(encoder_outputs, aux_input["style_mel"], aux_input["d_vectors"]) encoder_outputs = self.compute_gst(encoder_outputs, aux_input["style_mel"], aux_input["d_vectors"])
if self.capacitron_vae and self.use_capacitron_vae:
if aux_input["style_text"] is not None:
style_text_embedding = self.embedding(aux_input["style_text"])
style_text_length = torch.tensor([style_text_embedding.size(1)], dtype=torch.int64).to(
encoder_outputs.device
) # pylint: disable=not-callable
reference_mel_length = (
torch.tensor([aux_input["style_mel"].size(1)], dtype=torch.int64).to(encoder_outputs.device)
if aux_input["style_mel"] is not None
else None
) # pylint: disable=not-callable
# B x capacitron_VAE_embedding_dim
encoder_outputs, *_ = self.compute_capacitron_VAE_embedding(
encoder_outputs,
reference_mel_info=[aux_input["style_mel"], reference_mel_length]
if aux_input["style_mel"] is not None
else None,
text_info=[style_text_embedding, style_text_length] if aux_input["style_text"] is not None else None,
speaker_embedding=aux_input["d_vectors"]
if self.capacitron_vae.capacitron_use_speaker_embedding
else None,
)
if self.num_speakers > 1: if self.num_speakers > 1:
if not self.use_d_vector_file: if not self.use_d_vector_file:
embedded_speakers = self.speaker_embedding(aux_input["speaker_ids"])[None] embedded_speakers = self.speaker_embedding(aux_input["speaker_ids"])[None]
@ -242,6 +300,13 @@ class Tacotron2(BaseTacotron):
} }
return outputs return outputs
def before_backward_pass(self, loss_dict, optimizer) -> None:
# Extracting custom training specific operations for capacitron
# from the trainer
if self.use_capacitron_vae:
loss_dict["capacitron_vae_beta_loss"].backward()
optimizer.first_step()
def train_step(self, batch: Dict, criterion: torch.nn.Module): def train_step(self, batch: Dict, criterion: torch.nn.Module):
"""A single training step. Forward pass and loss computation. """A single training step. Forward pass and loss computation.
@ -258,14 +323,8 @@ class Tacotron2(BaseTacotron):
speaker_ids = batch["speaker_ids"] speaker_ids = batch["speaker_ids"]
d_vectors = batch["d_vectors"] d_vectors = batch["d_vectors"]
# forward pass model aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
outputs = self.forward( outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input)
text_input,
text_lengths,
mel_input,
mel_lengths,
aux_input={"speaker_ids": speaker_ids, "d_vectors": d_vectors},
)
# set the [alignment] lengths wrt reduction factor for guided attention # set the [alignment] lengths wrt reduction factor for guided attention
if mel_lengths.max() % self.decoder.r != 0: if mel_lengths.max() % self.decoder.r != 0:
@ -275,9 +334,6 @@ class Tacotron2(BaseTacotron):
else: else:
alignment_lengths = mel_lengths // self.decoder.r alignment_lengths = mel_lengths // self.decoder.r
aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, aux_input)
# compute loss # compute loss
with autocast(enabled=False): # use float32 for the criterion with autocast(enabled=False): # use float32 for the criterion
loss_dict = criterion( loss_dict = criterion(
@ -288,6 +344,7 @@ class Tacotron2(BaseTacotron):
outputs["stop_tokens"].float(), outputs["stop_tokens"].float(),
stop_targets.float(), stop_targets.float(),
stop_target_lengths, stop_target_lengths,
outputs["capacitron_vae_outputs"] if self.capacitron_vae else None,
mel_lengths, mel_lengths,
None if outputs["decoder_outputs_backward"] is None else outputs["decoder_outputs_backward"].float(), None if outputs["decoder_outputs_backward"] is None else outputs["decoder_outputs_backward"].float(),
outputs["alignments"].float(), outputs["alignments"].float(),
@ -301,6 +358,25 @@ class Tacotron2(BaseTacotron):
loss_dict["align_error"] = align_error loss_dict["align_error"] = align_error
return outputs, loss_dict return outputs, loss_dict
def get_optimizer(self) -> List:
if self.use_capacitron_vae:
return CapacitronOptimizer(self.config, self.named_parameters())
return get_optimizer(self.config.optimizer, self.config.optimizer_params, self.config.lr, self)
def get_scheduler(self, optimizer: object):
opt = optimizer.primary_optimizer if self.use_capacitron_vae else optimizer
return get_scheduler(self.config.lr_scheduler, self.config.lr_scheduler_params, opt)
def before_gradient_clipping(self):
if self.use_capacitron_vae:
# Capacitron model specific gradient clipping
model_params_to_clip = []
for name, param in self.named_parameters():
if param.requires_grad:
if name != "capacitron_vae_layer.beta":
model_params_to_clip.append(param)
torch.nn.utils.clip_grad_norm_(model_params_to_clip, self.capacitron_vae.capacitron_grad_clip)
def _create_logs(self, batch, outputs, ap): def _create_logs(self, batch, outputs, ap):
"""Create dashboard log information.""" """Create dashboard log information."""
postnet_outputs = outputs["model_outputs"] postnet_outputs = outputs["model_outputs"]

View File

@ -41,6 +41,23 @@ hann_window = {}
mel_basis = {} mel_basis = {}
@torch.no_grad()
def weights_reset(m: nn.Module):
# check if the current module has reset_parameters and if it is reset the weight
reset_parameters = getattr(m, "reset_parameters", None)
if callable(reset_parameters):
m.reset_parameters()
def get_module_weights_sum(mdl: nn.Module):
dict_sums = {}
for name, w in mdl.named_parameters():
if "weight" in name:
value = w.data.sum().item()
dict_sums[name] = value
return dict_sums
def load_audio(file_path): def load_audio(file_path):
"""Load the audio file normalized in [-1, 1] """Load the audio file normalized in [-1, 1]
@ -189,15 +206,20 @@ def wav_to_mel(y, n_fft, num_mels, sample_rate, hop_length, win_length, fmin, fm
class VitsDataset(TTSDataset): class VitsDataset(TTSDataset):
def __init__(self, *args, **kwargs): def __init__(self, model_args, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
self.pad_id = self.tokenizer.characters.pad_id self.pad_id = self.tokenizer.characters.pad_id
self.model_args = model_args
def __getitem__(self, idx): def __getitem__(self, idx):
item = self.samples[idx] item = self.samples[idx]
raw_text = item["text"] raw_text = item["text"]
wav, _ = load_audio(item["audio_file"]) wav, _ = load_audio(item["audio_file"])
if self.model_args.encoder_sample_rate is not None:
if wav.size(1) % self.model_args.encoder_sample_rate != 0:
wav = wav[:, : -int(wav.size(1) % self.model_args.encoder_sample_rate)]
wav_filename = os.path.basename(item["audio_file"]) wav_filename = os.path.basename(item["audio_file"])
token_ids = self.get_token_ids(idx, item["text"]) token_ids = self.get_token_ids(idx, item["text"])
@ -362,6 +384,9 @@ class VitsArgs(Coqpit):
upsample_kernel_sizes_decoder (List[int]): upsample_kernel_sizes_decoder (List[int]):
Kernel sizes for each upsampling layer of the decoder network. Defaults to `[16, 16, 4, 4]`. Kernel sizes for each upsampling layer of the decoder network. Defaults to `[16, 16, 4, 4]`.
periods_multi_period_discriminator (List[int]):
Periods values for Vits Multi-Period Discriminator. Defaults to `[2, 3, 5, 7, 11]`.
use_sdp (bool): use_sdp (bool):
Use Stochastic Duration Predictor. Defaults to True. Use Stochastic Duration Predictor. Defaults to True.
@ -451,6 +476,18 @@ class VitsArgs(Coqpit):
freeze_waveform_decoder (bool): freeze_waveform_decoder (bool):
Freeze the waveform decoder weigths during training. Defaults to False. Freeze the waveform decoder weigths during training. Defaults to False.
encoder_sample_rate (int):
If not None this sample rate will be used for training the Posterior Encoder,
flow, text_encoder and duration predictor. The decoder part (vocoder) will be
trained with the `config.audio.sample_rate`. Defaults to None.
interpolate_z (bool):
If `encoder_sample_rate` not None and this parameter True the nearest interpolation
will be used to upsampling the latent variable z with the sampling rate `encoder_sample_rate`
to the `config.audio.sample_rate`. If it is False you will need to add extra
`upsample_rates_decoder` to match the shape. Defaults to True.
""" """
num_chars: int = 100 num_chars: int = 100
@ -475,6 +512,7 @@ class VitsArgs(Coqpit):
upsample_rates_decoder: List[int] = field(default_factory=lambda: [8, 8, 2, 2]) upsample_rates_decoder: List[int] = field(default_factory=lambda: [8, 8, 2, 2])
upsample_initial_channel_decoder: int = 512 upsample_initial_channel_decoder: int = 512
upsample_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [16, 16, 4, 4]) upsample_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [16, 16, 4, 4])
periods_multi_period_discriminator: List[int] = field(default_factory=lambda: [2, 3, 5, 7, 11])
use_sdp: bool = True use_sdp: bool = True
noise_scale: float = 1.0 noise_scale: float = 1.0
inference_noise_scale: float = 0.667 inference_noise_scale: float = 0.667
@ -505,6 +543,10 @@ class VitsArgs(Coqpit):
freeze_PE: bool = False freeze_PE: bool = False
freeze_flow_decoder: bool = False freeze_flow_decoder: bool = False
freeze_waveform_decoder: bool = False freeze_waveform_decoder: bool = False
encoder_sample_rate: int = None
interpolate_z: bool = True
reinit_DP: bool = False
reinit_text_encoder: bool = False
class Vits(BaseTTS): class Vits(BaseTTS):
@ -548,6 +590,7 @@ class Vits(BaseTTS):
self.init_multispeaker(config) self.init_multispeaker(config)
self.init_multilingual(config) self.init_multilingual(config)
self.init_upsampling()
self.length_scale = self.args.length_scale self.length_scale = self.args.length_scale
self.noise_scale = self.args.noise_scale self.noise_scale = self.args.noise_scale
@ -625,7 +668,10 @@ class Vits(BaseTTS):
) )
if self.args.init_discriminator: if self.args.init_discriminator:
self.disc = VitsDiscriminator(use_spectral_norm=self.args.use_spectral_norm_disriminator) self.disc = VitsDiscriminator(
periods=self.args.periods_multi_period_discriminator,
use_spectral_norm=self.args.use_spectral_norm_disriminator,
)
def init_multispeaker(self, config: Coqpit): def init_multispeaker(self, config: Coqpit):
"""Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer """Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer
@ -707,6 +753,38 @@ class Vits(BaseTTS):
else: else:
self.embedded_language_dim = 0 self.embedded_language_dim = 0
def init_upsampling(self):
"""
Initialize upsampling modules of a model.
"""
if self.args.encoder_sample_rate:
self.interpolate_factor = self.config.audio["sample_rate"] / self.args.encoder_sample_rate
self.audio_resampler = torchaudio.transforms.Resample(
orig_freq=self.config.audio["sample_rate"], new_freq=self.args.encoder_sample_rate
) # pylint: disable=W0201
def on_init_end(self, trainer): # pylint: disable=W0613
"""Reinit layes if needed"""
if self.args.reinit_DP:
before_dict = get_module_weights_sum(self.duration_predictor)
# Applies weights_reset recursively to every submodule of the duration predictor
self.duration_predictor.apply(fn=weights_reset)
after_dict = get_module_weights_sum(self.duration_predictor)
for key, value in after_dict.items():
if value == before_dict[key]:
raise RuntimeError(" [!] The weights of Duration Predictor was not reinit check it !")
print(" > Duration Predictor was reinit.")
if self.args.reinit_text_encoder:
before_dict = get_module_weights_sum(self.text_encoder)
# Applies weights_reset recursively to every submodule of the duration predictor
self.text_encoder.apply(fn=weights_reset)
after_dict = get_module_weights_sum(self.text_encoder)
for key, value in after_dict.items():
if value == before_dict[key]:
raise RuntimeError(" [!] The weights of Text Encoder was not reinit check it !")
print(" > Text Encoder was reinit.")
def get_aux_input(self, aux_input: Dict): def get_aux_input(self, aux_input: Dict):
sid, g, lid = self._set_cond_input(aux_input) sid, g, lid = self._set_cond_input(aux_input)
return {"speaker_ids": sid, "style_wav": None, "d_vectors": g, "language_ids": lid} return {"speaker_ids": sid, "style_wav": None, "d_vectors": g, "language_ids": lid}
@ -804,6 +882,23 @@ class Vits(BaseTTS):
outputs["loss_duration"] = loss_duration outputs["loss_duration"] = loss_duration
return outputs, attn return outputs, attn
def upsampling_z(self, z, slice_ids=None, y_lengths=None, y_mask=None):
spec_segment_size = self.spec_segment_size
if self.args.encoder_sample_rate:
# recompute the slices and spec_segment_size if needed
slice_ids = slice_ids * int(self.interpolate_factor) if slice_ids is not None else slice_ids
spec_segment_size = spec_segment_size * int(self.interpolate_factor)
# interpolate z if needed
if self.args.interpolate_z:
z = torch.nn.functional.interpolate(z, scale_factor=[self.interpolate_factor], mode="linear").squeeze(0)
# recompute the mask if needed
if y_lengths is not None and y_mask is not None:
y_mask = (
sequence_mask(y_lengths * self.interpolate_factor, None).to(y_mask.dtype).unsqueeze(1)
) # [B, 1, T_dec_resampled]
return z, spec_segment_size, slice_ids, y_mask
def forward( # pylint: disable=dangerous-default-value def forward( # pylint: disable=dangerous-default-value
self, self,
x: torch.tensor, x: torch.tensor,
@ -878,12 +973,16 @@ class Vits(BaseTTS):
# select a random feature segment for the waveform decoder # select a random feature segment for the waveform decoder
z_slice, slice_ids = rand_segments(z, y_lengths, self.spec_segment_size, let_short_samples=True, pad_short=True) z_slice, slice_ids = rand_segments(z, y_lengths, self.spec_segment_size, let_short_samples=True, pad_short=True)
# interpolate z if needed
z_slice, spec_segment_size, slice_ids, _ = self.upsampling_z(z_slice, slice_ids=slice_ids)
o = self.waveform_decoder(z_slice, g=g) o = self.waveform_decoder(z_slice, g=g)
wav_seg = segment( wav_seg = segment(
waveform, waveform,
slice_ids * self.config.audio.hop_length, slice_ids * self.config.audio.hop_length,
self.args.spec_segment_size * self.config.audio.hop_length, spec_segment_size * self.config.audio.hop_length,
pad_short=True, pad_short=True,
) )
@ -927,6 +1026,7 @@ class Vits(BaseTTS):
return aux_input["x_lengths"] return aux_input["x_lengths"]
return torch.tensor(x.shape[1:2]).to(x.device) return torch.tensor(x.shape[1:2]).to(x.device)
@torch.no_grad()
def inference( def inference(
self, x, aux_input={"x_lengths": None, "d_vectors": None, "speaker_ids": None, "language_ids": None} self, x, aux_input={"x_lengths": None, "d_vectors": None, "speaker_ids": None, "language_ids": None}
): # pylint: disable=dangerous-default-value ): # pylint: disable=dangerous-default-value
@ -989,9 +1089,22 @@ class Vits(BaseTTS):
z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * self.inference_noise_scale z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * self.inference_noise_scale
z = self.flow(z_p, y_mask, g=g, reverse=True) z = self.flow(z_p, y_mask, g=g, reverse=True)
# upsampling if needed
z, _, _, y_mask = self.upsampling_z(z, y_lengths=y_lengths, y_mask=y_mask)
o = self.waveform_decoder((z * y_mask)[:, :, : self.max_inference_len], g=g) o = self.waveform_decoder((z * y_mask)[:, :, : self.max_inference_len], g=g)
outputs = {"model_outputs": o, "alignments": attn.squeeze(1), "z": z, "z_p": z_p, "m_p": m_p, "logs_p": logs_p} outputs = {
"model_outputs": o,
"alignments": attn.squeeze(1),
"durations": w_ceil,
"z": z,
"z_p": z_p,
"m_p": m_p,
"logs_p": logs_p,
"y_mask": y_mask,
}
return outputs return outputs
@torch.no_grad() @torch.no_grad()
@ -1014,7 +1127,7 @@ class Vits(BaseTTS):
self.config.audio.hop_length, self.config.audio.hop_length,
self.config.audio.win_length, self.config.audio.win_length,
center=False, center=False,
).transpose(1, 2) )
y_lengths = torch.tensor([y.size(-1)]).to(y.device) y_lengths = torch.tensor([y.size(-1)]).to(y.device)
speaker_cond_src = reference_speaker_id if reference_speaker_id is not None else reference_d_vector speaker_cond_src = reference_speaker_id if reference_speaker_id is not None else reference_d_vector
speaker_cond_tgt = speaker_id if speaker_id is not None else d_vector speaker_cond_tgt = speaker_id if speaker_id is not None else d_vector
@ -1044,7 +1157,7 @@ class Vits(BaseTTS):
else: else:
raise RuntimeError(" [!] Voice conversion is only supported on multi-speaker models.") raise RuntimeError(" [!] Voice conversion is only supported on multi-speaker models.")
z, _, _, y_mask = self.posterior_encoder(y.transpose(1, 2), y_lengths, g=g_src) z, _, _, y_mask = self.posterior_encoder(y, y_lengths, g=g_src)
z_p = self.flow(z, y_mask, g=g_src) z_p = self.flow(z, y_mask, g=g_src)
z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True) z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
o_hat = self.waveform_decoder(z_hat * y_mask, g=g_tgt) o_hat = self.waveform_decoder(z_hat * y_mask, g=g_tgt)
@ -1064,13 +1177,12 @@ class Vits(BaseTTS):
self._freeze_layers() self._freeze_layers()
mel_lens = batch["mel_lens"] spec_lens = batch["spec_lens"]
if optimizer_idx == 0: if optimizer_idx == 0:
tokens = batch["tokens"] tokens = batch["tokens"]
token_lenghts = batch["token_lens"] token_lenghts = batch["token_lens"]
spec = batch["spec"] spec = batch["spec"]
spec_lens = batch["spec_lens"]
d_vectors = batch["d_vectors"] d_vectors = batch["d_vectors"]
speaker_ids = batch["speaker_ids"] speaker_ids = batch["speaker_ids"]
@ -1108,8 +1220,14 @@ class Vits(BaseTTS):
# compute melspec segment # compute melspec segment
with autocast(enabled=False): with autocast(enabled=False):
if self.args.encoder_sample_rate:
spec_segment_size = self.spec_segment_size * int(self.interpolate_factor)
else:
spec_segment_size = self.spec_segment_size
mel_slice = segment( mel_slice = segment(
mel.float(), self.model_outputs_cache["slice_ids"], self.spec_segment_size, pad_short=True mel.float(), self.model_outputs_cache["slice_ids"], spec_segment_size, pad_short=True
) )
mel_slice_hat = wav_to_mel( mel_slice_hat = wav_to_mel(
y=self.model_outputs_cache["model_outputs"].float(), y=self.model_outputs_cache["model_outputs"].float(),
@ -1137,7 +1255,7 @@ class Vits(BaseTTS):
logs_q=self.model_outputs_cache["logs_q"].float(), logs_q=self.model_outputs_cache["logs_q"].float(),
m_p=self.model_outputs_cache["m_p"].float(), m_p=self.model_outputs_cache["m_p"].float(),
logs_p=self.model_outputs_cache["logs_p"].float(), logs_p=self.model_outputs_cache["logs_p"].float(),
z_len=mel_lens, z_len=spec_lens,
scores_disc_fake=scores_disc_fake, scores_disc_fake=scores_disc_fake,
feats_disc_fake=feats_disc_fake, feats_disc_fake=feats_disc_fake,
feats_disc_real=feats_disc_real, feats_disc_real=feats_disc_real,
@ -1318,21 +1436,48 @@ class Vits(BaseTTS):
"""Compute spectrograms on the device.""" """Compute spectrograms on the device."""
ac = self.config.audio ac = self.config.audio
if self.args.encoder_sample_rate:
wav = self.audio_resampler(batch["waveform"])
else:
wav = batch["waveform"]
# compute spectrograms # compute spectrograms
batch["spec"] = wav_to_spec(batch["waveform"], ac.fft_size, ac.hop_length, ac.win_length, center=False) batch["spec"] = wav_to_spec(wav, ac.fft_size, ac.hop_length, ac.win_length, center=False)
if self.args.encoder_sample_rate:
# recompute spec with high sampling rate to the loss
spec_mel = wav_to_spec(batch["waveform"], ac.fft_size, ac.hop_length, ac.win_length, center=False)
# remove extra stft frames if needed
if spec_mel.size(2) > int(batch["spec"].size(2) * self.interpolate_factor):
spec_mel = spec_mel[:, :, : int(batch["spec"].size(2) * self.interpolate_factor)]
else:
batch["spec"] = batch["spec"][:, :, : int(spec_mel.size(2) / self.interpolate_factor)]
else:
spec_mel = batch["spec"]
batch["mel"] = spec_to_mel( batch["mel"] = spec_to_mel(
spec=batch["spec"], spec=spec_mel,
n_fft=ac.fft_size, n_fft=ac.fft_size,
num_mels=ac.num_mels, num_mels=ac.num_mels,
sample_rate=ac.sample_rate, sample_rate=ac.sample_rate,
fmin=ac.mel_fmin, fmin=ac.mel_fmin,
fmax=ac.mel_fmax, fmax=ac.mel_fmax,
) )
if self.args.encoder_sample_rate:
assert batch["spec"].shape[2] == int(
batch["mel"].shape[2] / self.interpolate_factor
), f"{batch['spec'].shape[2]}, {batch['mel'].shape[2]}"
else:
assert batch["spec"].shape[2] == batch["mel"].shape[2], f"{batch['spec'].shape[2]}, {batch['mel'].shape[2]}" assert batch["spec"].shape[2] == batch["mel"].shape[2], f"{batch['spec'].shape[2]}, {batch['mel'].shape[2]}"
# compute spectrogram frame lengths # compute spectrogram frame lengths
batch["spec_lens"] = (batch["spec"].shape[2] * batch["waveform_rel_lens"]).int() batch["spec_lens"] = (batch["spec"].shape[2] * batch["waveform_rel_lens"]).int()
batch["mel_lens"] = (batch["mel"].shape[2] * batch["waveform_rel_lens"]).int() batch["mel_lens"] = (batch["mel"].shape[2] * batch["waveform_rel_lens"]).int()
if self.args.encoder_sample_rate:
assert (batch["spec_lens"] - (batch["mel_lens"] / self.interpolate_factor).int()).sum() == 0
else:
assert (batch["spec_lens"] - batch["mel_lens"]).sum() == 0 assert (batch["spec_lens"] - batch["mel_lens"]).sum() == 0
# zero the padding frames # zero the padding frames
@ -1355,8 +1500,9 @@ class Vits(BaseTTS):
else: else:
# init dataloader # init dataloader
dataset = VitsDataset( dataset = VitsDataset(
model_args=self.args,
samples=samples, samples=samples,
# batch_group_size=0 if is_eval else config.batch_group_size * config.batch_size, batch_group_size=0 if is_eval else config.batch_group_size * config.batch_size,
min_text_len=config.min_text_len, min_text_len=config.min_text_len,
max_text_len=config.max_text_len, max_text_len=config.max_text_len,
min_audio_len=config.min_audio_len, min_audio_len=config.min_audio_len,
@ -1449,6 +1595,11 @@ class Vits(BaseTTS):
# TODO: consider baking the speaker encoder into the model and call it from there. # TODO: consider baking the speaker encoder into the model and call it from there.
# as it is probably easier for model distribution. # as it is probably easier for model distribution.
state["model"] = {k: v for k, v in state["model"].items() if "speaker_encoder" not in k} state["model"] = {k: v for k, v in state["model"].items() if "speaker_encoder" not in k}
if self.args.encoder_sample_rate is not None and eval:
# audio resampler is not used in inference time
self.audio_resampler = None
# handle fine-tuning from a checkpoint with additional speakers # handle fine-tuning from a checkpoint with additional speakers
if hasattr(self, "emb_g") and state["model"]["emb_g.weight"].shape != self.emb_g.weight.shape: if hasattr(self, "emb_g") and state["model"]["emb_g.weight"].shape != self.emb_g.weight.shape:
num_new_speakers = self.emb_g.weight.shape[0] - state["model"]["emb_g.weight"].shape[0] num_new_speakers = self.emb_g.weight.shape[0] - state["model"]["emb_g.weight"].shape[0]
@ -1476,9 +1627,17 @@ class Vits(BaseTTS):
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor
upsample_rate = torch.prod(torch.as_tensor(config.model_args.upsample_rates_decoder)).item() upsample_rate = torch.prod(torch.as_tensor(config.model_args.upsample_rates_decoder)).item()
if not config.model_args.encoder_sample_rate:
assert ( assert (
upsample_rate == config.audio.hop_length upsample_rate == config.audio.hop_length
), f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {config.audio.hop_length}" ), f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {config.audio.hop_length}"
else:
encoder_to_vocoder_upsampling_factor = config.audio.sample_rate / config.model_args.encoder_sample_rate
effective_hop_length = config.audio.hop_length * encoder_to_vocoder_upsampling_factor
assert (
upsample_rate == effective_hop_length
), f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {effective_hop_length}"
ap = AudioProcessor.init_from_config(config, verbose=verbose) ap = AudioProcessor.init_from_config(config, verbose=verbose)
tokenizer, new_config = TTSTokenizer.init_from_config(config) tokenizer, new_config = TTSTokenizer.init_from_config(config)

View File

@ -1,4 +1,7 @@
import bisect
import numpy as np import numpy as np
import torch
def _pad_data(x, length): def _pad_data(x, length):
@ -51,3 +54,26 @@ def prepare_stop_target(inputs, out_steps):
def pad_per_step(inputs, pad_len): def pad_per_step(inputs, pad_len):
return np.pad(inputs, [[0, 0], [0, 0], [0, pad_len]], mode="constant", constant_values=0.0) return np.pad(inputs, [[0, 0], [0, 0], [0, pad_len]], mode="constant", constant_values=0.0)
def get_length_balancer_weights(items: list, num_buckets=10):
# get all durations
audio_lengths = np.array([item["audio_length"] for item in items])
# create the $num_buckets buckets classes based in the dataset max and min length
max_length = int(max(audio_lengths))
min_length = int(min(audio_lengths))
step = int((max_length - min_length) / num_buckets) + 1
buckets_classes = [i + step for i in range(min_length, (max_length - step) + num_buckets + 1, step)]
# add each sample in their respective length bucket
buckets_names = np.array(
[buckets_classes[bisect.bisect_left(buckets_classes, item["audio_length"])] for item in items]
)
# count and compute the weights_bucket for each sample
unique_buckets_names = np.unique(buckets_names).tolist()
bucket_ids = [unique_buckets_names.index(l) for l in buckets_names]
bucket_count = np.array([len(np.where(buckets_names == l)[0]) for l in unique_buckets_names])
weight_bucket = 1.0 / bucket_count
dataset_samples_weight = np.array([weight_bucket[l] for l in bucket_ids])
# normalize
dataset_samples_weight = dataset_samples_weight / np.linalg.norm(dataset_samples_weight)
return torch.from_numpy(dataset_samples_weight).float()

View File

@ -11,6 +11,28 @@ from TTS.encoder.utils.generic_utils import setup_encoder_model
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor
def load_file(path: str):
if path.endswith(".json"):
with fsspec.open(path, "r") as f:
return json.load(f)
elif path.endswith(".pth"):
with fsspec.open(path, "rb") as f:
return torch.load(f, map_location="cpu")
else:
raise ValueError("Unsupported file type")
def save_file(obj: Any, path: str):
if path.endswith(".json"):
with fsspec.open(path, "w") as f:
json.dump(obj, f, indent=4)
elif path.endswith(".pth"):
with fsspec.open(path, "wb") as f:
torch.save(obj, f)
else:
raise ValueError("Unsupported file type")
class BaseIDManager: class BaseIDManager:
"""Base `ID` Manager class. Every new `ID` manager must inherit this. """Base `ID` Manager class. Every new `ID` manager must inherit this.
It defines common `ID` manager specific functions. It defines common `ID` manager specific functions.
@ -46,7 +68,7 @@ class BaseIDManager:
Args: Args:
file_path (str): Path to the file. file_path (str): Path to the file.
""" """
self.ids = self._load_json(file_path) self.ids = load_file(file_path)
def save_ids_to_file(self, file_path: str) -> None: def save_ids_to_file(self, file_path: str) -> None:
"""Save IDs to a json file. """Save IDs to a json file.
@ -54,7 +76,7 @@ class BaseIDManager:
Args: Args:
file_path (str): Path to the output file. file_path (str): Path to the output file.
""" """
self._save_json(file_path, self.ids) save_file(self.ids, file_path)
def get_random_id(self) -> Any: def get_random_id(self) -> Any:
"""Get a random embedding. """Get a random embedding.
@ -110,7 +132,7 @@ class EmbeddingManager(BaseIDManager):
self.load_embeddings_from_file(embedding_file_path) self.load_embeddings_from_file(embedding_file_path)
if encoder_model_path and encoder_config_path: if encoder_model_path and encoder_config_path:
self.init_encoder(encoder_model_path, encoder_config_path) self.init_encoder(encoder_model_path, encoder_config_path, use_cuda)
@property @property
def embedding_dim(self): def embedding_dim(self):
@ -125,7 +147,7 @@ class EmbeddingManager(BaseIDManager):
Args: Args:
file_path (str): Path to the output file. file_path (str): Path to the output file.
""" """
self._save_json(file_path, self.embeddings) save_file(self.embeddings, file_path)
def load_embeddings_from_file(self, file_path: str) -> None: def load_embeddings_from_file(self, file_path: str) -> None:
"""Load embeddings from a json file. """Load embeddings from a json file.
@ -133,7 +155,7 @@ class EmbeddingManager(BaseIDManager):
Args: Args:
file_path (str): Path to the target json file. file_path (str): Path to the target json file.
""" """
self.embeddings = self._load_json(file_path) self.embeddings = load_file(file_path)
speakers = sorted({x["name"] for x in self.embeddings.values()}) speakers = sorted({x["name"] for x in self.embeddings.values()})
self.ids = {name: i for i, name in enumerate(speakers)} self.ids = {name: i for i, name in enumerate(speakers)}
@ -216,17 +238,19 @@ class EmbeddingManager(BaseIDManager):
def get_clips(self) -> List: def get_clips(self) -> List:
return sorted(self.embeddings.keys()) return sorted(self.embeddings.keys())
def init_encoder(self, model_path: str, config_path: str) -> None: def init_encoder(self, model_path: str, config_path: str, use_cuda=False) -> None:
"""Initialize a speaker encoder model. """Initialize a speaker encoder model.
Args: Args:
model_path (str): Model file path. model_path (str): Model file path.
config_path (str): Model config file path. config_path (str): Model config file path.
use_cuda (bool, optional): Use CUDA. Defaults to False.
""" """
self.use_cuda = use_cuda
self.encoder_config = load_config(config_path) self.encoder_config = load_config(config_path)
self.encoder = setup_encoder_model(self.encoder_config) self.encoder = setup_encoder_model(self.encoder_config)
self.encoder_criterion = self.encoder.load_checkpoint( self.encoder_criterion = self.encoder.load_checkpoint(
self.encoder_config, model_path, eval=True, use_cuda=self.use_cuda self.encoder_config, model_path, eval=True, use_cuda=use_cuda
) )
self.encoder_ap = AudioProcessor(**self.encoder_config.audio) self.encoder_ap = AudioProcessor(**self.encoder_config.audio)

View File

@ -108,6 +108,7 @@ class SpeakerManager(EmbeddingManager):
) )
if get_from_config_or_model_args_with_default(config, "use_d_vector_file", False): if get_from_config_or_model_args_with_default(config, "use_d_vector_file", False):
speaker_manager = SpeakerManager()
if get_from_config_or_model_args_with_default(config, "speakers_file", None): if get_from_config_or_model_args_with_default(config, "speakers_file", None):
speaker_manager = SpeakerManager( speaker_manager = SpeakerManager(
d_vectors_file_path=get_from_config_or_model_args_with_default(config, "speaker_file", None) d_vectors_file_path=get_from_config_or_model_args_with_default(config, "speaker_file", None)

View File

@ -26,6 +26,7 @@ def run_model_torch(
inputs: torch.Tensor, inputs: torch.Tensor,
speaker_id: int = None, speaker_id: int = None,
style_mel: torch.Tensor = None, style_mel: torch.Tensor = None,
style_text: str = None,
d_vector: torch.Tensor = None, d_vector: torch.Tensor = None,
language_id: torch.Tensor = None, language_id: torch.Tensor = None,
) -> Dict: ) -> Dict:
@ -53,6 +54,7 @@ def run_model_torch(
"speaker_ids": speaker_id, "speaker_ids": speaker_id,
"d_vectors": d_vector, "d_vectors": d_vector,
"style_mel": style_mel, "style_mel": style_mel,
"style_text": style_text,
"language_ids": language_id, "language_ids": language_id,
}, },
) )
@ -115,6 +117,7 @@ def synthesis(
use_cuda, use_cuda,
speaker_id=None, speaker_id=None,
style_wav=None, style_wav=None,
style_text=None,
use_griffin_lim=False, use_griffin_lim=False,
do_trim_silence=False, do_trim_silence=False,
d_vector=None, d_vector=None,
@ -140,7 +143,12 @@ def synthesis(
Speaker ID passed to the speaker embedding layer in multi-speaker model. Defaults to None. Speaker ID passed to the speaker embedding layer in multi-speaker model. Defaults to None.
style_wav (str | Dict[str, float]): style_wav (str | Dict[str, float]):
Path or tensor to/of a waveform used for computing the style embedding. Defaults to None. Path or tensor to/of a waveform used for computing the style embedding based on GST or Capacitron.
Defaults to None, meaning that Capacitron models will sample from the prior distribution to
generate random but realistic prosody.
style_text (str):
Transcription of style_wav for Capacitron models. Defaults to None.
enable_eos_bos_chars (bool): enable_eos_bos_chars (bool):
enable special chars for end of sentence and start of sentence. Defaults to False. enable special chars for end of sentence and start of sentence. Defaults to False.
@ -154,13 +162,19 @@ def synthesis(
language_id (int): language_id (int):
Language ID passed to the language embedding layer in multi-langual model. Defaults to None. Language ID passed to the language embedding layer in multi-langual model. Defaults to None.
""" """
# GST processing # GST or Capacitron processing
# TODO: need to handle the case of setting both gst and capacitron to true somewhere
style_mel = None style_mel = None
if CONFIG.has("gst") and CONFIG.gst and style_wav is not None: if CONFIG.has("gst") and CONFIG.gst and style_wav is not None:
if isinstance(style_wav, dict): if isinstance(style_wav, dict):
style_mel = style_wav style_mel = style_wav
else: else:
style_mel = compute_style_mel(style_wav, model.ap, cuda=use_cuda) style_mel = compute_style_mel(style_wav, model.ap, cuda=use_cuda)
if CONFIG.has("capacitron_vae") and CONFIG.use_capacitron_vae and style_wav is not None:
style_mel = compute_style_mel(style_wav, model.ap, cuda=use_cuda)
style_mel = style_mel.transpose(1, 2) # [1, time, depth]
# convert text to sequence of token IDs # convert text to sequence of token IDs
text_inputs = np.asarray( text_inputs = np.asarray(
model.tokenizer.text_to_ids(text, language=language_id), model.tokenizer.text_to_ids(text, language=language_id),
@ -177,11 +191,28 @@ def synthesis(
language_id = id_to_torch(language_id, cuda=use_cuda) language_id = id_to_torch(language_id, cuda=use_cuda)
if not isinstance(style_mel, dict): if not isinstance(style_mel, dict):
# GST or Capacitron style mel
style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda) style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda)
if style_text is not None:
style_text = np.asarray(
model.tokenizer.text_to_ids(style_text, language=language_id),
dtype=np.int32,
)
style_text = numpy_to_torch(style_text, torch.long, cuda=use_cuda)
style_text = style_text.unsqueeze(0)
text_inputs = numpy_to_torch(text_inputs, torch.long, cuda=use_cuda) text_inputs = numpy_to_torch(text_inputs, torch.long, cuda=use_cuda)
text_inputs = text_inputs.unsqueeze(0) text_inputs = text_inputs.unsqueeze(0)
# synthesize voice # synthesize voice
outputs = run_model_torch(model, text_inputs, speaker_id, style_mel, d_vector=d_vector, language_id=language_id) outputs = run_model_torch(
model,
text_inputs,
speaker_id,
style_mel,
style_text,
d_vector=d_vector,
language_id=language_id,
)
model_outputs = outputs["model_outputs"] model_outputs = outputs["model_outputs"]
model_outputs = model_outputs[0].data.cpu().numpy() model_outputs = model_outputs[0].data.cpu().numpy()
alignments = outputs["alignments"] alignments = outputs["alignments"]

View File

@ -107,15 +107,6 @@ class ESpeak(BasePhonemizer):
if backend not in ["espeak", "espeak-ng"]: if backend not in ["espeak", "espeak-ng"]:
raise Exception("Unknown backend: %s" % backend) raise Exception("Unknown backend: %s" % backend)
self._ESPEAK_LIB = backend self._ESPEAK_LIB = backend
# skip first two characters of the retuned text
# "_ p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n"
# ^^
self.num_skip_chars = 2
if backend == "espeak-ng":
# skip the first character of the retuned text
# "_p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n"
# ^
self.num_skip_chars = 1
def auto_set_espeak_lib(self) -> None: def auto_set_espeak_lib(self) -> None:
if is_tool("espeak-ng"): if is_tool("espeak-ng"):
@ -163,7 +154,16 @@ class ESpeak(BasePhonemizer):
phonemes = "" phonemes = ""
for line in _espeak_exe(self._ESPEAK_LIB, args, sync=True): for line in _espeak_exe(self._ESPEAK_LIB, args, sync=True):
logging.debug("line: %s", repr(line)) logging.debug("line: %s", repr(line))
phonemes += line.decode("utf8").strip()[self.num_skip_chars :] # skip initial redundant characters ph_decoded = line.decode("utf8").strip()
# espeak need to skip first two characters of the retuned text:
# version 1.48.03: "_ p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n"
# version 1.48.15: " p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n"
# espeak-ng need to skip the first character of the retuned text:
# "_p_ɹ_ˈaɪ_ɚ t_ə n_oʊ_v_ˈɛ_m_b_ɚ t_w_ˈɛ_n_t_i t_ˈuː\n"
# dealing with the conditions descrived above
ph_decoded = ph_decoded[:1].replace("_", "") + ph_decoded[1:]
phonemes += ph_decoded.strip()
return phonemes.replace("_", separator) return phonemes.replace("_", separator)
def _phonemize(self, text, separator=None): def _phonemize(self, text, separator=None):

View File

@ -859,7 +859,11 @@ class AudioProcessor(object):
path (str): Path to a output file. path (str): Path to a output file.
sr (int, optional): Sampling rate used for saving to the file. Defaults to None. sr (int, optional): Sampling rate used for saving to the file. Defaults to None.
""" """
if self.do_rms_norm:
wav_norm = self.rms_volume_norm(wav, self.db_level) * 32767
else:
wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav)))) wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
scipy.io.wavfile.write(path, sr if sr else self.sample_rate, wav_norm.astype(np.int16)) scipy.io.wavfile.write(path, sr if sr else self.sample_rate, wav_norm.astype(np.int16))
def get_duration(self, filename: str) -> float: def get_duration(self, filename: str) -> float:

View File

@ -0,0 +1,65 @@
from typing import Generator
from trainer.trainer_utils import get_optimizer
class CapacitronOptimizer:
"""Double optimizer class for the Capacitron model."""
def __init__(self, config: dict, model_params: Generator) -> None:
self.primary_params, self.secondary_params = self.split_model_parameters(model_params)
optimizer_names = list(config.optimizer_params.keys())
optimizer_parameters = list(config.optimizer_params.values())
self.primary_optimizer = get_optimizer(
optimizer_names[0],
optimizer_parameters[0],
config.lr,
parameters=self.primary_params,
)
self.secondary_optimizer = get_optimizer(
optimizer_names[1],
self.extract_optimizer_parameters(optimizer_parameters[1]),
optimizer_parameters[1]["lr"],
parameters=self.secondary_params,
)
self.param_groups = self.primary_optimizer.param_groups
def first_step(self):
self.secondary_optimizer.step()
self.secondary_optimizer.zero_grad()
self.primary_optimizer.zero_grad()
def step(self):
self.primary_optimizer.step()
def zero_grad(self):
self.primary_optimizer.zero_grad()
self.secondary_optimizer.zero_grad()
def load_state_dict(self, state_dict):
self.primary_optimizer.load_state_dict(state_dict[0])
self.secondary_optimizer.load_state_dict(state_dict[1])
def state_dict(self):
return [self.primary_optimizer.state_dict(), self.secondary_optimizer.state_dict()]
@staticmethod
def split_model_parameters(model_params: Generator) -> list:
primary_params = []
secondary_params = []
for name, param in model_params:
if param.requires_grad:
if name == "capacitron_vae_layer.beta":
secondary_params.append(param)
else:
primary_params.append(param)
return [iter(primary_params), iter(secondary_params)]
@staticmethod
def extract_optimizer_parameters(params: dict) -> dict:
"""Extract parameters that are not the learning rate"""
return {k: v for k, v in params.items() if k != "lr"}

View File

@ -106,6 +106,8 @@ def save_model(config, model, optimizer, scaler, current_step, epoch, output_pat
model_state = model.state_dict() model_state = model.state_dict()
if isinstance(optimizer, list): if isinstance(optimizer, list):
optimizer_state = [optim.state_dict() for optim in optimizer] optimizer_state = [optim.state_dict() for optim in optimizer]
elif optimizer.__class__.__name__ == "CapacitronOptimizer":
optimizer_state = [optimizer.primary_optimizer.state_dict(), optimizer.secondary_optimizer.state_dict()]
else: else:
optimizer_state = optimizer.state_dict() if optimizer is not None else None optimizer_state = optimizer.state_dict() if optimizer is not None else None

View File

@ -90,6 +90,81 @@ class ModelManager(object):
models_name_list.extend(model_list) models_name_list.extend(model_list)
return models_name_list return models_name_list
def model_info_by_idx(self, model_query):
"""Print the description of the model from .models.json file using model_idx
Args:
model_query (str): <model_tye>/<model_idx>
"""
model_name_list = []
model_type, model_query_idx = model_query.split("/")
try:
model_query_idx = int(model_query_idx)
if model_query_idx <= 0:
print("> model_query_idx should be a positive integer!")
return
except:
print("> model_query_idx should be an integer!")
return
model_count = 0
if model_type in self.models_dict:
for lang in self.models_dict[model_type]:
for dataset in self.models_dict[model_type][lang]:
for model in self.models_dict[model_type][lang][dataset]:
model_name_list.append(f"{model_type}/{lang}/{dataset}/{model}")
model_count += 1
else:
print(f"> model_type {model_type} does not exist in the list.")
return
if model_query_idx > model_count:
print(f"model query idx exceeds the number of available models [{model_count}] ")
else:
model_type, lang, dataset, model = model_name_list[model_query_idx - 1].split("/")
print(f"> model type : {model_type}")
print(f"> language supported : {lang}")
print(f"> dataset used : {dataset}")
print(f"> model name : {model}")
if "description" in self.models_dict[model_type][lang][dataset][model]:
print(f"> description : {self.models_dict[model_type][lang][dataset][model]['description']}")
else:
print("> description : coming soon")
if "default_vocoder" in self.models_dict[model_type][lang][dataset][model]:
print(f"> default_vocoder : {self.models_dict[model_type][lang][dataset][model]['default_vocoder']}")
def model_info_by_full_name(self, model_query_name):
"""Print the description of the model from .models.json file using model_full_name
Args:
model_query_name (str): Format is <model_type>/<language>/<dataset>/<model_name>
"""
model_type, lang, dataset, model = model_query_name.split("/")
if model_type in self.models_dict:
if lang in self.models_dict[model_type]:
if dataset in self.models_dict[model_type][lang]:
if model in self.models_dict[model_type][lang][dataset]:
print(f"> model type : {model_type}")
print(f"> language supported : {lang}")
print(f"> dataset used : {dataset}")
print(f"> model name : {model}")
if "description" in self.models_dict[model_type][lang][dataset][model]:
print(
f"> description : {self.models_dict[model_type][lang][dataset][model]['description']}"
)
else:
print("> description : coming soon")
if "default_vocoder" in self.models_dict[model_type][lang][dataset][model]:
print(
f"> default_vocoder : {self.models_dict[model_type][lang][dataset][model]['default_vocoder']}"
)
else:
print(f"> model {model} does not exist for {model_type}/{lang}/{dataset}.")
else:
print(f"> dataset {dataset} does not exist for {model_type}/{lang}.")
else:
print(f"> lang {lang} does not exist for {model_type}.")
else:
print(f"> model_type {model_type} does not exist in the list.")
def list_tts_models(self): def list_tts_models(self):
"""Print all `TTS` models and return a list of model names """Print all `TTS` models and return a list of model names

View File

@ -1,5 +1,5 @@
import time import time
from typing import List, Union from typing import List
import numpy as np import numpy as np
import pysbd import pysbd
@ -97,10 +97,10 @@ class Synthesizer(object):
"""Load the TTS model. """Load the TTS model.
1. Load the model config. 1. Load the model config.
2. Init the AudioProcessor. 2. Init the model from the config.
3. Init the model from the config. 3. Load the model weights.
4. Move the model to the GPU if CUDA is enabled. 4. Move the model to the GPU if CUDA is enabled.
5. Init the speaker manager for the model. 5. Init the speaker manager in the model.
Args: Args:
tts_checkpoint (str): path to the model checkpoint. tts_checkpoint (str): path to the model checkpoint.
@ -122,7 +122,7 @@ class Synthesizer(object):
self.tts_model.cuda() self.tts_model.cuda()
if self.encoder_checkpoint and hasattr(self.tts_model, "speaker_manager"): if self.encoder_checkpoint and hasattr(self.tts_model, "speaker_manager"):
self.tts_model.speaker_manager.init_encoder(self.encoder_checkpoint, self.encoder_config) self.tts_model.speaker_manager.init_encoder(self.encoder_checkpoint, self.encoder_config, use_cuda)
def _set_speaker_encoder_paths_from_tts_config(self): def _set_speaker_encoder_paths_from_tts_config(self):
"""Set the encoder paths from the tts model config for models with speaker encoders.""" """Set the encoder paths from the tts model config for models with speaker encoders."""
@ -178,8 +178,9 @@ class Synthesizer(object):
text: str = "", text: str = "",
speaker_name: str = "", speaker_name: str = "",
language_name: str = "", language_name: str = "",
speaker_wav: Union[str, List[str]] = None, speaker_wav=None,
style_wav=None, style_wav=None,
style_text=None,
reference_wav=None, reference_wav=None,
reference_speaker_name=None, reference_speaker_name=None,
) -> List[int]: ) -> List[int]:
@ -191,6 +192,7 @@ class Synthesizer(object):
language_name (str, optional): language id for multi-language models. Defaults to "". language_name (str, optional): language id for multi-language models. Defaults to "".
speaker_wav (Union[str, List[str]], optional): path to the speaker wav. Defaults to None. speaker_wav (Union[str, List[str]], optional): path to the speaker wav. Defaults to None.
style_wav ([type], optional): style waveform for GST. Defaults to None. style_wav ([type], optional): style waveform for GST. Defaults to None.
style_text ([type], optional): transcription of style_wav for Capacitron. Defaults to None.
reference_wav ([type], optional): reference waveform for voice conversion. Defaults to None. reference_wav ([type], optional): reference waveform for voice conversion. Defaults to None.
reference_speaker_name ([type], optional): spekaer id of reference waveform. Defaults to None. reference_speaker_name ([type], optional): spekaer id of reference waveform. Defaults to None.
Returns: Returns:
@ -273,10 +275,11 @@ class Synthesizer(object):
CONFIG=self.tts_config, CONFIG=self.tts_config,
use_cuda=self.use_cuda, use_cuda=self.use_cuda,
speaker_id=speaker_id, speaker_id=speaker_id,
language_id=language_id,
style_wav=style_wav, style_wav=style_wav,
style_text=style_text,
use_griffin_lim=use_gl, use_griffin_lim=use_gl,
d_vector=speaker_embedding, d_vector=speaker_embedding,
language_id=language_id,
) )
waveform = outputs["wav"] waveform = outputs["wav"]
mel_postnet_spec = outputs["outputs"]["model_outputs"][0].detach().cpu().numpy() mel_postnet_spec = outputs["outputs"]["model_outputs"][0].detach().cpu().numpy()
@ -315,7 +318,7 @@ class Synthesizer(object):
# get the speaker embedding or speaker id for the reference wav file # get the speaker embedding or speaker id for the reference wav file
reference_speaker_embedding = None reference_speaker_embedding = None
reference_speaker_id = None reference_speaker_id = None
if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "speaker_ids"): if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "ids"):
if reference_speaker_name and isinstance(reference_speaker_name, str): if reference_speaker_name and isinstance(reference_speaker_name, str):
if self.tts_config.use_d_vector_file: if self.tts_config.use_d_vector_file:
# get the speaker embedding from the saved d_vectors. # get the speaker embedding from the saved d_vectors.

View File

@ -115,8 +115,8 @@ class GANDataset(Dataset):
audio, mel = self.cache[idx] audio, mel = self.cache[idx]
else: else:
audio = self.ap.load_wav(wavpath) audio = self.ap.load_wav(wavpath)
audio, _ = self._pad_short_samples(audio)
mel = self.ap.melspectrogram(audio) mel = self.ap.melspectrogram(audio)
audio, mel = self._pad_short_samples(audio, mel)
else: else:
# load precomputed features # load precomputed features

View File

@ -90,50 +90,26 @@ class GAN(BaseVocoder):
raise ValueError(" [!] Unexpected `optimizer_idx`.") raise ValueError(" [!] Unexpected `optimizer_idx`.")
if optimizer_idx == 0: if optimizer_idx == 0:
# GENERATOR # DISCRIMINATOR optimization
# generator pass # generator pass
y_hat = self.model_g(x)[:, :, : y.size(2)] y_hat = self.model_g(x)[:, :, : y.size(2)]
self.y_hat_g = y_hat # save for discriminator
y_hat_sub = None # cache for generator loss
y_sub = None # pylint: disable=W0201
self.y_hat_g = y_hat
self.y_hat_sub = None
self.y_sub_g = None
# PQMF formatting # PQMF formatting
if y_hat.shape[1] > 1: if y_hat.shape[1] > 1:
y_hat_sub = y_hat self.y_hat_sub = y_hat
y_hat = self.model_g.pqmf_synthesis(y_hat) y_hat = self.model_g.pqmf_synthesis(y_hat)
self.y_hat_g = y_hat # save for discriminator self.y_hat_g = y_hat # save for generator loss
y_sub = self.model_g.pqmf_analysis(y) self.y_sub_g = self.model_g.pqmf_analysis(y)
scores_fake, feats_fake, feats_real = None, None, None scores_fake, feats_fake, feats_real = None, None, None
if self.train_disc:
if len(signature(self.model_d.forward).parameters) == 2:
D_out_fake = self.model_d(y_hat, x)
else:
D_out_fake = self.model_d(y_hat)
D_out_real = None
if self.config.use_feat_match_loss:
with torch.no_grad():
D_out_real = self.model_d(y)
# format D outputs
if isinstance(D_out_fake, tuple):
scores_fake, feats_fake = D_out_fake
if D_out_real is None:
feats_real = None
else:
_, feats_real = D_out_real
else:
scores_fake = D_out_fake
feats_fake, feats_real = None, None
# compute losses
loss_dict = criterion[optimizer_idx](y_hat, y, scores_fake, feats_fake, feats_real, y_hat_sub, y_sub)
outputs = {"model_outputs": y_hat}
if optimizer_idx == 1:
# DISCRIMINATOR
if self.train_disc: if self.train_disc:
# use different samples for G and D trainings # use different samples for G and D trainings
if self.config.diff_samples_for_G_and_D: if self.config.diff_samples_for_G_and_D:
@ -177,6 +153,36 @@ class GAN(BaseVocoder):
loss_dict = criterion[optimizer_idx](scores_fake, scores_real) loss_dict = criterion[optimizer_idx](scores_fake, scores_real)
outputs = {"model_outputs": y_hat} outputs = {"model_outputs": y_hat}
if optimizer_idx == 1:
# GENERATOR loss
scores_fake, feats_fake, feats_real = None, None, None
if self.train_disc:
if len(signature(self.model_d.forward).parameters) == 2:
D_out_fake = self.model_d(self.y_hat_g, x)
else:
D_out_fake = self.model_d(self.y_hat_g)
D_out_real = None
if self.config.use_feat_match_loss:
with torch.no_grad():
D_out_real = self.model_d(y)
# format D outputs
if isinstance(D_out_fake, tuple):
scores_fake, feats_fake = D_out_fake
if D_out_real is None:
feats_real = None
else:
_, feats_real = D_out_real
else:
scores_fake = D_out_fake
feats_fake, feats_real = None, None
# compute losses
loss_dict = criterion[optimizer_idx](
self.y_hat_g, y, scores_fake, feats_fake, feats_real, self.y_hat_sub, self.y_sub_g
)
outputs = {"model_outputs": self.y_hat_g}
return outputs, loss_dict return outputs, loss_dict
@staticmethod @staticmethod
@ -210,6 +216,7 @@ class GAN(BaseVocoder):
@torch.no_grad() @torch.no_grad()
def eval_step(self, batch: Dict, criterion: nn.Module, optimizer_idx: int) -> Tuple[Dict, Dict]: def eval_step(self, batch: Dict, criterion: nn.Module, optimizer_idx: int) -> Tuple[Dict, Dict]:
"""Call `train_step()` with `no_grad()`""" """Call `train_step()` with `no_grad()`"""
self.train_disc = True # Avoid a bug in the Training with the missing discriminator loss
return self.train_step(batch, criterion, optimizer_idx) return self.train_step(batch, criterion, optimizer_idx)
def eval_log( def eval_log(
@ -266,7 +273,7 @@ class GAN(BaseVocoder):
optimizer2 = get_optimizer( optimizer2 = get_optimizer(
self.config.optimizer, self.config.optimizer_params, self.config.lr_disc, self.model_d self.config.optimizer, self.config.optimizer_params, self.config.lr_disc, self.model_d
) )
return [optimizer1, optimizer2] return [optimizer2, optimizer1]
def get_lr(self) -> List: def get_lr(self) -> List:
"""Set the initial learning rates for each optimizer. """Set the initial learning rates for each optimizer.
@ -274,7 +281,7 @@ class GAN(BaseVocoder):
Returns: Returns:
List: learning rates for each optimizer. List: learning rates for each optimizer.
""" """
return [self.config.lr_gen, self.config.lr_disc] return [self.config.lr_disc, self.config.lr_gen]
def get_scheduler(self, optimizer) -> List: def get_scheduler(self, optimizer) -> List:
"""Set the schedulers for each optimizer. """Set the schedulers for each optimizer.
@ -287,7 +294,7 @@ class GAN(BaseVocoder):
""" """
scheduler1 = get_scheduler(self.config.lr_scheduler_gen, self.config.lr_scheduler_gen_params, optimizer[0]) scheduler1 = get_scheduler(self.config.lr_scheduler_gen, self.config.lr_scheduler_gen_params, optimizer[0])
scheduler2 = get_scheduler(self.config.lr_scheduler_disc, self.config.lr_scheduler_disc_params, optimizer[1]) scheduler2 = get_scheduler(self.config.lr_scheduler_disc, self.config.lr_scheduler_disc_params, optimizer[1])
return [scheduler1, scheduler2] return [scheduler2, scheduler1]
@staticmethod @staticmethod
def format_batch(batch: List) -> Dict: def format_batch(batch: List) -> Dict:
@ -359,7 +366,7 @@ class GAN(BaseVocoder):
def get_criterion(self): def get_criterion(self):
"""Return criterions for the optimizers""" """Return criterions for the optimizers"""
return [GeneratorLoss(self.config), DiscriminatorLoss(self.config)] return [DiscriminatorLoss(self.config), GeneratorLoss(self.config)]
@staticmethod @staticmethod
def init_from_config(config: Coqpit, verbose=True) -> "GAN": def init_from_config(config: Coqpit, verbose=True) -> "GAN":

View File

@ -59,8 +59,7 @@ We tried to collect common issues and questions we receive about 🐸TTS. It is
6. Train your model. 6. Train your model.
- SingleGPU training: ```CUDA_VISIBLE_DEVICES="0" python train_tts.py --config_path config.json``` - SingleGPU training: ```CUDA_VISIBLE_DEVICES="0" python train_tts.py --config_path config.json```
- MultiGPU training: ```CUDA_VISIBLE_DEVICES="0,1,2" python distribute.py --script train_tts.py --config_path config.json``` - MultiGPU training: ```python3 -m trainer.distribute --gpus "0,1" --script TTS/bin/train_tts.py --config_path config.json```
- This command uses all the GPUs given in ```CUDA_VISIBLE_DEVICES```. If you don't specify, it uses all the GPUs available.
**Note:** You can also train your model using pure 🐍 python. Check ```{eval-rst} :ref: 'tutorial_for_nervous_beginners'```. **Note:** You can also train your model using pure 🐍 python. Check ```{eval-rst} :ref: 'tutorial_for_nervous_beginners'```.

View File

@ -1,6 +1,6 @@
# Installation # Installation
🐸TTS supports python >=3.6 <=3.9 and tested on Ubuntu 18.10, 19.10, 20.10. 🐸TTS supports python >=3.7 <3.11.0 and tested on Ubuntu 18.10, 19.10, 20.10.
## Using `pip` ## Using `pip`

View File

@ -2,7 +2,7 @@
1. Decide the model you want to use. 1. Decide the model you want to use.
Each model has a different set of pros and cons that define the run-time efficiency and the voice quality. It is up to you to decide what model servers your needs. Other than referring to the papers, one easy way is to test the 🐸TTS Each model has a different set of pros and cons that define the run-time efficiency and the voice quality. It is up to you to decide what model serves your needs. Other than referring to the papers, one easy way is to test the 🐸TTS
community models and see how fast and good each of the models. Or you can start a discussion on our communication channels. community models and see how fast and good each of the models. Or you can start a discussion on our communication channels.
2. Understand the configuration, its fields and values. 2. Understand the configuration, its fields and values.

View File

@ -1,6 +1,5 @@
{ {
"cells": [ "cells": [{
{
"cell_type": "markdown", "cell_type": "markdown",
"metadata": { "metadata": {
"Collapsed": "false" "Collapsed": "false"
@ -37,9 +36,7 @@
"import librosa.display\n", "import librosa.display\n",
"\n", "\n",
"from TTS.tts.layers import *\n", "from TTS.tts.layers import *\n",
"from TTS.utils.audio import AudioProcessor "from TTS.utils.audio import AudioProcessor\n",
\n",
"from TTS.tts.utils.generic_utils import setup_model\n", "from TTS.tts.utils.generic_utils import setup_model\n",
"from TTS.tts.utils.io import load_config\n", "from TTS.tts.utils.io import load_config\n",
"from TTS.tts.utils.text import text_to_sequence\n", "from TTS.tts.utils.text import text_to_sequence\n",

View File

@ -0,0 +1,272 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "45ea3ef5",
"metadata": {
"tags": []
},
"source": [
"# Easy Inferencing with 🐸 TTS ⚡\n",
"\n",
"#### You want to quicly synthesize speech using Coqui 🐸 TTS model?\n",
"\n",
"💡: Grab a pre-trained model and use it to synthesize speech using any speaker voice, including yours! ⚡\n",
"\n",
"🐸 TTS comes with a list of pretrained models and speaker voices. You can even start a local demo server that you can open it on your favorite web browser and 🗣️ .\n",
"\n",
"In this notebook, we will: \n",
"```\n",
"1. List available pre-trained 🐸 TTS models\n",
"2. Run a 🐸 TTS model\n",
"3. Listen to the synthesized wave 📣\n",
"4. Run multispeaker 🐸 TTS model \n",
"```\n",
"So, let's jump right in!\n"
]
},
{
"cell_type": "markdown",
"id": "a1e5c2a5-46eb-42fd-b550-2a052546857e",
"metadata": {},
"source": [
"## Install 🐸 TTS ⬇️"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fa2aec77",
"metadata": {},
"outputs": [],
"source": [
"! pip install -U pip\n",
"! pip install TTS"
]
},
{
"cell_type": "markdown",
"id": "8c07a273",
"metadata": {},
"source": [
"## ✅ List available pre-trained 🐸 TTS models\n",
"\n",
"Coqui 🐸TTS comes with a list of pretrained models for different model types (ex: TTS, vocoder), languages, datasets used for training and architectures. \n",
"\n",
"You can either use your own model or the release models under 🐸TTS.\n",
"\n",
"Use `tts --list_models` to find out the availble models.\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "608d203f",
"metadata": {},
"outputs": [],
"source": [
"! tts --list_models"
]
},
{
"cell_type": "markdown",
"id": "ed9dd7ab",
"metadata": {},
"source": [
"## ✅ Run a 🐸 TTS model\n",
"\n",
"#### **First things first**: Using a release model and default vocoder:\n",
"\n",
"You can simply copy the full model name from the list above and use it \n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cc9e4608-16ec-4dcd-bd6b-bd10d62286f8",
"metadata": {},
"outputs": [],
"source": [
"!tts --text \"hello world\" \\\n",
"--model_name \"tts_models/en/ljspeech/glow-tts\" \\\n",
"--out_path output.wav\n"
]
},
{
"cell_type": "markdown",
"id": "0ca2cb14-1aba-400e-a219-8ce44d9410be",
"metadata": {},
"source": [
"## 📣 Listen to the synthesized wave 📣"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5fe63ef4-9284-4461-9dda-1ca7483a8f9b",
"metadata": {},
"outputs": [],
"source": [
"import IPython\n",
"IPython.display.Audio(\"output.wav\")"
]
},
{
"cell_type": "markdown",
"id": "5e67d178-1ebe-49c7-9a47-0593251bdb96",
"metadata": {},
"source": [
"### **Second things second**:\n",
"\n",
"🔶 A TTS model can be either trained on a single speaker voice or multispeaker voices. This training choice is directly reflected on the inference ability and the available speaker voices that can be used to synthesize speech. \n",
"\n",
"🔶 If you want to run a multispeaker model from the released models list, you can first check the speaker ids using `--list_speaker_idx` flag and use this speaker voice to synthesize speech."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "87b18839-f750-4a61-bbb0-c964acaecab2",
"metadata": {},
"outputs": [],
"source": [
"# list the possible speaker IDs.\n",
"!tts --model_name \"tts_models/en/vctk/vits\" \\\n",
"--list_speaker_idxs \n"
]
},
{
"cell_type": "markdown",
"id": "c4365a9d-f922-4b14-88b0-d2b22a245b2e",
"metadata": {},
"source": [
"## 💬 Synthesize speech using speaker ID 💬"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "52be0403-d13e-4d9b-99c2-c10b85154063",
"metadata": {},
"outputs": [],
"source": [
"!tts --text \"Trying out specific speaker voice\"\\\n",
"--out_path spkr-out.wav --model_name \"tts_models/en/vctk/vits\" \\\n",
"--speaker_idx \"p341\""
]
},
{
"cell_type": "markdown",
"id": "894a560a-f9c8-48ce-aaa6-afdf516c01f6",
"metadata": {},
"source": [
"## 📣 Listen to the synthesized speaker specific wave 📣"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ed485b0a-dfd5-4a7e-a571-ebf74bdfc41d",
"metadata": {},
"outputs": [],
"source": [
"import IPython\n",
"IPython.display.Audio(\"spkr-out.wav\")"
]
},
{
"cell_type": "markdown",
"id": "84636a38-097e-4dad-933b-0aeaee650e92",
"metadata": {},
"source": [
"🔶 If you want to use an external speaker to synthesize speech, you need to supply `--speaker_wav` flag along with an external speaker encoder path and config file, as follows:"
]
},
{
"cell_type": "markdown",
"id": "cbdb15fa-123a-4282-a127-87b50dc70365",
"metadata": {},
"source": [
"First we need to get the speaker encoder model, its config and a referece `speaker_wav`"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e54f1b13-560c-4fed-bafd-e38ec9712359",
"metadata": {},
"outputs": [],
"source": [
"!wget https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json\n",
"!wget https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar\n",
"!wget https://github.com/coqui-ai/TTS/raw/speaker_encoder_model/tests/data/ljspeech/wavs/LJ001-0001.wav"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6dac1912-5054-4a68-8357-6d20fd99cb10",
"metadata": {},
"outputs": [],
"source": [
"!tts --model_name tts_models/multilingual/multi-dataset/your_tts \\\n",
"--encoder_path model_se.pth.tar \\\n",
"--encoder_config config_se.json \\\n",
"--speaker_wav LJ001-0001.wav \\\n",
"--text \"Are we not allowed to dim the lights so people can see that a bit better?\"\\\n",
"--out_path spkr-out.wav \\\n",
"--language_idx \"en\""
]
},
{
"cell_type": "markdown",
"id": "92ddce58-8aca-4f69-84c3-645ae1b12e7d",
"metadata": {},
"source": [
"## 📣 Listen to the synthesized speaker specific wave 📣"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cc889adc-9c71-4232-8e85-bfc8f76476f4",
"metadata": {},
"outputs": [],
"source": [
"import IPython\n",
"IPython.display.Audio(\"spkr-out.wav\")"
]
},
{
"cell_type": "markdown",
"id": "29101d01-0b01-4153-a216-5dae415a5dd6",
"metadata": {},
"source": [
"## 🎉 Congratulations! 🎉 You now know how to use a TTS model to synthesize speech! \n",
"Follow up with the next tutorials to learn more adnavced material."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -0,0 +1,454 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "f79d99ef",
"metadata": {},
"source": [
"# Train your first 🐸 TTS model 💫\n",
"\n",
"### 👋 Hello and welcome to Coqui (🐸) TTS\n",
"\n",
"The goal of this notebook is to show you a **typical workflow** for **training** and **testing** a TTS model with 🐸.\n",
"\n",
"Let's train a very small model on a very small amount of data so we can iterate quickly.\n",
"\n",
"In this notebook, we will:\n",
"\n",
"1. Download data and format it for 🐸 TTS.\n",
"2. Configure the training and testing runs.\n",
"3. Train a new model.\n",
"4. Test the model and display its performance.\n",
"\n",
"So, let's jump right in!\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fa2aec78",
"metadata": {},
"outputs": [],
"source": [
"## Install Coqui TTS\n",
"! pip install -U pip\n",
"! pip install TTS"
]
},
{
"cell_type": "markdown",
"id": "be5fe49c",
"metadata": {},
"source": [
"## ✅ Data Preparation\n",
"\n",
"### **First things first**: we need some data.\n",
"\n",
"We're training a Text-to-Speech model, so we need some _text_ and we need some _speech_. Specificially, we want _transcribed speech_. The speech must be divided into audio clips and each clip needs transcription. More details about data requirements such as recording characteristics, background noise abd vocabulary coverage can be found in the [🐸TTS documentation](https://tts.readthedocs.io/en/latest/formatting_your_dataset.html).\n",
"\n",
"If you have a single audio file and you need to **split** it into clips. It is also important to use a lossless audio file format to prevent compression artifacts. We recommend using **wav** file format.\n",
"\n",
"The data format we will be adopting for this tutorial is taken from the widely-used **LJSpeech** dataset, where **waves** are collected under a folder:\n",
"\n",
"<span style=\"color:purple;font-size:15px\">\n",
"/wavs<br /> \n",
" &emsp;| - audio1.wav<br /> \n",
" &emsp;| - audio2.wav<br /> \n",
" &emsp;| - audio3.wav<br /> \n",
" ...<br /> \n",
"</span>\n",
"\n",
"and a **metadata.csv** file will have the audio file name in parallel to the transcript, delimited by `|`: \n",
" \n",
"<span style=\"color:purple;font-size:15px\">\n",
"# metadata.csv <br /> \n",
"audio1|This is my sentence. <br /> \n",
"audio2|This is maybe my sentence. <br /> \n",
"audio3|This is certainly my sentence. <br /> \n",
"audio4|Let this be your sentence. <br /> \n",
"...\n",
"</span>\n",
"\n",
"In the end, we should have the following **folder structure**:\n",
"\n",
"<span style=\"color:purple;font-size:15px\">\n",
"/MyTTSDataset <br /> \n",
"&emsp;| <br /> \n",
"&emsp;| -> metadata.txt<br /> \n",
"&emsp;| -> /wavs<br /> \n",
"&emsp;&emsp;| -> audio1.wav<br /> \n",
"&emsp;&emsp;| -> audio2.wav<br /> \n",
"&emsp;&emsp;| ...<br /> \n",
"</span>"
]
},
{
"cell_type": "markdown",
"id": "69501a10-3b53-4e75-ae66-90221d6f2271",
"metadata": {},
"source": [
"🐸TTS already provides tooling for the _LJSpeech_. if you use the same format, you can start training your models right away. <br /> \n",
"\n",
"After you collect and format your dataset, you need to check two things. Whether you need a **_formatter_** and a **_text_cleaner_**. <br /> The **_formatter_** loads the text file (created above) as a list and the **_text_cleaner_** performs a sequence of text normalization operations that converts the raw text into the spoken representation (e.g. converting numbers to text, acronyms, and symbols to the spoken format).\n",
"\n",
"If you use a different dataset format then the LJSpeech or the other public datasets that 🐸TTS supports, then you need to write your own **_formatter_** and **_text_cleaner_**."
]
},
{
"cell_type": "markdown",
"id": "e7f226c8-4e55-48fa-937b-8415d539b17c",
"metadata": {},
"source": [
"## ⏳️ Loading your dataset\n",
"Load one of the dataset supported by 🐸TTS.\n",
"\n",
"We will start by defining dataset config and setting LJSpeech as our target dataset and define its path.\n"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "b3cb0191-b8fc-4158-bd26-8423c2a8ba66",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"# BaseDatasetConfig: defines name, formatter and path of the dataset.\n",
"from TTS.tts.configs.shared_configs import BaseDatasetConfig\n",
"\n",
"output_path = \"tts_train_dir\"\n",
"if not os.path.exists(output_path):\n",
" os.makedirs(output_path)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ae6b7019-3685-4b48-8917-c152e288d7e3",
"metadata": {},
"outputs": [],
"source": [
"# Download and extract LJSpeech dataset.\n",
"\n",
"!wget -O $output_path/LJSpeech-1.1.tar.bz2 https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 \n",
"!tar -xf $output_path/LJSpeech-1.1.tar.bz2 -C $output_path"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "76cd3ab5-6387-45f1-b488-24734cc1beb5",
"metadata": {},
"outputs": [],
"source": [
"dataset_config = BaseDatasetConfig(\n",
" name=\"ljspeech\", meta_file_train=\"metadata.csv\", path=os.path.join(output_path, \"LJSpeech-1.1/\")\n",
")"
]
},
{
"cell_type": "markdown",
"id": "ae82fd75",
"metadata": {},
"source": [
"## ✅ Train a new model\n",
"\n",
"Let's kick off a training run 🚀🚀🚀.\n",
"\n",
"Deciding on the model architecture you'd want to use is based on your needs and available resources. Each model architecture has it's pros and cons that define the run-time efficiency and the voice quality.\n",
"We have many recipes under `TTS/recipes/` that provide a good starting point. For this tutorial, we will be using `GlowTTS`."
]
},
{
"cell_type": "markdown",
"id": "f5876e46-2aee-4bcf-b6b3-9e3c535c553f",
"metadata": {},
"source": [
"We will begin by initializing the model training configuration."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5483ca28-39d6-49f8-a18e-4fb53c50ad84",
"metadata": {},
"outputs": [],
"source": [
"# GlowTTSConfig: all model related values for training, validating and testing.\n",
"from TTS.tts.configs.glow_tts_config import GlowTTSConfig\n",
"config = GlowTTSConfig(\n",
" batch_size=32,\n",
" eval_batch_size=16,\n",
" num_loader_workers=4,\n",
" num_eval_loader_workers=4,\n",
" run_eval=True,\n",
" test_delay_epochs=-1,\n",
" epochs=100,\n",
" text_cleaner=\"phoneme_cleaners\",\n",
" use_phonemes=True,\n",
" phoneme_language=\"en-us\",\n",
" phoneme_cache_path=os.path.join(output_path, \"phoneme_cache\"),\n",
" print_step=25,\n",
" print_eval=False,\n",
" mixed_precision=True,\n",
" output_path=output_path,\n",
" datasets=[dataset_config],\n",
" save_step=1000,\n",
")"
]
},
{
"cell_type": "markdown",
"id": "b93ed377-80b7-447b-bd92-106bffa777ee",
"metadata": {},
"source": [
"Next we will initialize the audio processor which is used for feature extraction and audio I/O."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b1b12f61-f851-4565-84dd-7640947e04ab",
"metadata": {},
"outputs": [],
"source": [
"from TTS.utils.audio import AudioProcessor\n",
"ap = AudioProcessor.init_from_config(config)"
]
},
{
"cell_type": "markdown",
"id": "1d461683-b05e-403f-815f-8007bda08c38",
"metadata": {},
"source": [
"Next we will initialize the tokenizer which is used to convert text to sequences of token IDs. If characters are not defined in the config, default characters are passed to the config."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "014879b7-f18d-44c0-b24a-e10f8002113a",
"metadata": {},
"outputs": [],
"source": [
"from TTS.tts.utils.text.tokenizer import TTSTokenizer\n",
"tokenizer, config = TTSTokenizer.init_from_config(config)"
]
},
{
"cell_type": "markdown",
"id": "df3016e1-9e99-4c4f-94e3-fa89231fd978",
"metadata": {},
"source": [
"Next we will load data samples. Each sample is a list of ```[text, audio_file_path, speaker_name]```. You can define your custom sample loader returning the list of samples."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cadd6ada-c8eb-4f79-b8fe-6d72850af5a7",
"metadata": {},
"outputs": [],
"source": [
"from TTS.tts.datasets import load_tts_samples\n",
"train_samples, eval_samples = load_tts_samples(\n",
" dataset_config,\n",
" eval_split=True,\n",
" eval_split_max_size=config.eval_split_max_size,\n",
" eval_split_size=config.eval_split_size,\n",
")"
]
},
{
"cell_type": "markdown",
"id": "db8b451e-1fe1-4aa3-b69e-ab22b925bd19",
"metadata": {},
"source": [
"Now we're ready to initialize the model.\n",
"\n",
"Models take a config object and a speaker manager as input. Config defines the details of the model like the number of layers, the size of the embedding, etc. Speaker manager is used by multi-speaker models."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ac2ffe3e-ad0c-443e-800c-9b076ee811b4",
"metadata": {},
"outputs": [],
"source": [
"from TTS.tts.models.glow_tts import GlowTTS\n",
"model = GlowTTS(config, ap, tokenizer, speaker_manager=None)"
]
},
{
"cell_type": "markdown",
"id": "e2832c56-889d-49a6-95b6-eb231892ecc6",
"metadata": {},
"source": [
"Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training, distributed training, etc."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0f609945-4fe0-4d0d-b95e-11d7bfb63ebe",
"metadata": {},
"outputs": [],
"source": [
"from trainer import Trainer, TrainerArgs\n",
"trainer = Trainer(\n",
" TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples\n",
")"
]
},
{
"cell_type": "markdown",
"id": "5b320831-dd83-429b-bb6a-473f9d49d321",
"metadata": {},
"source": [
"### AND... 3,2,1... START TRAINING 🚀🚀🚀"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d4c07f99-3d1d-4bea-801e-9f33bbff0e9f",
"metadata": {},
"outputs": [],
"source": [
"trainer.fit()"
]
},
{
"cell_type": "markdown",
"id": "4cff0c40-2734-40a6-a905-e945a9fb3e98",
"metadata": {},
"source": [
"#### 🚀 Run the Tensorboard. 🚀\n",
"On the notebook and Tensorboard, you can monitor the progress of your model. Also Tensorboard provides certain figures and sample outputs."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5a85cd3b-1646-40ad-a6c2-49323e08eeec",
"metadata": {},
"outputs": [],
"source": [
"!pip install tensorboard\n",
"!tensorboard --logdir=tts_train_dir"
]
},
{
"cell_type": "markdown",
"id": "9f6dc959",
"metadata": {},
"source": [
"## ✅ Test the model\n",
"\n",
"We made it! 🙌\n",
"\n",
"Let's kick off the testing run, which displays performance metrics.\n",
"\n",
"We're committing the cardinal sin of ML 😈 (aka - testing on our training data) so you don't want to deploy this model into production. In this notebook we're focusing on the workflow itself, so it's forgivable 😇\n",
"\n",
"You can see from the test output that our tiny model has overfit to the data, and basically memorized this one sentence.\n",
"\n",
"When you start training your own models, make sure your testing data doesn't include your training data 😅"
]
},
{
"cell_type": "markdown",
"id": "99fada7a-592f-4a09-9369-e6f3d82de3a0",
"metadata": {},
"source": [
"Let's get the latest saved checkpoint. "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6dd47ed5-da8e-4bf9-b524-d686630d6961",
"metadata": {},
"outputs": [],
"source": [
"import glob, os\n",
"output_path = \"tts_train_dir\"\n",
"ckpts = sorted([f for f in glob.glob(output_path+\"/*/*.pth\")])\n",
"configs = sorted([f for f in glob.glob(output_path+\"/*/*.json\")])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dd42bc7a",
"metadata": {},
"outputs": [],
"source": [
" !tts --text \"Text for TTS\" \\\n",
" --model_path $test_ckpt \\\n",
" --config_path $test_config \\\n",
" --out_path out.wav"
]
},
{
"cell_type": "markdown",
"id": "81cbcb3f-d952-469b-a0d8-8941cd7af670",
"metadata": {},
"source": [
"## 📣 Listen to the synthesized wave 📣"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e0000bd6-6763-4a10-a74d-911dd08ebcff",
"metadata": {},
"outputs": [],
"source": [
"import IPython\n",
"IPython.display.Audio(\"out.wav\")"
]
},
{
"cell_type": "markdown",
"id": "13914401-cad1-494a-b701-474e52829138",
"metadata": {},
"source": [
"## 🎉 Congratulations! 🎉 You now have trained your first TTS model! \n",
"Follow up with the next tutorials to learn more advanced material."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "950d9fc6-896f-4a2c-86fd-8fd1fcbbb3f7",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -1,5 +1,5 @@
[build-system] [build-system]
requires = ["setuptools", "wheel", "Cython", "numpy==1.19.5"] requires = ["setuptools", "wheel", "cython==0.29.28", "numpy==1.21.6"]
[flake8] [flake8]
max-line-length=120 max-line-length=120

View File

@ -0,0 +1,12 @@
# How to get the Blizzard 2013 Dataset
The Capacitron model is a variational encoder extension of standard Tacotron based models to model prosody.
To take full advantage of the model, it is advised to train the model with a dataset that contains a significant amount of prosodic information in the utterances. A tested candidate for such applications is the blizzard2013 dataset from the Blizzard Challenge, containing many hours of high quality audio book recordings.
To get a license and download link for this dataset, you need to visit the [website](https://www.cstr.ed.ac.uk/projects/blizzard/2013/lessac_blizzard2013/license.html) of the Centre for Speech Technology Research of the University of Edinburgh.
You get access to the raw dataset in a couple of days. There are a few preprocessing steps you need to do to be able to use the high fidelity dataset.
1. Get the forced time alignments for the blizzard dataset from [here](https://github.com/mueller91/tts_alignments).
2. Segment the high fidelity audio-book files based on the instructions [here](https://github.com/Tomiinek/Blizzard2013_Segmentation).

View File

@ -0,0 +1,101 @@
import os
from trainer import Trainer, TrainerArgs
from TTS.config.shared_configs import BaseAudioConfig
from TTS.tts.configs.shared_configs import BaseDatasetConfig, CapacitronVAEConfig
from TTS.tts.configs.tacotron_config import TacotronConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.tacotron import Tacotron
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
output_path = os.path.dirname(os.path.abspath(__file__))
data_path = "/srv/data/"
# Using LJSpeech like dataset processing for the blizzard dataset
dataset_config = BaseDatasetConfig(name="ljspeech", meta_file_train="metadata.csv", path=data_path)
audio_config = BaseAudioConfig(
sample_rate=24000,
do_trim_silence=True,
trim_db=60.0,
signal_norm=True,
mel_fmin=80.0,
mel_fmax=12000,
spec_gain=20.0,
log_func="np.log10",
ref_level_db=20,
preemphasis=0.0,
min_level_db=-100,
)
# Using the standard Capacitron config
capacitron_config = CapacitronVAEConfig(capacitron_VAE_loss_alpha=1.0)
config = TacotronConfig(
run_name="Blizzard-Capacitron-T1",
audio=audio_config,
capacitron_vae=capacitron_config,
use_capacitron_vae=True,
batch_size=128, # Tune this to your gpu
max_audio_len=6 * 24000, # Tune this to your gpu
min_audio_len=0.5 * 24000,
eval_batch_size=16,
num_loader_workers=12,
num_eval_loader_workers=8,
precompute_num_workers=24,
run_eval=True,
test_delay_epochs=5,
ga_alpha=0.0,
r=2,
optimizer="CapacitronOptimizer",
optimizer_params={"RAdam": {"betas": [0.9, 0.998], "weight_decay": 1e-6}, "SGD": {"lr": 1e-5, "momentum": 0.9}},
attention_type="graves",
attention_heads=5,
epochs=1000,
text_cleaner="phoneme_cleaners",
use_phonemes=True,
phoneme_language="en-us",
phonemizer="espeak",
phoneme_cache_path=os.path.join(data_path, "phoneme_cache"),
stopnet_pos_weight=15,
print_step=50,
print_eval=True,
mixed_precision=False,
output_path=output_path,
datasets=[dataset_config],
lr=1e-3,
lr_scheduler="StepwiseGradualLR",
lr_scheduler_params={"gradual_learning_rates": [[0, 1e-3], [2e4, 5e-4], [4e5, 3e-4], [6e4, 1e-4], [8e4, 5e-5]]},
scheduler_after_epoch=False, # scheduler doesn't work without this flag
# Need to experiment with these below for capacitron
loss_masking=False,
decoder_loss_alpha=1.0,
postnet_loss_alpha=1.0,
postnet_diff_spec_alpha=0.0,
decoder_diff_spec_alpha=0.0,
decoder_ssim_alpha=0.0,
postnet_ssim_alpha=0.0,
)
ap = AudioProcessor(**config.audio.to_dict())
tokenizer, config = TTSTokenizer.init_from_config(config)
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
model = Tacotron(config, ap, tokenizer, speaker_manager=None)
trainer = Trainer(
TrainerArgs(),
config,
output_path,
model=model,
train_samples=train_samples,
eval_samples=eval_samples,
)
# 🚀
trainer.fit()

View File

@ -0,0 +1,117 @@
import os
from trainer import Trainer, TrainerArgs
from TTS.config.shared_configs import BaseAudioConfig
from TTS.tts.configs.shared_configs import BaseDatasetConfig, CapacitronVAEConfig
from TTS.tts.configs.tacotron2_config import Tacotron2Config
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.tacotron2 import Tacotron2
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
output_path = os.path.dirname(os.path.abspath(__file__))
data_path = "/srv/data/blizzard2013/segmented"
# Using LJSpeech like dataset processing for the blizzard dataset
dataset_config = BaseDatasetConfig(
name="ljspeech",
meta_file_train="metadata.csv",
path=data_path,
)
audio_config = BaseAudioConfig(
sample_rate=24000,
do_trim_silence=True,
trim_db=60.0,
signal_norm=True,
mel_fmin=80.0,
mel_fmax=12000,
spec_gain=25.0,
log_func="np.log10",
ref_level_db=20,
preemphasis=0.0,
min_level_db=-100,
)
# Using the standard Capacitron config
capacitron_config = CapacitronVAEConfig(capacitron_VAE_loss_alpha=1.0)
config = Tacotron2Config(
run_name="Blizzard-Capacitron-T2",
audio=audio_config,
capacitron_vae=capacitron_config,
use_capacitron_vae=True,
batch_size=246, # Tune this to your gpu
max_audio_len=6 * 24000, # Tune this to your gpu
min_audio_len=1 * 24000,
eval_batch_size=16,
num_loader_workers=12,
num_eval_loader_workers=8,
precompute_num_workers=24,
run_eval=True,
test_delay_epochs=5,
ga_alpha=0.0,
r=2,
optimizer="CapacitronOptimizer",
optimizer_params={"RAdam": {"betas": [0.9, 0.998], "weight_decay": 1e-6}, "SGD": {"lr": 1e-5, "momentum": 0.9}},
attention_type="dynamic_convolution",
grad_clip=0.0, # Important! We overwrite the standard grad_clip with capacitron_grad_clip
double_decoder_consistency=False,
epochs=1000,
text_cleaner="phoneme_cleaners",
use_phonemes=True,
phoneme_language="en-us",
phonemizer="espeak",
phoneme_cache_path=os.path.join(data_path, "phoneme_cache"),
stopnet_pos_weight=15,
print_step=25,
print_eval=True,
mixed_precision=False,
output_path=output_path,
datasets=[dataset_config],
lr=1e-3,
lr_scheduler="StepwiseGradualLR",
lr_scheduler_params={
"gradual_learning_rates": [
[0, 1e-3],
[2e4, 5e-4],
[4e5, 3e-4],
[6e4, 1e-4],
[8e4, 5e-5],
]
},
scheduler_after_epoch=False, # scheduler doesn't work without this flag
# dashboard_logger='wandb',
# sort_by_audio_len=True,
seq_len_norm=True,
# Need to experiment with these below for capacitron
loss_masking=False,
decoder_loss_alpha=1.0,
postnet_loss_alpha=1.0,
postnet_diff_spec_alpha=0.0,
decoder_diff_spec_alpha=0.0,
decoder_ssim_alpha=0.0,
postnet_ssim_alpha=0.0,
)
ap = AudioProcessor(**config.audio.to_dict())
tokenizer, config = TTSTokenizer.init_from_config(config)
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
model = Tacotron2(config, ap, tokenizer, speaker_manager=None)
trainer = Trainer(
TrainerArgs(),
config,
output_path,
model=model,
train_samples=train_samples,
eval_samples=eval_samples,
training_assets={"audio_processor": ap},
)
trainer.fit()

View File

@ -0,0 +1,115 @@
import os
from trainer import Trainer, TrainerArgs
from TTS.config.shared_configs import BaseAudioConfig
from TTS.tts.configs.shared_configs import BaseDatasetConfig, CapacitronVAEConfig
from TTS.tts.configs.tacotron2_config import Tacotron2Config
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.tacotron2 import Tacotron2
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
output_path = os.path.dirname(os.path.abspath(__file__))
data_path = "/srv/data/"
# Using LJSpeech like dataset processing for the blizzard dataset
dataset_config = BaseDatasetConfig(
name="ljspeech",
meta_file_train="metadata.csv",
path=data_path,
)
audio_config = BaseAudioConfig(
sample_rate=22050,
do_trim_silence=True,
trim_db=60.0,
signal_norm=False,
mel_fmin=0.0,
mel_fmax=11025,
spec_gain=1.0,
log_func="np.log",
ref_level_db=20,
preemphasis=0.0,
)
# Using the standard Capacitron config
capacitron_config = CapacitronVAEConfig(capacitron_VAE_loss_alpha=1.0, capacitron_capacity=50)
config = Tacotron2Config(
run_name="Capacitron-Tacotron2",
audio=audio_config,
capacitron_vae=capacitron_config,
use_capacitron_vae=True,
batch_size=128, # Tune this to your gpu
max_audio_len=8 * 22050, # Tune this to your gpu
min_audio_len=1 * 22050,
eval_batch_size=16,
num_loader_workers=8,
num_eval_loader_workers=8,
precompute_num_workers=24,
run_eval=True,
test_delay_epochs=25,
ga_alpha=0.0,
r=2,
optimizer="CapacitronOptimizer",
optimizer_params={"RAdam": {"betas": [0.9, 0.998], "weight_decay": 1e-6}, "SGD": {"lr": 1e-5, "momentum": 0.9}},
attention_type="dynamic_convolution",
grad_clip=0.0, # Important! We overwrite the standard grad_clip with capacitron_grad_clip
double_decoder_consistency=False,
epochs=1000,
text_cleaner="phoneme_cleaners",
use_phonemes=True,
phoneme_language="en-us",
phonemizer="espeak",
phoneme_cache_path=os.path.join(data_path, "phoneme_cache"),
stopnet_pos_weight=15,
print_step=25,
print_eval=True,
mixed_precision=False,
sort_by_audio_len=True,
seq_len_norm=True,
output_path=output_path,
datasets=[dataset_config],
lr=1e-3,
lr_scheduler="StepwiseGradualLR",
lr_scheduler_params={
"gradual_learning_rates": [
[0, 1e-3],
[2e4, 5e-4],
[4e5, 3e-4],
[6e4, 1e-4],
[8e4, 5e-5],
]
},
scheduler_after_epoch=False, # scheduler doesn't work without this flag
# Need to experiment with these below for capacitron
loss_masking=False,
decoder_loss_alpha=1.0,
postnet_loss_alpha=1.0,
postnet_diff_spec_alpha=0.0,
decoder_diff_spec_alpha=0.0,
decoder_ssim_alpha=0.0,
postnet_ssim_alpha=0.0,
)
ap = AudioProcessor(**config.audio.to_dict())
tokenizer, config = TTSTokenizer.init_from_config(config)
train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
model = Tacotron2(config, ap, tokenizer, speaker_manager=None)
trainer = Trainer(
TrainerArgs(),
config,
output_path,
model=model,
train_samples=train_samples,
eval_samples=eval_samples,
training_assets={"audio_processor": ap},
)
trainer.fit()

View File

@ -0,0 +1,15 @@
# 🐸💬 TTS Thorsten Recipes
For running the recipes you need the [Thorsten-Voice](https://github.com/thorstenMueller/Thorsten-Voice) dataset.
You can download it manually from [the official website](https://www.thorsten-voice.de/) or use ```download_thorsten_de.sh``` alternatively running any of the **train_modelX.py**scripts will download the dataset if not already present.
Then, go to your desired model folder and run the training.
Running Python files. (Choose the desired GPU ID for your run and set ```CUDA_VISIBLE_DEVICES```)
```terminal
CUDA_VISIBLE_DEVICES="0" python train_modelX.py
```
💡 Note that these runs are just templates to help you start training your first model. They are not optimized for the best
result. Double-check the configurations and feel free to share your experiments to find better parameters together 💪.

View File

@ -0,0 +1,84 @@
import os
from trainer import Trainer, TrainerArgs
from TTS.tts.configs.align_tts_config import AlignTTSConfig
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.align_tts import AlignTTS
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
from TTS.utils.downloaders import download_thorsten_de
output_path = os.path.dirname(os.path.abspath(__file__))
# init configs
dataset_config = BaseDatasetConfig(
name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
)
# download dataset if not already present
if not os.path.exists(dataset_config.path):
print("Downloading dataset")
download_thorsten_de(os.path.split(os.path.abspath(dataset_config.path))[0])
config = AlignTTSConfig(
batch_size=32,
eval_batch_size=16,
num_loader_workers=4,
num_eval_loader_workers=4,
run_eval=True,
test_delay_epochs=-1,
epochs=1000,
text_cleaner="phoneme_cleaners",
use_phonemes=False,
phoneme_language="de",
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
print_step=25,
print_eval=True,
mixed_precision=False,
test_sentences=[
"Es hat mich viel Zeit gekostet ein Stimme zu entwickeln, jetzt wo ich sie habe werde ich nicht mehr schweigen.",
"Sei eine Stimme, kein Echo.",
"Es tut mir Leid David. Das kann ich leider nicht machen.",
"Dieser Kuchen ist großartig. Er ist so lecker und feucht.",
"Vor dem 22. November 1963.",
],
output_path=output_path,
datasets=[dataset_config],
)
# INITIALIZE THE AUDIO PROCESSOR
# Audio processor is used for feature extraction and audio I/O.
# It mainly serves to the dataloader and the training loggers.
ap = AudioProcessor.init_from_config(config)
# INITIALIZE THE TOKENIZER
# Tokenizer is used to convert text to sequences of token IDs.
# If characters are not defined in the config, default characters are passed to the config
tokenizer, config = TTSTokenizer.init_from_config(config)
# LOAD DATA SAMPLES
# Each sample is a list of ```[text, audio_file_path, speaker_name]```
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(
dataset_config,
eval_split=True,
eval_split_max_size=config.eval_split_max_size,
eval_split_size=config.eval_split_size,
)
# init model
model = AlignTTS(config, ap, tokenizer)
# INITIALIZE THE TRAINER
# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
# distributed training, etc.
trainer = Trainer(
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)
# AND... 3,2,1... 🚀
trainer.fit()

View File

@ -0,0 +1,21 @@
# create venv
python3 -m venv env
source .env/bin/activate
pip install pip --upgrade
# download Thorsten_DE dataset
pip install gdown
gdown --id 1yKJM1LAOQpRVojKunD9r8WN_p5KzBxjc -O dataset.tgz
tar -xzf dataset.tgz
# create train-val splits
shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv
head -n 20668 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv
tail -n 2000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv
# rename dataset and remove archive
mv LJSpeech-1.1 thorsten-de
rm dataset.tgz
# destry venv
rm -rf env

View File

@ -0,0 +1,97 @@
import os
# Trainer: Where the ✨️ happens.
# TrainingArgs: Defines the set of arguments of the Trainer.
from trainer import Trainer, TrainerArgs
# GlowTTSConfig: all model related values for training, validating and testing.
from TTS.tts.configs.glow_tts_config import GlowTTSConfig
# BaseDatasetConfig: defines name, formatter and path of the dataset.
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.glow_tts import GlowTTS
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
from TTS.utils.downloaders import download_thorsten_de
# we use the same path as this script as our training folder.
output_path = os.path.dirname(os.path.abspath(__file__))
# DEFINE DATASET CONFIG
# Set LJSpeech as our target dataset and define its path.
# You can also use a simple Dict to define the dataset and pass it to your custom formatter.
dataset_config = BaseDatasetConfig(
name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
)
# download dataset if not already present
if not os.path.exists(dataset_config.path):
print("Downloading dataset")
download_thorsten_de(os.path.split(os.path.abspath(dataset_config.path))[0])
# INITIALIZE THE TRAINING CONFIGURATION
# Configure the model. Every config class inherits the BaseTTSConfig.
config = GlowTTSConfig(
batch_size=32,
eval_batch_size=16,
num_loader_workers=4,
num_eval_loader_workers=4,
run_eval=True,
test_delay_epochs=-1,
epochs=1000,
text_cleaner="phoneme_cleaners",
use_phonemes=True,
phoneme_language="de",
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
print_step=25,
print_eval=False,
mixed_precision=True,
test_sentences=[
"Es hat mich viel Zeit gekostet ein Stimme zu entwickeln, jetzt wo ich sie habe werde ich nicht mehr schweigen.",
"Sei eine Stimme, kein Echo.",
"Es tut mir Leid David. Das kann ich leider nicht machen.",
"Dieser Kuchen ist großartig. Er ist so lecker und feucht.",
"Vor dem 22. November 1963.",
],
output_path=output_path,
datasets=[dataset_config],
)
# INITIALIZE THE AUDIO PROCESSOR
# Audio processor is used for feature extraction and audio I/O.
# It mainly serves to the dataloader and the training loggers.
ap = AudioProcessor.init_from_config(config)
# INITIALIZE THE TOKENIZER
# Tokenizer is used to convert text to sequences of token IDs.
# If characters are not defined in the config, default characters are passed to the config
tokenizer, config = TTSTokenizer.init_from_config(config)
# LOAD DATA SAMPLES
# Each sample is a list of ```[text, audio_file_path, speaker_name]```
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(
dataset_config,
eval_split=True,
eval_split_max_size=config.eval_split_max_size,
eval_split_size=config.eval_split_size,
)
# INITIALIZE THE MODEL
# Models take a config object and a speaker manager as input
# Config defines the details of the model like the number of layers, the size of the embedding, etc.
# Speaker manager is used by multi-speaker models.
model = GlowTTS(config, ap, tokenizer, speaker_manager=None)
# INITIALIZE THE TRAINER
# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
# distributed training, etc.
trainer = Trainer(
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)
# AND... 3,2,1... 🚀
trainer.fit()

View File

@ -0,0 +1,53 @@
import os
from trainer import Trainer, TrainerArgs
from TTS.utils.audio import AudioProcessor
from TTS.utils.downloaders import download_thorsten_de
from TTS.vocoder.configs import HifiganConfig
from TTS.vocoder.datasets.preprocess import load_wav_data
from TTS.vocoder.models.gan import GAN
output_path = os.path.dirname(os.path.abspath(__file__))
config = HifiganConfig(
batch_size=32,
eval_batch_size=16,
num_loader_workers=4,
num_eval_loader_workers=4,
run_eval=True,
test_delay_epochs=5,
epochs=1000,
seq_len=8192,
pad_short=2000,
use_noise_augment=True,
eval_split_size=10,
print_step=25,
print_eval=False,
mixed_precision=False,
lr_gen=1e-4,
lr_disc=1e-4,
data_path=os.path.join(output_path, "../thorsten-de/wavs/"),
output_path=output_path,
)
# download dataset if not already present
if not os.path.exists(config.data_path):
print("Downloading dataset")
download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../"))
download_thorsten_de(download_path)
# init audio processor
ap = AudioProcessor(**config.audio.to_dict())
# load training samples
eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
# init model
model = GAN(config, ap)
# init the trainer and 🚀
trainer = Trainer(
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)
trainer.fit()

View File

@ -0,0 +1,53 @@
import os
from trainer import Trainer, TrainerArgs
from TTS.utils.audio import AudioProcessor
from TTS.utils.downloaders import download_thorsten_de
from TTS.vocoder.configs import MultibandMelganConfig
from TTS.vocoder.datasets.preprocess import load_wav_data
from TTS.vocoder.models.gan import GAN
output_path = os.path.dirname(os.path.abspath(__file__))
config = MultibandMelganConfig(
batch_size=32,
eval_batch_size=16,
num_loader_workers=4,
num_eval_loader_workers=4,
run_eval=True,
test_delay_epochs=5,
epochs=1000,
seq_len=8192,
pad_short=2000,
use_noise_augment=True,
eval_split_size=10,
print_step=25,
print_eval=False,
mixed_precision=False,
lr_gen=1e-4,
lr_disc=1e-4,
data_path=os.path.join(output_path, "../thorsten-de/wavs/"),
output_path=output_path,
)
# download dataset if not already present
if not os.path.exists(config.data_path):
print("Downloading dataset")
download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../"))
download_thorsten_de(download_path)
# init audio processor
ap = AudioProcessor(**config.audio.to_dict())
# load training samples
eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
# init model
model = GAN(config, ap)
# init the trainer and 🚀
trainer = Trainer(
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)
trainer.fit()

View File

@ -0,0 +1,102 @@
import os
from trainer import Trainer, TrainerArgs
from TTS.config import BaseAudioConfig, BaseDatasetConfig
from TTS.tts.configs.speedy_speech_config import SpeedySpeechConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.forward_tts import ForwardTTS
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
from TTS.utils.downloaders import download_thorsten_de
output_path = os.path.dirname(os.path.abspath(__file__))
dataset_config = BaseDatasetConfig(
name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
)
# download dataset if not already present
if not os.path.exists(dataset_config.path):
print("Downloading dataset")
download_thorsten_de(os.path.split(os.path.abspath(dataset_config.path))[0])
audio_config = BaseAudioConfig(
sample_rate=22050,
do_trim_silence=True,
trim_db=60.0,
signal_norm=False,
mel_fmin=0.0,
mel_fmax=8000,
spec_gain=1.0,
log_func="np.log",
ref_level_db=20,
preemphasis=0.0,
)
config = SpeedySpeechConfig(
run_name="speedy_speech_thorsten-de",
audio=audio_config,
batch_size=32,
eval_batch_size=16,
num_loader_workers=4,
num_eval_loader_workers=4,
compute_input_seq_cache=True,
run_eval=True,
test_delay_epochs=-1,
epochs=1000,
min_audio_len=11050, # need to up min_audio_len to avois speedy speech error
text_cleaner="phoneme_cleaners",
use_phonemes=True,
phoneme_language="de",
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
precompute_num_workers=4,
print_step=50,
print_eval=False,
mixed_precision=False,
test_sentences=[
"Es hat mich viel Zeit gekostet ein Stimme zu entwickeln, jetzt wo ich sie habe werde ich nicht mehr schweigen.",
"Sei eine Stimme, kein Echo.",
"Es tut mir Leid David. Das kann ich leider nicht machen.",
"Dieser Kuchen ist großartig. Er ist so lecker und feucht.",
"Vor dem 22. November 1963.",
],
sort_by_audio_len=True,
max_seq_len=500000,
output_path=output_path,
datasets=[dataset_config],
)
# INITIALIZE THE AUDIO PROCESSOR
# Audio processor is used for feature extraction and audio I/O.
# It mainly serves to the dataloader and the training loggers.
ap = AudioProcessor.init_from_config(config)
# INITIALIZE THE TOKENIZER
# Tokenizer is used to convert text to sequences of token IDs.
# If characters are not defined in the config, default characters are passed to the config
tokenizer, config = TTSTokenizer.init_from_config(config)
# LOAD DATA SAMPLES
# Each sample is a list of ```[text, audio_file_path, speaker_name]```
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(
dataset_config,
eval_split=True,
eval_split_max_size=config.eval_split_max_size,
eval_split_size=config.eval_split_size,
)
# init model
model = ForwardTTS(config, ap, tokenizer)
# INITIALIZE THE TRAINER
# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
# distributed training, etc.
trainer = Trainer(
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)
# AND... 3,2,1... 🚀
trainer.fit()

View File

@ -0,0 +1,108 @@
import os
from trainer import Trainer, TrainerArgs
from TTS.config.shared_configs import BaseAudioConfig
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.configs.tacotron2_config import Tacotron2Config
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.tacotron2 import Tacotron2
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
from TTS.utils.downloaders import download_thorsten_de
# from TTS.tts.datasets.tokenizer import Tokenizer
output_path = os.path.dirname(os.path.abspath(__file__))
# init configs
dataset_config = BaseDatasetConfig(
name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
)
# download dataset if not already present
if not os.path.exists(dataset_config.path):
print("Downloading dataset")
download_thorsten_de(os.path.split(os.path.abspath(dataset_config.path))[0])
audio_config = BaseAudioConfig(
sample_rate=22050,
do_trim_silence=True,
trim_db=60.0,
signal_norm=False,
mel_fmin=0.0,
mel_fmax=8000,
spec_gain=1.0,
log_func="np.log",
ref_level_db=20,
preemphasis=0.0,
)
config = Tacotron2Config( # This is the config that is saved for the future use
audio=audio_config,
batch_size=40, # BS of 40 and max length of 10s will use about 20GB of GPU memory
eval_batch_size=16,
num_loader_workers=4,
num_eval_loader_workers=4,
run_eval=True,
test_delay_epochs=-1,
r=6,
gradual_training=[[0, 6, 64], [10000, 4, 32], [50000, 3, 32], [100000, 2, 32]],
double_decoder_consistency=True,
epochs=1000,
text_cleaner="phoneme_cleaners",
use_phonemes=True,
phoneme_language="de",
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
precompute_num_workers=8,
print_step=25,
print_eval=True,
mixed_precision=False,
test_sentences=[
"Es hat mich viel Zeit gekostet ein Stimme zu entwickeln, jetzt wo ich sie habe werde ich nicht mehr schweigen.",
"Sei eine Stimme, kein Echo.",
"Es tut mir Leid David. Das kann ich leider nicht machen.",
"Dieser Kuchen ist großartig. Er ist so lecker und feucht.",
"Vor dem 22. November 1963.",
],
# max audio length of 10 seconds, feel free to increase if you got more than 20GB GPU memory
max_audio_len=22050 * 10,
output_path=output_path,
datasets=[dataset_config],
)
# init audio processor
ap = AudioProcessor(**config.audio.to_dict())
# INITIALIZE THE AUDIO PROCESSOR
# Audio processor is used for feature extraction and audio I/O.
# It mainly serves to the dataloader and the training loggers.
ap = AudioProcessor.init_from_config(config)
# INITIALIZE THE TOKENIZER
# Tokenizer is used to convert text to sequences of token IDs.
# If characters are not defined in the config, default characters are passed to the config
tokenizer, config = TTSTokenizer.init_from_config(config)
# LOAD DATA SAMPLES
# Each sample is a list of ```[text, audio_file_path, speaker_name]```
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(
dataset_config,
eval_split=True,
eval_split_max_size=config.eval_split_max_size,
eval_split_size=config.eval_split_size,
)
# INITIALIZE THE MODEL
# Models take a config object and a speaker manager as input
# Config defines the details of the model like the number of layers, the size of the embedding, etc.
# Speaker manager is used by multi-speaker models.
model = Tacotron2(config, ap, tokenizer, speaker_manager=None)
# init the trainer and 🚀
trainer = Trainer(
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)
trainer.fit()

View File

@ -0,0 +1,52 @@
import os
from trainer import Trainer, TrainerArgs
from TTS.utils.audio import AudioProcessor
from TTS.utils.downloaders import download_thorsten_de
from TTS.vocoder.configs import UnivnetConfig
from TTS.vocoder.datasets.preprocess import load_wav_data
from TTS.vocoder.models.gan import GAN
output_path = os.path.dirname(os.path.abspath(__file__))
config = UnivnetConfig(
batch_size=64,
eval_batch_size=16,
num_loader_workers=4,
num_eval_loader_workers=4,
run_eval=True,
test_delay_epochs=-1,
epochs=1000,
seq_len=8192,
pad_short=2000,
use_noise_augment=True,
eval_split_size=10,
print_step=25,
print_eval=False,
mixed_precision=False,
lr_gen=1e-4,
lr_disc=1e-4,
data_path=os.path.join(output_path, "../thorsten-de/wavs/"),
output_path=output_path,
)
# download dataset if not already present
if not os.path.exists(config.data_path):
print("Downloading dataset")
download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../"))
download_thorsten_de(download_path)
# init audio processor
ap = AudioProcessor(**config.audio.to_dict())
# load training samples
eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
# init model
model = GAN(config, ap)
# init the trainer and 🚀
trainer = Trainer(
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)
trainer.fit()

View File

@ -0,0 +1,105 @@
import os
from trainer import Trainer, TrainerArgs
from TTS.config.shared_configs import BaseAudioConfig
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.configs.vits_config import VitsConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.vits import Vits
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
from TTS.utils.downloaders import download_thorsten_de
output_path = os.path.dirname(os.path.abspath(__file__))
dataset_config = BaseDatasetConfig(
name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
)
# download dataset if not already present
if not os.path.exists(dataset_config.path):
print("Downloading dataset")
download_thorsten_de(os.path.split(os.path.abspath(dataset_config.path))[0])
audio_config = BaseAudioConfig(
sample_rate=22050,
win_length=1024,
hop_length=256,
num_mels=80,
preemphasis=0.0,
ref_level_db=20,
log_func="np.log",
do_trim_silence=True,
trim_db=45,
mel_fmin=0,
mel_fmax=None,
spec_gain=1.0,
signal_norm=False,
do_amp_to_db_linear=False,
)
config = VitsConfig(
audio=audio_config,
run_name="vits_thorsten-de",
batch_size=32,
eval_batch_size=16,
batch_group_size=5,
num_loader_workers=0,
num_eval_loader_workers=4,
run_eval=True,
test_delay_epochs=-1,
epochs=1000,
text_cleaner="phoneme_cleaners",
use_phonemes=True,
phoneme_language="de",
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
compute_input_seq_cache=True,
print_step=25,
print_eval=True,
mixed_precision=True,
test_sentences=[
"Es hat mich viel Zeit gekostet ein Stimme zu entwickeln, jetzt wo ich sie habe werde ich nicht mehr schweigen.",
"Sei eine Stimme, kein Echo.",
"Es tut mir Leid David. Das kann ich leider nicht machen.",
"Dieser Kuchen ist großartig. Er ist so lecker und feucht.",
"Vor dem 22. November 1963.",
],
output_path=output_path,
datasets=[dataset_config],
)
# INITIALIZE THE AUDIO PROCESSOR
# Audio processor is used for feature extraction and audio I/O.
# It mainly serves to the dataloader and the training loggers.
ap = AudioProcessor.init_from_config(config)
# INITIALIZE THE TOKENIZER
# Tokenizer is used to convert text to sequences of token IDs.
# config is updated with the default characters if not defined in the config.
tokenizer, config = TTSTokenizer.init_from_config(config)
# LOAD DATA SAMPLES
# Each sample is a list of ```[text, audio_file_path, speaker_name]```
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(
dataset_config,
eval_split=True,
eval_split_max_size=config.eval_split_max_size,
eval_split_size=config.eval_split_size,
)
# init model
model = Vits(config, ap, tokenizer, speaker_manager=None)
# init the trainer and 🚀
trainer = Trainer(
TrainerArgs(),
config,
output_path,
model=model,
train_samples=train_samples,
eval_samples=eval_samples,
)
trainer.fit()

View File

@ -0,0 +1,56 @@
import os
from trainer import Trainer, TrainerArgs
from TTS.utils.audio import AudioProcessor
from TTS.utils.downloaders import download_thorsten_de
from TTS.vocoder.configs import WavegradConfig
from TTS.vocoder.datasets.preprocess import load_wav_data
from TTS.vocoder.models.wavegrad import Wavegrad
output_path = os.path.dirname(os.path.abspath(__file__))
config = WavegradConfig(
batch_size=32,
eval_batch_size=16,
num_loader_workers=4,
num_eval_loader_workers=4,
run_eval=True,
test_delay_epochs=-1,
epochs=1000,
seq_len=6144,
pad_short=2000,
use_noise_augment=True,
eval_split_size=50,
print_step=50,
print_eval=True,
mixed_precision=False,
data_path=os.path.join(output_path, "../thorsten-de/wavs/"),
output_path=output_path,
)
# download dataset if not already present
if not os.path.exists(config.data_path):
print("Downloading dataset")
download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../"))
download_thorsten_de(download_path)
# init audio processor
ap = AudioProcessor(**config.audio.to_dict())
# load training samples
eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
# init model
model = Wavegrad(config)
# init the trainer and 🚀
trainer = Trainer(
TrainerArgs(),
config,
output_path,
model=model,
train_samples=train_samples,
eval_samples=eval_samples,
training_assets={"audio_processor": ap},
)
trainer.fit()

View File

@ -0,0 +1,58 @@
import os
from trainer import Trainer, TrainerArgs
from TTS.utils.audio import AudioProcessor
from TTS.utils.downloaders import download_thorsten_de
from TTS.vocoder.configs import WavernnConfig
from TTS.vocoder.datasets.preprocess import load_wav_data
from TTS.vocoder.models.wavernn import Wavernn
output_path = os.path.dirname(os.path.abspath(__file__))
config = WavernnConfig(
batch_size=64,
eval_batch_size=16,
num_loader_workers=4,
num_eval_loader_workers=4,
run_eval=True,
test_delay_epochs=-1,
epochs=10000,
seq_len=1280,
pad_short=2000,
use_noise_augment=False,
eval_split_size=10,
print_step=25,
print_eval=True,
mixed_precision=False,
lr=1e-4,
grad_clip=4,
data_path=os.path.join(output_path, "../thorsten-de/wavs/"),
output_path=output_path,
)
# download dataset if not already present
if not os.path.exists(config.data_path):
print("Downloading dataset")
download_path = os.path.abspath(os.path.join(os.path.abspath(config.data_path), "../../"))
download_thorsten_de(download_path)
# init audio processor
ap = AudioProcessor(**config.audio.to_dict())
# load training samples
eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
# init model
model = Wavernn(config)
# init the trainer and 🚀
trainer = Trainer(
TrainerArgs(),
config,
output_path,
model=model,
train_samples=train_samples,
eval_samples=eval_samples,
training_assets={"audio_processor": ap},
)
trainer.fit()

View File

@ -1,5 +1,5 @@
black black
coverage coverage
isort isort
nose nose2
pylint==2.10.2 pylint==2.10.2

View File

@ -1,12 +1,12 @@
# core deps # core deps
numpy==1.19.5 numpy==1.21.6
cython cython==0.29.28
scipy>=1.4.0 scipy>=1.4.0
torch>=1.7 torch>=1.7
torchaudio torchaudio
soundfile soundfile
librosa==0.8.0 librosa==0.8.0
numba==0.53 numba==0.55.1
inflect inflect
tqdm tqdm
anyascii anyascii
@ -21,16 +21,16 @@ umap-learn==0.5.1
pandas pandas
# deps for training # deps for training
matplotlib matplotlib
tensorboardX pyworld==0.2.10 # > 0.2.10 is not p3.10.x compatible
pyworld
# coqui stack # coqui stack
trainer trainer
coqpit # config management # config management
coqpit>=0.0.16
# chinese g2p deps # chinese g2p deps
jieba jieba
pypinyin pypinyin
# japanese g2p deps # japanese g2p deps
mecab-python3==1.0.3 mecab-python3==1.0.5
unidic-lite==1.0.8 unidic-lite==1.0.8
# gruut+supported langs # gruut+supported langs
gruut[cs,de,es,fr,it,nl,pt,ru,sv]==2.2.3 gruut[cs,de,es,fr,it,nl,pt,ru,sv]==2.2.3

View File

@ -31,8 +31,8 @@ import setuptools.command.develop
from Cython.Build import cythonize from Cython.Build import cythonize
from setuptools import Extension, find_packages, setup from setuptools import Extension, find_packages, setup
if LooseVersion(sys.version) < LooseVersion("3.6") or LooseVersion(sys.version) > LooseVersion("3.10"): if LooseVersion(sys.version) < LooseVersion("3.7") or LooseVersion(sys.version) >= LooseVersion("3.11"):
raise RuntimeError("TTS requires python >= 3.6 and <=3.10 " "but your Python version is {}".format(sys.version)) raise RuntimeError("TTS requires python >= 3.7 and < 3.11 " "but your Python version is {}".format(sys.version))
cwd = os.path.dirname(os.path.abspath(__file__)) cwd = os.path.dirname(os.path.abspath(__file__))
@ -113,15 +113,15 @@ setup(
"dev": requirements_dev, "dev": requirements_dev,
"notebooks": requirements_notebooks, "notebooks": requirements_notebooks,
}, },
python_requires=">=3.6.0, <3.10", python_requires=">=3.7.0, <3.11",
entry_points={"console_scripts": ["tts=TTS.bin.synthesize:main", "tts-server = TTS.server.server:main"]}, entry_points={"console_scripts": ["tts=TTS.bin.synthesize:main", "tts-server = TTS.server.server:main"]},
classifiers=[ classifiers=[
"Programming Language :: Python", "Programming Language :: Python",
"Programming Language :: Python :: 3", "Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Development Status :: 3 - Alpha", "Development Status :: 3 - Alpha",
"Intended Audience :: Science/Research", "Intended Audience :: Science/Research",
"Intended Audience :: Developers", "Intended Audience :: Developers",

View File

@ -16,6 +16,7 @@ encoder_model_path = os.path.join(get_tests_input_path(), "checkpoint_0.pth")
sample_wav_path = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0001.wav") sample_wav_path = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0001.wav")
sample_wav_path2 = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0002.wav") sample_wav_path2 = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0002.wav")
d_vectors_file_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.json") d_vectors_file_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.json")
d_vectors_file_pth_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.pth")
class SpeakerManagerTest(unittest.TestCase): class SpeakerManagerTest(unittest.TestCase):
@ -58,12 +59,13 @@ class SpeakerManagerTest(unittest.TestCase):
# remove dummy model # remove dummy model
os.remove(encoder_model_path) os.remove(encoder_model_path)
@staticmethod def test_speakers_file_processing(self):
def test_speakers_file_processing():
manager = SpeakerManager(d_vectors_file_path=d_vectors_file_path) manager = SpeakerManager(d_vectors_file_path=d_vectors_file_path)
print(manager.num_speakers) self.assertEqual(manager.num_speakers, 1)
print(manager.embedding_dim) self.assertEqual(manager.embedding_dim, 256)
print(manager.clip_ids) manager = SpeakerManager(d_vectors_file_path=d_vectors_file_pth_path)
self.assertEqual(manager.num_speakers, 1)
self.assertEqual(manager.embedding_dim, 256)
d_vector = manager.get_embedding_by_clip(manager.clip_ids[0]) d_vector = manager.get_embedding_by_clip(manager.clip_ids[0])
assert len(d_vector) == 256 assert len(d_vector) == 256
d_vectors = manager.get_embeddings_by_name(manager.speaker_names[0]) d_vectors = manager.get_embeddings_by_name(manager.speaker_names[0])

Binary file not shown.

View File

@ -6,7 +6,7 @@ import numpy as np
import torch import torch
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
from tests import get_tests_output_path from tests import get_tests_data_path, get_tests_output_path
from TTS.tts.configs.shared_configs import BaseDatasetConfig, BaseTTSConfig from TTS.tts.configs.shared_configs import BaseDatasetConfig, BaseTTSConfig
from TTS.tts.datasets import TTSDataset, load_tts_samples from TTS.tts.datasets import TTSDataset, load_tts_samples
from TTS.tts.utils.text.tokenizer import TTSTokenizer from TTS.tts.utils.text.tokenizer import TTSTokenizer
@ -20,7 +20,7 @@ os.makedirs(OUTPATH, exist_ok=True)
# create a dummy config for testing data loaders. # create a dummy config for testing data loaders.
c = BaseTTSConfig(text_cleaner="english_cleaners", num_loader_workers=0, batch_size=2, use_noise_augment=False) c = BaseTTSConfig(text_cleaner="english_cleaners", num_loader_workers=0, batch_size=2, use_noise_augment=False)
c.r = 5 c.r = 5
c.data_path = "tests/data/ljspeech/" c.data_path = os.path.join(get_tests_data_path(), "ljspeech/")
ok_ljspeech = os.path.exists(c.data_path) ok_ljspeech = os.path.exists(c.data_path)
dataset_config = BaseDatasetConfig( dataset_config = BaseDatasetConfig(

View File

@ -1,4 +1,5 @@
import functools import functools
import random
import unittest import unittest
import torch import torch
@ -6,6 +7,7 @@ import torch
from TTS.config.shared_configs import BaseDatasetConfig from TTS.config.shared_configs import BaseDatasetConfig
from TTS.encoder.utils.samplers import PerfectBatchSampler from TTS.encoder.utils.samplers import PerfectBatchSampler
from TTS.tts.datasets import load_tts_samples from TTS.tts.datasets import load_tts_samples
from TTS.tts.utils.data import get_length_balancer_weights
from TTS.tts.utils.languages import get_language_balancer_weights from TTS.tts.utils.languages import get_language_balancer_weights
from TTS.tts.utils.speakers import get_speaker_balancer_weights from TTS.tts.utils.speakers import get_speaker_balancer_weights
@ -136,3 +138,28 @@ class TestSamplers(unittest.TestCase):
else: else:
spk2 += 1 spk2 += 1
assert spk1 == spk2, "PerfectBatchSampler is supposed to be perfectly balanced" assert spk1 == spk2, "PerfectBatchSampler is supposed to be perfectly balanced"
def test_length_weighted_random_sampler(self): # pylint: disable=no-self-use
for _ in range(1000):
# gerenate a lenght unbalanced dataset with random max/min audio lenght
min_audio = random.randrange(1, 22050)
max_audio = random.randrange(44100, 220500)
for idx, item in enumerate(train_samples):
# increase the diversity of durations
random_increase = random.randrange(100, 1000)
if idx < 5:
item["audio_length"] = min_audio + random_increase
else:
item["audio_length"] = max_audio + random_increase
weighted_sampler = torch.utils.data.sampler.WeightedRandomSampler(
get_length_balancer_weights(train_samples, num_buckets=2), len(train_samples)
)
ids = functools.reduce(lambda a, b: a + b, [list(weighted_sampler) for i in range(100)])
len1, len2 = 0, 0
for index in ids:
if train_samples[index]["audio_length"] < max_audio:
len1 += 1
else:
len2 += 1
assert is_balanced(len1, len2), "Length Weighted sampler is supposed to be balanced"

View File

@ -6,7 +6,7 @@ import torch
from torch import nn, optim from torch import nn, optim
from tests import get_tests_input_path from tests import get_tests_input_path
from TTS.tts.configs.shared_configs import GSTConfig from TTS.tts.configs.shared_configs import CapacitronVAEConfig, GSTConfig
from TTS.tts.configs.tacotron2_config import Tacotron2Config from TTS.tts.configs.tacotron2_config import Tacotron2Config
from TTS.tts.layers.losses import MSELossMasked from TTS.tts.layers.losses import MSELossMasked
from TTS.tts.models.tacotron2 import Tacotron2 from TTS.tts.models.tacotron2 import Tacotron2
@ -260,6 +260,73 @@ class TacotronGSTTrainTest(unittest.TestCase):
count += 1 count += 1
class TacotronCapacitronTrainTest(unittest.TestCase):
@staticmethod
def test_train_step():
config = Tacotron2Config(
num_chars=32,
num_speakers=10,
use_speaker_embedding=True,
out_channels=80,
decoder_output_dim=80,
use_capacitron_vae=True,
capacitron_vae=CapacitronVAEConfig(),
optimizer="CapacitronOptimizer",
optimizer_params={
"RAdam": {"betas": [0.9, 0.998], "weight_decay": 1e-6},
"SGD": {"lr": 1e-5, "momentum": 0.9},
},
)
batch = dict({})
batch["text_input"] = torch.randint(0, 24, (8, 128)).long().to(device)
batch["text_lengths"] = torch.randint(100, 129, (8,)).long().to(device)
batch["text_lengths"] = torch.sort(batch["text_lengths"], descending=True)[0]
batch["text_lengths"][0] = 128
batch["mel_input"] = torch.rand(8, 120, config.audio["num_mels"]).to(device)
batch["mel_lengths"] = torch.randint(20, 120, (8,)).long().to(device)
batch["mel_lengths"] = torch.sort(batch["mel_lengths"], descending=True)[0]
batch["mel_lengths"][0] = 120
batch["stop_targets"] = torch.zeros(8, 120, 1).float().to(device)
batch["stop_target_lengths"] = torch.randint(0, 120, (8,)).to(device)
batch["speaker_ids"] = torch.randint(0, 5, (8,)).long().to(device)
batch["d_vectors"] = None
for idx in batch["mel_lengths"]:
batch["stop_targets"][:, int(idx.item()) :, 0] = 1.0
batch["stop_targets"] = batch["stop_targets"].view(
batch["text_input"].shape[0], batch["stop_targets"].size(1) // config.r, -1
)
batch["stop_targets"] = (batch["stop_targets"].sum(2) > 0.0).unsqueeze(2).float().squeeze()
model = Tacotron2(config).to(device)
criterion = model.get_criterion()
optimizer = model.get_optimizer()
model.train()
model_ref = copy.deepcopy(model)
count = 0
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
assert (param - param_ref).sum() == 0, param
count += 1
for _ in range(10):
_, loss_dict = model.train_step(batch, criterion)
optimizer.zero_grad()
loss_dict["capacitron_vae_beta_loss"].backward()
optimizer.first_step()
loss_dict["loss"].backward()
optimizer.step()
# check parameter changes
count = 0
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
# ignore pre-higway layer since it works conditional
assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
count, param.shape, param, param_ref
)
count += 1
class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
"""Test multi-speaker Tacotron2 with Global Style Tokens and d-vector inputs.""" """Test multi-speaker Tacotron2 with Global Style Tokens and d-vector inputs."""

View File

@ -6,7 +6,7 @@ import torch
from torch import nn, optim from torch import nn, optim
from tests import get_tests_input_path from tests import get_tests_input_path
from TTS.tts.configs.shared_configs import GSTConfig from TTS.tts.configs.shared_configs import CapacitronVAEConfig, GSTConfig
from TTS.tts.configs.tacotron_config import TacotronConfig from TTS.tts.configs.tacotron_config import TacotronConfig
from TTS.tts.layers.losses import L1LossMasked from TTS.tts.layers.losses import L1LossMasked
from TTS.tts.models.tacotron import Tacotron from TTS.tts.models.tacotron import Tacotron
@ -248,6 +248,74 @@ class TacotronGSTTrainTest(unittest.TestCase):
count += 1 count += 1
class TacotronCapacitronTrainTest(unittest.TestCase):
@staticmethod
def test_train_step():
config = TacotronConfig(
num_chars=32,
num_speakers=10,
use_speaker_embedding=True,
out_channels=513,
decoder_output_dim=80,
use_capacitron_vae=True,
capacitron_vae=CapacitronVAEConfig(),
optimizer="CapacitronOptimizer",
optimizer_params={
"RAdam": {"betas": [0.9, 0.998], "weight_decay": 1e-6},
"SGD": {"lr": 1e-5, "momentum": 0.9},
},
)
batch = dict({})
batch["text_input"] = torch.randint(0, 24, (8, 128)).long().to(device)
batch["text_lengths"] = torch.randint(100, 129, (8,)).long().to(device)
batch["text_lengths"] = torch.sort(batch["text_lengths"], descending=True)[0]
batch["text_lengths"][0] = 128
batch["linear_input"] = torch.rand(8, 120, config.audio["fft_size"] // 2 + 1).to(device)
batch["mel_input"] = torch.rand(8, 120, config.audio["num_mels"]).to(device)
batch["mel_lengths"] = torch.randint(20, 120, (8,)).long().to(device)
batch["mel_lengths"] = torch.sort(batch["mel_lengths"], descending=True)[0]
batch["mel_lengths"][0] = 120
batch["stop_targets"] = torch.zeros(8, 120, 1).float().to(device)
batch["stop_target_lengths"] = torch.randint(0, 120, (8,)).to(device)
batch["speaker_ids"] = torch.randint(0, 5, (8,)).long().to(device)
batch["d_vectors"] = None
for idx in batch["mel_lengths"]:
batch["stop_targets"][:, int(idx.item()) :, 0] = 1.0
batch["stop_targets"] = batch["stop_targets"].view(
batch["text_input"].shape[0], batch["stop_targets"].size(1) // config.r, -1
)
batch["stop_targets"] = (batch["stop_targets"].sum(2) > 0.0).unsqueeze(2).float().squeeze()
model = Tacotron(config).to(device)
criterion = model.get_criterion()
optimizer = model.get_optimizer()
model.train()
print(" > Num parameters for Tacotron with Capacitron VAE model:%s" % (count_parameters(model)))
model_ref = copy.deepcopy(model)
count = 0
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
assert (param - param_ref).sum() == 0, param
count += 1
for _ in range(10):
_, loss_dict = model.train_step(batch, criterion)
optimizer.zero_grad()
loss_dict["capacitron_vae_beta_loss"].backward()
optimizer.first_step()
loss_dict["loss"].backward()
optimizer.step()
# check parameter changes
count = 0
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
# ignore pre-higway layer since it works conditional
assert (param != param_ref).any(), "param {} with shape {} not updated!! \n{}\n{}".format(
count, param.shape, param, param_ref
)
count += 1
class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
@staticmethod @staticmethod
def test_train_step(): def test_train_step():

View File

@ -122,7 +122,7 @@ class TestVits(unittest.TestCase):
args = VitsArgs(num_speakers=num_speakers, use_speaker_embedding=True) args = VitsArgs(num_speakers=num_speakers, use_speaker_embedding=True)
model = Vits(args) model = Vits(args)
ref_inp = torch.randn(1, spec_len, 513) ref_inp = torch.randn(1, 513, spec_len)
ref_inp_len = torch.randint(1, spec_effective_len, (1,)) ref_inp_len = torch.randint(1, spec_effective_len, (1,))
ref_spk_id = torch.randint(1, num_speakers, (1,)) ref_spk_id = torch.randint(1, num_speakers, (1,))
tgt_spk_id = torch.randint(1, num_speakers, (1,)) tgt_spk_id = torch.randint(1, num_speakers, (1,))
@ -420,6 +420,76 @@ class TestVits(unittest.TestCase):
# check parameter changes # check parameter changes
self._check_parameter_changes(model, model_ref) self._check_parameter_changes(model, model_ref)
def test_train_step_upsampling(self):
# setup the model
with torch.autograd.set_detect_anomaly(True):
model_args = VitsArgs(
num_chars=32,
spec_segment_size=10,
encoder_sample_rate=11025,
interpolate_z=False,
upsample_rates_decoder=[8, 8, 4, 2],
)
config = VitsConfig(model_args=model_args)
model = Vits(config).to(device)
model.train()
# model to train
optimizers = model.get_optimizer()
criterions = model.get_criterion()
criterions = [criterions[0].to(device), criterions[1].to(device)]
# reference model to compare model weights
model_ref = Vits(config).to(device)
# # pass the state to ref model
model_ref.load_state_dict(copy.deepcopy(model.state_dict()))
count = 0
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
assert (param - param_ref).sum() == 0, param
count = count + 1
for _ in range(5):
batch = self._create_batch(config, 2)
for idx in [0, 1]:
outputs, loss_dict = model.train_step(batch, criterions, idx)
self.assertFalse(not outputs)
self.assertFalse(not loss_dict)
loss_dict["loss"].backward()
optimizers[idx].step()
optimizers[idx].zero_grad()
# check parameter changes
self._check_parameter_changes(model, model_ref)
def test_train_step_upsampling_interpolation(self):
# setup the model
with torch.autograd.set_detect_anomaly(True):
model_args = VitsArgs(num_chars=32, spec_segment_size=10, encoder_sample_rate=11025, interpolate_z=True)
config = VitsConfig(model_args=model_args)
model = Vits(config).to(device)
model.train()
# model to train
optimizers = model.get_optimizer()
criterions = model.get_criterion()
criterions = [criterions[0].to(device), criterions[1].to(device)]
# reference model to compare model weights
model_ref = Vits(config).to(device)
# # pass the state to ref model
model_ref.load_state_dict(copy.deepcopy(model.state_dict()))
count = 0
for param, param_ref in zip(model.parameters(), model_ref.parameters()):
assert (param - param_ref).sum() == 0, param
count = count + 1
for _ in range(5):
batch = self._create_batch(config, 2)
for idx in [0, 1]:
outputs, loss_dict = model.train_step(batch, criterions, idx)
self.assertFalse(not outputs)
self.assertFalse(not loss_dict)
loss_dict["loss"].backward()
optimizers[idx].step()
optimizers[idx].zero_grad()
# check parameter changes
self._check_parameter_changes(model, model_ref)
def test_train_eval_log(self): def test_train_eval_log(self):
batch_size = 2 batch_size = 2
config = VitsConfig(model_args=VitsArgs(num_chars=32, spec_segment_size=10)) config = VitsConfig(model_args=VitsArgs(num_chars=32, spec_segment_size=10))

View File

@ -3,7 +3,7 @@ import glob
import os import os
import shutil import shutil
from tests import get_tests_output_path, run_cli from tests import get_tests_data_path, get_tests_output_path, run_cli
from TTS.tts.utils.languages import LanguageManager from TTS.tts.utils.languages import LanguageManager
from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.speakers import SpeakerManager
from TTS.utils.generic_utils import get_user_data_dir from TTS.utils.generic_utils import get_user_data_dir
@ -56,3 +56,16 @@ def test_run_all_models():
folders = glob.glob(os.path.join(manager.output_prefix, "*")) folders = glob.glob(os.path.join(manager.output_prefix, "*"))
assert len(folders) == len(model_names) assert len(folders) == len(model_names)
shutil.rmtree(manager.output_prefix) shutil.rmtree(manager.output_prefix)
def test_voice_conversion():
print(" > Run voice conversion inference using YourTTS model.")
model_name = "tts_models/multilingual/multi-dataset/your_tts"
language_id = "en"
speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")
reference_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0032.wav")
output_path = os.path.join(get_tests_output_path(), "output.wav")
run_cli(
f"tts --model_name {model_name}"
f" --out_path {output_path} --speaker_wav {speaker_wav} --reference_wav {reference_wav} --language_idx {language_id} "
)