mirror of https://github.com/coqui-ai/TTS.git
Merge branch 'dev' into fix-improvements/adjust-speech-rate-or-speed
This commit is contained in:
commit
ed1563b132
|
@ -1,5 +0,0 @@
|
||||||
linters:
|
|
||||||
- pylint:
|
|
||||||
# pylintrc: pylintrc
|
|
||||||
filefilter: ['- test_*.py', '+ *.py', '- *.npy']
|
|
||||||
# exclude:
|
|
|
@ -59,7 +59,7 @@ body:
|
||||||
You can either run `TTS/bin/collect_env_info.py`
|
You can either run `TTS/bin/collect_env_info.py`
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
wget https://raw.githubusercontent.com/coqui-ai/TTS/main/TTS/bin/collect_env_info.py
|
wget https://raw.githubusercontent.com/idiap/coqui-ai-TTS/main/TTS/bin/collect_env_info.py
|
||||||
python collect_env_info.py
|
python collect_env_info.py
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
blank_issues_enabled: false
|
blank_issues_enabled: false
|
||||||
contact_links:
|
contact_links:
|
||||||
- name: CoquiTTS GitHub Discussions
|
- name: CoquiTTS GitHub Discussions
|
||||||
url: https://github.com/coqui-ai/TTS/discussions
|
url: https://github.com/idiap/coqui-ai-TTS/discussions
|
||||||
about: Please ask and answer questions here.
|
about: Please ask and answer questions here.
|
||||||
- name: Coqui Security issue disclosure
|
- name: Coqui Security issue disclosure
|
||||||
url: mailto:info@coqui.ai
|
url: mailto:enno.hermann@gmail.com
|
||||||
about: Please report security vulnerabilities here.
|
about: Please report security vulnerabilities here.
|
||||||
|
|
|
@ -5,11 +5,3 @@ Welcome to the 🐸TTS project! We are excited to see your interest, and appreci
|
||||||
This repository is governed by the Contributor Covenant Code of Conduct. For more details, see the [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md) file.
|
This repository is governed by the Contributor Covenant Code of Conduct. For more details, see the [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md) file.
|
||||||
|
|
||||||
In order to make a good pull request, please see our [CONTRIBUTING.md](CONTRIBUTING.md) file.
|
In order to make a good pull request, please see our [CONTRIBUTING.md](CONTRIBUTING.md) file.
|
||||||
|
|
||||||
Before accepting your pull request, you will be asked to sign a [Contributor License Agreement](https://cla-assistant.io/coqui-ai/TTS).
|
|
||||||
|
|
||||||
This [Contributor License Agreement](https://cla-assistant.io/coqui-ai/TTS):
|
|
||||||
|
|
||||||
- Protects you, Coqui, and the users of the code.
|
|
||||||
- Does not change your rights to use your contributions for any purpose.
|
|
||||||
- Does not change the license of the 🐸TTS project. It just makes the terms of your contribution clearer and lets us know you are OK to contribute.
|
|
||||||
|
|
|
@ -0,0 +1,11 @@
|
||||||
|
name: Setup uv
|
||||||
|
|
||||||
|
runs:
|
||||||
|
using: 'composite'
|
||||||
|
steps:
|
||||||
|
- name: Install uv
|
||||||
|
uses: astral-sh/setup-uv@v4
|
||||||
|
with:
|
||||||
|
version: "0.5.4"
|
||||||
|
enable-cache: true
|
||||||
|
cache-dependency-glob: "**/pyproject.toml"
|
|
@ -15,4 +15,3 @@ markComment: >
|
||||||
for your contributions. You might also look our discussion channels.
|
for your contributions. You might also look our discussion channels.
|
||||||
# Comment to post when closing a stale issue. Set to `false` to disable
|
# Comment to post when closing a stale issue. Set to `false` to disable
|
||||||
closeComment: false
|
closeComment: false
|
||||||
|
|
||||||
|
|
|
@ -1,51 +0,0 @@
|
||||||
name: aux-tests
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
pull_request:
|
|
||||||
types: [opened, synchronize, reopened]
|
|
||||||
jobs:
|
|
||||||
check_skip:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
if: "! contains(github.event.head_commit.message, '[ci skip]')"
|
|
||||||
steps:
|
|
||||||
- run: echo "${{ github.event.head_commit.message }}"
|
|
||||||
|
|
||||||
test:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
python-version: [3.9, "3.10", "3.11"]
|
|
||||||
experimental: [false]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v3
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
|
||||||
uses: actions/setup-python@v4
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python-version }}
|
|
||||||
architecture: x64
|
|
||||||
cache: 'pip'
|
|
||||||
cache-dependency-path: 'requirements*'
|
|
||||||
- name: check OS
|
|
||||||
run: cat /etc/os-release
|
|
||||||
- name: set ENV
|
|
||||||
run: export TRAINER_TELEMETRY=0
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install -y git make gcc
|
|
||||||
make system-deps
|
|
||||||
- name: Install/upgrade Python setup deps
|
|
||||||
run: python3 -m pip install --upgrade pip setuptools wheel
|
|
||||||
- name: Replace scarf urls
|
|
||||||
run: |
|
|
||||||
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
|
|
||||||
- name: Install TTS
|
|
||||||
run: |
|
|
||||||
python3 -m pip install .[all]
|
|
||||||
python3 setup.py egg_info
|
|
||||||
- name: Unit tests
|
|
||||||
run: make test_aux
|
|
|
@ -1,51 +0,0 @@
|
||||||
name: data-tests
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
pull_request:
|
|
||||||
types: [opened, synchronize, reopened]
|
|
||||||
jobs:
|
|
||||||
check_skip:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
if: "! contains(github.event.head_commit.message, '[ci skip]')"
|
|
||||||
steps:
|
|
||||||
- run: echo "${{ github.event.head_commit.message }}"
|
|
||||||
|
|
||||||
test:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
python-version: [3.9, "3.10", "3.11"]
|
|
||||||
experimental: [false]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v3
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
|
||||||
uses: actions/setup-python@v4
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python-version }}
|
|
||||||
architecture: x64
|
|
||||||
cache: 'pip'
|
|
||||||
cache-dependency-path: 'requirements*'
|
|
||||||
- name: check OS
|
|
||||||
run: cat /etc/os-release
|
|
||||||
- name: set ENV
|
|
||||||
run: export TRAINER_TELEMETRY=0
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install -y --no-install-recommends git make gcc
|
|
||||||
make system-deps
|
|
||||||
- name: Install/upgrade Python setup deps
|
|
||||||
run: python3 -m pip install --upgrade pip setuptools wheel
|
|
||||||
- name: Replace scarf urls
|
|
||||||
run: |
|
|
||||||
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
|
|
||||||
- name: Install TTS
|
|
||||||
run: |
|
|
||||||
python3 -m pip install .[all]
|
|
||||||
python3 setup.py egg_info
|
|
||||||
- name: Unit tests
|
|
||||||
run: make data_tests
|
|
|
@ -10,7 +10,7 @@ on:
|
||||||
jobs:
|
jobs:
|
||||||
docker-build:
|
docker-build:
|
||||||
name: "Build and push Docker image"
|
name: "Build and push Docker image"
|
||||||
runs-on: ubuntu-20.04
|
runs-on: ubuntu-latest
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
arch: ["amd64"]
|
arch: ["amd64"]
|
||||||
|
@ -18,7 +18,7 @@ jobs:
|
||||||
- "nvidia/cuda:11.8.0-base-ubuntu22.04" # GPU enabled
|
- "nvidia/cuda:11.8.0-base-ubuntu22.04" # GPU enabled
|
||||||
- "python:3.10.8-slim" # CPU only
|
- "python:3.10.8-slim" # CPU only
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v4
|
||||||
- name: Log in to the Container registry
|
- name: Log in to the Container registry
|
||||||
uses: docker/login-action@v1
|
uses: docker/login-action@v1
|
||||||
with:
|
with:
|
||||||
|
@ -29,11 +29,11 @@ jobs:
|
||||||
id: compute-tag
|
id: compute-tag
|
||||||
run: |
|
run: |
|
||||||
set -ex
|
set -ex
|
||||||
base="ghcr.io/coqui-ai/tts"
|
base="ghcr.io/idiap/coqui-tts"
|
||||||
tags="" # PR build
|
tags="" # PR build
|
||||||
|
|
||||||
if [[ ${{ matrix.base }} = "python:3.10.8-slim" ]]; then
|
if [[ ${{ matrix.base }} = "python:3.10.8-slim" ]]; then
|
||||||
base="ghcr.io/coqui-ai/tts-cpu"
|
base="ghcr.io/idiap/coqui-tts-cpu"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ "${{ startsWith(github.ref, 'refs/heads/') }}" = "true" ]]; then
|
if [[ "${{ startsWith(github.ref, 'refs/heads/') }}" = "true" ]]; then
|
||||||
|
@ -42,7 +42,7 @@ jobs:
|
||||||
branch=${github_ref#*refs/heads/} # strip prefix to get branch name
|
branch=${github_ref#*refs/heads/} # strip prefix to get branch name
|
||||||
tags="${base}:${branch},${base}:${{ github.sha }},"
|
tags="${base}:${branch},${base}:${{ github.sha }},"
|
||||||
elif [[ "${{ startsWith(github.ref, 'refs/tags/') }}" = "true" ]]; then
|
elif [[ "${{ startsWith(github.ref, 'refs/tags/') }}" = "true" ]]; then
|
||||||
VERSION="v$(cat TTS/VERSION)"
|
VERSION="v$(grep -m 1 version pyproject.toml | grep -P '\d+\.\d+\.\d+' -o)"
|
||||||
if [[ "${{ github.ref }}" != "refs/tags/${VERSION}" ]]; then
|
if [[ "${{ github.ref }}" != "refs/tags/${VERSION}" ]]; then
|
||||||
echo "Pushed tag does not match VERSION file. Aborting push."
|
echo "Pushed tag does not match VERSION file. Aborting push."
|
||||||
exit 1
|
exit 1
|
||||||
|
@ -63,3 +63,58 @@ jobs:
|
||||||
push: ${{ github.event_name == 'push' }}
|
push: ${{ github.event_name == 'push' }}
|
||||||
build-args: "BASE=${{ matrix.base }}"
|
build-args: "BASE=${{ matrix.base }}"
|
||||||
tags: ${{ steps.compute-tag.outputs.tags }}
|
tags: ${{ steps.compute-tag.outputs.tags }}
|
||||||
|
docker-dev-build:
|
||||||
|
name: "Build the development Docker image"
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
arch: ["amd64"]
|
||||||
|
base:
|
||||||
|
- "nvidia/cuda:11.8.0-base-ubuntu22.04" # GPU enabled
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- name: Log in to the Container registry
|
||||||
|
uses: docker/login-action@v1
|
||||||
|
with:
|
||||||
|
registry: ghcr.io
|
||||||
|
username: ${{ github.actor }}
|
||||||
|
password: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
- name: Compute Docker tags, check VERSION file matches tag
|
||||||
|
id: compute-tag
|
||||||
|
run: |
|
||||||
|
set -ex
|
||||||
|
base="ghcr.io/idiap/coqui-tts-dev"
|
||||||
|
tags="" # PR build
|
||||||
|
|
||||||
|
if [[ ${{ matrix.base }} = "python:3.10.8-slim" ]]; then
|
||||||
|
base="ghcr.io/idiap/coqui-tts-dev-cpu"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "${{ startsWith(github.ref, 'refs/heads/') }}" = "true" ]]; then
|
||||||
|
# Push to branch
|
||||||
|
github_ref="${{ github.ref }}"
|
||||||
|
branch=${github_ref#*refs/heads/} # strip prefix to get branch name
|
||||||
|
tags="${base}:${branch},${base}:${{ github.sha }},"
|
||||||
|
elif [[ "${{ startsWith(github.ref, 'refs/tags/') }}" = "true" ]]; then
|
||||||
|
VERSION="v$(grep -m 1 version pyproject.toml | grep -P '\d+\.\d+\.\d+' -o)"
|
||||||
|
if [[ "${{ github.ref }}" != "refs/tags/${VERSION}" ]]; then
|
||||||
|
echo "Pushed tag does not match VERSION file. Aborting push."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
tags="${base}:${VERSION},${base}:latest,${base}:${{ github.sha }}"
|
||||||
|
fi
|
||||||
|
echo "::set-output name=tags::${tags}"
|
||||||
|
- name: Set up QEMU
|
||||||
|
uses: docker/setup-qemu-action@v1
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
id: buildx
|
||||||
|
uses: docker/setup-buildx-action@v1
|
||||||
|
- name: Build and push
|
||||||
|
uses: docker/build-push-action@v2
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
file: dockerfiles/Dockerfile.dev
|
||||||
|
platforms: linux/${{ matrix.arch }}
|
||||||
|
push: false
|
||||||
|
build-args: "BASE=${{ matrix.base }}"
|
||||||
|
tags: ${{ steps.compute-tag.outputs.tags }}
|
||||||
|
|
|
@ -1,53 +0,0 @@
|
||||||
name: inference_tests
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
pull_request:
|
|
||||||
types: [opened, synchronize, reopened]
|
|
||||||
jobs:
|
|
||||||
check_skip:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
if: "! contains(github.event.head_commit.message, '[ci skip]')"
|
|
||||||
steps:
|
|
||||||
- run: echo "${{ github.event.head_commit.message }}"
|
|
||||||
|
|
||||||
test:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
python-version: [3.9, "3.10", "3.11"]
|
|
||||||
experimental: [false]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v3
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
|
||||||
uses: actions/setup-python@v4
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python-version }}
|
|
||||||
architecture: x64
|
|
||||||
cache: 'pip'
|
|
||||||
cache-dependency-path: 'requirements*'
|
|
||||||
- name: check OS
|
|
||||||
run: cat /etc/os-release
|
|
||||||
- name: set ENV
|
|
||||||
run: |
|
|
||||||
export TRAINER_TELEMETRY=0
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install -y --no-install-recommends git make gcc
|
|
||||||
sudo apt-get install espeak-ng
|
|
||||||
make system-deps
|
|
||||||
- name: Install/upgrade Python setup deps
|
|
||||||
run: python3 -m pip install --upgrade pip setuptools wheel
|
|
||||||
- name: Replace scarf urls
|
|
||||||
run: |
|
|
||||||
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
|
|
||||||
- name: Install TTS
|
|
||||||
run: |
|
|
||||||
python3 -m pip install .[all]
|
|
||||||
python3 setup.py egg_info
|
|
||||||
- name: Unit tests
|
|
||||||
run: make inference_tests
|
|
|
@ -7,88 +7,48 @@ defaults:
|
||||||
shell:
|
shell:
|
||||||
bash
|
bash
|
||||||
jobs:
|
jobs:
|
||||||
build-sdist:
|
build:
|
||||||
runs-on: ubuntu-20.04
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v4
|
||||||
|
- name: Setup uv
|
||||||
|
uses: ./.github/actions/setup-uv
|
||||||
- name: Verify tag matches version
|
- name: Verify tag matches version
|
||||||
run: |
|
run: |
|
||||||
set -ex
|
set -ex
|
||||||
version=$(cat TTS/VERSION)
|
version=$(grep -m 1 version pyproject.toml | grep -P '\d+\.\d+\.\d+' -o)
|
||||||
tag="${GITHUB_REF/refs\/tags\/}"
|
tag="${GITHUB_REF/refs\/tags\/}"
|
||||||
if [[ "v$version" != "$tag" ]]; then
|
if [[ "v$version" != "$tag" ]]; then
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
- uses: actions/setup-python@v2
|
- name: Set up Python
|
||||||
with:
|
run: uv python install 3.12
|
||||||
python-version: 3.9
|
- name: Build sdist and wheel
|
||||||
- run: |
|
run: uv build
|
||||||
python -m pip install -U pip setuptools wheel build
|
- name: Test installation of sdist and wheel
|
||||||
- run: |
|
|
||||||
python -m build
|
|
||||||
- run: |
|
|
||||||
pip install dist/*.tar.gz
|
|
||||||
- uses: actions/upload-artifact@v2
|
|
||||||
with:
|
|
||||||
name: sdist
|
|
||||||
path: dist/*.tar.gz
|
|
||||||
build-wheels:
|
|
||||||
runs-on: ubuntu-20.04
|
|
||||||
strategy:
|
|
||||||
matrix:
|
|
||||||
python-version: ["3.9", "3.10", "3.11"]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v3
|
|
||||||
- uses: actions/setup-python@v2
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python-version }}
|
|
||||||
- name: Install pip requirements
|
|
||||||
run: |
|
run: |
|
||||||
python -m pip install -U pip setuptools wheel build
|
uv venv --no-project
|
||||||
python -m pip install -r requirements.txt
|
uv pip install dist/*.tar.gz
|
||||||
- name: Setup and install manylinux1_x86_64 wheel
|
uv pip install dist/*.whl
|
||||||
run: |
|
- uses: actions/upload-artifact@v4
|
||||||
python setup.py bdist_wheel --plat-name=manylinux1_x86_64
|
|
||||||
python -m pip install dist/*-manylinux*.whl
|
|
||||||
- uses: actions/upload-artifact@v2
|
|
||||||
with:
|
with:
|
||||||
name: wheel-${{ matrix.python-version }}
|
name: build
|
||||||
path: dist/*-manylinux*.whl
|
path: dist/*
|
||||||
publish-artifacts:
|
publish-artifacts:
|
||||||
runs-on: ubuntu-20.04
|
name: Publish to PyPI
|
||||||
needs: [build-sdist, build-wheels]
|
runs-on: ubuntu-latest
|
||||||
|
needs: [build]
|
||||||
|
environment:
|
||||||
|
name: release
|
||||||
|
url: https://pypi.org/p/coqui-tts
|
||||||
|
permissions:
|
||||||
|
id-token: write
|
||||||
steps:
|
steps:
|
||||||
- run: |
|
- uses: actions/download-artifact@v4
|
||||||
mkdir dist
|
|
||||||
- uses: actions/download-artifact@v2
|
|
||||||
with:
|
with:
|
||||||
name: "sdist"
|
|
||||||
path: "dist/"
|
|
||||||
- uses: actions/download-artifact@v2
|
|
||||||
with:
|
|
||||||
name: "wheel-3.9"
|
|
||||||
path: "dist/"
|
|
||||||
- uses: actions/download-artifact@v2
|
|
||||||
with:
|
|
||||||
name: "wheel-3.10"
|
|
||||||
path: "dist/"
|
|
||||||
- uses: actions/download-artifact@v2
|
|
||||||
with:
|
|
||||||
name: "wheel-3.11"
|
|
||||||
path: "dist/"
|
path: "dist/"
|
||||||
|
name: build
|
||||||
- run: |
|
- run: |
|
||||||
ls -lh dist/
|
ls -lh dist/
|
||||||
- name: Setup PyPI config
|
- name: Publish package distributions to PyPI
|
||||||
run: |
|
uses: pypa/gh-action-pypi-publish@release/v1
|
||||||
cat << EOF > ~/.pypirc
|
|
||||||
[pypi]
|
|
||||||
username=__token__
|
|
||||||
password=${{ secrets.PYPI_TOKEN }}
|
|
||||||
EOF
|
|
||||||
- uses: actions/setup-python@v2
|
|
||||||
with:
|
|
||||||
python-version: 3.9
|
|
||||||
- run: |
|
|
||||||
python -m pip install twine
|
|
||||||
- run: |
|
|
||||||
twine upload --repository pypi dist/*
|
|
||||||
|
|
|
@ -7,40 +7,17 @@ on:
|
||||||
pull_request:
|
pull_request:
|
||||||
types: [opened, synchronize, reopened]
|
types: [opened, synchronize, reopened]
|
||||||
jobs:
|
jobs:
|
||||||
check_skip:
|
lint:
|
||||||
runs-on: ubuntu-latest
|
|
||||||
if: "! contains(github.event.head_commit.message, '[ci skip]')"
|
|
||||||
steps:
|
|
||||||
- run: echo "${{ github.event.head_commit.message }}"
|
|
||||||
|
|
||||||
test:
|
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
python-version: [3.9]
|
python-version: [3.9]
|
||||||
experimental: [false]
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v4
|
||||||
|
- name: Setup uv
|
||||||
|
uses: ./.github/actions/setup-uv
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
uses: actions/setup-python@v4
|
run: uv python install ${{ matrix.python-version }}
|
||||||
with:
|
- name: Lint check
|
||||||
python-version: ${{ matrix.python-version }}
|
run: make lint
|
||||||
architecture: x64
|
|
||||||
cache: 'pip'
|
|
||||||
cache-dependency-path: 'requirements*'
|
|
||||||
- name: check OS
|
|
||||||
run: cat /etc/os-release
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install -y git make gcc
|
|
||||||
make system-deps
|
|
||||||
- name: Install/upgrade Python setup deps
|
|
||||||
run: python3 -m pip install --upgrade pip setuptools wheel
|
|
||||||
- name: Install TTS
|
|
||||||
run: |
|
|
||||||
python3 -m pip install .[all]
|
|
||||||
python3 setup.py egg_info
|
|
||||||
- name: Style check
|
|
||||||
run: make style
|
|
||||||
|
|
|
@ -0,0 +1,127 @@
|
||||||
|
name: test
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
pull_request:
|
||||||
|
types: [opened, synchronize, reopened]
|
||||||
|
workflow_dispatch:
|
||||||
|
inputs:
|
||||||
|
trainer_branch:
|
||||||
|
description: "Branch of Trainer to test"
|
||||||
|
required: false
|
||||||
|
default: "main"
|
||||||
|
coqpit_branch:
|
||||||
|
description: "Branch of Coqpit to test"
|
||||||
|
required: false
|
||||||
|
default: "main"
|
||||||
|
jobs:
|
||||||
|
unit:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
python-version: [3.9, "3.10", "3.11", "3.12"]
|
||||||
|
subset: ["data_tests", "inference_tests", "test_aux", "test_text"]
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- name: Setup uv
|
||||||
|
uses: ./.github/actions/setup-uv
|
||||||
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
|
run: uv python install ${{ matrix.python-version }}
|
||||||
|
- name: Install Espeak
|
||||||
|
if: contains(fromJSON('["inference_tests", "test_text"]'), matrix.subset)
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install espeak espeak-ng
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install -y --no-install-recommends git make gcc
|
||||||
|
make system-deps
|
||||||
|
- name: Install custom Trainer and/or Coqpit if requested
|
||||||
|
run: |
|
||||||
|
if [[ -n "${{ github.event.inputs.trainer_branch }}" ]]; then
|
||||||
|
uv add git+https://github.com/idiap/coqui-ai-Trainer --branch ${{ github.event.inputs.trainer_branch }}
|
||||||
|
fi
|
||||||
|
if [[ -n "${{ github.event.inputs.coqpit_branch }}" ]]; then
|
||||||
|
uv add git+https://github.com/idiap/coqui-ai-coqpit --branch ${{ github.event.inputs.coqpit_branch }}
|
||||||
|
fi
|
||||||
|
- name: Unit tests
|
||||||
|
run: |
|
||||||
|
resolution=highest
|
||||||
|
if [ "${{ matrix.python-version }}" == "3.9" ]; then
|
||||||
|
resolution=lowest-direct
|
||||||
|
fi
|
||||||
|
uv run --resolution=$resolution --extra server --extra languages make ${{ matrix.subset }}
|
||||||
|
- name: Upload coverage data
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
include-hidden-files: true
|
||||||
|
name: coverage-data-${{ matrix.subset }}-${{ matrix.python-version }}
|
||||||
|
path: .coverage.*
|
||||||
|
if-no-files-found: ignore
|
||||||
|
integration:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
python-version: ["3.9", "3.12"]
|
||||||
|
subset: ["test_tts", "test_tts2", "test_vocoder", "test_xtts", "test_zoo0", "test_zoo1", "test_zoo2"]
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- name: Setup uv
|
||||||
|
uses: ./.github/actions/setup-uv
|
||||||
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
|
run: uv python install ${{ matrix.python-version }}
|
||||||
|
- name: Install Espeak
|
||||||
|
if: contains(fromJSON('["test_tts", "test_tts2", "test_xtts", "test_zoo0", "test_zoo1", "test_zoo2"]'), matrix.subset)
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install espeak espeak-ng
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install -y --no-install-recommends git make gcc
|
||||||
|
make system-deps
|
||||||
|
- name: Install custom Trainer and/or Coqpit if requested
|
||||||
|
run: |
|
||||||
|
if [[ -n "${{ github.event.inputs.trainer_branch }}" ]]; then
|
||||||
|
uv add git+https://github.com/idiap/coqui-ai-Trainer --branch ${{ github.event.inputs.trainer_branch }}
|
||||||
|
fi
|
||||||
|
if [[ -n "${{ github.event.inputs.coqpit_branch }}" ]]; then
|
||||||
|
uv add git+https://github.com/idiap/coqui-ai-coqpit --branch ${{ github.event.inputs.coqpit_branch }}
|
||||||
|
fi
|
||||||
|
- name: Integration tests
|
||||||
|
run: |
|
||||||
|
resolution=highest
|
||||||
|
if [ "${{ matrix.python-version }}" == "3.9" ]; then
|
||||||
|
resolution=lowest-direct
|
||||||
|
fi
|
||||||
|
uv run --resolution=$resolution --extra server --extra languages make ${{ matrix.subset }}
|
||||||
|
- name: Upload coverage data
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
include-hidden-files: true
|
||||||
|
name: coverage-data-${{ matrix.subset }}-${{ matrix.python-version }}
|
||||||
|
path: .coverage.*
|
||||||
|
if-no-files-found: ignore
|
||||||
|
coverage:
|
||||||
|
if: always()
|
||||||
|
needs: [unit, integration]
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- name: Setup uv
|
||||||
|
uses: ./.github/actions/setup-uv
|
||||||
|
- uses: actions/download-artifact@v4
|
||||||
|
with:
|
||||||
|
pattern: coverage-data-*
|
||||||
|
merge-multiple: true
|
||||||
|
- name: Combine coverage
|
||||||
|
run: |
|
||||||
|
uv python install
|
||||||
|
uvx coverage combine
|
||||||
|
uvx coverage html --skip-covered --skip-empty
|
||||||
|
uvx coverage report --format=markdown >> $GITHUB_STEP_SUMMARY
|
|
@ -1,50 +0,0 @@
|
||||||
name: text-tests
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
pull_request:
|
|
||||||
types: [opened, synchronize, reopened]
|
|
||||||
jobs:
|
|
||||||
check_skip:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
if: "! contains(github.event.head_commit.message, '[ci skip]')"
|
|
||||||
steps:
|
|
||||||
- run: echo "${{ github.event.head_commit.message }}"
|
|
||||||
|
|
||||||
test:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
python-version: [3.9, "3.10", "3.11"]
|
|
||||||
experimental: [false]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v3
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
|
||||||
uses: actions/setup-python@v4
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python-version }}
|
|
||||||
architecture: x64
|
|
||||||
cache: 'pip'
|
|
||||||
cache-dependency-path: 'requirements*'
|
|
||||||
- name: check OS
|
|
||||||
run: cat /etc/os-release
|
|
||||||
- name: set ENV
|
|
||||||
run: export TRAINER_TELEMETRY=0
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install -y --no-install-recommends git make gcc
|
|
||||||
sudo apt-get install espeak
|
|
||||||
sudo apt-get install espeak-ng
|
|
||||||
make system-deps
|
|
||||||
- name: Install/upgrade Python setup deps
|
|
||||||
run: python3 -m pip install --upgrade pip setuptools wheel
|
|
||||||
- name: Install TTS
|
|
||||||
run: |
|
|
||||||
python3 -m pip install .[all]
|
|
||||||
python3 setup.py egg_info
|
|
||||||
- name: Unit tests
|
|
||||||
run: make test_text
|
|
|
@ -1,53 +0,0 @@
|
||||||
name: tts-tests
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
pull_request:
|
|
||||||
types: [opened, synchronize, reopened]
|
|
||||||
jobs:
|
|
||||||
check_skip:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
if: "! contains(github.event.head_commit.message, '[ci skip]')"
|
|
||||||
steps:
|
|
||||||
- run: echo "${{ github.event.head_commit.message }}"
|
|
||||||
|
|
||||||
test:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
python-version: [3.9, "3.10", "3.11"]
|
|
||||||
experimental: [false]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v3
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
|
||||||
uses: actions/setup-python@v4
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python-version }}
|
|
||||||
architecture: x64
|
|
||||||
cache: 'pip'
|
|
||||||
cache-dependency-path: 'requirements*'
|
|
||||||
- name: check OS
|
|
||||||
run: cat /etc/os-release
|
|
||||||
- name: set ENV
|
|
||||||
run: export TRAINER_TELEMETRY=0
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install -y --no-install-recommends git make gcc
|
|
||||||
sudo apt-get install espeak
|
|
||||||
sudo apt-get install espeak-ng
|
|
||||||
make system-deps
|
|
||||||
- name: Install/upgrade Python setup deps
|
|
||||||
run: python3 -m pip install --upgrade pip setuptools wheel
|
|
||||||
- name: Replace scarf urls
|
|
||||||
run: |
|
|
||||||
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
|
|
||||||
- name: Install TTS
|
|
||||||
run: |
|
|
||||||
python3 -m pip install .[all]
|
|
||||||
python3 setup.py egg_info
|
|
||||||
- name: Unit tests
|
|
||||||
run: make test_tts
|
|
|
@ -1,53 +0,0 @@
|
||||||
name: tts-tests2
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
pull_request:
|
|
||||||
types: [opened, synchronize, reopened]
|
|
||||||
jobs:
|
|
||||||
check_skip:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
if: "! contains(github.event.head_commit.message, '[ci skip]')"
|
|
||||||
steps:
|
|
||||||
- run: echo "${{ github.event.head_commit.message }}"
|
|
||||||
|
|
||||||
test:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
python-version: [3.9, "3.10", "3.11"]
|
|
||||||
experimental: [false]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v3
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
|
||||||
uses: actions/setup-python@v4
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python-version }}
|
|
||||||
architecture: x64
|
|
||||||
cache: 'pip'
|
|
||||||
cache-dependency-path: 'requirements*'
|
|
||||||
- name: check OS
|
|
||||||
run: cat /etc/os-release
|
|
||||||
- name: set ENV
|
|
||||||
run: export TRAINER_TELEMETRY=0
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install -y --no-install-recommends git make gcc
|
|
||||||
sudo apt-get install espeak
|
|
||||||
sudo apt-get install espeak-ng
|
|
||||||
make system-deps
|
|
||||||
- name: Install/upgrade Python setup deps
|
|
||||||
run: python3 -m pip install --upgrade pip setuptools wheel
|
|
||||||
- name: Replace scarf urls
|
|
||||||
run: |
|
|
||||||
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
|
|
||||||
- name: Install TTS
|
|
||||||
run: |
|
|
||||||
python3 -m pip install .[all]
|
|
||||||
python3 setup.py egg_info
|
|
||||||
- name: Unit tests
|
|
||||||
run: make test_tts2
|
|
|
@ -1,48 +0,0 @@
|
||||||
name: vocoder-tests
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
pull_request:
|
|
||||||
types: [opened, synchronize, reopened]
|
|
||||||
jobs:
|
|
||||||
check_skip:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
if: "! contains(github.event.head_commit.message, '[ci skip]')"
|
|
||||||
steps:
|
|
||||||
- run: echo "${{ github.event.head_commit.message }}"
|
|
||||||
|
|
||||||
test:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
python-version: [3.9, "3.10", "3.11"]
|
|
||||||
experimental: [false]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v3
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
|
||||||
uses: actions/setup-python@v4
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python-version }}
|
|
||||||
architecture: x64
|
|
||||||
cache: 'pip'
|
|
||||||
cache-dependency-path: 'requirements*'
|
|
||||||
- name: check OS
|
|
||||||
run: cat /etc/os-release
|
|
||||||
- name: set ENV
|
|
||||||
run: export TRAINER_TELEMETRY=0
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install -y git make gcc
|
|
||||||
make system-deps
|
|
||||||
- name: Install/upgrade Python setup deps
|
|
||||||
run: python3 -m pip install --upgrade pip setuptools wheel
|
|
||||||
- name: Install TTS
|
|
||||||
run: |
|
|
||||||
python3 -m pip install .[all]
|
|
||||||
python3 setup.py egg_info
|
|
||||||
- name: Unit tests
|
|
||||||
run: make test_vocoder
|
|
|
@ -1,53 +0,0 @@
|
||||||
name: xtts-tests
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
pull_request:
|
|
||||||
types: [opened, synchronize, reopened]
|
|
||||||
jobs:
|
|
||||||
check_skip:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
if: "! contains(github.event.head_commit.message, '[ci skip]')"
|
|
||||||
steps:
|
|
||||||
- run: echo "${{ github.event.head_commit.message }}"
|
|
||||||
|
|
||||||
test:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
python-version: [3.9, "3.10", "3.11"]
|
|
||||||
experimental: [false]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v3
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
|
||||||
uses: actions/setup-python@v4
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python-version }}
|
|
||||||
architecture: x64
|
|
||||||
cache: 'pip'
|
|
||||||
cache-dependency-path: 'requirements*'
|
|
||||||
- name: check OS
|
|
||||||
run: cat /etc/os-release
|
|
||||||
- name: set ENV
|
|
||||||
run: export TRAINER_TELEMETRY=0
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install -y --no-install-recommends git make gcc
|
|
||||||
sudo apt-get install espeak
|
|
||||||
sudo apt-get install espeak-ng
|
|
||||||
make system-deps
|
|
||||||
- name: Install/upgrade Python setup deps
|
|
||||||
run: python3 -m pip install --upgrade pip setuptools wheel
|
|
||||||
- name: Replace scarf urls
|
|
||||||
run: |
|
|
||||||
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
|
|
||||||
- name: Install TTS
|
|
||||||
run: |
|
|
||||||
python3 -m pip install .[all]
|
|
||||||
python3 setup.py egg_info
|
|
||||||
- name: Unit tests
|
|
||||||
run: make test_xtts
|
|
|
@ -1,54 +0,0 @@
|
||||||
name: zoo-tests-0
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
pull_request:
|
|
||||||
types: [opened, synchronize, reopened]
|
|
||||||
jobs:
|
|
||||||
check_skip:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
if: "! contains(github.event.head_commit.message, '[ci skip]')"
|
|
||||||
steps:
|
|
||||||
- run: echo "${{ github.event.head_commit.message }}"
|
|
||||||
|
|
||||||
test:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
python-version: [3.9, "3.10", "3.11"]
|
|
||||||
experimental: [false]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v3
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
|
||||||
uses: actions/setup-python@v4
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python-version }}
|
|
||||||
architecture: x64
|
|
||||||
cache: 'pip'
|
|
||||||
cache-dependency-path: 'requirements*'
|
|
||||||
- name: check OS
|
|
||||||
run: cat /etc/os-release
|
|
||||||
- name: set ENV
|
|
||||||
run: export TRAINER_TELEMETRY=0
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install -y git make gcc
|
|
||||||
sudo apt-get install espeak espeak-ng
|
|
||||||
make system-deps
|
|
||||||
- name: Install/upgrade Python setup deps
|
|
||||||
run: python3 -m pip install --upgrade pip setuptools wheel
|
|
||||||
- name: Replace scarf urls
|
|
||||||
run: |
|
|
||||||
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
|
|
||||||
- name: Install TTS
|
|
||||||
run: |
|
|
||||||
python3 -m pip install .[all]
|
|
||||||
python3 setup.py egg_info
|
|
||||||
- name: Unit tests
|
|
||||||
run: |
|
|
||||||
nose2 -F -v -B TTS tests.zoo_tests.test_models.test_models_offset_0_step_3
|
|
||||||
nose2 -F -v -B TTS tests.zoo_tests.test_models.test_voice_conversion
|
|
|
@ -1,53 +0,0 @@
|
||||||
name: zoo-tests-1
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
pull_request:
|
|
||||||
types: [opened, synchronize, reopened]
|
|
||||||
jobs:
|
|
||||||
check_skip:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
if: "! contains(github.event.head_commit.message, '[ci skip]')"
|
|
||||||
steps:
|
|
||||||
- run: echo "${{ github.event.head_commit.message }}"
|
|
||||||
|
|
||||||
test:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
python-version: [3.9, "3.10", "3.11"]
|
|
||||||
experimental: [false]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v3
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
|
||||||
uses: actions/setup-python@v4
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python-version }}
|
|
||||||
architecture: x64
|
|
||||||
cache: 'pip'
|
|
||||||
cache-dependency-path: 'requirements*'
|
|
||||||
- name: check OS
|
|
||||||
run: cat /etc/os-release
|
|
||||||
- name: set ENV
|
|
||||||
run: export TRAINER_TELEMETRY=0
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install -y git make gcc
|
|
||||||
sudo apt-get install espeak espeak-ng
|
|
||||||
make system-deps
|
|
||||||
- name: Install/upgrade Python setup deps
|
|
||||||
run: python3 -m pip install --upgrade pip setuptools wheel
|
|
||||||
- name: Replace scarf urls
|
|
||||||
run: |
|
|
||||||
sed -i 's/https:\/\/coqui.gateway.scarf.sh\/hf\/bark\//https:\/\/huggingface.co\/erogol\/bark\/resolve\/main\//g' TTS/.models.json
|
|
||||||
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
|
|
||||||
- name: Install TTS
|
|
||||||
run: |
|
|
||||||
python3 -m pip install .[all]
|
|
||||||
python3 setup.py egg_info
|
|
||||||
- name: Unit tests
|
|
||||||
run: nose2 -F -v -B --with-coverage --coverage TTS tests.zoo_tests.test_models.test_models_offset_1_step_3
|
|
|
@ -1,52 +0,0 @@
|
||||||
name: zoo-tests-2
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
pull_request:
|
|
||||||
types: [opened, synchronize, reopened]
|
|
||||||
jobs:
|
|
||||||
check_skip:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
if: "! contains(github.event.head_commit.message, '[ci skip]')"
|
|
||||||
steps:
|
|
||||||
- run: echo "${{ github.event.head_commit.message }}"
|
|
||||||
|
|
||||||
test:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
python-version: [3.9, "3.10", "3.11"]
|
|
||||||
experimental: [false]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v3
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
|
||||||
uses: actions/setup-python@v4
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python-version }}
|
|
||||||
architecture: x64
|
|
||||||
cache: 'pip'
|
|
||||||
cache-dependency-path: 'requirements*'
|
|
||||||
- name: check OS
|
|
||||||
run: cat /etc/os-release
|
|
||||||
- name: set ENV
|
|
||||||
run: export TRAINER_TELEMETRY=0
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install -y git make gcc
|
|
||||||
sudo apt-get install espeak espeak-ng
|
|
||||||
make system-deps
|
|
||||||
- name: Install/upgrade Python setup deps
|
|
||||||
run: python3 -m pip install --upgrade pip setuptools wheel
|
|
||||||
- name: Replace scarf urls
|
|
||||||
run: |
|
|
||||||
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
|
|
||||||
- name: Install TTS
|
|
||||||
run: |
|
|
||||||
python3 -m pip install .[all]
|
|
||||||
python3 setup.py egg_info
|
|
||||||
- name: Unit tests
|
|
||||||
run: nose2 -F -v -B --with-coverage --coverage TTS tests.zoo_tests.test_models.test_models_offset_2_step_3
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
uv.lock
|
||||||
|
|
||||||
WadaSNR/
|
WadaSNR/
|
||||||
.idea/
|
.idea/
|
||||||
*.pyc
|
*.pyc
|
||||||
|
|
|
@ -1,27 +1,19 @@
|
||||||
repos:
|
repos:
|
||||||
- repo: 'https://github.com/pre-commit/pre-commit-hooks'
|
- repo: "https://github.com/pre-commit/pre-commit-hooks"
|
||||||
rev: v2.3.0
|
rev: v5.0.0
|
||||||
hooks:
|
hooks:
|
||||||
|
- id: check-json
|
||||||
|
files: "TTS/.models.json"
|
||||||
- id: check-yaml
|
- id: check-yaml
|
||||||
- id: end-of-file-fixer
|
- id: end-of-file-fixer
|
||||||
- id: trailing-whitespace
|
- id: trailing-whitespace
|
||||||
- repo: 'https://github.com/psf/black'
|
- repo: "https://github.com/psf/black"
|
||||||
rev: 22.3.0
|
rev: 24.2.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: black
|
- id: black
|
||||||
language_version: python3
|
language_version: python3
|
||||||
- repo: https://github.com/pycqa/isort
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||||
rev: 5.8.0
|
rev: v0.7.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: isort
|
- id: ruff
|
||||||
name: isort (python)
|
args: [--fix, --exit-non-zero-on-fix]
|
||||||
- id: isort
|
|
||||||
name: isort (cython)
|
|
||||||
types: [cython]
|
|
||||||
- id: isort
|
|
||||||
name: isort (pyi)
|
|
||||||
types: [pyi]
|
|
||||||
- repo: https://github.com/pycqa/pylint
|
|
||||||
rev: v2.8.2
|
|
||||||
hooks:
|
|
||||||
- id: pylint
|
|
||||||
|
|
599
.pylintrc
599
.pylintrc
|
@ -1,599 +0,0 @@
|
||||||
[MASTER]
|
|
||||||
|
|
||||||
# A comma-separated list of package or module names from where C extensions may
|
|
||||||
# be loaded. Extensions are loading into the active Python interpreter and may
|
|
||||||
# run arbitrary code.
|
|
||||||
extension-pkg-whitelist=
|
|
||||||
|
|
||||||
# Add files or directories to the blacklist. They should be base names, not
|
|
||||||
# paths.
|
|
||||||
ignore=CVS
|
|
||||||
|
|
||||||
# Add files or directories matching the regex patterns to the blacklist. The
|
|
||||||
# regex matches against base names, not paths.
|
|
||||||
ignore-patterns=
|
|
||||||
|
|
||||||
# Python code to execute, usually for sys.path manipulation such as
|
|
||||||
# pygtk.require().
|
|
||||||
#init-hook=
|
|
||||||
|
|
||||||
# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
|
|
||||||
# number of processors available to use.
|
|
||||||
jobs=1
|
|
||||||
|
|
||||||
# Control the amount of potential inferred values when inferring a single
|
|
||||||
# object. This can help the performance when dealing with large functions or
|
|
||||||
# complex, nested conditions.
|
|
||||||
limit-inference-results=100
|
|
||||||
|
|
||||||
# List of plugins (as comma separated values of python modules names) to load,
|
|
||||||
# usually to register additional checkers.
|
|
||||||
load-plugins=
|
|
||||||
|
|
||||||
# Pickle collected data for later comparisons.
|
|
||||||
persistent=yes
|
|
||||||
|
|
||||||
# Specify a configuration file.
|
|
||||||
#rcfile=
|
|
||||||
|
|
||||||
# When enabled, pylint would attempt to guess common misconfiguration and emit
|
|
||||||
# user-friendly hints instead of false-positive error messages.
|
|
||||||
suggestion-mode=yes
|
|
||||||
|
|
||||||
# Allow loading of arbitrary C extensions. Extensions are imported into the
|
|
||||||
# active Python interpreter and may run arbitrary code.
|
|
||||||
unsafe-load-any-extension=no
|
|
||||||
|
|
||||||
|
|
||||||
[MESSAGES CONTROL]
|
|
||||||
|
|
||||||
# Only show warnings with the listed confidence levels. Leave empty to show
|
|
||||||
# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED.
|
|
||||||
confidence=
|
|
||||||
|
|
||||||
# Disable the message, report, category or checker with the given id(s). You
|
|
||||||
# can either give multiple identifiers separated by comma (,) or put this
|
|
||||||
# option multiple times (only on the command line, not in the configuration
|
|
||||||
# file where it should appear only once). You can also use "--disable=all" to
|
|
||||||
# disable everything first and then reenable specific checks. For example, if
|
|
||||||
# you want to run only the similarities checker, you can use "--disable=all
|
|
||||||
# --enable=similarities". If you want to run only the classes checker, but have
|
|
||||||
# no Warning level messages displayed, use "--disable=all --enable=classes
|
|
||||||
# --disable=W".
|
|
||||||
disable=missing-docstring,
|
|
||||||
too-many-public-methods,
|
|
||||||
too-many-lines,
|
|
||||||
bare-except,
|
|
||||||
## for avoiding weird p3.6 CI linter error
|
|
||||||
## TODO: see later if we can remove this
|
|
||||||
assigning-non-slot,
|
|
||||||
unsupported-assignment-operation,
|
|
||||||
## end
|
|
||||||
line-too-long,
|
|
||||||
fixme,
|
|
||||||
wrong-import-order,
|
|
||||||
ungrouped-imports,
|
|
||||||
wrong-import-position,
|
|
||||||
import-error,
|
|
||||||
invalid-name,
|
|
||||||
too-many-instance-attributes,
|
|
||||||
arguments-differ,
|
|
||||||
arguments-renamed,
|
|
||||||
no-name-in-module,
|
|
||||||
no-member,
|
|
||||||
unsubscriptable-object,
|
|
||||||
print-statement,
|
|
||||||
parameter-unpacking,
|
|
||||||
unpacking-in-except,
|
|
||||||
old-raise-syntax,
|
|
||||||
backtick,
|
|
||||||
long-suffix,
|
|
||||||
old-ne-operator,
|
|
||||||
old-octal-literal,
|
|
||||||
import-star-module-level,
|
|
||||||
non-ascii-bytes-literal,
|
|
||||||
raw-checker-failed,
|
|
||||||
bad-inline-option,
|
|
||||||
locally-disabled,
|
|
||||||
file-ignored,
|
|
||||||
suppressed-message,
|
|
||||||
useless-suppression,
|
|
||||||
deprecated-pragma,
|
|
||||||
use-symbolic-message-instead,
|
|
||||||
useless-object-inheritance,
|
|
||||||
too-few-public-methods,
|
|
||||||
too-many-branches,
|
|
||||||
too-many-arguments,
|
|
||||||
too-many-locals,
|
|
||||||
too-many-statements,
|
|
||||||
apply-builtin,
|
|
||||||
basestring-builtin,
|
|
||||||
buffer-builtin,
|
|
||||||
cmp-builtin,
|
|
||||||
coerce-builtin,
|
|
||||||
execfile-builtin,
|
|
||||||
file-builtin,
|
|
||||||
long-builtin,
|
|
||||||
raw_input-builtin,
|
|
||||||
reduce-builtin,
|
|
||||||
standarderror-builtin,
|
|
||||||
unicode-builtin,
|
|
||||||
xrange-builtin,
|
|
||||||
coerce-method,
|
|
||||||
delslice-method,
|
|
||||||
getslice-method,
|
|
||||||
setslice-method,
|
|
||||||
no-absolute-import,
|
|
||||||
old-division,
|
|
||||||
dict-iter-method,
|
|
||||||
dict-view-method,
|
|
||||||
next-method-called,
|
|
||||||
metaclass-assignment,
|
|
||||||
indexing-exception,
|
|
||||||
raising-string,
|
|
||||||
reload-builtin,
|
|
||||||
oct-method,
|
|
||||||
hex-method,
|
|
||||||
nonzero-method,
|
|
||||||
cmp-method,
|
|
||||||
input-builtin,
|
|
||||||
round-builtin,
|
|
||||||
intern-builtin,
|
|
||||||
unichr-builtin,
|
|
||||||
map-builtin-not-iterating,
|
|
||||||
zip-builtin-not-iterating,
|
|
||||||
range-builtin-not-iterating,
|
|
||||||
filter-builtin-not-iterating,
|
|
||||||
using-cmp-argument,
|
|
||||||
eq-without-hash,
|
|
||||||
div-method,
|
|
||||||
idiv-method,
|
|
||||||
rdiv-method,
|
|
||||||
exception-message-attribute,
|
|
||||||
invalid-str-codec,
|
|
||||||
sys-max-int,
|
|
||||||
bad-python3-import,
|
|
||||||
deprecated-string-function,
|
|
||||||
deprecated-str-translate-call,
|
|
||||||
deprecated-itertools-function,
|
|
||||||
deprecated-types-field,
|
|
||||||
next-method-defined,
|
|
||||||
dict-items-not-iterating,
|
|
||||||
dict-keys-not-iterating,
|
|
||||||
dict-values-not-iterating,
|
|
||||||
deprecated-operator-function,
|
|
||||||
deprecated-urllib-function,
|
|
||||||
xreadlines-attribute,
|
|
||||||
deprecated-sys-function,
|
|
||||||
exception-escape,
|
|
||||||
comprehension-escape,
|
|
||||||
duplicate-code,
|
|
||||||
not-callable,
|
|
||||||
import-outside-toplevel,
|
|
||||||
logging-fstring-interpolation,
|
|
||||||
logging-not-lazy
|
|
||||||
|
|
||||||
# Enable the message, report, category or checker with the given id(s). You can
|
|
||||||
# either give multiple identifier separated by comma (,) or put this option
|
|
||||||
# multiple time (only on the command line, not in the configuration file where
|
|
||||||
# it should appear only once). See also the "--disable" option for examples.
|
|
||||||
enable=c-extension-no-member
|
|
||||||
|
|
||||||
|
|
||||||
[REPORTS]
|
|
||||||
|
|
||||||
# Python expression which should return a note less than 10 (10 is the highest
|
|
||||||
# note). You have access to the variables errors warning, statement which
|
|
||||||
# respectively contain the number of errors / warnings messages and the total
|
|
||||||
# number of statements analyzed. This is used by the global evaluation report
|
|
||||||
# (RP0004).
|
|
||||||
evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
|
|
||||||
|
|
||||||
# Template used to display messages. This is a python new-style format string
|
|
||||||
# used to format the message information. See doc for all details.
|
|
||||||
#msg-template=
|
|
||||||
|
|
||||||
# Set the output format. Available formats are text, parseable, colorized, json
|
|
||||||
# and msvs (visual studio). You can also give a reporter class, e.g.
|
|
||||||
# mypackage.mymodule.MyReporterClass.
|
|
||||||
output-format=text
|
|
||||||
|
|
||||||
# Tells whether to display a full report or only the messages.
|
|
||||||
reports=no
|
|
||||||
|
|
||||||
# Activate the evaluation score.
|
|
||||||
score=yes
|
|
||||||
|
|
||||||
|
|
||||||
[REFACTORING]
|
|
||||||
|
|
||||||
# Maximum number of nested blocks for function / method body
|
|
||||||
max-nested-blocks=5
|
|
||||||
|
|
||||||
# Complete name of functions that never returns. When checking for
|
|
||||||
# inconsistent-return-statements if a never returning function is called then
|
|
||||||
# it will be considered as an explicit return statement and no message will be
|
|
||||||
# printed.
|
|
||||||
never-returning-functions=sys.exit
|
|
||||||
|
|
||||||
|
|
||||||
[LOGGING]
|
|
||||||
|
|
||||||
# Format style used to check logging format string. `old` means using %
|
|
||||||
# formatting, while `new` is for `{}` formatting.
|
|
||||||
logging-format-style=old
|
|
||||||
|
|
||||||
# Logging modules to check that the string format arguments are in logging
|
|
||||||
# function parameter format.
|
|
||||||
logging-modules=logging
|
|
||||||
|
|
||||||
|
|
||||||
[SPELLING]
|
|
||||||
|
|
||||||
# Limits count of emitted suggestions for spelling mistakes.
|
|
||||||
max-spelling-suggestions=4
|
|
||||||
|
|
||||||
# Spelling dictionary name. Available dictionaries: none. To make it working
|
|
||||||
# install python-enchant package..
|
|
||||||
spelling-dict=
|
|
||||||
|
|
||||||
# List of comma separated words that should not be checked.
|
|
||||||
spelling-ignore-words=
|
|
||||||
|
|
||||||
# A path to a file that contains private dictionary; one word per line.
|
|
||||||
spelling-private-dict-file=
|
|
||||||
|
|
||||||
# Tells whether to store unknown words to indicated private dictionary in
|
|
||||||
# --spelling-private-dict-file option instead of raising a message.
|
|
||||||
spelling-store-unknown-words=no
|
|
||||||
|
|
||||||
|
|
||||||
[MISCELLANEOUS]
|
|
||||||
|
|
||||||
# List of note tags to take in consideration, separated by a comma.
|
|
||||||
notes=FIXME,
|
|
||||||
XXX,
|
|
||||||
TODO
|
|
||||||
|
|
||||||
|
|
||||||
[TYPECHECK]
|
|
||||||
|
|
||||||
# List of decorators that produce context managers, such as
|
|
||||||
# contextlib.contextmanager. Add to this list to register other decorators that
|
|
||||||
# produce valid context managers.
|
|
||||||
contextmanager-decorators=contextlib.contextmanager
|
|
||||||
|
|
||||||
# List of members which are set dynamically and missed by pylint inference
|
|
||||||
# system, and so shouldn't trigger E1101 when accessed. Python regular
|
|
||||||
# expressions are accepted.
|
|
||||||
generated-members=numpy.*,torch.*
|
|
||||||
|
|
||||||
# Tells whether missing members accessed in mixin class should be ignored. A
|
|
||||||
# mixin class is detected if its name ends with "mixin" (case insensitive).
|
|
||||||
ignore-mixin-members=yes
|
|
||||||
|
|
||||||
# Tells whether to warn about missing members when the owner of the attribute
|
|
||||||
# is inferred to be None.
|
|
||||||
ignore-none=yes
|
|
||||||
|
|
||||||
# This flag controls whether pylint should warn about no-member and similar
|
|
||||||
# checks whenever an opaque object is returned when inferring. The inference
|
|
||||||
# can return multiple potential results while evaluating a Python object, but
|
|
||||||
# some branches might not be evaluated, which results in partial inference. In
|
|
||||||
# that case, it might be useful to still emit no-member and other checks for
|
|
||||||
# the rest of the inferred objects.
|
|
||||||
ignore-on-opaque-inference=yes
|
|
||||||
|
|
||||||
# List of class names for which member attributes should not be checked (useful
|
|
||||||
# for classes with dynamically set attributes). This supports the use of
|
|
||||||
# qualified names.
|
|
||||||
ignored-classes=optparse.Values,thread._local,_thread._local
|
|
||||||
|
|
||||||
# List of module names for which member attributes should not be checked
|
|
||||||
# (useful for modules/projects where namespaces are manipulated during runtime
|
|
||||||
# and thus existing member attributes cannot be deduced by static analysis. It
|
|
||||||
# supports qualified module names, as well as Unix pattern matching.
|
|
||||||
ignored-modules=
|
|
||||||
|
|
||||||
# Show a hint with possible names when a member name was not found. The aspect
|
|
||||||
# of finding the hint is based on edit distance.
|
|
||||||
missing-member-hint=yes
|
|
||||||
|
|
||||||
# The minimum edit distance a name should have in order to be considered a
|
|
||||||
# similar match for a missing member name.
|
|
||||||
missing-member-hint-distance=1
|
|
||||||
|
|
||||||
# The total number of similar names that should be taken in consideration when
|
|
||||||
# showing a hint for a missing member.
|
|
||||||
missing-member-max-choices=1
|
|
||||||
|
|
||||||
|
|
||||||
[VARIABLES]
|
|
||||||
|
|
||||||
# List of additional names supposed to be defined in builtins. Remember that
|
|
||||||
# you should avoid defining new builtins when possible.
|
|
||||||
additional-builtins=
|
|
||||||
|
|
||||||
# Tells whether unused global variables should be treated as a violation.
|
|
||||||
allow-global-unused-variables=yes
|
|
||||||
|
|
||||||
# List of strings which can identify a callback function by name. A callback
|
|
||||||
# name must start or end with one of those strings.
|
|
||||||
callbacks=cb_,
|
|
||||||
_cb
|
|
||||||
|
|
||||||
# A regular expression matching the name of dummy variables (i.e. expected to
|
|
||||||
# not be used).
|
|
||||||
dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
|
|
||||||
|
|
||||||
# Argument names that match this expression will be ignored. Default to name
|
|
||||||
# with leading underscore.
|
|
||||||
ignored-argument-names=_.*|^ignored_|^unused_
|
|
||||||
|
|
||||||
# Tells whether we should check for unused import in __init__ files.
|
|
||||||
init-import=no
|
|
||||||
|
|
||||||
# List of qualified module names which can have objects that can redefine
|
|
||||||
# builtins.
|
|
||||||
redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
|
|
||||||
|
|
||||||
|
|
||||||
[FORMAT]
|
|
||||||
|
|
||||||
# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
|
|
||||||
expected-line-ending-format=
|
|
||||||
|
|
||||||
# Regexp for a line that is allowed to be longer than the limit.
|
|
||||||
ignore-long-lines=^\s*(# )?<?https?://\S+>?$
|
|
||||||
|
|
||||||
# Number of spaces of indent required inside a hanging or continued line.
|
|
||||||
indent-after-paren=4
|
|
||||||
|
|
||||||
# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
|
|
||||||
# tab).
|
|
||||||
indent-string=' '
|
|
||||||
|
|
||||||
# Maximum number of characters on a single line.
|
|
||||||
max-line-length=120
|
|
||||||
|
|
||||||
# Maximum number of lines in a module.
|
|
||||||
max-module-lines=1000
|
|
||||||
|
|
||||||
# List of optional constructs for which whitespace checking is disabled. `dict-
|
|
||||||
# separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}.
|
|
||||||
# `trailing-comma` allows a space between comma and closing bracket: (a, ).
|
|
||||||
# `empty-line` allows space-only lines.
|
|
||||||
no-space-check=trailing-comma,
|
|
||||||
dict-separator
|
|
||||||
|
|
||||||
# Allow the body of a class to be on the same line as the declaration if body
|
|
||||||
# contains single statement.
|
|
||||||
single-line-class-stmt=no
|
|
||||||
|
|
||||||
# Allow the body of an if to be on the same line as the test if there is no
|
|
||||||
# else.
|
|
||||||
single-line-if-stmt=no
|
|
||||||
|
|
||||||
|
|
||||||
[SIMILARITIES]
|
|
||||||
|
|
||||||
# Ignore comments when computing similarities.
|
|
||||||
ignore-comments=yes
|
|
||||||
|
|
||||||
# Ignore docstrings when computing similarities.
|
|
||||||
ignore-docstrings=yes
|
|
||||||
|
|
||||||
# Ignore imports when computing similarities.
|
|
||||||
ignore-imports=no
|
|
||||||
|
|
||||||
# Minimum lines number of a similarity.
|
|
||||||
min-similarity-lines=4
|
|
||||||
|
|
||||||
|
|
||||||
[BASIC]
|
|
||||||
|
|
||||||
# Naming style matching correct argument names.
|
|
||||||
argument-naming-style=snake_case
|
|
||||||
|
|
||||||
# Regular expression matching correct argument names. Overrides argument-
|
|
||||||
# naming-style.
|
|
||||||
argument-rgx=[a-z_][a-z0-9_]{0,30}$
|
|
||||||
|
|
||||||
# Naming style matching correct attribute names.
|
|
||||||
attr-naming-style=snake_case
|
|
||||||
|
|
||||||
# Regular expression matching correct attribute names. Overrides attr-naming-
|
|
||||||
# style.
|
|
||||||
#attr-rgx=
|
|
||||||
|
|
||||||
# Bad variable names which should always be refused, separated by a comma.
|
|
||||||
bad-names=
|
|
||||||
|
|
||||||
# Naming style matching correct class attribute names.
|
|
||||||
class-attribute-naming-style=any
|
|
||||||
|
|
||||||
# Regular expression matching correct class attribute names. Overrides class-
|
|
||||||
# attribute-naming-style.
|
|
||||||
#class-attribute-rgx=
|
|
||||||
|
|
||||||
# Naming style matching correct class names.
|
|
||||||
class-naming-style=PascalCase
|
|
||||||
|
|
||||||
# Regular expression matching correct class names. Overrides class-naming-
|
|
||||||
# style.
|
|
||||||
#class-rgx=
|
|
||||||
|
|
||||||
# Naming style matching correct constant names.
|
|
||||||
const-naming-style=UPPER_CASE
|
|
||||||
|
|
||||||
# Regular expression matching correct constant names. Overrides const-naming-
|
|
||||||
# style.
|
|
||||||
#const-rgx=
|
|
||||||
|
|
||||||
# Minimum line length for functions/classes that require docstrings, shorter
|
|
||||||
# ones are exempt.
|
|
||||||
docstring-min-length=-1
|
|
||||||
|
|
||||||
# Naming style matching correct function names.
|
|
||||||
function-naming-style=snake_case
|
|
||||||
|
|
||||||
# Regular expression matching correct function names. Overrides function-
|
|
||||||
# naming-style.
|
|
||||||
#function-rgx=
|
|
||||||
|
|
||||||
# Good variable names which should always be accepted, separated by a comma.
|
|
||||||
good-names=i,
|
|
||||||
j,
|
|
||||||
k,
|
|
||||||
x,
|
|
||||||
ex,
|
|
||||||
Run,
|
|
||||||
_
|
|
||||||
|
|
||||||
# Include a hint for the correct naming format with invalid-name.
|
|
||||||
include-naming-hint=no
|
|
||||||
|
|
||||||
# Naming style matching correct inline iteration names.
|
|
||||||
inlinevar-naming-style=any
|
|
||||||
|
|
||||||
# Regular expression matching correct inline iteration names. Overrides
|
|
||||||
# inlinevar-naming-style.
|
|
||||||
#inlinevar-rgx=
|
|
||||||
|
|
||||||
# Naming style matching correct method names.
|
|
||||||
method-naming-style=snake_case
|
|
||||||
|
|
||||||
# Regular expression matching correct method names. Overrides method-naming-
|
|
||||||
# style.
|
|
||||||
#method-rgx=
|
|
||||||
|
|
||||||
# Naming style matching correct module names.
|
|
||||||
module-naming-style=snake_case
|
|
||||||
|
|
||||||
# Regular expression matching correct module names. Overrides module-naming-
|
|
||||||
# style.
|
|
||||||
#module-rgx=
|
|
||||||
|
|
||||||
# Colon-delimited sets of names that determine each other's naming style when
|
|
||||||
# the name regexes allow several styles.
|
|
||||||
name-group=
|
|
||||||
|
|
||||||
# Regular expression which should only match function or class names that do
|
|
||||||
# not require a docstring.
|
|
||||||
no-docstring-rgx=^_
|
|
||||||
|
|
||||||
# List of decorators that produce properties, such as abc.abstractproperty. Add
|
|
||||||
# to this list to register other decorators that produce valid properties.
|
|
||||||
# These decorators are taken in consideration only for invalid-name.
|
|
||||||
property-classes=abc.abstractproperty
|
|
||||||
|
|
||||||
# Naming style matching correct variable names.
|
|
||||||
variable-naming-style=snake_case
|
|
||||||
|
|
||||||
# Regular expression matching correct variable names. Overrides variable-
|
|
||||||
# naming-style.
|
|
||||||
variable-rgx=[a-z_][a-z0-9_]{0,30}$
|
|
||||||
|
|
||||||
|
|
||||||
[STRING]
|
|
||||||
|
|
||||||
# This flag controls whether the implicit-str-concat-in-sequence should
|
|
||||||
# generate a warning on implicit string concatenation in sequences defined over
|
|
||||||
# several lines.
|
|
||||||
check-str-concat-over-line-jumps=no
|
|
||||||
|
|
||||||
|
|
||||||
[IMPORTS]
|
|
||||||
|
|
||||||
# Allow wildcard imports from modules that define __all__.
|
|
||||||
allow-wildcard-with-all=no
|
|
||||||
|
|
||||||
# Analyse import fallback blocks. This can be used to support both Python 2 and
|
|
||||||
# 3 compatible code, which means that the block might have code that exists
|
|
||||||
# only in one or another interpreter, leading to false positives when analysed.
|
|
||||||
analyse-fallback-blocks=no
|
|
||||||
|
|
||||||
# Deprecated modules which should not be used, separated by a comma.
|
|
||||||
deprecated-modules=optparse,tkinter.tix
|
|
||||||
|
|
||||||
# Create a graph of external dependencies in the given file (report RP0402 must
|
|
||||||
# not be disabled).
|
|
||||||
ext-import-graph=
|
|
||||||
|
|
||||||
# Create a graph of every (i.e. internal and external) dependencies in the
|
|
||||||
# given file (report RP0402 must not be disabled).
|
|
||||||
import-graph=
|
|
||||||
|
|
||||||
# Create a graph of internal dependencies in the given file (report RP0402 must
|
|
||||||
# not be disabled).
|
|
||||||
int-import-graph=
|
|
||||||
|
|
||||||
# Force import order to recognize a module as part of the standard
|
|
||||||
# compatibility libraries.
|
|
||||||
known-standard-library=
|
|
||||||
|
|
||||||
# Force import order to recognize a module as part of a third party library.
|
|
||||||
known-third-party=enchant
|
|
||||||
|
|
||||||
|
|
||||||
[CLASSES]
|
|
||||||
|
|
||||||
# List of method names used to declare (i.e. assign) instance attributes.
|
|
||||||
defining-attr-methods=__init__,
|
|
||||||
__new__,
|
|
||||||
setUp
|
|
||||||
|
|
||||||
# List of member names, which should be excluded from the protected access
|
|
||||||
# warning.
|
|
||||||
exclude-protected=_asdict,
|
|
||||||
_fields,
|
|
||||||
_replace,
|
|
||||||
_source,
|
|
||||||
_make
|
|
||||||
|
|
||||||
# List of valid names for the first argument in a class method.
|
|
||||||
valid-classmethod-first-arg=cls
|
|
||||||
|
|
||||||
# List of valid names for the first argument in a metaclass class method.
|
|
||||||
valid-metaclass-classmethod-first-arg=cls
|
|
||||||
|
|
||||||
|
|
||||||
[DESIGN]
|
|
||||||
|
|
||||||
# Maximum number of arguments for function / method.
|
|
||||||
max-args=5
|
|
||||||
|
|
||||||
# Maximum number of attributes for a class (see R0902).
|
|
||||||
max-attributes=7
|
|
||||||
|
|
||||||
# Maximum number of boolean expressions in an if statement.
|
|
||||||
max-bool-expr=5
|
|
||||||
|
|
||||||
# Maximum number of branch for function / method body.
|
|
||||||
max-branches=12
|
|
||||||
|
|
||||||
# Maximum number of locals for function / method body.
|
|
||||||
max-locals=15
|
|
||||||
|
|
||||||
# Maximum number of parents for a class (see R0901).
|
|
||||||
max-parents=15
|
|
||||||
|
|
||||||
# Maximum number of public methods for a class (see R0904).
|
|
||||||
max-public-methods=20
|
|
||||||
|
|
||||||
# Maximum number of return / yield for function / method body.
|
|
||||||
max-returns=6
|
|
||||||
|
|
||||||
# Maximum number of statements in function / method body.
|
|
||||||
max-statements=50
|
|
||||||
|
|
||||||
# Minimum number of public methods for a class (see R0903).
|
|
||||||
min-public-methods=2
|
|
||||||
|
|
||||||
|
|
||||||
[EXCEPTIONS]
|
|
||||||
|
|
||||||
# Exceptions that will emit a warning when being caught. Defaults to
|
|
||||||
# "BaseException, Exception".
|
|
||||||
overgeneral-exceptions=BaseException,
|
|
||||||
Exception
|
|
|
@ -9,13 +9,13 @@ version: 2
|
||||||
build:
|
build:
|
||||||
os: ubuntu-22.04
|
os: ubuntu-22.04
|
||||||
tools:
|
tools:
|
||||||
python: "3.11"
|
python: "3.12"
|
||||||
|
commands:
|
||||||
# Optionally set the version of Python and requirements required to build your docs
|
- asdf plugin add uv
|
||||||
python:
|
- asdf install uv latest
|
||||||
install:
|
- asdf global uv latest
|
||||||
- requirements: docs/requirements.txt
|
- uv sync --group docs
|
||||||
- requirements: requirements.txt
|
- uv run -m sphinx -T -b html -d docs/_build/doctrees -D language=en docs/source $READTHEDOCS_OUTPUT/html
|
||||||
|
|
||||||
# Build documentation in the docs/ directory with Sphinx
|
# Build documentation in the docs/ directory with Sphinx
|
||||||
sphinx:
|
sphinx:
|
||||||
|
|
|
@ -10,8 +10,8 @@ authors:
|
||||||
version: 1.4
|
version: 1.4
|
||||||
doi: 10.5281/zenodo.6334862
|
doi: 10.5281/zenodo.6334862
|
||||||
license: "MPL-2.0"
|
license: "MPL-2.0"
|
||||||
url: "https://www.coqui.ai"
|
url: "https://github.com/idiap/coqui-ai-TTS"
|
||||||
repository-code: "https://github.com/coqui-ai/TTS"
|
repository-code: "https://github.com/idiap/coqui-ai-TTS"
|
||||||
keywords:
|
keywords:
|
||||||
- machine learning
|
- machine learning
|
||||||
- deep learning
|
- deep learning
|
||||||
|
|
|
@ -1,75 +0,0 @@
|
||||||
TTS code owners / governance system
|
|
||||||
==========================================
|
|
||||||
|
|
||||||
TTS is run under a governance system inspired (and partially copied from) by the `Mozilla module ownership system <https://www.mozilla.org/about/governance/policies/module-ownership/>`_. The project is roughly divided into modules, and each module has its owners, which are responsible for reviewing pull requests and deciding on technical direction for their modules. Module ownership authority is given to people who have worked extensively on areas of the project.
|
|
||||||
|
|
||||||
Module owners also have the authority of naming other module owners or appointing module peers, which are people with authority to review pull requests in that module. They can also sub-divide their module into sub-modules with their owners.
|
|
||||||
|
|
||||||
Module owners are not tyrants. They are chartered to make decisions with input from the community and in the best interest of the community. Module owners are not required to make code changes or additions solely because the community wants them to do so. (Like anyone else, the module owners may write code because they want to, because their employers want them to, because the community wants them to, or for some other reason.) Module owners do need to pay attention to patches submitted to that module. However “pay attention” does not mean agreeing to every patch. Some patches may not make sense for the WebThings project; some may be poorly implemented. Module owners have the authority to decline a patch; this is a necessary part of the role. We ask the module owners to describe in the relevant issue their reasons for wanting changes to a patch, for declining it altogether, or for postponing review for some period. We don’t ask or expect them to rewrite patches to make them acceptable. Similarly, module owners may need to delay review of a promising patch due to an upcoming deadline. For example, a patch may be of interest, but not for the next milestone. In such a case it may make sense for the module owner to postpone review of a patch until after matters needed for a milestone have been finalized. Again, we expect this to be described in the relevant issue. And of course, it shouldn’t go on very often or for very long or escalation and review is likely.
|
|
||||||
|
|
||||||
The work of the various module owners and peers is overseen by the global owners, which are responsible for making final decisions in case there's conflict between owners as well as set the direction for the project as a whole.
|
|
||||||
|
|
||||||
This file describes module owners who are active on the project and which parts of the code they have expertise on (and interest in). If you're making changes to the code and are wondering who's an appropriate person to talk to, this list will tell you who to ping.
|
|
||||||
|
|
||||||
There's overlap in the areas of expertise of each owner, and in particular when looking at which files are covered by each area, there is a lot of overlap. Don't worry about getting it exactly right when requesting review, any code owner will be happy to redirect the request to a more appropriate person.
|
|
||||||
|
|
||||||
Global owners
|
|
||||||
----------------
|
|
||||||
|
|
||||||
These are people who have worked on the project extensively and are familiar with all or most parts of it. Their expertise and review guidance is trusted by other code owners to cover their own areas of expertise. In case of conflicting opinions from other owners, global owners will make a final decision.
|
|
||||||
|
|
||||||
- Eren Gölge (@erogol)
|
|
||||||
- Reuben Morais (@reuben)
|
|
||||||
|
|
||||||
Training, feeding
|
|
||||||
-----------------
|
|
||||||
|
|
||||||
- Eren Gölge (@erogol)
|
|
||||||
|
|
||||||
Model exporting
|
|
||||||
---------------
|
|
||||||
|
|
||||||
- Eren Gölge (@erogol)
|
|
||||||
|
|
||||||
Multi-Speaker TTS
|
|
||||||
-----------------
|
|
||||||
|
|
||||||
- Eren Gölge (@erogol)
|
|
||||||
- Edresson Casanova (@edresson)
|
|
||||||
|
|
||||||
TTS
|
|
||||||
---
|
|
||||||
|
|
||||||
- Eren Gölge (@erogol)
|
|
||||||
|
|
||||||
Vocoders
|
|
||||||
--------
|
|
||||||
|
|
||||||
- Eren Gölge (@erogol)
|
|
||||||
|
|
||||||
Speaker Encoder
|
|
||||||
---------------
|
|
||||||
|
|
||||||
- Eren Gölge (@erogol)
|
|
||||||
|
|
||||||
Testing & CI
|
|
||||||
------------
|
|
||||||
|
|
||||||
- Eren Gölge (@erogol)
|
|
||||||
- Reuben Morais (@reuben)
|
|
||||||
|
|
||||||
Python bindings
|
|
||||||
---------------
|
|
||||||
|
|
||||||
- Eren Gölge (@erogol)
|
|
||||||
- Reuben Morais (@reuben)
|
|
||||||
|
|
||||||
Documentation
|
|
||||||
-------------
|
|
||||||
|
|
||||||
- Eren Gölge (@erogol)
|
|
||||||
|
|
||||||
Third party bindings
|
|
||||||
--------------------
|
|
||||||
|
|
||||||
Owned by the author.
|
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
Welcome to the 🐸TTS!
|
Welcome to the 🐸TTS!
|
||||||
|
|
||||||
This repository is governed by [the Contributor Covenant Code of Conduct](https://github.com/coqui-ai/TTS/blob/main/CODE_OF_CONDUCT.md).
|
This repository is governed by [the Contributor Covenant Code of Conduct](https://github.com/idiap/coqui-ai-TTS/blob/main/CODE_OF_CONDUCT.md).
|
||||||
|
|
||||||
## Where to start.
|
## Where to start.
|
||||||
We welcome everyone who likes to contribute to 🐸TTS.
|
We welcome everyone who likes to contribute to 🐸TTS.
|
||||||
|
@ -11,30 +11,25 @@ You can contribute not only with code but with bug reports, comments, questions,
|
||||||
|
|
||||||
If you like to contribute code, squash a bug but if you don't know where to start, here are some pointers.
|
If you like to contribute code, squash a bug but if you don't know where to start, here are some pointers.
|
||||||
|
|
||||||
- [Development Road Map](https://github.com/coqui-ai/TTS/issues/378)
|
- [Github Issues Tracker](https://github.com/idiap/coqui-ai-TTS/issues)
|
||||||
|
|
||||||
You can pick something out of our road map. We keep the progess of the project in this simple issue thread. It has new model proposals or developmental updates etc.
|
|
||||||
|
|
||||||
- [Github Issues Tracker](https://github.com/coqui-ai/TTS/issues)
|
|
||||||
|
|
||||||
This is a place to find feature requests, bugs.
|
This is a place to find feature requests, bugs.
|
||||||
|
|
||||||
Issues with the ```good first issue``` tag are good place for beginners to take on.
|
Issues with the ```good first issue``` tag are good place for beginners to
|
||||||
|
take on. Issues tagged with `help wanted` are suited for more experienced
|
||||||
- ✨**PR**✨ [pages](https://github.com/coqui-ai/TTS/pulls) with the ```🚀new version``` tag.
|
outside contributors.
|
||||||
|
|
||||||
We list all the target improvements for the next version. You can pick one of them and start contributing.
|
|
||||||
|
|
||||||
- Also feel free to suggest new features, ideas and models. We're always open for new things.
|
- Also feel free to suggest new features, ideas and models. We're always open for new things.
|
||||||
|
|
||||||
## Call for sharing language models
|
## Call for sharing pretrained models
|
||||||
If possible, please consider sharing your pre-trained models in any language (if the licences allow for you to do so). We will include them in our model catalogue for public use and give the proper attribution, whether it be your name, company, website or any other source specified.
|
If possible, please consider sharing your pre-trained models in any language (if the licences allow for you to do so). We will include them in our model catalogue for public use and give the proper attribution, whether it be your name, company, website or any other source specified.
|
||||||
|
|
||||||
This model can be shared in two ways:
|
This model can be shared in two ways:
|
||||||
1. Share the model files with us and we serve them with the next 🐸 TTS release.
|
1. Share the model files with us and we serve them with the next 🐸 TTS release.
|
||||||
2. Upload your models on GDrive and share the link.
|
2. Upload your models on GDrive and share the link.
|
||||||
|
|
||||||
Models are served under `.models.json` file and any model is available under TTS CLI or Server end points.
|
Models are served under `.models.json` file and any model is available under TTS
|
||||||
|
CLI and Python API end points.
|
||||||
|
|
||||||
Either way you choose, please make sure you send the models [here](https://github.com/coqui-ai/TTS/discussions/930).
|
Either way you choose, please make sure you send the models [here](https://github.com/coqui-ai/TTS/discussions/930).
|
||||||
|
|
||||||
|
@ -44,29 +39,37 @@ If you have a new feature, a model to implement, or a bug to squash, go ahead an
|
||||||
Please use the following steps to send a ✨**PR**✨.
|
Please use the following steps to send a ✨**PR**✨.
|
||||||
Let us know if you encounter a problem along the way.
|
Let us know if you encounter a problem along the way.
|
||||||
|
|
||||||
The following steps are tested on an Ubuntu system.
|
The following steps are tested on an Ubuntu system and require
|
||||||
|
[uv](https://docs.astral.sh/uv/) for virtual environment management. Choose your
|
||||||
|
preferred [installation
|
||||||
|
method](https://docs.astral.sh/uv/getting-started/installation/), e.g. the
|
||||||
|
standalone installer:
|
||||||
|
|
||||||
1. Fork 🐸TTS[https://github.com/coqui-ai/TTS] by clicking the fork button at the top right corner of the project page.
|
```bash
|
||||||
|
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||||
|
```
|
||||||
|
|
||||||
|
1. Fork 🐸TTS[https://github.com/idiap/coqui-ai-TTS] by clicking the fork button at the top right corner of the project page.
|
||||||
|
|
||||||
2. Clone 🐸TTS and add the main repo as a new remote named ```upstream```.
|
2. Clone 🐸TTS and add the main repo as a new remote named ```upstream```.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ git clone git@github.com:<your Github name>/TTS.git
|
git clone git@github.com:<your Github name>/coqui-ai-TTS.git
|
||||||
$ cd TTS
|
cd coqui-ai-TTS
|
||||||
$ git remote add upstream https://github.com/coqui-ai/TTS.git
|
git remote add upstream https://github.com/idiap/coqui-ai-TTS.git
|
||||||
```
|
```
|
||||||
|
|
||||||
3. Install 🐸TTS for development.
|
3. Install 🐸TTS for development.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ make system-deps # intended to be used on Ubuntu (Debian). Let us know if you have a different OS.
|
make system-deps # intended to be used on Ubuntu (Debian). Let us know if you have a different OS.
|
||||||
$ make install
|
make install_dev
|
||||||
```
|
```
|
||||||
|
|
||||||
4. Create a new branch with an informative name for your goal.
|
4. Create a new branch with an informative name for your goal.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ git checkout -b an_informative_name_for_my_branch
|
git checkout -b an_informative_name_for_my_branch
|
||||||
```
|
```
|
||||||
|
|
||||||
5. Implement your changes on your new branch.
|
5. Implement your changes on your new branch.
|
||||||
|
@ -75,39 +78,42 @@ The following steps are tested on an Ubuntu system.
|
||||||
|
|
||||||
7. Add your tests to our test suite under ```tests``` folder. It is important to show that your code works, edge cases are considered, and inform others about the intended use.
|
7. Add your tests to our test suite under ```tests``` folder. It is important to show that your code works, edge cases are considered, and inform others about the intended use.
|
||||||
|
|
||||||
8. Run the tests to see how your updates work with the rest of the project. You can repeat this step multiple times as you implement your changes to make sure you are on the right direction.
|
8. Run the tests to see how your updates work with the rest of the project. You
|
||||||
|
can repeat this step multiple times as you implement your changes to make
|
||||||
|
sure you are on the right direction. **NB: running all tests takes a long time,
|
||||||
|
it is better to leave this to the CI.**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ make test # stop at the first error
|
uv run make test # stop at the first error
|
||||||
$ make test_all # run all the tests, report all the errors
|
uv run make test_all # run all the tests, report all the errors
|
||||||
```
|
```
|
||||||
|
|
||||||
9. Format your code. We use ```black``` for code and ```isort``` for ```import``` formatting.
|
9. Format your code. We use ```black``` for code formatting.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ make style
|
make style
|
||||||
```
|
```
|
||||||
|
|
||||||
10. Run the linter and correct the issues raised. We use ```pylint``` for linting. It helps to enforce a coding standard, offers simple refactoring suggestions.
|
10. Run the linter and correct the issues raised. We use ```ruff``` for linting. It helps to enforce a coding standard, offers simple refactoring suggestions.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ make lint
|
make lint
|
||||||
```
|
```
|
||||||
|
|
||||||
11. When things are good, add new files and commit your changes.
|
11. When things are good, add new files and commit your changes.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ git add my_file1.py my_file2.py ...
|
git add my_file1.py my_file2.py ...
|
||||||
$ git commit
|
git commit
|
||||||
```
|
```
|
||||||
|
|
||||||
It's a good practice to regularly sync your local copy of the project with the upstream code to keep up with the recent updates.
|
It's a good practice to regularly sync your local copy of the project with the upstream code to keep up with the recent updates.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ git fetch upstream
|
git fetch upstream
|
||||||
$ git rebase upstream/master
|
git rebase upstream/main
|
||||||
# or for the development version
|
# or for the development version
|
||||||
$ git rebase upstream/dev
|
git rebase upstream/dev
|
||||||
```
|
```
|
||||||
|
|
||||||
12. Send a PR to ```dev``` branch.
|
12. Send a PR to ```dev``` branch.
|
||||||
|
@ -115,7 +121,7 @@ The following steps are tested on an Ubuntu system.
|
||||||
Push your branch to your fork.
|
Push your branch to your fork.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ git push -u origin an_informative_name_for_my_branch
|
git push -u origin an_informative_name_for_my_branch
|
||||||
```
|
```
|
||||||
|
|
||||||
Then go to your fork's Github page and click on 'Pull request' to send your ✨**PR**✨.
|
Then go to your fork's Github page and click on 'Pull request' to send your ✨**PR**✨.
|
||||||
|
@ -124,7 +130,8 @@ The following steps are tested on an Ubuntu system.
|
||||||
|
|
||||||
13. Let's discuss until it is perfect. 💪
|
13. Let's discuss until it is perfect. 💪
|
||||||
|
|
||||||
We might ask you for certain changes that would appear in the ✨**PR**✨'s page under 🐸TTS[https://github.com/coqui-ai/TTS/pulls].
|
We might ask you for certain changes that would appear in the
|
||||||
|
[Github ✨**PR**✨'s page](https://github.com/idiap/coqui-ai-TTS/pulls).
|
||||||
|
|
||||||
14. Once things look perfect, We merge it to the ```dev``` branch and make it ready for the next version.
|
14. Once things look perfect, We merge it to the ```dev``` branch and make it ready for the next version.
|
||||||
|
|
||||||
|
@ -132,14 +139,14 @@ The following steps are tested on an Ubuntu system.
|
||||||
|
|
||||||
If you prefer working within a Docker container as your development environment, you can do the following:
|
If you prefer working within a Docker container as your development environment, you can do the following:
|
||||||
|
|
||||||
1. Fork 🐸TTS[https://github.com/coqui-ai/TTS] by clicking the fork button at the top right corner of the project page.
|
1. Fork the 🐸TTS [Github repository](https://github.com/idiap/coqui-ai-TTS) by clicking the fork button at the top right corner of the page.
|
||||||
|
|
||||||
2. Clone 🐸TTS and add the main repo as a new remote named ```upsteam```.
|
2. Clone 🐸TTS and add the main repo as a new remote named ```upstream```.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ git clone git@github.com:<your Github name>/TTS.git
|
git clone git@github.com:<your Github name>/coqui-ai-TTS.git
|
||||||
$ cd TTS
|
cd coqui-ai-TTS
|
||||||
$ git remote add upstream https://github.com/coqui-ai/TTS.git
|
git remote add upstream https://github.com/idiap/coqui-ai-TTS.git
|
||||||
```
|
```
|
||||||
|
|
||||||
3. Build the Docker Image as your development environment (it installs all of the dependencies for you):
|
3. Build the Docker Image as your development environment (it installs all of the dependencies for you):
|
||||||
|
|
18
Dockerfile
18
Dockerfile
|
@ -1,8 +1,20 @@
|
||||||
ARG BASE=nvidia/cuda:11.8.0-base-ubuntu22.04
|
ARG BASE=nvidia/cuda:11.8.0-base-ubuntu22.04
|
||||||
FROM ${BASE}
|
FROM ${BASE}
|
||||||
|
|
||||||
RUN apt-get update && apt-get upgrade -y
|
RUN apt-get update && \
|
||||||
RUN apt-get install -y --no-install-recommends gcc g++ make python3 python3-dev python3-pip python3-venv python3-wheel espeak-ng libsndfile1-dev && rm -rf /var/lib/apt/lists/*
|
apt-get upgrade -y
|
||||||
|
RUN apt-get install -y --no-install-recommends \
|
||||||
|
gcc g++ make python3 python3-dev python3-pip \
|
||||||
|
python3-venv python3-wheel espeak-ng \
|
||||||
|
libsndfile1-dev libc-dev curl && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Install Rust compiler (to build sudachipy for Mac)
|
||||||
|
RUN curl --proto '=https' --tlsv1.2 -sSf "https://sh.rustup.rs" | sh -s -- -y
|
||||||
|
ENV PATH="/root/.cargo/bin:${PATH}"
|
||||||
|
|
||||||
|
RUN pip3 install -U pip setuptools wheel
|
||||||
|
RUN pip3 install -U "spacy[ja]<3.8"
|
||||||
RUN pip3 install llvmlite --ignore-installed
|
RUN pip3 install llvmlite --ignore-installed
|
||||||
|
|
||||||
# Install Dependencies:
|
# Install Dependencies:
|
||||||
|
@ -13,7 +25,7 @@ RUN rm -rf /root/.cache/pip
|
||||||
WORKDIR /root
|
WORKDIR /root
|
||||||
COPY . /root
|
COPY . /root
|
||||||
|
|
||||||
RUN make install
|
RUN pip3 install -e .[all]
|
||||||
|
|
||||||
ENTRYPOINT ["tts"]
|
ENTRYPOINT ["tts"]
|
||||||
CMD ["--help"]
|
CMD ["--help"]
|
||||||
|
|
15
MANIFEST.in
15
MANIFEST.in
|
@ -1,15 +0,0 @@
|
||||||
include README.md
|
|
||||||
include LICENSE.txt
|
|
||||||
include requirements.*.txt
|
|
||||||
include *.cff
|
|
||||||
include requirements.txt
|
|
||||||
include TTS/VERSION
|
|
||||||
recursive-include TTS *.json
|
|
||||||
recursive-include TTS *.html
|
|
||||||
recursive-include TTS *.png
|
|
||||||
recursive-include TTS *.md
|
|
||||||
recursive-include TTS *.py
|
|
||||||
recursive-include TTS *.pyx
|
|
||||||
recursive-include images *.png
|
|
||||||
recursive-exclude tests *
|
|
||||||
prune tests*
|
|
64
Makefile
64
Makefile
|
@ -1,5 +1,5 @@
|
||||||
.DEFAULT_GOAL := help
|
.DEFAULT_GOAL := help
|
||||||
.PHONY: test system-deps dev-deps deps style lint install help docs
|
.PHONY: test system-deps style lint install install_dev help docs
|
||||||
|
|
||||||
help:
|
help:
|
||||||
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
|
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
|
||||||
|
@ -11,68 +11,60 @@ test_all: ## run tests and don't stop on an error.
|
||||||
./run_bash_tests.sh
|
./run_bash_tests.sh
|
||||||
|
|
||||||
test: ## run tests.
|
test: ## run tests.
|
||||||
nose2 -F -v -B --with-coverage --coverage TTS tests
|
coverage run -m nose2 -F -v -B tests
|
||||||
|
|
||||||
test_vocoder: ## run vocoder tests.
|
test_vocoder: ## run vocoder tests.
|
||||||
nose2 -F -v -B --with-coverage --coverage TTS tests.vocoder_tests
|
coverage run -m nose2 -F -v -B tests.vocoder_tests
|
||||||
|
|
||||||
test_tts: ## run tts tests.
|
test_tts: ## run tts tests.
|
||||||
nose2 -F -v -B --with-coverage --coverage TTS tests.tts_tests
|
coverage run -m nose2 -F -v -B tests.tts_tests
|
||||||
|
|
||||||
test_tts2: ## run tts tests.
|
test_tts2: ## run tts tests.
|
||||||
nose2 -F -v -B --with-coverage --coverage TTS tests.tts_tests2
|
coverage run -m nose2 -F -v -B tests.tts_tests2
|
||||||
|
|
||||||
test_xtts:
|
test_xtts:
|
||||||
nose2 -F -v -B --with-coverage --coverage TTS tests.xtts_tests
|
coverage run -m nose2 -F -v -B tests.xtts_tests
|
||||||
|
|
||||||
test_aux: ## run aux tests.
|
test_aux: ## run aux tests.
|
||||||
nose2 -F -v -B --with-coverage --coverage TTS tests.aux_tests
|
coverage run -m nose2 -F -v -B tests.aux_tests
|
||||||
./run_bash_tests.sh
|
./run_bash_tests.sh
|
||||||
|
|
||||||
test_zoo: ## run zoo tests.
|
test_zoo0: ## run zoo tests.
|
||||||
nose2 -F -v -B --with-coverage --coverage TTS tests.zoo_tests
|
coverage run -m nose2 -F -v -B tests.zoo_tests.test_models.test_models_offset_0_step_3 \
|
||||||
|
tests.zoo_tests.test_models.test_voice_conversion
|
||||||
|
test_zoo1: ## run zoo tests.
|
||||||
|
coverage run -m nose2 -F -v -B tests.zoo_tests.test_models.test_models_offset_1_step_3
|
||||||
|
test_zoo2: ## run zoo tests.
|
||||||
|
coverage run -m nose2 -F -v -B tests.zoo_tests.test_models.test_models_offset_2_step_3
|
||||||
|
|
||||||
inference_tests: ## run inference tests.
|
inference_tests: ## run inference tests.
|
||||||
nose2 -F -v -B --with-coverage --coverage TTS tests.inference_tests
|
coverage run -m nose2 -F -v -B tests.inference_tests
|
||||||
|
|
||||||
data_tests: ## run data tests.
|
data_tests: ## run data tests.
|
||||||
nose2 -F -v -B --with-coverage --coverage TTS tests.data_tests
|
coverage run -m nose2 -F -v -B tests.data_tests
|
||||||
|
|
||||||
test_text: ## run text tests.
|
test_text: ## run text tests.
|
||||||
nose2 -F -v -B --with-coverage --coverage TTS tests.text_tests
|
coverage run -m nose2 -F -v -B tests.text_tests
|
||||||
|
|
||||||
test_failed: ## only run tests failed the last time.
|
test_failed: ## only run tests failed the last time.
|
||||||
nose2 -F -v -B --with-coverage --coverage TTS tests
|
coverage run -m nose2 -F -v -B tests
|
||||||
|
|
||||||
style: ## update code style.
|
style: ## update code style.
|
||||||
black ${target_dirs}
|
uv run --only-dev black ${target_dirs}
|
||||||
isort ${target_dirs}
|
|
||||||
|
|
||||||
lint: ## run pylint linter.
|
lint: ## run linters.
|
||||||
pylint ${target_dirs}
|
uv run --only-dev ruff check ${target_dirs}
|
||||||
black ${target_dirs} --check
|
uv run --only-dev black ${target_dirs} --check
|
||||||
isort ${target_dirs} --check-only
|
|
||||||
|
|
||||||
system-deps: ## install linux system deps
|
system-deps: ## install linux system deps
|
||||||
sudo apt-get install -y libsndfile1-dev
|
sudo apt-get install -y libsndfile1-dev
|
||||||
|
|
||||||
dev-deps: ## install development deps
|
install: ## install 🐸 TTS
|
||||||
pip install -r requirements.dev.txt
|
uv sync --all-extras
|
||||||
|
|
||||||
doc-deps: ## install docs dependencies
|
install_dev: ## install 🐸 TTS for development.
|
||||||
pip install -r docs/requirements.txt
|
uv sync --all-extras
|
||||||
|
uv run pre-commit install
|
||||||
build-docs: ## build the docs
|
|
||||||
cd docs && make clean && make build
|
|
||||||
|
|
||||||
hub-deps: ## install deps for torch hub use
|
|
||||||
pip install -r requirements.hub.txt
|
|
||||||
|
|
||||||
deps: ## install 🐸 requirements.
|
|
||||||
pip install -r requirements.txt
|
|
||||||
|
|
||||||
install: ## install 🐸 TTS for development.
|
|
||||||
pip install -e .[all]
|
|
||||||
|
|
||||||
docs: ## build the docs
|
docs: ## build the docs
|
||||||
$(MAKE) -C docs clean && $(MAKE) -C docs html
|
uv run --group docs $(MAKE) -C docs clean && uv run --group docs $(MAKE) -C docs html
|
||||||
|
|
434
README.md
434
README.md
|
@ -1,177 +1,173 @@
|
||||||
|
# <img src="https://raw.githubusercontent.com/idiap/coqui-ai-TTS/main/images/coqui-log-green-TTS.png" height="56"/>
|
||||||
## 🐸Coqui.ai News
|
|
||||||
- 📣 ⓍTTSv2 is here with 16 languages and better performance across the board.
|
|
||||||
- 📣 ⓍTTS fine-tuning code is out. Check the [example recipes](https://github.com/coqui-ai/TTS/tree/dev/recipes/ljspeech).
|
|
||||||
- 📣 ⓍTTS can now stream with <200ms latency.
|
|
||||||
- 📣 ⓍTTS, our production TTS model that can speak 13 languages, is released [Blog Post](https://coqui.ai/blog/tts/open_xtts), [Demo](https://huggingface.co/spaces/coqui/xtts), [Docs](https://tts.readthedocs.io/en/dev/models/xtts.html)
|
|
||||||
- 📣 [🐶Bark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://tts.readthedocs.io/en/dev/models/bark.html)
|
|
||||||
- 📣 You can use [~1100 Fairseq models](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS.
|
|
||||||
- 📣 🐸TTS now supports 🐢Tortoise with faster inference. [Docs](https://tts.readthedocs.io/en/dev/models/tortoise.html)
|
|
||||||
|
|
||||||
<div align="center">
|
|
||||||
<img src="https://static.scarf.sh/a.png?x-pxid=cf317fe7-2188-4721-bc01-124bb5d5dbb2" />
|
|
||||||
|
|
||||||
## <img src="https://raw.githubusercontent.com/coqui-ai/TTS/main/images/coqui-log-green-TTS.png" height="56"/>
|
|
||||||
|
|
||||||
|
|
||||||
**🐸TTS is a library for advanced Text-to-Speech generation.**
|
**🐸 Coqui TTS is a library for advanced Text-to-Speech generation.**
|
||||||
|
|
||||||
🚀 Pretrained models in +1100 languages.
|
🚀 Pretrained models in +1100 languages.
|
||||||
|
|
||||||
🛠️ Tools for training new models and fine-tuning existing models in any language.
|
🛠️ Tools for training new models and fine-tuning existing models in any language.
|
||||||
|
|
||||||
📚 Utilities for dataset analysis and curation.
|
📚 Utilities for dataset analysis and curation.
|
||||||
______________________________________________________________________
|
|
||||||
|
|
||||||
[](https://discord.gg/5eXr5seRrv)
|
[](https://discord.gg/5eXr5seRrv)
|
||||||
|
[](https://pypi.org/project/coqui-tts/)
|
||||||
[](https://opensource.org/licenses/MPL-2.0)
|
[](https://opensource.org/licenses/MPL-2.0)
|
||||||
[](https://badge.fury.io/py/TTS)
|
[](https://pypi.org/project/coqui-tts/)
|
||||||
[](https://github.com/coqui-ai/TTS/blob/master/CODE_OF_CONDUCT.md)
|
[](https://pepy.tech/project/coqui-tts)
|
||||||
[](https://pepy.tech/project/tts)
|
|
||||||
[](https://zenodo.org/badge/latestdoi/265612440)
|
[](https://zenodo.org/badge/latestdoi/265612440)
|
||||||
|
[](https://github.com/idiap/coqui-ai-TTS/actions/workflows/tests.yml)
|
||||||

|
[](https://github.com/idiap/coqui-ai-TTS/actions/workflows/docker.yaml)
|
||||||

|
[](https://github.com/idiap/coqui-ai-TTS/actions/workflows/style_check.yml)
|
||||||

|
[](https://coqui-tts.readthedocs.io/en/latest/)
|
||||||

|
|
||||||

|
|
||||||

|
|
||||||

|
|
||||||

|
|
||||||

|
|
||||||

|
|
||||||

|
|
||||||
[](https://tts.readthedocs.io/en/latest/)
|
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
______________________________________________________________________
|
## 📣 News
|
||||||
|
- **Fork of the [original, unmaintained repository](https://github.com/coqui-ai/TTS). New PyPI package: [coqui-tts](https://pypi.org/project/coqui-tts)**
|
||||||
|
- 0.25.0: [OpenVoice](https://github.com/myshell-ai/OpenVoice) models now available for voice conversion.
|
||||||
|
- 0.24.2: Prebuilt wheels are now also published for Mac and Windows (in addition to Linux as before) for easier installation across platforms.
|
||||||
|
- 0.20.0: XTTSv2 is here with 17 languages and better performance across the board. XTTS can stream with <200ms latency.
|
||||||
|
- 0.19.0: XTTS fine-tuning code is out. Check the [example recipes](https://github.com/idiap/coqui-ai-TTS/tree/dev/recipes/ljspeech).
|
||||||
|
- 0.14.1: You can use [Fairseq models in ~1100 languages](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS.
|
||||||
|
|
||||||
## 💬 Where to ask questions
|
## 💬 Where to ask questions
|
||||||
Please use our dedicated channels for questions and discussion. Help is much more valuable if it's shared publicly so that more people can benefit from it.
|
Please use our dedicated channels for questions and discussion. Help is much more valuable if it's shared publicly so that more people can benefit from it.
|
||||||
|
|
||||||
| Type | Platforms |
|
| Type | Platforms |
|
||||||
| ------------------------------- | --------------------------------------- |
|
| -------------------------------------------- | ----------------------------------- |
|
||||||
| 🚨 **Bug Reports** | [GitHub Issue Tracker] |
|
| 🚨 **Bug Reports, Feature Requests & Ideas** | [GitHub Issue Tracker] |
|
||||||
| 🎁 **Feature Requests & Ideas** | [GitHub Issue Tracker] |
|
|
||||||
| 👩💻 **Usage Questions** | [GitHub Discussions] |
|
| 👩💻 **Usage Questions** | [GitHub Discussions] |
|
||||||
| 🗯 **General Discussion** | [GitHub Discussions] or [Discord] |
|
| 🗯 **General Discussion** | [GitHub Discussions] or [Discord] |
|
||||||
|
|
||||||
[github issue tracker]: https://github.com/coqui-ai/tts/issues
|
[github issue tracker]: https://github.com/idiap/coqui-ai-TTS/issues
|
||||||
[github discussions]: https://github.com/coqui-ai/TTS/discussions
|
[github discussions]: https://github.com/idiap/coqui-ai-TTS/discussions
|
||||||
[discord]: https://discord.gg/5eXr5seRrv
|
[discord]: https://discord.gg/5eXr5seRrv
|
||||||
[Tutorials and Examples]: https://github.com/coqui-ai/TTS/wiki/TTS-Notebooks-and-Tutorials
|
[Tutorials and Examples]: https://github.com/coqui-ai/TTS/wiki/TTS-Notebooks-and-Tutorials
|
||||||
|
|
||||||
|
The [issues](https://github.com/coqui-ai/TTS/issues) and
|
||||||
|
[discussions](https://github.com/coqui-ai/TTS/discussions) in the original
|
||||||
|
repository are also still a useful source of information.
|
||||||
|
|
||||||
|
|
||||||
## 🔗 Links and Resources
|
## 🔗 Links and Resources
|
||||||
| Type | Links |
|
| Type | Links |
|
||||||
| ------------------------------- | --------------------------------------- |
|
| ------------------------------- | --------------------------------------- |
|
||||||
| 💼 **Documentation** | [ReadTheDocs](https://tts.readthedocs.io/en/latest/)
|
| 💼 **Documentation** | [ReadTheDocs](https://coqui-tts.readthedocs.io/en/latest/)
|
||||||
| 💾 **Installation** | [TTS/README.md](https://github.com/coqui-ai/TTS/tree/dev#installation)|
|
| 💾 **Installation** | [TTS/README.md](https://github.com/idiap/coqui-ai-TTS/tree/dev#installation)|
|
||||||
| 👩💻 **Contributing** | [CONTRIBUTING.md](https://github.com/coqui-ai/TTS/blob/main/CONTRIBUTING.md)|
|
| 👩💻 **Contributing** | [CONTRIBUTING.md](https://github.com/idiap/coqui-ai-TTS/blob/main/CONTRIBUTING.md)|
|
||||||
| 📌 **Road Map** | [Main Development Plans](https://github.com/coqui-ai/TTS/issues/378)
|
| 🚀 **Released Models** | [Standard models](https://github.com/idiap/coqui-ai-TTS/blob/dev/TTS/.models.json) and [Fairseq models in ~1100 languages](https://github.com/idiap/coqui-ai-TTS#example-text-to-speech-using-fairseq-models-in-1100-languages-)|
|
||||||
| 🚀 **Released Models** | [TTS Releases](https://github.com/coqui-ai/TTS/releases) and [Experimental Models](https://github.com/coqui-ai/TTS/wiki/Experimental-Released-Models)|
|
|
||||||
| 📰 **Papers** | [TTS Papers](https://github.com/erogol/TTS-papers)|
|
|
||||||
|
|
||||||
|
|
||||||
## 🥇 TTS Performance
|
|
||||||
<p align="center"><img src="https://raw.githubusercontent.com/coqui-ai/TTS/main/images/TTS-performance.png" width="800" /></p>
|
|
||||||
|
|
||||||
Underlined "TTS*" and "Judy*" are **internal** 🐸TTS models that are not released open-source. They are here to show the potential. Models prefixed with a dot (.Jofish .Abe and .Janice) are real human voices.
|
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
- High-performance Deep Learning models for Text2Speech tasks.
|
- High-performance text-to-speech and voice conversion models, see list below.
|
||||||
- Text2Spec models (Tacotron, Tacotron2, Glow-TTS, SpeedySpeech).
|
- Fast and efficient model training with detailed training logs on the terminal and Tensorboard.
|
||||||
- Speaker Encoder to compute speaker embeddings efficiently.
|
- Support for multi-speaker and multilingual TTS.
|
||||||
- Vocoder models (MelGAN, Multiband-MelGAN, GAN-TTS, ParallelWaveGAN, WaveGrad, WaveRNN)
|
|
||||||
- Fast and efficient model training.
|
|
||||||
- Detailed training logs on the terminal and Tensorboard.
|
|
||||||
- Support for Multi-speaker TTS.
|
|
||||||
- Efficient, flexible, lightweight but feature complete `Trainer API`.
|
|
||||||
- Released and ready-to-use models.
|
- Released and ready-to-use models.
|
||||||
- Tools to curate Text2Speech datasets under```dataset_analysis```.
|
- Tools to curate TTS datasets under ```dataset_analysis/```.
|
||||||
- Utilities to use and test your models.
|
- Command line and Python APIs to use and test your models.
|
||||||
- Modular (but not too much) code base enabling easy implementation of new ideas.
|
- Modular (but not too much) code base enabling easy implementation of new ideas.
|
||||||
|
|
||||||
## Model Implementations
|
## Model Implementations
|
||||||
### Spectrogram models
|
### Spectrogram models
|
||||||
- Tacotron: [paper](https://arxiv.org/abs/1703.10135)
|
- [Tacotron](https://arxiv.org/abs/1703.10135), [Tacotron2](https://arxiv.org/abs/1712.05884)
|
||||||
- Tacotron2: [paper](https://arxiv.org/abs/1712.05884)
|
- [Glow-TTS](https://arxiv.org/abs/2005.11129), [SC-GlowTTS](https://arxiv.org/abs/2104.05557)
|
||||||
- Glow-TTS: [paper](https://arxiv.org/abs/2005.11129)
|
- [Speedy-Speech](https://arxiv.org/abs/2008.03802)
|
||||||
- Speedy-Speech: [paper](https://arxiv.org/abs/2008.03802)
|
- [Align-TTS](https://arxiv.org/abs/2003.01950)
|
||||||
- Align-TTS: [paper](https://arxiv.org/abs/2003.01950)
|
- [FastPitch](https://arxiv.org/pdf/2006.06873.pdf)
|
||||||
- FastPitch: [paper](https://arxiv.org/pdf/2006.06873.pdf)
|
- [FastSpeech](https://arxiv.org/abs/1905.09263), [FastSpeech2](https://arxiv.org/abs/2006.04558)
|
||||||
- FastSpeech: [paper](https://arxiv.org/abs/1905.09263)
|
- [Capacitron](https://arxiv.org/abs/1906.03402)
|
||||||
- FastSpeech2: [paper](https://arxiv.org/abs/2006.04558)
|
- [OverFlow](https://arxiv.org/abs/2211.06892)
|
||||||
- SC-GlowTTS: [paper](https://arxiv.org/abs/2104.05557)
|
- [Neural HMM TTS](https://arxiv.org/abs/2108.13320)
|
||||||
- Capacitron: [paper](https://arxiv.org/abs/1906.03402)
|
- [Delightful TTS](https://arxiv.org/abs/2110.12612)
|
||||||
- OverFlow: [paper](https://arxiv.org/abs/2211.06892)
|
|
||||||
- Neural HMM TTS: [paper](https://arxiv.org/abs/2108.13320)
|
|
||||||
- Delightful TTS: [paper](https://arxiv.org/abs/2110.12612)
|
|
||||||
|
|
||||||
### End-to-End Models
|
### End-to-End Models
|
||||||
- ⓍTTS: [blog](https://coqui.ai/blog/tts/open_xtts)
|
- [XTTS](https://arxiv.org/abs/2406.04904)
|
||||||
- VITS: [paper](https://arxiv.org/pdf/2106.06103)
|
- [VITS](https://arxiv.org/pdf/2106.06103)
|
||||||
- 🐸 YourTTS: [paper](https://arxiv.org/abs/2112.02418)
|
- 🐸[YourTTS](https://arxiv.org/abs/2112.02418)
|
||||||
- 🐢 Tortoise: [orig. repo](https://github.com/neonbjb/tortoise-tts)
|
- 🐢[Tortoise](https://github.com/neonbjb/tortoise-tts)
|
||||||
- 🐶 Bark: [orig. repo](https://github.com/suno-ai/bark)
|
- 🐶[Bark](https://github.com/suno-ai/bark)
|
||||||
|
|
||||||
### Attention Methods
|
|
||||||
- Guided Attention: [paper](https://arxiv.org/abs/1710.08969)
|
|
||||||
- Forward Backward Decoding: [paper](https://arxiv.org/abs/1907.09006)
|
|
||||||
- Graves Attention: [paper](https://arxiv.org/abs/1910.10288)
|
|
||||||
- Double Decoder Consistency: [blog](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/)
|
|
||||||
- Dynamic Convolutional Attention: [paper](https://arxiv.org/pdf/1910.10288.pdf)
|
|
||||||
- Alignment Network: [paper](https://arxiv.org/abs/2108.10447)
|
|
||||||
|
|
||||||
### Speaker Encoder
|
|
||||||
- GE2E: [paper](https://arxiv.org/abs/1710.10467)
|
|
||||||
- Angular Loss: [paper](https://arxiv.org/pdf/2003.11982.pdf)
|
|
||||||
|
|
||||||
### Vocoders
|
### Vocoders
|
||||||
- MelGAN: [paper](https://arxiv.org/abs/1910.06711)
|
- [MelGAN](https://arxiv.org/abs/1910.06711)
|
||||||
- MultiBandMelGAN: [paper](https://arxiv.org/abs/2005.05106)
|
- [MultiBandMelGAN](https://arxiv.org/abs/2005.05106)
|
||||||
- ParallelWaveGAN: [paper](https://arxiv.org/abs/1910.11480)
|
- [ParallelWaveGAN](https://arxiv.org/abs/1910.11480)
|
||||||
- GAN-TTS discriminators: [paper](https://arxiv.org/abs/1909.11646)
|
- [GAN-TTS discriminators](https://arxiv.org/abs/1909.11646)
|
||||||
- WaveRNN: [origin](https://github.com/fatchord/WaveRNN/)
|
- [WaveRNN](https://github.com/fatchord/WaveRNN/)
|
||||||
- WaveGrad: [paper](https://arxiv.org/abs/2009.00713)
|
- [WaveGrad](https://arxiv.org/abs/2009.00713)
|
||||||
- HiFiGAN: [paper](https://arxiv.org/abs/2010.05646)
|
- [HiFiGAN](https://arxiv.org/abs/2010.05646)
|
||||||
- UnivNet: [paper](https://arxiv.org/abs/2106.07889)
|
- [UnivNet](https://arxiv.org/abs/2106.07889)
|
||||||
|
|
||||||
### Voice Conversion
|
### Voice Conversion
|
||||||
- FreeVC: [paper](https://arxiv.org/abs/2210.15418)
|
- [FreeVC](https://arxiv.org/abs/2210.15418)
|
||||||
|
- [OpenVoice](https://arxiv.org/abs/2312.01479)
|
||||||
|
|
||||||
|
### Others
|
||||||
|
- Attention methods: [Guided Attention](https://arxiv.org/abs/1710.08969),
|
||||||
|
[Forward Backward Decoding](https://arxiv.org/abs/1907.09006),
|
||||||
|
[Graves Attention](https://arxiv.org/abs/1910.10288),
|
||||||
|
[Double Decoder Consistency](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/),
|
||||||
|
[Dynamic Convolutional Attention](https://arxiv.org/pdf/1910.10288.pdf),
|
||||||
|
[Alignment Network](https://arxiv.org/abs/2108.10447)
|
||||||
|
- Speaker encoders: [GE2E](https://arxiv.org/abs/1710.10467),
|
||||||
|
[Angular Loss](https://arxiv.org/pdf/2003.11982.pdf)
|
||||||
|
|
||||||
You can also help us implement more models.
|
You can also help us implement more models.
|
||||||
|
|
||||||
|
<!-- start installation -->
|
||||||
## Installation
|
## Installation
|
||||||
🐸TTS is tested on Ubuntu 18.04 with **python >= 3.9, < 3.12.**.
|
|
||||||
|
|
||||||
If you are only interested in [synthesizing speech](https://tts.readthedocs.io/en/latest/inference.html) with the released 🐸TTS models, installing from PyPI is the easiest option.
|
🐸TTS is tested on Ubuntu 24.04 with **python >= 3.9, < 3.13**, but should also
|
||||||
|
work on Mac and Windows.
|
||||||
|
|
||||||
|
If you are only interested in [synthesizing speech](https://coqui-tts.readthedocs.io/en/latest/inference.html) with the pretrained 🐸TTS models, installing from PyPI is the easiest option.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install TTS
|
pip install coqui-tts
|
||||||
```
|
```
|
||||||
|
|
||||||
If you plan to code or train models, clone 🐸TTS and install it locally.
|
If you plan to code or train models, clone 🐸TTS and install it locally.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
git clone https://github.com/coqui-ai/TTS
|
git clone https://github.com/idiap/coqui-ai-TTS
|
||||||
pip install -e .[all,dev,notebooks] # Select the relevant extras
|
cd coqui-ai-TTS
|
||||||
|
pip install -e .
|
||||||
```
|
```
|
||||||
|
|
||||||
If you are on Ubuntu (Debian), you can also run following commands for installation.
|
### Optional dependencies
|
||||||
|
|
||||||
|
The following extras allow the installation of optional dependencies:
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
|------|-------------|
|
||||||
|
| `all` | All optional dependencies |
|
||||||
|
| `notebooks` | Dependencies only used in notebooks |
|
||||||
|
| `server` | Dependencies to run the TTS server |
|
||||||
|
| `bn` | Bangla G2P |
|
||||||
|
| `ja` | Japanese G2P |
|
||||||
|
| `ko` | Korean G2P |
|
||||||
|
| `zh` | Chinese G2P |
|
||||||
|
| `languages` | All language-specific dependencies |
|
||||||
|
|
||||||
|
You can install extras with one of the following commands:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ make system-deps # intended to be used on Ubuntu (Debian). Let us know if you have a different OS.
|
pip install coqui-tts[server,ja]
|
||||||
$ make install
|
pip install -e .[server,ja]
|
||||||
```
|
```
|
||||||
|
|
||||||
If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](https://stackoverflow.com/questions/66726331/how-can-i-run-mozilla-tts-coqui-tts-training-with-cuda-on-a-windows-system).
|
### Platforms
|
||||||
|
|
||||||
|
If you are on Ubuntu (Debian), you can also run the following commands for installation.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
make system-deps
|
||||||
|
make install
|
||||||
|
```
|
||||||
|
|
||||||
|
<!-- end installation -->
|
||||||
|
|
||||||
## Docker Image
|
## Docker Image
|
||||||
You can also try TTS without install with the docker image.
|
You can also try out Coqui TTS without installation with the docker image.
|
||||||
Simply run the following command and you will be able to run TTS without installing it.
|
Simply run the following command and you will be able to run TTS:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker run --rm -it -p 5002:5002 --entrypoint /bin/bash ghcr.io/coqui-ai/tts-cpu
|
docker run --rm -it -p 5002:5002 --entrypoint /bin/bash ghcr.io/coqui-ai/tts-cpu
|
||||||
|
@ -180,14 +176,15 @@ python3 TTS/server/server.py --model_name tts_models/en/vctk/vits # To start a s
|
||||||
```
|
```
|
||||||
|
|
||||||
You can then enjoy the TTS server [here](http://[::1]:5002/)
|
You can then enjoy the TTS server [here](http://[::1]:5002/)
|
||||||
More details about the docker images (like GPU support) can be found [here](https://tts.readthedocs.io/en/latest/docker_images.html)
|
More details about the docker images (like GPU support) can be found
|
||||||
|
[here](https://coqui-tts.readthedocs.io/en/latest/docker_images.html)
|
||||||
|
|
||||||
|
|
||||||
## Synthesizing speech by 🐸TTS
|
## Synthesizing speech by 🐸TTS
|
||||||
|
<!-- start inference -->
|
||||||
### 🐍 Python API
|
### 🐍 Python API
|
||||||
|
|
||||||
#### Running a multi-speaker and multi-lingual model
|
#### Multi-speaker and multi-lingual model
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import torch
|
import torch
|
||||||
|
@ -199,44 +196,63 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
# List available 🐸TTS models
|
# List available 🐸TTS models
|
||||||
print(TTS().list_models())
|
print(TTS().list_models())
|
||||||
|
|
||||||
# Init TTS
|
# Initialize TTS
|
||||||
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
|
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
|
||||||
|
|
||||||
|
# List speakers
|
||||||
|
print(tts.speakers)
|
||||||
|
|
||||||
# Run TTS
|
# Run TTS
|
||||||
# ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language
|
# ❗ XTTS supports both, but many models allow only one of the `speaker` and
|
||||||
# Text to speech list of amplitude values as output
|
# `speaker_wav` arguments
|
||||||
wav = tts.tts(text="Hello world!", speaker_wav="my/cloning/audio.wav", language="en")
|
|
||||||
# Text to speech to a file
|
# TTS with list of amplitude values as output, clone the voice from `speaker_wav`
|
||||||
tts.tts_to_file(text="Hello world!", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav")
|
wav = tts.tts(
|
||||||
|
text="Hello world!",
|
||||||
|
speaker_wav="my/cloning/audio.wav",
|
||||||
|
language="en"
|
||||||
|
)
|
||||||
|
|
||||||
|
# TTS to a file, use a preset speaker
|
||||||
|
tts.tts_to_file(
|
||||||
|
text="Hello world!",
|
||||||
|
speaker="Craig Gutsy",
|
||||||
|
language="en",
|
||||||
|
file_path="output.wav"
|
||||||
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Running a single speaker model
|
#### Single speaker model
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# Init TTS with the target model name
|
# Initialize TTS with the target model name
|
||||||
tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False).to(device)
|
tts = TTS("tts_models/de/thorsten/tacotron2-DDC").to(device)
|
||||||
|
|
||||||
# Run TTS
|
# Run TTS
|
||||||
tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path=OUTPUT_PATH)
|
tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path=OUTPUT_PATH)
|
||||||
|
|
||||||
# Example voice cloning with YourTTS in English, French and Portuguese
|
|
||||||
tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False).to(device)
|
|
||||||
tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav")
|
|
||||||
tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr-fr", file_path="output.wav")
|
|
||||||
tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt-br", file_path="output.wav")
|
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Example voice conversion
|
#### Voice conversion (VC)
|
||||||
|
|
||||||
Converting the voice in `source_wav` to the voice of `target_wav`
|
Converting the voice in `source_wav` to the voice of `target_wav`
|
||||||
|
|
||||||
```python
|
```python
|
||||||
tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False).to("cuda")
|
tts = TTS("voice_conversion_models/multilingual/vctk/freevc24").to("cuda")
|
||||||
tts.voice_conversion_to_file(source_wav="my/source.wav", target_wav="my/target.wav", file_path="output.wav")
|
tts.voice_conversion_to_file(
|
||||||
|
source_wav="my/source.wav",
|
||||||
|
target_wav="my/target.wav",
|
||||||
|
file_path="output.wav"
|
||||||
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Example voice cloning together with the voice conversion model.
|
Other available voice conversion models:
|
||||||
This way, you can clone voices by using any model in 🐸TTS.
|
- `voice_conversion_models/multilingual/multi-dataset/openvoice_v1`
|
||||||
|
- `voice_conversion_models/multilingual/multi-dataset/openvoice_v2`
|
||||||
|
|
||||||
|
#### Voice cloning by combining single speaker TTS model with the default VC model
|
||||||
|
|
||||||
|
This way, you can clone voices by using any model in 🐸TTS. The FreeVC model is
|
||||||
|
used for voice conversion after synthesizing speech.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
|
||||||
|
@ -248,160 +264,140 @@ tts.tts_with_vc_to_file(
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Example text to speech using **Fairseq models in ~1100 languages** 🤯.
|
#### TTS using Fairseq models in ~1100 languages 🤯
|
||||||
For Fairseq models, use the following name format: `tts_models/<lang-iso_code>/fairseq/vits`.
|
For Fairseq models, use the following name format: `tts_models/<lang-iso_code>/fairseq/vits`.
|
||||||
You can find the language ISO codes [here](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html)
|
You can find the language ISO codes [here](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html)
|
||||||
and learn about the Fairseq models [here](https://github.com/facebookresearch/fairseq/tree/main/examples/mms).
|
and learn about the Fairseq models [here](https://github.com/facebookresearch/fairseq/tree/main/examples/mms).
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# TTS with on the fly voice conversion
|
# TTS with fairseq models
|
||||||
api = TTS("tts_models/deu/fairseq/vits")
|
api = TTS("tts_models/deu/fairseq/vits")
|
||||||
api.tts_with_vc_to_file(
|
api.tts_to_file(
|
||||||
"Wie sage ich auf Italienisch, dass ich dich liebe?",
|
"Wie sage ich auf Italienisch, dass ich dich liebe?",
|
||||||
speaker_wav="target/speaker.wav",
|
|
||||||
file_path="output.wav"
|
file_path="output.wav"
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
### Command-line `tts`
|
### Command-line interface `tts`
|
||||||
|
|
||||||
<!-- begin-tts-readme -->
|
<!-- begin-tts-readme -->
|
||||||
|
|
||||||
Synthesize speech on command line.
|
Synthesize speech on the command line.
|
||||||
|
|
||||||
You can either use your trained model or choose a model from the provided list.
|
You can either use your trained model or choose a model from the provided list.
|
||||||
|
|
||||||
If you don't specify any models, then it uses LJSpeech based English model.
|
|
||||||
|
|
||||||
#### Single Speaker Models
|
|
||||||
|
|
||||||
- List provided models:
|
- List provided models:
|
||||||
|
|
||||||
```
|
```sh
|
||||||
$ tts --list_models
|
tts --list_models
|
||||||
```
|
```
|
||||||
|
|
||||||
- Get model info (for both tts_models and vocoder_models):
|
- Get model information. Use the names obtained from `--list_models`.
|
||||||
|
```sh
|
||||||
- Query by type/name:
|
tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
|
||||||
The model_info_by_name uses the name as it from the --list_models.
|
|
||||||
```
|
|
||||||
$ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
|
|
||||||
```
|
```
|
||||||
For example:
|
For example:
|
||||||
```
|
```sh
|
||||||
$ tts --model_info_by_name tts_models/tr/common-voice/glow-tts
|
tts --model_info_by_name tts_models/tr/common-voice/glow-tts
|
||||||
$ tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2
|
tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2
|
||||||
```
|
|
||||||
- Query by type/idx:
|
|
||||||
The model_query_idx uses the corresponding idx from --list_models.
|
|
||||||
|
|
||||||
```
|
|
||||||
$ tts --model_info_by_idx "<model_type>/<model_query_idx>"
|
|
||||||
```
|
```
|
||||||
|
|
||||||
For example:
|
#### Single speaker models
|
||||||
|
|
||||||
```
|
- Run TTS with the default model (`tts_models/en/ljspeech/tacotron2-DDC`):
|
||||||
$ tts --model_info_by_idx tts_models/3
|
|
||||||
```
|
|
||||||
|
|
||||||
- Query info for model info by full name:
|
```sh
|
||||||
```
|
tts --text "Text for TTS" --out_path output/path/speech.wav
|
||||||
$ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
|
|
||||||
```
|
|
||||||
|
|
||||||
- Run TTS with default models:
|
|
||||||
|
|
||||||
```
|
|
||||||
$ tts --text "Text for TTS" --out_path output/path/speech.wav
|
|
||||||
```
|
```
|
||||||
|
|
||||||
- Run TTS and pipe out the generated TTS wav file data:
|
- Run TTS and pipe out the generated TTS wav file data:
|
||||||
|
|
||||||
```
|
```sh
|
||||||
$ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
|
tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
|
||||||
```
|
```
|
||||||
|
|
||||||
- Run a TTS model with its default vocoder model:
|
- Run a TTS model with its default vocoder model:
|
||||||
|
|
||||||
```
|
```sh
|
||||||
$ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
|
tts --text "Text for TTS" \
|
||||||
|
--model_name "<model_type>/<language>/<dataset>/<model_name>" \
|
||||||
|
--out_path output/path/speech.wav
|
||||||
```
|
```
|
||||||
|
|
||||||
For example:
|
For example:
|
||||||
|
|
||||||
```
|
```sh
|
||||||
$ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --out_path output/path/speech.wav
|
tts --text "Text for TTS" \
|
||||||
|
--model_name "tts_models/en/ljspeech/glow-tts" \
|
||||||
|
--out_path output/path/speech.wav
|
||||||
```
|
```
|
||||||
|
|
||||||
- Run with specific TTS and vocoder models from the list:
|
- Run with specific TTS and vocoder models from the list. Note that not every vocoder is compatible with every TTS model.
|
||||||
|
|
||||||
```
|
```sh
|
||||||
$ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --vocoder_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
|
tts --text "Text for TTS" \
|
||||||
|
--model_name "<model_type>/<language>/<dataset>/<model_name>" \
|
||||||
|
--vocoder_name "<model_type>/<language>/<dataset>/<model_name>" \
|
||||||
|
--out_path output/path/speech.wav
|
||||||
```
|
```
|
||||||
|
|
||||||
For example:
|
For example:
|
||||||
|
|
||||||
```
|
```sh
|
||||||
$ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --vocoder_name "vocoder_models/en/ljspeech/univnet" --out_path output/path/speech.wav
|
tts --text "Text for TTS" \
|
||||||
|
--model_name "tts_models/en/ljspeech/glow-tts" \
|
||||||
|
--vocoder_name "vocoder_models/en/ljspeech/univnet" \
|
||||||
|
--out_path output/path/speech.wav
|
||||||
```
|
```
|
||||||
|
|
||||||
- Run your own TTS model (Using Griffin-Lim Vocoder):
|
- Run your own TTS model (using Griffin-Lim Vocoder):
|
||||||
|
|
||||||
```
|
```sh
|
||||||
$ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
|
tts --text "Text for TTS" \
|
||||||
|
--model_path path/to/model.pth \
|
||||||
|
--config_path path/to/config.json \
|
||||||
|
--out_path output/path/speech.wav
|
||||||
```
|
```
|
||||||
|
|
||||||
- Run your own TTS and Vocoder models:
|
- Run your own TTS and Vocoder models:
|
||||||
|
|
||||||
```
|
```sh
|
||||||
$ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
|
tts --text "Text for TTS" \
|
||||||
--vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
|
--model_path path/to/model.pth \
|
||||||
|
--config_path path/to/config.json \
|
||||||
|
--out_path output/path/speech.wav \
|
||||||
|
--vocoder_path path/to/vocoder.pth \
|
||||||
|
--vocoder_config_path path/to/vocoder_config.json
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Multi-speaker Models
|
#### Multi-speaker models
|
||||||
|
|
||||||
- List the available speakers and choose a <speaker_id> among them:
|
- List the available speakers and choose a `<speaker_id>` among them:
|
||||||
|
|
||||||
```
|
```sh
|
||||||
$ tts --model_name "<language>/<dataset>/<model_name>" --list_speaker_idxs
|
tts --model_name "<language>/<dataset>/<model_name>" --list_speaker_idxs
|
||||||
```
|
```
|
||||||
|
|
||||||
- Run the multi-speaker TTS model with the target speaker ID:
|
- Run the multi-speaker TTS model with the target speaker ID:
|
||||||
|
|
||||||
```
|
```sh
|
||||||
$ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --speaker_idx <speaker_id>
|
tts --text "Text for TTS." --out_path output/path/speech.wav \
|
||||||
|
--model_name "<language>/<dataset>/<model_name>" --speaker_idx <speaker_id>
|
||||||
```
|
```
|
||||||
|
|
||||||
- Run your own multi-speaker TTS model:
|
- Run your own multi-speaker TTS model:
|
||||||
|
|
||||||
```
|
```sh
|
||||||
$ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/model.pth --config_path path/to/config.json --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
|
tts --text "Text for TTS" --out_path output/path/speech.wav \
|
||||||
|
--model_path path/to/model.pth --config_path path/to/config.json \
|
||||||
|
--speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
|
||||||
```
|
```
|
||||||
|
|
||||||
### Voice Conversion Models
|
#### Voice conversion models
|
||||||
|
|
||||||
```
|
```sh
|
||||||
$ tts --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --source_wav <path/to/speaker/wav> --target_wav <path/to/reference/wav>
|
tts --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" \
|
||||||
|
--source_wav <path/to/speaker/wav> --target_wav <path/to/reference/wav>
|
||||||
```
|
```
|
||||||
|
|
||||||
<!-- end-tts-readme -->
|
<!-- end-tts-readme -->
|
||||||
|
|
||||||
## Directory Structure
|
|
||||||
```
|
|
||||||
|- notebooks/ (Jupyter Notebooks for model evaluation, parameter selection and data analysis.)
|
|
||||||
|- utils/ (common utilities.)
|
|
||||||
|- TTS
|
|
||||||
|- bin/ (folder for all the executables.)
|
|
||||||
|- train*.py (train your target model.)
|
|
||||||
|- ...
|
|
||||||
|- tts/ (text to speech models)
|
|
||||||
|- layers/ (model layer definitions)
|
|
||||||
|- models/ (model definitions)
|
|
||||||
|- utils/ (model specific utilities.)
|
|
||||||
|- speaker_encoder/ (Speaker Encoder models.)
|
|
||||||
|- (same)
|
|
||||||
|- vocoder/ (Vocoder models.)
|
|
||||||
|- (same)
|
|
||||||
```
|
|
||||||
|
|
237
TTS/.models.json
237
TTS/.models.json
|
@ -5,11 +5,11 @@
|
||||||
"xtts_v2": {
|
"xtts_v2": {
|
||||||
"description": "XTTS-v2.0.3 by Coqui with 17 languages.",
|
"description": "XTTS-v2.0.3 by Coqui with 17 languages.",
|
||||||
"hf_url": [
|
"hf_url": [
|
||||||
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth",
|
"https://huggingface.co/coqui/XTTS-v2/resolve/main/model.pth",
|
||||||
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json",
|
"https://huggingface.co/coqui/XTTS-v2/resolve/main/config.json",
|
||||||
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json",
|
"https://huggingface.co/coqui/XTTS-v2/resolve/main/vocab.json",
|
||||||
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/hash.md5",
|
"https://huggingface.co/coqui/XTTS-v2/resolve/main/hash.md5",
|
||||||
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/speakers_xtts.pth"
|
"https://huggingface.co/coqui/XTTS-v2/resolve/main/speakers_xtts.pth"
|
||||||
],
|
],
|
||||||
"model_hash": "10f92b55c512af7a8d39d650547a15a7",
|
"model_hash": "10f92b55c512af7a8d39d650547a15a7",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
|
@ -21,10 +21,10 @@
|
||||||
"xtts_v1.1": {
|
"xtts_v1.1": {
|
||||||
"description": "XTTS-v1.1 by Coqui with 14 languages, cross-language voice cloning and reference leak fixed.",
|
"description": "XTTS-v1.1 by Coqui with 14 languages, cross-language voice cloning and reference leak fixed.",
|
||||||
"hf_url": [
|
"hf_url": [
|
||||||
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/model.pth",
|
"https://huggingface.co/coqui/XTTS-v1/resolve/v1.1.2/model.pth",
|
||||||
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/config.json",
|
"https://huggingface.co/coqui/XTTS-v1/resolve/v1.1.2/config.json",
|
||||||
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/vocab.json",
|
"https://huggingface.co/coqui/XTTS-v1/resolve/v1.1.2/vocab.json",
|
||||||
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/hash.md5"
|
"https://huggingface.co/coqui/XTTS-v1/resolve/v1.1.2/hash.md5"
|
||||||
],
|
],
|
||||||
"model_hash": "7c62beaf58d39b729de287330dc254e7b515677416839b649a50e7cf74c3df59",
|
"model_hash": "7c62beaf58d39b729de287330dc254e7b515677416839b649a50e7cf74c3df59",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
|
@ -35,7 +35,7 @@
|
||||||
},
|
},
|
||||||
"your_tts": {
|
"your_tts": {
|
||||||
"description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418",
|
"description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418",
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--multilingual--multi-dataset--your_tts.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.10.1_models/tts_models--multilingual--multi-dataset--your_tts.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"commit": "e9a1953e",
|
"commit": "e9a1953e",
|
||||||
"license": "CC BY-NC-ND 4.0",
|
"license": "CC BY-NC-ND 4.0",
|
||||||
|
@ -44,12 +44,11 @@
|
||||||
"bark": {
|
"bark": {
|
||||||
"description": "🐶 Bark TTS model released by suno-ai. You can find the original implementation in https://github.com/suno-ai/bark.",
|
"description": "🐶 Bark TTS model released by suno-ai. You can find the original implementation in https://github.com/suno-ai/bark.",
|
||||||
"hf_url": [
|
"hf_url": [
|
||||||
"https://coqui.gateway.scarf.sh/hf/bark/coarse_2.pt",
|
"https://huggingface.co/erogol/bark/resolve/main/coarse_2.pt",
|
||||||
"https://coqui.gateway.scarf.sh/hf/bark/fine_2.pt",
|
"https://huggingface.co/erogol/bark/resolve/main/fine_2.pt",
|
||||||
"https://coqui.gateway.scarf.sh/hf/text_2.pt",
|
"https://huggingface.co/erogol/bark/resolve/main/text_2.pt",
|
||||||
"https://coqui.gateway.scarf.sh/hf/bark/config.json",
|
"https://huggingface.co/erogol/bark/resolve/main/config.json",
|
||||||
"https://coqui.gateway.scarf.sh/hf/bark/hubert.pt",
|
"https://huggingface.co/erogol/bark/resolve/main/tokenizer.pth"
|
||||||
"https://coqui.gateway.scarf.sh/hf/bark/tokenizer.pth"
|
|
||||||
],
|
],
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"commit": "e9a1953e",
|
"commit": "e9a1953e",
|
||||||
|
@ -61,7 +60,7 @@
|
||||||
"bg": {
|
"bg": {
|
||||||
"cv": {
|
"cv": {
|
||||||
"vits": {
|
"vits": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--bg--cv--vits.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--bg--cv--vits.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"commit": null,
|
"commit": null,
|
||||||
"author": "@NeonGeckoCom",
|
"author": "@NeonGeckoCom",
|
||||||
|
@ -72,7 +71,7 @@
|
||||||
"cs": {
|
"cs": {
|
||||||
"cv": {
|
"cv": {
|
||||||
"vits": {
|
"vits": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--cs--cv--vits.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--cs--cv--vits.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"commit": null,
|
"commit": null,
|
||||||
"author": "@NeonGeckoCom",
|
"author": "@NeonGeckoCom",
|
||||||
|
@ -83,7 +82,7 @@
|
||||||
"da": {
|
"da": {
|
||||||
"cv": {
|
"cv": {
|
||||||
"vits": {
|
"vits": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--da--cv--vits.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--da--cv--vits.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"commit": null,
|
"commit": null,
|
||||||
"author": "@NeonGeckoCom",
|
"author": "@NeonGeckoCom",
|
||||||
|
@ -94,7 +93,7 @@
|
||||||
"et": {
|
"et": {
|
||||||
"cv": {
|
"cv": {
|
||||||
"vits": {
|
"vits": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--et--cv--vits.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--et--cv--vits.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"commit": null,
|
"commit": null,
|
||||||
"author": "@NeonGeckoCom",
|
"author": "@NeonGeckoCom",
|
||||||
|
@ -105,7 +104,7 @@
|
||||||
"ga": {
|
"ga": {
|
||||||
"cv": {
|
"cv": {
|
||||||
"vits": {
|
"vits": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ga--cv--vits.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--ga--cv--vits.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"commit": null,
|
"commit": null,
|
||||||
"author": "@NeonGeckoCom",
|
"author": "@NeonGeckoCom",
|
||||||
|
@ -117,7 +116,7 @@
|
||||||
"ek1": {
|
"ek1": {
|
||||||
"tacotron2": {
|
"tacotron2": {
|
||||||
"description": "EK1 en-rp tacotron2 by NMStoker",
|
"description": "EK1 en-rp tacotron2 by NMStoker",
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ek1--tacotron2.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--en--ek1--tacotron2.zip",
|
||||||
"default_vocoder": "vocoder_models/en/ek1/wavegrad",
|
"default_vocoder": "vocoder_models/en/ek1/wavegrad",
|
||||||
"commit": "c802255",
|
"commit": "c802255",
|
||||||
"license": "apache 2.0"
|
"license": "apache 2.0"
|
||||||
|
@ -126,7 +125,7 @@
|
||||||
"ljspeech": {
|
"ljspeech": {
|
||||||
"tacotron2-DDC": {
|
"tacotron2-DDC": {
|
||||||
"description": "Tacotron2 with Double Decoder Consistency.",
|
"description": "Tacotron2 with Double Decoder Consistency.",
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC.zip",
|
||||||
"default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
|
"default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
|
||||||
"commit": "bae2ad0f",
|
"commit": "bae2ad0f",
|
||||||
"author": "Eren Gölge @erogol",
|
"author": "Eren Gölge @erogol",
|
||||||
|
@ -135,7 +134,7 @@
|
||||||
},
|
},
|
||||||
"tacotron2-DDC_ph": {
|
"tacotron2-DDC_ph": {
|
||||||
"description": "Tacotron2 with Double Decoder Consistency with phonemes.",
|
"description": "Tacotron2 with Double Decoder Consistency with phonemes.",
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC_ph.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC_ph.zip",
|
||||||
"default_vocoder": "vocoder_models/en/ljspeech/univnet",
|
"default_vocoder": "vocoder_models/en/ljspeech/univnet",
|
||||||
"commit": "3900448",
|
"commit": "3900448",
|
||||||
"author": "Eren Gölge @erogol",
|
"author": "Eren Gölge @erogol",
|
||||||
|
@ -144,7 +143,7 @@
|
||||||
},
|
},
|
||||||
"glow-tts": {
|
"glow-tts": {
|
||||||
"description": "",
|
"description": "",
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--glow-tts.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--en--ljspeech--glow-tts.zip",
|
||||||
"stats_file": null,
|
"stats_file": null,
|
||||||
"default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
|
"default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
|
||||||
"commit": "",
|
"commit": "",
|
||||||
|
@ -154,7 +153,7 @@
|
||||||
},
|
},
|
||||||
"speedy-speech": {
|
"speedy-speech": {
|
||||||
"description": "Speedy Speech model trained on LJSpeech dataset using the Alignment Network for learning the durations.",
|
"description": "Speedy Speech model trained on LJSpeech dataset using the Alignment Network for learning the durations.",
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--speedy-speech.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--en--ljspeech--speedy-speech.zip",
|
||||||
"stats_file": null,
|
"stats_file": null,
|
||||||
"default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
|
"default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
|
||||||
"commit": "4581e3d",
|
"commit": "4581e3d",
|
||||||
|
@ -164,7 +163,7 @@
|
||||||
},
|
},
|
||||||
"tacotron2-DCA": {
|
"tacotron2-DCA": {
|
||||||
"description": "",
|
"description": "",
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DCA.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DCA.zip",
|
||||||
"default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
|
"default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
|
||||||
"commit": "",
|
"commit": "",
|
||||||
"author": "Eren Gölge @erogol",
|
"author": "Eren Gölge @erogol",
|
||||||
|
@ -173,7 +172,7 @@
|
||||||
},
|
},
|
||||||
"vits": {
|
"vits": {
|
||||||
"description": "VITS is an End2End TTS model trained on LJSpeech dataset with phonemes.",
|
"description": "VITS is an End2End TTS model trained on LJSpeech dataset with phonemes.",
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--vits.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--en--ljspeech--vits.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"commit": "3900448",
|
"commit": "3900448",
|
||||||
"author": "Eren Gölge @erogol",
|
"author": "Eren Gölge @erogol",
|
||||||
|
@ -181,7 +180,7 @@
|
||||||
"contact": "egolge@coqui.com"
|
"contact": "egolge@coqui.com"
|
||||||
},
|
},
|
||||||
"vits--neon": {
|
"vits--neon": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--en--ljspeech--vits.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--en--ljspeech--vits.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"author": "@NeonGeckoCom",
|
"author": "@NeonGeckoCom",
|
||||||
"license": "bsd-3-clause",
|
"license": "bsd-3-clause",
|
||||||
|
@ -190,7 +189,7 @@
|
||||||
},
|
},
|
||||||
"fast_pitch": {
|
"fast_pitch": {
|
||||||
"description": "FastPitch model trained on LJSpeech using the Aligner Network",
|
"description": "FastPitch model trained on LJSpeech using the Aligner Network",
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--fast_pitch.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--en--ljspeech--fast_pitch.zip",
|
||||||
"default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
|
"default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
|
||||||
"commit": "b27b3ba",
|
"commit": "b27b3ba",
|
||||||
"author": "Eren Gölge @erogol",
|
"author": "Eren Gölge @erogol",
|
||||||
|
@ -199,7 +198,7 @@
|
||||||
},
|
},
|
||||||
"overflow": {
|
"overflow": {
|
||||||
"description": "Overflow model trained on LJSpeech",
|
"description": "Overflow model trained on LJSpeech",
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.0_models/tts_models--en--ljspeech--overflow.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.10.0_models/tts_models--en--ljspeech--overflow.zip",
|
||||||
"default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
|
"default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
|
||||||
"commit": "3b1a28f",
|
"commit": "3b1a28f",
|
||||||
"author": "Eren Gölge @erogol",
|
"author": "Eren Gölge @erogol",
|
||||||
|
@ -208,7 +207,7 @@
|
||||||
},
|
},
|
||||||
"neural_hmm": {
|
"neural_hmm": {
|
||||||
"description": "Neural HMM model trained on LJSpeech",
|
"description": "Neural HMM model trained on LJSpeech",
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.11.0_models/tts_models--en--ljspeech--neural_hmm.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.11.0_models/tts_models--en--ljspeech--neural_hmm.zip",
|
||||||
"default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
|
"default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
|
||||||
"commit": "3b1a28f",
|
"commit": "3b1a28f",
|
||||||
"author": "Shivam Metha @shivammehta25",
|
"author": "Shivam Metha @shivammehta25",
|
||||||
|
@ -219,7 +218,7 @@
|
||||||
"vctk": {
|
"vctk": {
|
||||||
"vits": {
|
"vits": {
|
||||||
"description": "VITS End2End TTS model trained on VCTK dataset with 109 different speakers with EN accent.",
|
"description": "VITS End2End TTS model trained on VCTK dataset with 109 different speakers with EN accent.",
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--vits.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--en--vctk--vits.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"commit": "3900448",
|
"commit": "3900448",
|
||||||
"author": "Eren @erogol",
|
"author": "Eren @erogol",
|
||||||
|
@ -228,7 +227,7 @@
|
||||||
},
|
},
|
||||||
"fast_pitch": {
|
"fast_pitch": {
|
||||||
"description": "FastPitch model trained on VCTK dataseset.",
|
"description": "FastPitch model trained on VCTK dataseset.",
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--fast_pitch.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--en--vctk--fast_pitch.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"commit": "bdab788d",
|
"commit": "bdab788d",
|
||||||
"author": "Eren @erogol",
|
"author": "Eren @erogol",
|
||||||
|
@ -239,7 +238,7 @@
|
||||||
"sam": {
|
"sam": {
|
||||||
"tacotron-DDC": {
|
"tacotron-DDC": {
|
||||||
"description": "Tacotron2 with Double Decoder Consistency trained with Aceenture's Sam dataset.",
|
"description": "Tacotron2 with Double Decoder Consistency trained with Aceenture's Sam dataset.",
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--sam--tacotron-DDC.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--en--sam--tacotron-DDC.zip",
|
||||||
"default_vocoder": "vocoder_models/en/sam/hifigan_v2",
|
"default_vocoder": "vocoder_models/en/sam/hifigan_v2",
|
||||||
"commit": "bae2ad0f",
|
"commit": "bae2ad0f",
|
||||||
"author": "Eren Gölge @erogol",
|
"author": "Eren Gölge @erogol",
|
||||||
|
@ -250,7 +249,7 @@
|
||||||
"blizzard2013": {
|
"blizzard2013": {
|
||||||
"capacitron-t2-c50": {
|
"capacitron-t2-c50": {
|
||||||
"description": "Capacitron additions to Tacotron 2 with Capacity at 50 as in https://arxiv.org/pdf/1906.03402.pdf",
|
"description": "Capacitron additions to Tacotron 2 with Capacity at 50 as in https://arxiv.org/pdf/1906.03402.pdf",
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--en--blizzard2013--capacitron-t2-c50.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.7.0_models/tts_models--en--blizzard2013--capacitron-t2-c50.zip",
|
||||||
"commit": "d6284e7",
|
"commit": "d6284e7",
|
||||||
"default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
|
"default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
|
||||||
"author": "Adam Froghyar @a-froghyar",
|
"author": "Adam Froghyar @a-froghyar",
|
||||||
|
@ -259,7 +258,7 @@
|
||||||
},
|
},
|
||||||
"capacitron-t2-c150_v2": {
|
"capacitron-t2-c150_v2": {
|
||||||
"description": "Capacitron additions to Tacotron 2 with Capacity at 150 as in https://arxiv.org/pdf/1906.03402.pdf",
|
"description": "Capacitron additions to Tacotron 2 with Capacity at 150 as in https://arxiv.org/pdf/1906.03402.pdf",
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.1_models/tts_models--en--blizzard2013--capacitron-t2-c150_v2.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.7.1_models/tts_models--en--blizzard2013--capacitron-t2-c150_v2.zip",
|
||||||
"commit": "a67039d",
|
"commit": "a67039d",
|
||||||
"default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
|
"default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
|
||||||
"author": "Adam Froghyar @a-froghyar",
|
"author": "Adam Froghyar @a-froghyar",
|
||||||
|
@ -271,15 +270,15 @@
|
||||||
"tortoise-v2": {
|
"tortoise-v2": {
|
||||||
"description": "Tortoise tts model https://github.com/neonbjb/tortoise-tts",
|
"description": "Tortoise tts model https://github.com/neonbjb/tortoise-tts",
|
||||||
"github_rls_url": [
|
"github_rls_url": [
|
||||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/autoregressive.pth",
|
"https://github.com/coqui-ai/TTS/releases/download/v0.14.1_models/autoregressive.pth",
|
||||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/clvp2.pth",
|
"https://github.com/coqui-ai/TTS/releases/download/v0.14.1_models/clvp2.pth",
|
||||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/cvvp.pth",
|
"https://github.com/coqui-ai/TTS/releases/download/v0.14.1_models/cvvp.pth",
|
||||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/diffusion_decoder.pth",
|
"https://github.com/coqui-ai/TTS/releases/download/v0.14.1_models/diffusion_decoder.pth",
|
||||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_auto.pth",
|
"https://github.com/coqui-ai/TTS/releases/download/v0.14.1_models/rlg_auto.pth",
|
||||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_diffuser.pth",
|
"https://github.com/coqui-ai/TTS/releases/download/v0.14.1_models/rlg_diffuser.pth",
|
||||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/vocoder.pth",
|
"https://github.com/coqui-ai/TTS/releases/download/v0.14.1_models/vocoder.pth",
|
||||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/mel_norms.pth",
|
"https://github.com/coqui-ai/TTS/releases/download/v0.14.1_models/mel_norms.pth",
|
||||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/config.json"
|
"https://github.com/coqui-ai/TTS/releases/download/v0.14.1_models/config.json"
|
||||||
],
|
],
|
||||||
"commit": "c1875f6",
|
"commit": "c1875f6",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
|
@ -290,7 +289,7 @@
|
||||||
"jenny": {
|
"jenny": {
|
||||||
"jenny": {
|
"jenny": {
|
||||||
"description": "VITS model trained with Jenny(Dioco) dataset. Named as Jenny as demanded by the license. Original URL for the model https://www.kaggle.com/datasets/noml4u/tts-models--en--jenny-dioco--vits",
|
"description": "VITS model trained with Jenny(Dioco) dataset. Named as Jenny as demanded by the license. Original URL for the model https://www.kaggle.com/datasets/noml4u/tts-models--en--jenny-dioco--vits",
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.14.0_models/tts_models--en--jenny--jenny.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.14.0_models/tts_models--en--jenny--jenny.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"commit": "ba40a1c",
|
"commit": "ba40a1c",
|
||||||
"license": "custom - see https://github.com/dioco-group/jenny-tts-dataset#important",
|
"license": "custom - see https://github.com/dioco-group/jenny-tts-dataset#important",
|
||||||
|
@ -301,7 +300,7 @@
|
||||||
"es": {
|
"es": {
|
||||||
"mai": {
|
"mai": {
|
||||||
"tacotron2-DDC": {
|
"tacotron2-DDC": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--es--mai--tacotron2-DDC.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--es--mai--tacotron2-DDC.zip",
|
||||||
"default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
|
"default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
|
||||||
"commit": "",
|
"commit": "",
|
||||||
"author": "Eren Gölge @erogol",
|
"author": "Eren Gölge @erogol",
|
||||||
|
@ -311,7 +310,7 @@
|
||||||
},
|
},
|
||||||
"css10": {
|
"css10": {
|
||||||
"vits": {
|
"vits": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--es--css10--vits.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--es--css10--vits.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"commit": null,
|
"commit": null,
|
||||||
"author": "@NeonGeckoCom",
|
"author": "@NeonGeckoCom",
|
||||||
|
@ -322,7 +321,7 @@
|
||||||
"fr": {
|
"fr": {
|
||||||
"mai": {
|
"mai": {
|
||||||
"tacotron2-DDC": {
|
"tacotron2-DDC": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--fr--mai--tacotron2-DDC.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--fr--mai--tacotron2-DDC.zip",
|
||||||
"default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
|
"default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
|
||||||
"commit": null,
|
"commit": null,
|
||||||
"author": "Eren Gölge @erogol",
|
"author": "Eren Gölge @erogol",
|
||||||
|
@ -332,7 +331,7 @@
|
||||||
},
|
},
|
||||||
"css10": {
|
"css10": {
|
||||||
"vits": {
|
"vits": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fr--css10--vits.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--fr--css10--vits.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"commit": null,
|
"commit": null,
|
||||||
"author": "@NeonGeckoCom",
|
"author": "@NeonGeckoCom",
|
||||||
|
@ -343,7 +342,7 @@
|
||||||
"uk": {
|
"uk": {
|
||||||
"mai": {
|
"mai": {
|
||||||
"glow-tts": {
|
"glow-tts": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--uk--mai--glow-tts.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--uk--mai--glow-tts.zip",
|
||||||
"author": "@robinhad",
|
"author": "@robinhad",
|
||||||
"commit": "bdab788d",
|
"commit": "bdab788d",
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
|
@ -351,7 +350,7 @@
|
||||||
"default_vocoder": "vocoder_models/uk/mai/multiband-melgan"
|
"default_vocoder": "vocoder_models/uk/mai/multiband-melgan"
|
||||||
},
|
},
|
||||||
"vits": {
|
"vits": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--uk--mai--vits.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--uk--mai--vits.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"commit": null,
|
"commit": null,
|
||||||
"author": "@NeonGeckoCom",
|
"author": "@NeonGeckoCom",
|
||||||
|
@ -362,7 +361,7 @@
|
||||||
"zh-CN": {
|
"zh-CN": {
|
||||||
"baker": {
|
"baker": {
|
||||||
"tacotron2-DDC-GST": {
|
"tacotron2-DDC-GST": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip",
|
||||||
"commit": "unknown",
|
"commit": "unknown",
|
||||||
"author": "@kirianguiller",
|
"author": "@kirianguiller",
|
||||||
"license": "apache 2.0",
|
"license": "apache 2.0",
|
||||||
|
@ -373,7 +372,7 @@
|
||||||
"nl": {
|
"nl": {
|
||||||
"mai": {
|
"mai": {
|
||||||
"tacotron2-DDC": {
|
"tacotron2-DDC": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--nl--mai--tacotron2-DDC.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--nl--mai--tacotron2-DDC.zip",
|
||||||
"author": "@r-dh",
|
"author": "@r-dh",
|
||||||
"license": "apache 2.0",
|
"license": "apache 2.0",
|
||||||
"default_vocoder": "vocoder_models/nl/mai/parallel-wavegan",
|
"default_vocoder": "vocoder_models/nl/mai/parallel-wavegan",
|
||||||
|
@ -383,7 +382,7 @@
|
||||||
},
|
},
|
||||||
"css10": {
|
"css10": {
|
||||||
"vits": {
|
"vits": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--nl--css10--vits.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--nl--css10--vits.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"commit": null,
|
"commit": null,
|
||||||
"author": "@NeonGeckoCom",
|
"author": "@NeonGeckoCom",
|
||||||
|
@ -394,21 +393,21 @@
|
||||||
"de": {
|
"de": {
|
||||||
"thorsten": {
|
"thorsten": {
|
||||||
"tacotron2-DCA": {
|
"tacotron2-DCA": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--de--thorsten--tacotron2-DCA.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--de--thorsten--tacotron2-DCA.zip",
|
||||||
"default_vocoder": "vocoder_models/de/thorsten/fullband-melgan",
|
"default_vocoder": "vocoder_models/de/thorsten/fullband-melgan",
|
||||||
"author": "@thorstenMueller",
|
"author": "@thorstenMueller",
|
||||||
"license": "apache 2.0",
|
"license": "apache 2.0",
|
||||||
"commit": "unknown"
|
"commit": "unknown"
|
||||||
},
|
},
|
||||||
"vits": {
|
"vits": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--de--thorsten--vits.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.7.0_models/tts_models--de--thorsten--vits.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"author": "@thorstenMueller",
|
"author": "@thorstenMueller",
|
||||||
"license": "apache 2.0",
|
"license": "apache 2.0",
|
||||||
"commit": "unknown"
|
"commit": "unknown"
|
||||||
},
|
},
|
||||||
"tacotron2-DDC": {
|
"tacotron2-DDC": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--de--thorsten--tacotron2-DDC.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--de--thorsten--tacotron2-DDC.zip",
|
||||||
"default_vocoder": "vocoder_models/de/thorsten/hifigan_v1",
|
"default_vocoder": "vocoder_models/de/thorsten/hifigan_v1",
|
||||||
"description": "Thorsten-Dec2021-22k-DDC",
|
"description": "Thorsten-Dec2021-22k-DDC",
|
||||||
"author": "@thorstenMueller",
|
"author": "@thorstenMueller",
|
||||||
|
@ -418,7 +417,7 @@
|
||||||
},
|
},
|
||||||
"css10": {
|
"css10": {
|
||||||
"vits-neon": {
|
"vits-neon": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--de--css10--vits.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--de--css10--vits.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"author": "@NeonGeckoCom",
|
"author": "@NeonGeckoCom",
|
||||||
"license": "bsd-3-clause",
|
"license": "bsd-3-clause",
|
||||||
|
@ -429,7 +428,7 @@
|
||||||
"ja": {
|
"ja": {
|
||||||
"kokoro": {
|
"kokoro": {
|
||||||
"tacotron2-DDC": {
|
"tacotron2-DDC": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--ja--kokoro--tacotron2-DDC.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--ja--kokoro--tacotron2-DDC.zip",
|
||||||
"default_vocoder": "vocoder_models/ja/kokoro/hifigan_v1",
|
"default_vocoder": "vocoder_models/ja/kokoro/hifigan_v1",
|
||||||
"description": "Tacotron2 with Double Decoder Consistency trained with Kokoro Speech Dataset.",
|
"description": "Tacotron2 with Double Decoder Consistency trained with Kokoro Speech Dataset.",
|
||||||
"author": "@kaiidams",
|
"author": "@kaiidams",
|
||||||
|
@ -441,7 +440,7 @@
|
||||||
"tr": {
|
"tr": {
|
||||||
"common-voice": {
|
"common-voice": {
|
||||||
"glow-tts": {
|
"glow-tts": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--tr--common-voice--glow-tts.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--tr--common-voice--glow-tts.zip",
|
||||||
"default_vocoder": "vocoder_models/tr/common-voice/hifigan",
|
"default_vocoder": "vocoder_models/tr/common-voice/hifigan",
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"description": "Turkish GlowTTS model using an unknown speaker from the Common-Voice dataset.",
|
"description": "Turkish GlowTTS model using an unknown speaker from the Common-Voice dataset.",
|
||||||
|
@ -453,7 +452,7 @@
|
||||||
"it": {
|
"it": {
|
||||||
"mai_female": {
|
"mai_female": {
|
||||||
"glow-tts": {
|
"glow-tts": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--glow-tts.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--it--mai_female--glow-tts.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
|
"description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
|
||||||
"author": "@nicolalandro",
|
"author": "@nicolalandro",
|
||||||
|
@ -461,7 +460,7 @@
|
||||||
"commit": null
|
"commit": null
|
||||||
},
|
},
|
||||||
"vits": {
|
"vits": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--vits.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--it--mai_female--vits.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
|
"description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
|
||||||
"author": "@nicolalandro",
|
"author": "@nicolalandro",
|
||||||
|
@ -471,7 +470,7 @@
|
||||||
},
|
},
|
||||||
"mai_male": {
|
"mai_male": {
|
||||||
"glow-tts": {
|
"glow-tts": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--glow-tts.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--it--mai_male--glow-tts.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
|
"description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
|
||||||
"author": "@nicolalandro",
|
"author": "@nicolalandro",
|
||||||
|
@ -479,7 +478,7 @@
|
||||||
"commit": null
|
"commit": null
|
||||||
},
|
},
|
||||||
"vits": {
|
"vits": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--vits.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--it--mai_male--vits.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
|
"description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
|
||||||
"author": "@nicolalandro",
|
"author": "@nicolalandro",
|
||||||
|
@ -491,7 +490,7 @@
|
||||||
"ewe": {
|
"ewe": {
|
||||||
"openbible": {
|
"openbible": {
|
||||||
"vits": {
|
"vits": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--ewe--openbible--vits.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.2_models/tts_models--ewe--openbible--vits.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"license": "CC-BY-SA 4.0",
|
"license": "CC-BY-SA 4.0",
|
||||||
"description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
|
"description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
|
||||||
|
@ -503,7 +502,7 @@
|
||||||
"hau": {
|
"hau": {
|
||||||
"openbible": {
|
"openbible": {
|
||||||
"vits": {
|
"vits": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--hau--openbible--vits.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.2_models/tts_models--hau--openbible--vits.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"license": "CC-BY-SA 4.0",
|
"license": "CC-BY-SA 4.0",
|
||||||
"description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
|
"description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
|
||||||
|
@ -515,7 +514,7 @@
|
||||||
"lin": {
|
"lin": {
|
||||||
"openbible": {
|
"openbible": {
|
||||||
"vits": {
|
"vits": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--lin--openbible--vits.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.2_models/tts_models--lin--openbible--vits.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"license": "CC-BY-SA 4.0",
|
"license": "CC-BY-SA 4.0",
|
||||||
"description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
|
"description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
|
||||||
|
@ -527,7 +526,7 @@
|
||||||
"tw_akuapem": {
|
"tw_akuapem": {
|
||||||
"openbible": {
|
"openbible": {
|
||||||
"vits": {
|
"vits": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_akuapem--openbible--vits.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.2_models/tts_models--tw_akuapem--openbible--vits.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"license": "CC-BY-SA 4.0",
|
"license": "CC-BY-SA 4.0",
|
||||||
"description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
|
"description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
|
||||||
|
@ -539,7 +538,7 @@
|
||||||
"tw_asante": {
|
"tw_asante": {
|
||||||
"openbible": {
|
"openbible": {
|
||||||
"vits": {
|
"vits": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_asante--openbible--vits.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.2_models/tts_models--tw_asante--openbible--vits.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"license": "CC-BY-SA 4.0",
|
"license": "CC-BY-SA 4.0",
|
||||||
"description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
|
"description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
|
||||||
|
@ -551,7 +550,7 @@
|
||||||
"yor": {
|
"yor": {
|
||||||
"openbible": {
|
"openbible": {
|
||||||
"vits": {
|
"vits": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--yor--openbible--vits.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.2_models/tts_models--yor--openbible--vits.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"license": "CC-BY-SA 4.0",
|
"license": "CC-BY-SA 4.0",
|
||||||
"description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
|
"description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
|
||||||
|
@ -563,7 +562,7 @@
|
||||||
"hu": {
|
"hu": {
|
||||||
"css10": {
|
"css10": {
|
||||||
"vits": {
|
"vits": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hu--css10--vits.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--hu--css10--vits.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"commit": null,
|
"commit": null,
|
||||||
"author": "@NeonGeckoCom",
|
"author": "@NeonGeckoCom",
|
||||||
|
@ -574,7 +573,7 @@
|
||||||
"el": {
|
"el": {
|
||||||
"cv": {
|
"cv": {
|
||||||
"vits": {
|
"vits": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--el--cv--vits.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--el--cv--vits.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"commit": null,
|
"commit": null,
|
||||||
"author": "@NeonGeckoCom",
|
"author": "@NeonGeckoCom",
|
||||||
|
@ -585,7 +584,7 @@
|
||||||
"fi": {
|
"fi": {
|
||||||
"css10": {
|
"css10": {
|
||||||
"vits": {
|
"vits": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fi--css10--vits.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--fi--css10--vits.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"commit": null,
|
"commit": null,
|
||||||
"author": "@NeonGeckoCom",
|
"author": "@NeonGeckoCom",
|
||||||
|
@ -596,7 +595,7 @@
|
||||||
"hr": {
|
"hr": {
|
||||||
"cv": {
|
"cv": {
|
||||||
"vits": {
|
"vits": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hr--cv--vits.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--hr--cv--vits.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"commit": null,
|
"commit": null,
|
||||||
"author": "@NeonGeckoCom",
|
"author": "@NeonGeckoCom",
|
||||||
|
@ -607,7 +606,7 @@
|
||||||
"lt": {
|
"lt": {
|
||||||
"cv": {
|
"cv": {
|
||||||
"vits": {
|
"vits": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lt--cv--vits.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--lt--cv--vits.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"commit": null,
|
"commit": null,
|
||||||
"author": "@NeonGeckoCom",
|
"author": "@NeonGeckoCom",
|
||||||
|
@ -618,7 +617,7 @@
|
||||||
"lv": {
|
"lv": {
|
||||||
"cv": {
|
"cv": {
|
||||||
"vits": {
|
"vits": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lv--cv--vits.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--lv--cv--vits.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"commit": null,
|
"commit": null,
|
||||||
"author": "@NeonGeckoCom",
|
"author": "@NeonGeckoCom",
|
||||||
|
@ -629,7 +628,7 @@
|
||||||
"mt": {
|
"mt": {
|
||||||
"cv": {
|
"cv": {
|
||||||
"vits": {
|
"vits": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--mt--cv--vits.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--mt--cv--vits.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"commit": null,
|
"commit": null,
|
||||||
"author": "@NeonGeckoCom",
|
"author": "@NeonGeckoCom",
|
||||||
|
@ -640,7 +639,7 @@
|
||||||
"pl": {
|
"pl": {
|
||||||
"mai_female": {
|
"mai_female": {
|
||||||
"vits": {
|
"vits": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pl--mai_female--vits.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--pl--mai_female--vits.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"commit": null,
|
"commit": null,
|
||||||
"author": "@NeonGeckoCom",
|
"author": "@NeonGeckoCom",
|
||||||
|
@ -651,7 +650,7 @@
|
||||||
"pt": {
|
"pt": {
|
||||||
"cv": {
|
"cv": {
|
||||||
"vits": {
|
"vits": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pt--cv--vits.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--pt--cv--vits.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"commit": null,
|
"commit": null,
|
||||||
"author": "@NeonGeckoCom",
|
"author": "@NeonGeckoCom",
|
||||||
|
@ -662,7 +661,7 @@
|
||||||
"ro": {
|
"ro": {
|
||||||
"cv": {
|
"cv": {
|
||||||
"vits": {
|
"vits": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ro--cv--vits.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--ro--cv--vits.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"commit": null,
|
"commit": null,
|
||||||
"author": "@NeonGeckoCom",
|
"author": "@NeonGeckoCom",
|
||||||
|
@ -673,7 +672,7 @@
|
||||||
"sk": {
|
"sk": {
|
||||||
"cv": {
|
"cv": {
|
||||||
"vits": {
|
"vits": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sk--cv--vits.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--sk--cv--vits.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"commit": null,
|
"commit": null,
|
||||||
"author": "@NeonGeckoCom",
|
"author": "@NeonGeckoCom",
|
||||||
|
@ -684,7 +683,7 @@
|
||||||
"sl": {
|
"sl": {
|
||||||
"cv": {
|
"cv": {
|
||||||
"vits": {
|
"vits": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sl--cv--vits.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--sl--cv--vits.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"commit": null,
|
"commit": null,
|
||||||
"author": "@NeonGeckoCom",
|
"author": "@NeonGeckoCom",
|
||||||
|
@ -695,7 +694,7 @@
|
||||||
"sv": {
|
"sv": {
|
||||||
"cv": {
|
"cv": {
|
||||||
"vits": {
|
"vits": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sv--cv--vits.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--sv--cv--vits.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"commit": null,
|
"commit": null,
|
||||||
"author": "@NeonGeckoCom",
|
"author": "@NeonGeckoCom",
|
||||||
|
@ -706,7 +705,7 @@
|
||||||
"ca": {
|
"ca": {
|
||||||
"custom": {
|
"custom": {
|
||||||
"vits": {
|
"vits": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--ca--custom--vits.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.10.1_models/tts_models--ca--custom--vits.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"commit": null,
|
"commit": null,
|
||||||
"description": " It is trained from zero with 101460 utterances consisting of 257 speakers, approx 138 hours of speech. We used three datasets;\nFestcat and Google Catalan TTS (both TTS datasets) and also a part of Common Voice 8. It is trained with TTS v0.8.0.\nhttps://github.com/coqui-ai/TTS/discussions/930#discussioncomment-4466345",
|
"description": " It is trained from zero with 101460 utterances consisting of 257 speakers, approx 138 hours of speech. We used three datasets;\nFestcat and Google Catalan TTS (both TTS datasets) and also a part of Common Voice 8. It is trained with TTS v0.8.0.\nhttps://github.com/coqui-ai/TTS/discussions/930#discussioncomment-4466345",
|
||||||
|
@ -718,7 +717,7 @@
|
||||||
"fa": {
|
"fa": {
|
||||||
"custom": {
|
"custom": {
|
||||||
"glow-tts": {
|
"glow-tts": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--fa--custom--glow-tts.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.10.1_models/tts_models--fa--custom--glow-tts.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"commit": null,
|
"commit": null,
|
||||||
"description": "persian-tts-female-glow_tts model for text to speech purposes. Single-speaker female voice Trained on persian-tts-dataset-famale. \nThis model has no compatible vocoder thus the output quality is not very good. \nDataset: https://www.kaggle.com/datasets/magnoliasis/persian-tts-dataset-famale.",
|
"description": "persian-tts-female-glow_tts model for text to speech purposes. Single-speaker female voice Trained on persian-tts-dataset-famale. \nThis model has no compatible vocoder thus the output quality is not very good. \nDataset: https://www.kaggle.com/datasets/magnoliasis/persian-tts-dataset-famale.",
|
||||||
|
@ -730,7 +729,7 @@
|
||||||
"bn": {
|
"bn": {
|
||||||
"custom": {
|
"custom": {
|
||||||
"vits-male": {
|
"vits-male": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_male.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.13.3_models/tts_models--bn--custom--vits_male.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"commit": null,
|
"commit": null,
|
||||||
"description": "Single speaker Bangla male model. For more information -> https://github.com/mobassir94/comprehensive-bangla-tts",
|
"description": "Single speaker Bangla male model. For more information -> https://github.com/mobassir94/comprehensive-bangla-tts",
|
||||||
|
@ -738,7 +737,7 @@
|
||||||
"license": "Apache 2.0"
|
"license": "Apache 2.0"
|
||||||
},
|
},
|
||||||
"vits-female": {
|
"vits-female": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_female.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.13.3_models/tts_models--bn--custom--vits_female.zip",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"commit": null,
|
"commit": null,
|
||||||
"description": "Single speaker Bangla female model. For more information -> https://github.com/mobassir94/comprehensive-bangla-tts",
|
"description": "Single speaker Bangla female model. For more information -> https://github.com/mobassir94/comprehensive-bangla-tts",
|
||||||
|
@ -751,7 +750,7 @@
|
||||||
"common-voice": {
|
"common-voice": {
|
||||||
"glow-tts":{
|
"glow-tts":{
|
||||||
"description": "Belarusian GlowTTS model created by @alex73 (Github).",
|
"description": "Belarusian GlowTTS model created by @alex73 (Github).",
|
||||||
"github_rls_url":"https://coqui.gateway.scarf.sh/v0.16.6/tts_models--be--common-voice--glow-tts.zip",
|
"github_rls_url":"https://github.com/coqui-ai/TTS/releases/download/v0.16.6/tts_models--be--common-voice--glow-tts.zip",
|
||||||
"default_vocoder": "vocoder_models/be/common-voice/hifigan",
|
"default_vocoder": "vocoder_models/be/common-voice/hifigan",
|
||||||
"commit": "c0aabb85",
|
"commit": "c0aabb85",
|
||||||
"license": "CC-BY-SA 4.0",
|
"license": "CC-BY-SA 4.0",
|
||||||
|
@ -764,14 +763,14 @@
|
||||||
"universal": {
|
"universal": {
|
||||||
"libri-tts": {
|
"libri-tts": {
|
||||||
"wavegrad": {
|
"wavegrad": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--wavegrad.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--universal--libri-tts--wavegrad.zip",
|
||||||
"commit": "ea976b0",
|
"commit": "ea976b0",
|
||||||
"author": "Eren Gölge @erogol",
|
"author": "Eren Gölge @erogol",
|
||||||
"license": "MPL",
|
"license": "MPL",
|
||||||
"contact": "egolge@coqui.com"
|
"contact": "egolge@coqui.com"
|
||||||
},
|
},
|
||||||
"fullband-melgan": {
|
"fullband-melgan": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--fullband-melgan.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--universal--libri-tts--fullband-melgan.zip",
|
||||||
"commit": "4132240",
|
"commit": "4132240",
|
||||||
"author": "Eren Gölge @erogol",
|
"author": "Eren Gölge @erogol",
|
||||||
"license": "MPL",
|
"license": "MPL",
|
||||||
|
@ -783,14 +782,14 @@
|
||||||
"ek1": {
|
"ek1": {
|
||||||
"wavegrad": {
|
"wavegrad": {
|
||||||
"description": "EK1 en-rp wavegrad by NMStoker",
|
"description": "EK1 en-rp wavegrad by NMStoker",
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ek1--wavegrad.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--en--ek1--wavegrad.zip",
|
||||||
"commit": "c802255",
|
"commit": "c802255",
|
||||||
"license": "apache 2.0"
|
"license": "apache 2.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"ljspeech": {
|
"ljspeech": {
|
||||||
"multiband-melgan": {
|
"multiband-melgan": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--multiband-melgan.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--en--ljspeech--multiband-melgan.zip",
|
||||||
"commit": "ea976b0",
|
"commit": "ea976b0",
|
||||||
"author": "Eren Gölge @erogol",
|
"author": "Eren Gölge @erogol",
|
||||||
"license": "MPL",
|
"license": "MPL",
|
||||||
|
@ -798,7 +797,7 @@
|
||||||
},
|
},
|
||||||
"hifigan_v2": {
|
"hifigan_v2": {
|
||||||
"description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
|
"description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--hifigan_v2.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--en--ljspeech--hifigan_v2.zip",
|
||||||
"commit": "bae2ad0f",
|
"commit": "bae2ad0f",
|
||||||
"author": "@erogol",
|
"author": "@erogol",
|
||||||
"license": "apache 2.0",
|
"license": "apache 2.0",
|
||||||
|
@ -806,7 +805,7 @@
|
||||||
},
|
},
|
||||||
"univnet": {
|
"univnet": {
|
||||||
"description": "UnivNet model finetuned on TacotronDDC_ph spectrograms for better compatibility.",
|
"description": "UnivNet model finetuned on TacotronDDC_ph spectrograms for better compatibility.",
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--univnet_v2.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--en--ljspeech--univnet_v2.zip",
|
||||||
"commit": "4581e3d",
|
"commit": "4581e3d",
|
||||||
"author": "Eren @erogol",
|
"author": "Eren @erogol",
|
||||||
"license": "apache 2.0",
|
"license": "apache 2.0",
|
||||||
|
@ -816,7 +815,7 @@
|
||||||
"blizzard2013": {
|
"blizzard2013": {
|
||||||
"hifigan_v2": {
|
"hifigan_v2": {
|
||||||
"description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
|
"description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/vocoder_models--en--blizzard2013--hifigan_v2.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.7.0_models/vocoder_models--en--blizzard2013--hifigan_v2.zip",
|
||||||
"commit": "d6284e7",
|
"commit": "d6284e7",
|
||||||
"author": "Adam Froghyar @a-froghyar",
|
"author": "Adam Froghyar @a-froghyar",
|
||||||
"license": "apache 2.0",
|
"license": "apache 2.0",
|
||||||
|
@ -826,7 +825,7 @@
|
||||||
"vctk": {
|
"vctk": {
|
||||||
"hifigan_v2": {
|
"hifigan_v2": {
|
||||||
"description": "Finetuned and intended to be used with tts_models/en/vctk/sc-glow-tts",
|
"description": "Finetuned and intended to be used with tts_models/en/vctk/sc-glow-tts",
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--vctk--hifigan_v2.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--en--vctk--hifigan_v2.zip",
|
||||||
"commit": "2f07160",
|
"commit": "2f07160",
|
||||||
"author": "Edresson Casanova",
|
"author": "Edresson Casanova",
|
||||||
"license": "apache 2.0",
|
"license": "apache 2.0",
|
||||||
|
@ -836,7 +835,7 @@
|
||||||
"sam": {
|
"sam": {
|
||||||
"hifigan_v2": {
|
"hifigan_v2": {
|
||||||
"description": "Finetuned and intended to be used with tts_models/en/sam/tacotron_DDC",
|
"description": "Finetuned and intended to be used with tts_models/en/sam/tacotron_DDC",
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--sam--hifigan_v2.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--en--sam--hifigan_v2.zip",
|
||||||
"commit": "2f07160",
|
"commit": "2f07160",
|
||||||
"author": "Eren Gölge @erogol",
|
"author": "Eren Gölge @erogol",
|
||||||
"license": "apache 2.0",
|
"license": "apache 2.0",
|
||||||
|
@ -847,7 +846,7 @@
|
||||||
"nl": {
|
"nl": {
|
||||||
"mai": {
|
"mai": {
|
||||||
"parallel-wavegan": {
|
"parallel-wavegan": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--nl--mai--parallel-wavegan.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--nl--mai--parallel-wavegan.zip",
|
||||||
"author": "@r-dh",
|
"author": "@r-dh",
|
||||||
"license": "apache 2.0",
|
"license": "apache 2.0",
|
||||||
"commit": "unknown"
|
"commit": "unknown"
|
||||||
|
@ -857,19 +856,19 @@
|
||||||
"de": {
|
"de": {
|
||||||
"thorsten": {
|
"thorsten": {
|
||||||
"wavegrad": {
|
"wavegrad": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--wavegrad.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--de--thorsten--wavegrad.zip",
|
||||||
"author": "@thorstenMueller",
|
"author": "@thorstenMueller",
|
||||||
"license": "apache 2.0",
|
"license": "apache 2.0",
|
||||||
"commit": "unknown"
|
"commit": "unknown"
|
||||||
},
|
},
|
||||||
"fullband-melgan": {
|
"fullband-melgan": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--fullband-melgan.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--de--thorsten--fullband-melgan.zip",
|
||||||
"author": "@thorstenMueller",
|
"author": "@thorstenMueller",
|
||||||
"license": "apache 2.0",
|
"license": "apache 2.0",
|
||||||
"commit": "unknown"
|
"commit": "unknown"
|
||||||
},
|
},
|
||||||
"hifigan_v1": {
|
"hifigan_v1": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/vocoder_models--de--thorsten--hifigan_v1.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/vocoder_models--de--thorsten--hifigan_v1.zip",
|
||||||
"description": "HifiGAN vocoder model for Thorsten Neutral Dec2021 22k Samplerate Tacotron2 DDC model",
|
"description": "HifiGAN vocoder model for Thorsten Neutral Dec2021 22k Samplerate Tacotron2 DDC model",
|
||||||
"author": "@thorstenMueller",
|
"author": "@thorstenMueller",
|
||||||
"license": "apache 2.0",
|
"license": "apache 2.0",
|
||||||
|
@ -880,7 +879,7 @@
|
||||||
"ja": {
|
"ja": {
|
||||||
"kokoro": {
|
"kokoro": {
|
||||||
"hifigan_v1": {
|
"hifigan_v1": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--ja--kokoro--hifigan_v1.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--ja--kokoro--hifigan_v1.zip",
|
||||||
"description": "HifiGAN model trained for kokoro dataset by @kaiidams",
|
"description": "HifiGAN model trained for kokoro dataset by @kaiidams",
|
||||||
"author": "@kaiidams",
|
"author": "@kaiidams",
|
||||||
"license": "apache 2.0",
|
"license": "apache 2.0",
|
||||||
|
@ -891,7 +890,7 @@
|
||||||
"uk": {
|
"uk": {
|
||||||
"mai": {
|
"mai": {
|
||||||
"multiband-melgan": {
|
"multiband-melgan": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--uk--mai--multiband-melgan.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--uk--mai--multiband-melgan.zip",
|
||||||
"author": "@robinhad",
|
"author": "@robinhad",
|
||||||
"commit": "bdab788d",
|
"commit": "bdab788d",
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
|
@ -902,7 +901,7 @@
|
||||||
"tr": {
|
"tr": {
|
||||||
"common-voice": {
|
"common-voice": {
|
||||||
"hifigan": {
|
"hifigan": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--tr--common-voice--hifigan.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--tr--common-voice--hifigan.zip",
|
||||||
"description": "HifiGAN model using an unknown speaker from the Common-Voice dataset.",
|
"description": "HifiGAN model using an unknown speaker from the Common-Voice dataset.",
|
||||||
"author": "Fatih Akademi",
|
"author": "Fatih Akademi",
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
|
@ -913,7 +912,7 @@
|
||||||
"be": {
|
"be": {
|
||||||
"common-voice": {
|
"common-voice": {
|
||||||
"hifigan": {
|
"hifigan": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.16.6/vocoder_models--be--common-voice--hifigan.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.16.6/vocoder_models--be--common-voice--hifigan.zip",
|
||||||
"description": "Belarusian HiFiGAN model created by @alex73 (Github).",
|
"description": "Belarusian HiFiGAN model created by @alex73 (Github).",
|
||||||
"author": "@alex73",
|
"author": "@alex73",
|
||||||
"license": "CC-BY-SA 4.0",
|
"license": "CC-BY-SA 4.0",
|
||||||
|
@ -926,12 +925,34 @@
|
||||||
"multilingual": {
|
"multilingual": {
|
||||||
"vctk": {
|
"vctk": {
|
||||||
"freevc24": {
|
"freevc24": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.0_models/voice_conversion_models--multilingual--vctk--freevc24.zip",
|
"github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.13.0_models/voice_conversion_models--multilingual--vctk--freevc24.zip",
|
||||||
"description": "FreeVC model trained on VCTK dataset from https://github.com/OlaWod/FreeVC",
|
"description": "FreeVC model trained on VCTK dataset from https://github.com/OlaWod/FreeVC",
|
||||||
"author": "Jing-Yi Li @OlaWod",
|
"author": "Jing-Yi Li @OlaWod",
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"commit": null
|
"commit": null
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
"multi-dataset": {
|
||||||
|
"openvoice_v1": {
|
||||||
|
"hf_url": [
|
||||||
|
"https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/converter/config.json",
|
||||||
|
"https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/converter/checkpoint.pth"
|
||||||
|
],
|
||||||
|
"description": "OpenVoice VC model from https://huggingface.co/myshell-ai/OpenVoiceV2",
|
||||||
|
"author": "MyShell.ai",
|
||||||
|
"license": "MIT",
|
||||||
|
"commit": null
|
||||||
|
},
|
||||||
|
"openvoice_v2": {
|
||||||
|
"hf_url": [
|
||||||
|
"https://huggingface.co/myshell-ai/OpenVoiceV2/resolve/main/converter/config.json",
|
||||||
|
"https://huggingface.co/myshell-ai/OpenVoiceV2/resolve/main/converter/checkpoint.pth"
|
||||||
|
],
|
||||||
|
"description": "OpenVoice VC model from https://huggingface.co/myshell-ai/OpenVoiceV2",
|
||||||
|
"author": "MyShell.ai",
|
||||||
|
"license": "MIT",
|
||||||
|
"commit": null
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
0.22.0
|
|
|
@ -1,6 +1,33 @@
|
||||||
import os
|
import importlib.metadata
|
||||||
|
|
||||||
with open(os.path.join(os.path.dirname(__file__), "VERSION"), "r", encoding="utf-8") as f:
|
from TTS.utils.generic_utils import is_pytorch_at_least_2_4
|
||||||
version = f.read().strip()
|
|
||||||
|
|
||||||
__version__ = version
|
__version__ = importlib.metadata.version("coqui-tts")
|
||||||
|
|
||||||
|
|
||||||
|
if is_pytorch_at_least_2_4():
|
||||||
|
import _codecs
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from TTS.config.shared_configs import BaseDatasetConfig
|
||||||
|
from TTS.tts.configs.xtts_config import XttsConfig
|
||||||
|
from TTS.tts.models.xtts import XttsArgs, XttsAudioConfig
|
||||||
|
from TTS.utils.radam import RAdam
|
||||||
|
|
||||||
|
torch.serialization.add_safe_globals([dict, defaultdict, RAdam])
|
||||||
|
|
||||||
|
# Bark
|
||||||
|
torch.serialization.add_safe_globals(
|
||||||
|
[
|
||||||
|
np.core.multiarray.scalar,
|
||||||
|
np.dtype,
|
||||||
|
np.dtypes.Float64DType,
|
||||||
|
_codecs.encode, # TODO: safe by default from Pytorch 2.5
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
# XTTS
|
||||||
|
torch.serialization.add_safe_globals([BaseDatasetConfig, XttsConfig, XttsAudioConfig, XttsArgs])
|
||||||
|
|
197
TTS/api.py
197
TTS/api.py
|
@ -1,15 +1,18 @@
|
||||||
|
"""Coqui TTS Python API."""
|
||||||
|
|
||||||
|
import logging
|
||||||
import tempfile
|
import tempfile
|
||||||
import warnings
|
import warnings
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Union
|
from typing import Optional
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
||||||
from TTS.utils.audio.numpy_transforms import save_wav
|
from TTS.config import load_config
|
||||||
from TTS.utils.manage import ModelManager
|
from TTS.utils.manage import ModelManager
|
||||||
from TTS.utils.synthesizer import Synthesizer
|
from TTS.utils.synthesizer import Synthesizer
|
||||||
from TTS.config import load_config
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class TTS(nn.Module):
|
class TTS(nn.Module):
|
||||||
|
@ -18,13 +21,19 @@ class TTS(nn.Module):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
model_name: str = "",
|
model_name: str = "",
|
||||||
model_path: str = None,
|
*,
|
||||||
config_path: str = None,
|
model_path: Optional[str] = None,
|
||||||
vocoder_path: str = None,
|
config_path: Optional[str] = None,
|
||||||
vocoder_config_path: str = None,
|
vocoder_name: Optional[str] = None,
|
||||||
|
vocoder_path: Optional[str] = None,
|
||||||
|
vocoder_config_path: Optional[str] = None,
|
||||||
|
encoder_path: Optional[str] = None,
|
||||||
|
encoder_config_path: Optional[str] = None,
|
||||||
|
speakers_file_path: Optional[str] = None,
|
||||||
|
language_ids_file_path: Optional[str] = None,
|
||||||
progress_bar: bool = True,
|
progress_bar: bool = True,
|
||||||
gpu=False,
|
gpu: bool = False,
|
||||||
):
|
) -> None:
|
||||||
"""🐸TTS python interface that allows to load and use the released models.
|
"""🐸TTS python interface that allows to load and use the released models.
|
||||||
|
|
||||||
Example with a multi-speaker model:
|
Example with a multi-speaker model:
|
||||||
|
@ -34,118 +43,147 @@ class TTS(nn.Module):
|
||||||
>>> tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav")
|
>>> tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav")
|
||||||
|
|
||||||
Example with a single-speaker model:
|
Example with a single-speaker model:
|
||||||
>>> tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False)
|
>>> tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False)
|
||||||
>>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
|
>>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
|
||||||
|
|
||||||
Example loading a model from a path:
|
Example loading a model from a path:
|
||||||
>>> tts = TTS(model_path="/path/to/checkpoint_100000.pth", config_path="/path/to/config.json", progress_bar=False, gpu=False)
|
>>> tts = TTS(model_path="/path/to/checkpoint_100000.pth", config_path="/path/to/config.json", progress_bar=False)
|
||||||
>>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
|
>>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
|
||||||
|
|
||||||
Example voice cloning with YourTTS in English, French and Portuguese:
|
Example voice cloning with YourTTS in English, French and Portuguese:
|
||||||
>>> tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)
|
>>> tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False).to("cuda")
|
||||||
>>> tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="thisisit.wav")
|
>>> tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="thisisit.wav")
|
||||||
>>> tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr", file_path="thisisit.wav")
|
>>> tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr", file_path="thisisit.wav")
|
||||||
>>> tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt", file_path="thisisit.wav")
|
>>> tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt", file_path="thisisit.wav")
|
||||||
|
|
||||||
Example Fairseq TTS models (uses ISO language codes in https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html):
|
Example Fairseq TTS models (uses ISO language codes in https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html):
|
||||||
>>> tts = TTS(model_name="tts_models/eng/fairseq/vits", progress_bar=False, gpu=True)
|
>>> tts = TTS(model_name="tts_models/eng/fairseq/vits", progress_bar=False).to("cuda")
|
||||||
>>> tts.tts_to_file("This is a test.", file_path="output.wav")
|
>>> tts.tts_to_file("This is a test.", file_path="output.wav")
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
model_name (str, optional): Model name to load. You can list models by ```tts.models```. Defaults to None.
|
model_name (str, optional): Model name to load. You can list models by ```tts.models```. Defaults to None.
|
||||||
model_path (str, optional): Path to the model checkpoint. Defaults to None.
|
model_path (str, optional): Path to the model checkpoint. Defaults to None.
|
||||||
config_path (str, optional): Path to the model config. Defaults to None.
|
config_path (str, optional): Path to the model config. Defaults to None.
|
||||||
|
vocoder_name (str, optional): Pre-trained vocoder to use. Defaults to None, i.e. using the default vocoder.
|
||||||
vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
|
vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
|
||||||
vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
|
vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
|
||||||
progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
|
encoder_path: Path to speaker encoder checkpoint. Default to None.
|
||||||
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
|
encoder_config_path: Path to speaker encoder config file. Defaults to None.
|
||||||
|
speakers_file_path: JSON file for multi-speaker model. Defaults to None.
|
||||||
|
language_ids_file_path: JSON file for multilingual model. Defaults to None
|
||||||
|
progress_bar (bool, optional): Whether to print a progress bar while downloading a model. Defaults to True.
|
||||||
|
gpu (bool, optional): Enable/disable GPU. Defaults to False. DEPRECATED, use TTS(...).to("cuda")
|
||||||
"""
|
"""
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar, verbose=False)
|
self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar)
|
||||||
self.config = load_config(config_path) if config_path else None
|
self.config = load_config(config_path) if config_path else None
|
||||||
self.synthesizer = None
|
self.synthesizer = None
|
||||||
self.voice_converter = None
|
self.voice_converter = None
|
||||||
self.model_name = ""
|
self.model_name = ""
|
||||||
|
|
||||||
|
self.vocoder_path = vocoder_path
|
||||||
|
self.vocoder_config_path = vocoder_config_path
|
||||||
|
self.encoder_path = encoder_path
|
||||||
|
self.encoder_config_path = encoder_config_path
|
||||||
|
self.speakers_file_path = speakers_file_path
|
||||||
|
self.language_ids_file_path = language_ids_file_path
|
||||||
|
|
||||||
if gpu:
|
if gpu:
|
||||||
warnings.warn("`gpu` will be deprecated. Please use `tts.to(device)` instead.")
|
warnings.warn("`gpu` will be deprecated. Please use `tts.to(device)` instead.")
|
||||||
|
|
||||||
if model_name is not None and len(model_name) > 0:
|
if model_name is not None and len(model_name) > 0:
|
||||||
if "tts_models" in model_name:
|
if "tts_models" in model_name:
|
||||||
self.load_tts_model_by_name(model_name, gpu)
|
self.load_tts_model_by_name(model_name, vocoder_name, gpu=gpu)
|
||||||
elif "voice_conversion_models" in model_name:
|
elif "voice_conversion_models" in model_name:
|
||||||
self.load_vc_model_by_name(model_name, gpu)
|
self.load_vc_model_by_name(model_name, gpu=gpu)
|
||||||
|
# To allow just TTS("xtts")
|
||||||
else:
|
else:
|
||||||
self.load_model_by_name(model_name, gpu)
|
self.load_model_by_name(model_name, vocoder_name, gpu=gpu)
|
||||||
|
|
||||||
if model_path:
|
if model_path:
|
||||||
self.load_tts_model_by_path(
|
self.load_tts_model_by_path(model_path, config_path, gpu=gpu)
|
||||||
model_path, config_path, vocoder_path=vocoder_path, vocoder_config=vocoder_config_path, gpu=gpu
|
|
||||||
)
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def models(self):
|
def models(self) -> list[str]:
|
||||||
return self.manager.list_tts_models()
|
return self.manager.list_tts_models()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def is_multi_speaker(self):
|
def is_multi_speaker(self) -> bool:
|
||||||
if hasattr(self.synthesizer.tts_model, "speaker_manager") and self.synthesizer.tts_model.speaker_manager:
|
if (
|
||||||
|
self.synthesizer is not None
|
||||||
|
and hasattr(self.synthesizer.tts_model, "speaker_manager")
|
||||||
|
and self.synthesizer.tts_model.speaker_manager
|
||||||
|
):
|
||||||
return self.synthesizer.tts_model.speaker_manager.num_speakers > 1
|
return self.synthesizer.tts_model.speaker_manager.num_speakers > 1
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def is_multi_lingual(self):
|
def is_multi_lingual(self) -> bool:
|
||||||
# Not sure what sets this to None, but applied a fix to prevent crashing.
|
# Not sure what sets this to None, but applied a fix to prevent crashing.
|
||||||
if (
|
if (
|
||||||
isinstance(self.model_name, str)
|
isinstance(self.model_name, str)
|
||||||
and "xtts" in self.model_name
|
and "xtts" in self.model_name
|
||||||
or self.config
|
or self.config
|
||||||
and ("xtts" in self.config.model or len(self.config.languages) > 1)
|
and ("xtts" in self.config.model or "languages" in self.config and len(self.config.languages) > 1)
|
||||||
):
|
):
|
||||||
return True
|
return True
|
||||||
if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager:
|
if (
|
||||||
|
self.synthesizer is not None
|
||||||
|
and hasattr(self.synthesizer.tts_model, "language_manager")
|
||||||
|
and self.synthesizer.tts_model.language_manager
|
||||||
|
):
|
||||||
return self.synthesizer.tts_model.language_manager.num_languages > 1
|
return self.synthesizer.tts_model.language_manager.num_languages > 1
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def speakers(self):
|
def speakers(self) -> list[str]:
|
||||||
if not self.is_multi_speaker:
|
if not self.is_multi_speaker:
|
||||||
return None
|
return None
|
||||||
return self.synthesizer.tts_model.speaker_manager.speaker_names
|
return self.synthesizer.tts_model.speaker_manager.speaker_names
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def languages(self):
|
def languages(self) -> list[str]:
|
||||||
if not self.is_multi_lingual:
|
if not self.is_multi_lingual:
|
||||||
return None
|
return None
|
||||||
return self.synthesizer.tts_model.language_manager.language_names
|
return self.synthesizer.tts_model.language_manager.language_names
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_models_file_path():
|
def get_models_file_path() -> Path:
|
||||||
return Path(__file__).parent / ".models.json"
|
return Path(__file__).parent / ".models.json"
|
||||||
|
|
||||||
def list_models(self):
|
@staticmethod
|
||||||
return ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False)
|
def list_models() -> list[str]:
|
||||||
|
return ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False).list_models()
|
||||||
|
|
||||||
def download_model_by_name(self, model_name: str):
|
def download_model_by_name(
|
||||||
|
self, model_name: str, vocoder_name: Optional[str] = None
|
||||||
|
) -> tuple[Optional[Path], Optional[Path], Optional[Path]]:
|
||||||
model_path, config_path, model_item = self.manager.download_model(model_name)
|
model_path, config_path, model_item = self.manager.download_model(model_name)
|
||||||
if "fairseq" in model_name or (model_item is not None and isinstance(model_item["model_url"], list)):
|
if "fairseq" in model_name or (model_item is not None and isinstance(model_item["model_url"], list)):
|
||||||
# return model directory if there are multiple files
|
# return model directory if there are multiple files
|
||||||
# we assume that the model knows how to load itself
|
# we assume that the model knows how to load itself
|
||||||
return None, None, None, None, model_path
|
return None, None, model_path
|
||||||
if model_item.get("default_vocoder") is None:
|
if model_item.get("default_vocoder") is None:
|
||||||
return model_path, config_path, None, None, None
|
return model_path, config_path, None
|
||||||
vocoder_path, vocoder_config_path, _ = self.manager.download_model(model_item["default_vocoder"])
|
if vocoder_name is None:
|
||||||
return model_path, config_path, vocoder_path, vocoder_config_path, None
|
vocoder_name = model_item["default_vocoder"]
|
||||||
|
vocoder_path, vocoder_config_path, _ = self.manager.download_model(vocoder_name)
|
||||||
|
# A local vocoder model will take precedence if specified via vocoder_path
|
||||||
|
if self.vocoder_path is None or self.vocoder_config_path is None:
|
||||||
|
self.vocoder_path = vocoder_path
|
||||||
|
self.vocoder_config_path = vocoder_config_path
|
||||||
|
return model_path, config_path, None
|
||||||
|
|
||||||
def load_model_by_name(self, model_name: str, gpu: bool = False):
|
def load_model_by_name(self, model_name: str, vocoder_name: Optional[str] = None, *, gpu: bool = False) -> None:
|
||||||
"""Load one of the 🐸TTS models by name.
|
"""Load one of the 🐸TTS models by name.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
model_name (str): Model name to load. You can list models by ```tts.models```.
|
model_name (str): Model name to load. You can list models by ```tts.models```.
|
||||||
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
|
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
|
||||||
"""
|
"""
|
||||||
self.load_tts_model_by_name(model_name, gpu)
|
self.load_tts_model_by_name(model_name, vocoder_name, gpu=gpu)
|
||||||
|
|
||||||
def load_vc_model_by_name(self, model_name: str, gpu: bool = False):
|
def load_vc_model_by_name(self, model_name: str, *, gpu: bool = False) -> None:
|
||||||
"""Load one of the voice conversion models by name.
|
"""Load one of the voice conversion models by name.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
@ -153,10 +191,12 @@ class TTS(nn.Module):
|
||||||
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
|
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
|
||||||
"""
|
"""
|
||||||
self.model_name = model_name
|
self.model_name = model_name
|
||||||
model_path, config_path, _, _, _ = self.download_model_by_name(model_name)
|
model_path, config_path, model_dir = self.download_model_by_name(model_name)
|
||||||
self.voice_converter = Synthesizer(vc_checkpoint=model_path, vc_config=config_path, use_cuda=gpu)
|
self.voice_converter = Synthesizer(
|
||||||
|
vc_checkpoint=model_path, vc_config=config_path, model_dir=model_dir, use_cuda=gpu
|
||||||
|
)
|
||||||
|
|
||||||
def load_tts_model_by_name(self, model_name: str, gpu: bool = False):
|
def load_tts_model_by_name(self, model_name: str, vocoder_name: Optional[str] = None, *, gpu: bool = False) -> None:
|
||||||
"""Load one of 🐸TTS models by name.
|
"""Load one of 🐸TTS models by name.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
@ -168,9 +208,7 @@ class TTS(nn.Module):
|
||||||
self.synthesizer = None
|
self.synthesizer = None
|
||||||
self.model_name = model_name
|
self.model_name = model_name
|
||||||
|
|
||||||
model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
|
model_path, config_path, model_dir = self.download_model_by_name(model_name, vocoder_name)
|
||||||
model_name
|
|
||||||
)
|
|
||||||
|
|
||||||
# init synthesizer
|
# init synthesizer
|
||||||
# None values are fetch from the model
|
# None values are fetch from the model
|
||||||
|
@ -179,17 +217,15 @@ class TTS(nn.Module):
|
||||||
tts_config_path=config_path,
|
tts_config_path=config_path,
|
||||||
tts_speakers_file=None,
|
tts_speakers_file=None,
|
||||||
tts_languages_file=None,
|
tts_languages_file=None,
|
||||||
vocoder_checkpoint=vocoder_path,
|
vocoder_checkpoint=self.vocoder_path,
|
||||||
vocoder_config=vocoder_config_path,
|
vocoder_config=self.vocoder_config_path,
|
||||||
encoder_checkpoint=None,
|
encoder_checkpoint=self.encoder_path,
|
||||||
encoder_config=None,
|
encoder_config=self.encoder_config_path,
|
||||||
model_dir=model_dir,
|
model_dir=model_dir,
|
||||||
use_cuda=gpu,
|
use_cuda=gpu,
|
||||||
)
|
)
|
||||||
|
|
||||||
def load_tts_model_by_path(
|
def load_tts_model_by_path(self, model_path: str, config_path: str, *, gpu: bool = False) -> None:
|
||||||
self, model_path: str, config_path: str, vocoder_path: str = None, vocoder_config: str = None, gpu: bool = False
|
|
||||||
):
|
|
||||||
"""Load a model from a path.
|
"""Load a model from a path.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
@ -203,22 +239,22 @@ class TTS(nn.Module):
|
||||||
self.synthesizer = Synthesizer(
|
self.synthesizer = Synthesizer(
|
||||||
tts_checkpoint=model_path,
|
tts_checkpoint=model_path,
|
||||||
tts_config_path=config_path,
|
tts_config_path=config_path,
|
||||||
tts_speakers_file=None,
|
tts_speakers_file=self.speakers_file_path,
|
||||||
tts_languages_file=None,
|
tts_languages_file=self.language_ids_file_path,
|
||||||
vocoder_checkpoint=vocoder_path,
|
vocoder_checkpoint=self.vocoder_path,
|
||||||
vocoder_config=vocoder_config,
|
vocoder_config=self.vocoder_config_path,
|
||||||
encoder_checkpoint=None,
|
encoder_checkpoint=self.encoder_path,
|
||||||
encoder_config=None,
|
encoder_config=self.encoder_config_path,
|
||||||
use_cuda=gpu,
|
use_cuda=gpu,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _check_arguments(
|
def _check_arguments(
|
||||||
self,
|
self,
|
||||||
speaker: str = None,
|
speaker: Optional[str] = None,
|
||||||
language: str = None,
|
language: Optional[str] = None,
|
||||||
speaker_wav: str = None,
|
speaker_wav: Optional[str] = None,
|
||||||
emotion: str = None,
|
emotion: Optional[str] = None,
|
||||||
speed: float = None,
|
speed: Optional[float] = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Check if the arguments are valid for the model."""
|
"""Check if the arguments are valid for the model."""
|
||||||
|
@ -231,7 +267,7 @@ class TTS(nn.Module):
|
||||||
raise ValueError("Model is not multi-speaker but `speaker` is provided.")
|
raise ValueError("Model is not multi-speaker but `speaker` is provided.")
|
||||||
if not self.is_multi_lingual and language is not None:
|
if not self.is_multi_lingual and language is not None:
|
||||||
raise ValueError("Model is not multi-lingual but `language` is provided.")
|
raise ValueError("Model is not multi-lingual but `language` is provided.")
|
||||||
if not emotion is None and not speed is None:
|
if emotion is not None and speed is not None:
|
||||||
raise ValueError("Emotion and speed can only be used with Coqui Studio models. Which is discontinued.")
|
raise ValueError("Emotion and speed can only be used with Coqui Studio models. Which is discontinued.")
|
||||||
|
|
||||||
def tts(
|
def tts(
|
||||||
|
@ -278,10 +314,6 @@ class TTS(nn.Module):
|
||||||
speaker_name=speaker,
|
speaker_name=speaker,
|
||||||
language_name=language,
|
language_name=language,
|
||||||
speaker_wav=speaker_wav,
|
speaker_wav=speaker_wav,
|
||||||
reference_wav=None,
|
|
||||||
style_wav=None,
|
|
||||||
style_text=None,
|
|
||||||
reference_speaker_name=None,
|
|
||||||
split_sentences=split_sentences,
|
split_sentences=split_sentences,
|
||||||
speed=speed,
|
speed=speed,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
|
@ -300,7 +332,7 @@ class TTS(nn.Module):
|
||||||
file_path: str = "output.wav",
|
file_path: str = "output.wav",
|
||||||
split_sentences: bool = True,
|
split_sentences: bool = True,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
) -> str:
|
||||||
"""Convert text to speech.
|
"""Convert text to speech.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
@ -356,15 +388,18 @@ class TTS(nn.Module):
|
||||||
target_wav (str):`
|
target_wav (str):`
|
||||||
Path to the target wav file.
|
Path to the target wav file.
|
||||||
"""
|
"""
|
||||||
wav = self.voice_converter.voice_conversion(source_wav=source_wav, target_wav=target_wav)
|
if self.voice_converter is None:
|
||||||
return wav
|
msg = "The selected model does not support voice conversion."
|
||||||
|
raise RuntimeError(msg)
|
||||||
|
return self.voice_converter.voice_conversion(source_wav=source_wav, target_wav=target_wav)
|
||||||
|
|
||||||
def voice_conversion_to_file(
|
def voice_conversion_to_file(
|
||||||
self,
|
self,
|
||||||
source_wav: str,
|
source_wav: str,
|
||||||
target_wav: str,
|
target_wav: str,
|
||||||
file_path: str = "output.wav",
|
file_path: str = "output.wav",
|
||||||
):
|
pipe_out=None,
|
||||||
|
) -> str:
|
||||||
"""Voice conversion with FreeVC. Convert source wav to target speaker.
|
"""Voice conversion with FreeVC. Convert source wav to target speaker.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
@ -374,9 +409,11 @@ class TTS(nn.Module):
|
||||||
Path to the target wav file.
|
Path to the target wav file.
|
||||||
file_path (str, optional):
|
file_path (str, optional):
|
||||||
Output file path. Defaults to "output.wav".
|
Output file path. Defaults to "output.wav".
|
||||||
|
pipe_out (BytesIO, optional):
|
||||||
|
Flag to stdout the generated TTS wav file for shell pipe.
|
||||||
"""
|
"""
|
||||||
wav = self.voice_conversion(source_wav=source_wav, target_wav=target_wav)
|
wav = self.voice_conversion(source_wav=source_wav, target_wav=target_wav)
|
||||||
save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
|
self.voice_converter.save_wav(wav=wav, path=file_path, pipe_out=pipe_out)
|
||||||
return file_path
|
return file_path
|
||||||
|
|
||||||
def tts_with_vc(
|
def tts_with_vc(
|
||||||
|
@ -429,7 +466,8 @@ class TTS(nn.Module):
|
||||||
file_path: str = "output.wav",
|
file_path: str = "output.wav",
|
||||||
speaker: str = None,
|
speaker: str = None,
|
||||||
split_sentences: bool = True,
|
split_sentences: bool = True,
|
||||||
):
|
pipe_out=None,
|
||||||
|
) -> str:
|
||||||
"""Convert text to speech with voice conversion and save to file.
|
"""Convert text to speech with voice conversion and save to file.
|
||||||
|
|
||||||
Check `tts_with_vc` for more details.
|
Check `tts_with_vc` for more details.
|
||||||
|
@ -452,8 +490,11 @@ class TTS(nn.Module):
|
||||||
Split text into sentences, synthesize them separately and concatenate the file audio.
|
Split text into sentences, synthesize them separately and concatenate the file audio.
|
||||||
Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
|
Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
|
||||||
applicable to the 🐸TTS models. Defaults to True.
|
applicable to the 🐸TTS models. Defaults to True.
|
||||||
|
pipe_out (BytesIO, optional):
|
||||||
|
Flag to stdout the generated TTS wav file for shell pipe.
|
||||||
"""
|
"""
|
||||||
wav = self.tts_with_vc(
|
wav = self.tts_with_vc(
|
||||||
text=text, language=language, speaker_wav=speaker_wav, speaker=speaker, split_sentences=split_sentences
|
text=text, language=language, speaker_wav=speaker_wav, speaker=speaker, split_sentences=split_sentences
|
||||||
)
|
)
|
||||||
save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
|
self.voice_converter.save_wav(wav=wav, path=file_path, pipe_out=pipe_out)
|
||||||
|
return file_path
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
"""Get detailed info about the working environment."""
|
"""Get detailed info about the working environment."""
|
||||||
|
|
||||||
|
import json
|
||||||
import os
|
import os
|
||||||
import platform
|
import platform
|
||||||
import sys
|
import sys
|
||||||
|
@ -6,11 +8,10 @@ import sys
|
||||||
import numpy
|
import numpy
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
sys.path += [os.path.abspath(".."), os.path.abspath(".")]
|
|
||||||
import json
|
|
||||||
|
|
||||||
import TTS
|
import TTS
|
||||||
|
|
||||||
|
sys.path += [os.path.abspath(".."), os.path.abspath(".")]
|
||||||
|
|
||||||
|
|
||||||
def system_info():
|
def system_info():
|
||||||
return {
|
return {
|
||||||
|
|
|
@ -1,21 +1,26 @@
|
||||||
import argparse
|
import argparse
|
||||||
import importlib
|
import importlib
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
from argparse import RawTextHelpFormatter
|
from argparse import RawTextHelpFormatter
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
from torch.utils.data import DataLoader
|
from torch.utils.data import DataLoader
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
from trainer.io import load_checkpoint
|
||||||
|
|
||||||
from TTS.config import load_config
|
from TTS.config import load_config
|
||||||
from TTS.tts.datasets.TTSDataset import TTSDataset
|
from TTS.tts.datasets.TTSDataset import TTSDataset
|
||||||
from TTS.tts.models import setup_model
|
from TTS.tts.models import setup_model
|
||||||
from TTS.tts.utils.text.characters import make_symbols, phonemes, symbols
|
from TTS.tts.utils.text.characters import make_symbols, phonemes, symbols
|
||||||
from TTS.utils.audio import AudioProcessor
|
from TTS.utils.audio import AudioProcessor
|
||||||
from TTS.utils.io import load_checkpoint
|
from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
|
||||||
|
|
||||||
# pylint: disable=bad-option-value
|
# pylint: disable=bad-option-value
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="""Extract attention masks from trained Tacotron/Tacotron2 models.
|
description="""Extract attention masks from trained Tacotron/Tacotron2 models.
|
||||||
|
@ -31,7 +36,7 @@ Example run:
|
||||||
--data_path /root/LJSpeech-1.1/
|
--data_path /root/LJSpeech-1.1/
|
||||||
--batch_size 32
|
--batch_size 32
|
||||||
--dataset ljspeech
|
--dataset ljspeech
|
||||||
--use_cuda True
|
--use_cuda
|
||||||
""",
|
""",
|
||||||
formatter_class=RawTextHelpFormatter,
|
formatter_class=RawTextHelpFormatter,
|
||||||
)
|
)
|
||||||
|
@ -58,7 +63,7 @@ Example run:
|
||||||
help="Dataset metafile inclusing file paths with transcripts.",
|
help="Dataset metafile inclusing file paths with transcripts.",
|
||||||
)
|
)
|
||||||
parser.add_argument("--data_path", type=str, default="", help="Defines the data path. It overwrites config.json.")
|
parser.add_argument("--data_path", type=str, default="", help="Defines the data path. It overwrites config.json.")
|
||||||
parser.add_argument("--use_cuda", type=bool, default=False, help="enable/disable cuda.")
|
parser.add_argument("--use_cuda", action=argparse.BooleanOptionalAction, default=False, help="enable/disable cuda.")
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--batch_size", default=16, type=int, help="Batch size for the model. Use batch_size=1 if you have no CUDA."
|
"--batch_size", default=16, type=int, help="Batch size for the model. Use batch_size=1 if you have no CUDA."
|
||||||
|
@ -70,13 +75,13 @@ Example run:
|
||||||
|
|
||||||
# if the vocabulary was passed, replace the default
|
# if the vocabulary was passed, replace the default
|
||||||
if "characters" in C.keys():
|
if "characters" in C.keys():
|
||||||
symbols, phonemes = make_symbols(**C.characters)
|
symbols, phonemes = make_symbols(**C.characters) # noqa: F811
|
||||||
|
|
||||||
# load the model
|
# load the model
|
||||||
num_chars = len(phonemes) if C.use_phonemes else len(symbols)
|
num_chars = len(phonemes) if C.use_phonemes else len(symbols)
|
||||||
# TODO: handle multi-speaker
|
# TODO: handle multi-speaker
|
||||||
model = setup_model(C)
|
model = setup_model(C)
|
||||||
model, _ = load_checkpoint(model, args.model_path, args.use_cuda, True)
|
model, _ = load_checkpoint(model, args.model_path, use_cuda=args.use_cuda, eval=True)
|
||||||
|
|
||||||
# data loader
|
# data loader
|
||||||
preprocessor = importlib.import_module("TTS.tts.datasets.formatters")
|
preprocessor = importlib.import_module("TTS.tts.datasets.formatters")
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
import argparse
|
import argparse
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
from argparse import RawTextHelpFormatter
|
from argparse import RawTextHelpFormatter
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
@ -10,6 +12,7 @@ from TTS.config.shared_configs import BaseDatasetConfig
|
||||||
from TTS.tts.datasets import load_tts_samples
|
from TTS.tts.datasets import load_tts_samples
|
||||||
from TTS.tts.utils.managers import save_file
|
from TTS.tts.utils.managers import save_file
|
||||||
from TTS.tts.utils.speakers import SpeakerManager
|
from TTS.tts.utils.speakers import SpeakerManager
|
||||||
|
from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
|
||||||
|
|
||||||
|
|
||||||
def compute_embeddings(
|
def compute_embeddings(
|
||||||
|
@ -100,6 +103,8 @@ def compute_embeddings(
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="""Compute embedding vectors for each audio file in a dataset and store them keyed by `{dataset_name}#{file_path}` in a .pth file\n\n"""
|
description="""Compute embedding vectors for each audio file in a dataset and store them keyed by `{dataset_name}#{file_path}` in a .pth file\n\n"""
|
||||||
"""
|
"""
|
||||||
|
@ -146,7 +151,7 @@ if __name__ == "__main__":
|
||||||
default=False,
|
default=False,
|
||||||
action="store_true",
|
action="store_true",
|
||||||
)
|
)
|
||||||
parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False)
|
parser.add_argument("--disable_cuda", action="store_true", help="Flag to disable cuda.", default=False)
|
||||||
parser.add_argument("--no_eval", help="Do not compute eval?. Default False", default=False, action="store_true")
|
parser.add_argument("--no_eval", help="Do not compute eval?. Default False", default=False, action="store_true")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--formatter_name",
|
"--formatter_name",
|
||||||
|
|
|
@ -3,7 +3,9 @@
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import glob
|
import glob
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
@ -12,10 +14,13 @@ from tqdm import tqdm
|
||||||
from TTS.config import load_config
|
from TTS.config import load_config
|
||||||
from TTS.tts.datasets import load_tts_samples
|
from TTS.tts.datasets import load_tts_samples
|
||||||
from TTS.utils.audio import AudioProcessor
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
"""Run preprocessing process."""
|
"""Run preprocessing process."""
|
||||||
|
setup_logger("TTS", level=logging.INFO, stream=sys.stderr, formatter=ConsoleFormatter())
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description="Compute mean and variance of spectrogtram features.")
|
parser = argparse.ArgumentParser(description="Compute mean and variance of spectrogtram features.")
|
||||||
parser.add_argument("config_path", type=str, help="TTS config file path to define audio processin parameters.")
|
parser.add_argument("config_path", type=str, help="TTS config file path to define audio processin parameters.")
|
||||||
parser.add_argument("out_path", type=str, help="save path (directory and filename).")
|
parser.add_argument("out_path", type=str, help="save path (directory and filename).")
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
import argparse
|
import argparse
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
from argparse import RawTextHelpFormatter
|
from argparse import RawTextHelpFormatter
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
@ -7,6 +9,7 @@ from tqdm import tqdm
|
||||||
from TTS.config import load_config
|
from TTS.config import load_config
|
||||||
from TTS.tts.datasets import load_tts_samples
|
from TTS.tts.datasets import load_tts_samples
|
||||||
from TTS.tts.utils.speakers import SpeakerManager
|
from TTS.tts.utils.speakers import SpeakerManager
|
||||||
|
from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
|
||||||
|
|
||||||
|
|
||||||
def compute_encoder_accuracy(dataset_items, encoder_manager):
|
def compute_encoder_accuracy(dataset_items, encoder_manager):
|
||||||
|
@ -51,6 +54,8 @@ def compute_encoder_accuracy(dataset_items, encoder_manager):
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="""Compute the accuracy of the encoder.\n\n"""
|
description="""Compute the accuracy of the encoder.\n\n"""
|
||||||
"""
|
"""
|
||||||
|
@ -71,8 +76,8 @@ if __name__ == "__main__":
|
||||||
type=str,
|
type=str,
|
||||||
help="Path to dataset config file.",
|
help="Path to dataset config file.",
|
||||||
)
|
)
|
||||||
parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)
|
parser.add_argument("--use_cuda", action=argparse.BooleanOptionalAction, help="flag to set cuda.", default=True)
|
||||||
parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
|
parser.add_argument("--eval", action=argparse.BooleanOptionalAction, help="compute eval.", default=True)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
|
|
@ -2,12 +2,15 @@
|
||||||
"""Extract Mel spectrograms with teacher forcing."""
|
"""Extract Mel spectrograms with teacher forcing."""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
from torch.utils.data import DataLoader
|
from torch.utils.data import DataLoader
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
from trainer.generic_utils import count_parameters
|
||||||
|
|
||||||
from TTS.config import load_config
|
from TTS.config import load_config
|
||||||
from TTS.tts.datasets import TTSDataset, load_tts_samples
|
from TTS.tts.datasets import TTSDataset, load_tts_samples
|
||||||
|
@ -16,12 +19,12 @@ from TTS.tts.utils.speakers import SpeakerManager
|
||||||
from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
||||||
from TTS.utils.audio import AudioProcessor
|
from TTS.utils.audio import AudioProcessor
|
||||||
from TTS.utils.audio.numpy_transforms import quantize
|
from TTS.utils.audio.numpy_transforms import quantize
|
||||||
from TTS.utils.generic_utils import count_parameters
|
from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
|
||||||
|
|
||||||
use_cuda = torch.cuda.is_available()
|
use_cuda = torch.cuda.is_available()
|
||||||
|
|
||||||
|
|
||||||
def setup_loader(ap, r, verbose=False):
|
def setup_loader(ap, r):
|
||||||
tokenizer, _ = TTSTokenizer.init_from_config(c)
|
tokenizer, _ = TTSTokenizer.init_from_config(c)
|
||||||
dataset = TTSDataset(
|
dataset = TTSDataset(
|
||||||
outputs_per_step=r,
|
outputs_per_step=r,
|
||||||
|
@ -37,7 +40,6 @@ def setup_loader(ap, r, verbose=False):
|
||||||
phoneme_cache_path=c.phoneme_cache_path,
|
phoneme_cache_path=c.phoneme_cache_path,
|
||||||
precompute_num_workers=0,
|
precompute_num_workers=0,
|
||||||
use_noise_augment=False,
|
use_noise_augment=False,
|
||||||
verbose=verbose,
|
|
||||||
speaker_id_mapping=speaker_manager.name_to_id if c.use_speaker_embedding else None,
|
speaker_id_mapping=speaker_manager.name_to_id if c.use_speaker_embedding else None,
|
||||||
d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None,
|
d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None,
|
||||||
)
|
)
|
||||||
|
@ -257,7 +259,7 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
print("\n > Model has {} parameters".format(num_params), flush=True)
|
print("\n > Model has {} parameters".format(num_params), flush=True)
|
||||||
# set r
|
# set r
|
||||||
r = 1 if c.model.lower() == "glow_tts" else model.decoder.r
|
r = 1 if c.model.lower() == "glow_tts" else model.decoder.r
|
||||||
own_loader = setup_loader(ap, r, verbose=True)
|
own_loader = setup_loader(ap, r)
|
||||||
|
|
||||||
extract_spectrograms(
|
extract_spectrograms(
|
||||||
own_loader,
|
own_loader,
|
||||||
|
@ -272,6 +274,8 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("--config_path", type=str, help="Path to config file for training.", required=True)
|
parser.add_argument("--config_path", type=str, help="Path to config file for training.", required=True)
|
||||||
parser.add_argument("--checkpoint_path", type=str, help="Model file to be restored.", required=True)
|
parser.add_argument("--checkpoint_path", type=str, help="Model file to be restored.", required=True)
|
||||||
|
@ -279,7 +283,7 @@ if __name__ == "__main__":
|
||||||
parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug")
|
parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug")
|
||||||
parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files")
|
parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files")
|
||||||
parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero")
|
parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero")
|
||||||
parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
|
parser.add_argument("--eval", action=argparse.BooleanOptionalAction, help="compute eval.", default=True)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
c = load_config(args.config_path)
|
c = load_config(args.config_path)
|
||||||
|
|
|
@ -1,12 +1,18 @@
|
||||||
"""Find all the unique characters in a dataset"""
|
"""Find all the unique characters in a dataset"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
from argparse import RawTextHelpFormatter
|
from argparse import RawTextHelpFormatter
|
||||||
|
|
||||||
from TTS.config import load_config
|
from TTS.config import load_config
|
||||||
from TTS.tts.datasets import load_tts_samples
|
from TTS.tts.datasets import find_unique_chars, load_tts_samples
|
||||||
|
from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
|
||||||
|
|
||||||
# pylint: disable=bad-option-value
|
# pylint: disable=bad-option-value
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="""Find all the unique characters or phonemes in a dataset.\n\n"""
|
description="""Find all the unique characters or phonemes in a dataset.\n\n"""
|
||||||
|
@ -28,17 +34,7 @@ def main():
|
||||||
)
|
)
|
||||||
|
|
||||||
items = train_items + eval_items
|
items = train_items + eval_items
|
||||||
|
find_unique_chars(items)
|
||||||
texts = "".join(item["text"] for item in items)
|
|
||||||
chars = set(texts)
|
|
||||||
lower_chars = filter(lambda c: c.islower(), chars)
|
|
||||||
chars_force_lower = [c.lower() for c in chars]
|
|
||||||
chars_force_lower = set(chars_force_lower)
|
|
||||||
|
|
||||||
print(f" > Number of unique characters: {len(chars)}")
|
|
||||||
print(f" > Unique characters: {''.join(sorted(chars))}")
|
|
||||||
print(f" > Unique lower characters: {''.join(sorted(lower_chars))}")
|
|
||||||
print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
@ -1,6 +1,9 @@
|
||||||
"""Find all the unique characters in a dataset"""
|
"""Find all the unique characters in a dataset"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import logging
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
|
import sys
|
||||||
from argparse import RawTextHelpFormatter
|
from argparse import RawTextHelpFormatter
|
||||||
|
|
||||||
from tqdm.contrib.concurrent import process_map
|
from tqdm.contrib.concurrent import process_map
|
||||||
|
@ -8,15 +11,18 @@ from tqdm.contrib.concurrent import process_map
|
||||||
from TTS.config import load_config
|
from TTS.config import load_config
|
||||||
from TTS.tts.datasets import load_tts_samples
|
from TTS.tts.datasets import load_tts_samples
|
||||||
from TTS.tts.utils.text.phonemizers import Gruut
|
from TTS.tts.utils.text.phonemizers import Gruut
|
||||||
|
from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
|
||||||
|
|
||||||
|
|
||||||
def compute_phonemes(item):
|
def compute_phonemes(item):
|
||||||
text = item["text"]
|
text = item["text"]
|
||||||
ph = phonemizer.phonemize(text).replace("|", "")
|
ph = phonemizer.phonemize(text).replace("|", "")
|
||||||
return set(list(ph))
|
return set(ph)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
|
||||||
|
|
||||||
# pylint: disable=W0601
|
# pylint: disable=W0601
|
||||||
global c, phonemizer
|
global c, phonemizer
|
||||||
# pylint: disable=bad-option-value
|
# pylint: disable=bad-option-value
|
||||||
|
|
|
@ -1,12 +1,15 @@
|
||||||
import argparse
|
import argparse
|
||||||
import glob
|
import glob
|
||||||
|
import logging
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
import os
|
import os
|
||||||
import pathlib
|
import pathlib
|
||||||
|
import sys
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
|
||||||
from TTS.utils.vad import get_vad_model_and_utils, remove_silence
|
from TTS.utils.vad import get_vad_model_and_utils, remove_silence
|
||||||
|
|
||||||
torch.set_num_threads(1)
|
torch.set_num_threads(1)
|
||||||
|
@ -75,8 +78,10 @@ def preprocess_audios():
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end True"
|
description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end"
|
||||||
)
|
)
|
||||||
parser.add_argument("-i", "--input_dir", type=str, help="Dataset root dir", required=True)
|
parser.add_argument("-i", "--input_dir", type=str, help="Dataset root dir", required=True)
|
||||||
parser.add_argument("-o", "--output_dir", type=str, help="Output Dataset dir", default="")
|
parser.add_argument("-o", "--output_dir", type=str, help="Output Dataset dir", default="")
|
||||||
|
@ -91,20 +96,20 @@ if __name__ == "__main__":
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-t",
|
"-t",
|
||||||
"--trim_just_beginning_and_end",
|
"--trim_just_beginning_and_end",
|
||||||
type=bool,
|
action=argparse.BooleanOptionalAction,
|
||||||
default=True,
|
default=True,
|
||||||
help="If True this script will trim just the beginning and end nonspeech parts. If False all nonspeech parts will be trim. Default True",
|
help="If True this script will trim just the beginning and end nonspeech parts. If False all nonspeech parts will be trimmed.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-c",
|
"-c",
|
||||||
"--use_cuda",
|
"--use_cuda",
|
||||||
type=bool,
|
action=argparse.BooleanOptionalAction,
|
||||||
default=False,
|
default=False,
|
||||||
help="If True use cuda",
|
help="If True use cuda",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--use_onnx",
|
"--use_onnx",
|
||||||
type=bool,
|
action=argparse.BooleanOptionalAction,
|
||||||
default=False,
|
default=False,
|
||||||
help="If True use onnx",
|
help="If True use onnx",
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,147 +1,141 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
"""Command line interface."""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import contextlib
|
import contextlib
|
||||||
|
import logging
|
||||||
import sys
|
import sys
|
||||||
from argparse import RawTextHelpFormatter
|
from argparse import RawTextHelpFormatter
|
||||||
|
|
||||||
# pylint: disable=redefined-outer-name, unused-argument
|
# pylint: disable=redefined-outer-name, unused-argument
|
||||||
from pathlib import Path
|
from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
description = """
|
description = """
|
||||||
Synthesize speech on command line.
|
Synthesize speech on the command line.
|
||||||
|
|
||||||
You can either use your trained model or choose a model from the provided list.
|
You can either use your trained model or choose a model from the provided list.
|
||||||
|
|
||||||
If you don't specify any models, then it uses LJSpeech based English model.
|
|
||||||
|
|
||||||
#### Single Speaker Models
|
|
||||||
|
|
||||||
- List provided models:
|
- List provided models:
|
||||||
|
|
||||||
```
|
```sh
|
||||||
$ tts --list_models
|
tts --list_models
|
||||||
```
|
```
|
||||||
|
|
||||||
- Get model info (for both tts_models and vocoder_models):
|
- Get model information. Use the names obtained from `--list_models`.
|
||||||
|
```sh
|
||||||
- Query by type/name:
|
tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
|
||||||
The model_info_by_name uses the name as it from the --list_models.
|
|
||||||
```
|
|
||||||
$ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
|
|
||||||
```
|
```
|
||||||
For example:
|
For example:
|
||||||
```
|
```sh
|
||||||
$ tts --model_info_by_name tts_models/tr/common-voice/glow-tts
|
tts --model_info_by_name tts_models/tr/common-voice/glow-tts
|
||||||
$ tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2
|
tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2
|
||||||
```
|
|
||||||
- Query by type/idx:
|
|
||||||
The model_query_idx uses the corresponding idx from --list_models.
|
|
||||||
|
|
||||||
```
|
|
||||||
$ tts --model_info_by_idx "<model_type>/<model_query_idx>"
|
|
||||||
```
|
```
|
||||||
|
|
||||||
For example:
|
#### Single speaker models
|
||||||
|
|
||||||
```
|
- Run TTS with the default model (`tts_models/en/ljspeech/tacotron2-DDC`):
|
||||||
$ tts --model_info_by_idx tts_models/3
|
|
||||||
```
|
|
||||||
|
|
||||||
- Query info for model info by full name:
|
```sh
|
||||||
```
|
tts --text "Text for TTS" --out_path output/path/speech.wav
|
||||||
$ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
|
|
||||||
```
|
|
||||||
|
|
||||||
- Run TTS with default models:
|
|
||||||
|
|
||||||
```
|
|
||||||
$ tts --text "Text for TTS" --out_path output/path/speech.wav
|
|
||||||
```
|
```
|
||||||
|
|
||||||
- Run TTS and pipe out the generated TTS wav file data:
|
- Run TTS and pipe out the generated TTS wav file data:
|
||||||
|
|
||||||
```
|
```sh
|
||||||
$ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
|
tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
|
||||||
```
|
```
|
||||||
|
|
||||||
- Run a TTS model with its default vocoder model:
|
- Run a TTS model with its default vocoder model:
|
||||||
|
|
||||||
```
|
```sh
|
||||||
$ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
|
tts --text "Text for TTS" \\
|
||||||
|
--model_name "<model_type>/<language>/<dataset>/<model_name>" \\
|
||||||
|
--out_path output/path/speech.wav
|
||||||
```
|
```
|
||||||
|
|
||||||
For example:
|
For example:
|
||||||
|
|
||||||
```
|
```sh
|
||||||
$ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --out_path output/path/speech.wav
|
tts --text "Text for TTS" \\
|
||||||
|
--model_name "tts_models/en/ljspeech/glow-tts" \\
|
||||||
|
--out_path output/path/speech.wav
|
||||||
```
|
```
|
||||||
|
|
||||||
- Run with specific TTS and vocoder models from the list:
|
- Run with specific TTS and vocoder models from the list. Note that not every vocoder is compatible with every TTS model.
|
||||||
|
|
||||||
```
|
```sh
|
||||||
$ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --vocoder_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
|
tts --text "Text for TTS" \\
|
||||||
|
--model_name "<model_type>/<language>/<dataset>/<model_name>" \\
|
||||||
|
--vocoder_name "<model_type>/<language>/<dataset>/<model_name>" \\
|
||||||
|
--out_path output/path/speech.wav
|
||||||
```
|
```
|
||||||
|
|
||||||
For example:
|
For example:
|
||||||
|
|
||||||
```
|
```sh
|
||||||
$ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --vocoder_name "vocoder_models/en/ljspeech/univnet" --out_path output/path/speech.wav
|
tts --text "Text for TTS" \\
|
||||||
|
--model_name "tts_models/en/ljspeech/glow-tts" \\
|
||||||
|
--vocoder_name "vocoder_models/en/ljspeech/univnet" \\
|
||||||
|
--out_path output/path/speech.wav
|
||||||
```
|
```
|
||||||
|
|
||||||
- Run your own TTS model (Using Griffin-Lim Vocoder):
|
- Run your own TTS model (using Griffin-Lim Vocoder):
|
||||||
|
|
||||||
```
|
```sh
|
||||||
$ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
|
tts --text "Text for TTS" \\
|
||||||
|
--model_path path/to/model.pth \\
|
||||||
|
--config_path path/to/config.json \\
|
||||||
|
--out_path output/path/speech.wav
|
||||||
```
|
```
|
||||||
|
|
||||||
- Run your own TTS and Vocoder models:
|
- Run your own TTS and Vocoder models:
|
||||||
|
|
||||||
```
|
```sh
|
||||||
$ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
|
tts --text "Text for TTS" \\
|
||||||
--vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
|
--model_path path/to/model.pth \\
|
||||||
|
--config_path path/to/config.json \\
|
||||||
|
--out_path output/path/speech.wav \\
|
||||||
|
--vocoder_path path/to/vocoder.pth \\
|
||||||
|
--vocoder_config_path path/to/vocoder_config.json
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Multi-speaker Models
|
#### Multi-speaker models
|
||||||
|
|
||||||
- List the available speakers and choose a <speaker_id> among them:
|
- List the available speakers and choose a `<speaker_id>` among them:
|
||||||
|
|
||||||
```
|
```sh
|
||||||
$ tts --model_name "<language>/<dataset>/<model_name>" --list_speaker_idxs
|
tts --model_name "<language>/<dataset>/<model_name>" --list_speaker_idxs
|
||||||
```
|
```
|
||||||
|
|
||||||
- Run the multi-speaker TTS model with the target speaker ID:
|
- Run the multi-speaker TTS model with the target speaker ID:
|
||||||
|
|
||||||
```
|
```sh
|
||||||
$ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --speaker_idx <speaker_id>
|
tts --text "Text for TTS." --out_path output/path/speech.wav \\
|
||||||
|
--model_name "<language>/<dataset>/<model_name>" --speaker_idx <speaker_id>
|
||||||
```
|
```
|
||||||
|
|
||||||
- Run your own multi-speaker TTS model:
|
- Run your own multi-speaker TTS model:
|
||||||
|
|
||||||
```
|
```sh
|
||||||
$ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/model.pth --config_path path/to/config.json --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
|
tts --text "Text for TTS" --out_path output/path/speech.wav \\
|
||||||
|
--model_path path/to/model.pth --config_path path/to/config.json \\
|
||||||
|
--speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
|
||||||
```
|
```
|
||||||
|
|
||||||
### Voice Conversion Models
|
#### Voice conversion models
|
||||||
|
|
||||||
```
|
```sh
|
||||||
$ tts --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --source_wav <path/to/speaker/wav> --target_wav <path/to/reference/wav>
|
tts --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" \\
|
||||||
|
--source_wav <path/to/speaker/wav> --target_wav <path/to/reference/wav>
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
def str2bool(v):
|
def parse_args() -> argparse.Namespace:
|
||||||
if isinstance(v, bool):
|
"""Parse arguments."""
|
||||||
return v
|
|
||||||
if v.lower() in ("yes", "true", "t", "y", "1"):
|
|
||||||
return True
|
|
||||||
if v.lower() in ("no", "false", "f", "n", "0"):
|
|
||||||
return False
|
|
||||||
raise argparse.ArgumentTypeError("Boolean value expected.")
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description=description.replace(" ```\n", ""),
|
description=description.replace(" ```\n", ""),
|
||||||
formatter_class=RawTextHelpFormatter,
|
formatter_class=RawTextHelpFormatter,
|
||||||
|
@ -149,10 +143,7 @@ def main():
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--list_models",
|
"--list_models",
|
||||||
type=str2bool,
|
action="store_true",
|
||||||
nargs="?",
|
|
||||||
const=True,
|
|
||||||
default=False,
|
|
||||||
help="list available pre-trained TTS and vocoder models.",
|
help="list available pre-trained TTS and vocoder models.",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -200,7 +191,7 @@ def main():
|
||||||
default="tts_output.wav",
|
default="tts_output.wav",
|
||||||
help="Output wav file path.",
|
help="Output wav file path.",
|
||||||
)
|
)
|
||||||
parser.add_argument("--use_cuda", type=bool, help="Run model on CUDA.", default=False)
|
parser.add_argument("--use_cuda", action="store_true", help="Run model on CUDA.")
|
||||||
parser.add_argument("--device", type=str, help="Device to run model on.", default="cpu")
|
parser.add_argument("--device", type=str, help="Device to run model on.", default="cpu")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--vocoder_path",
|
"--vocoder_path",
|
||||||
|
@ -219,10 +210,7 @@ def main():
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--pipe_out",
|
"--pipe_out",
|
||||||
help="stdout the generated TTS wav file for shell pipe.",
|
help="stdout the generated TTS wav file for shell pipe.",
|
||||||
type=str2bool,
|
action="store_true",
|
||||||
nargs="?",
|
|
||||||
const=True,
|
|
||||||
default=False,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# args for multi-speaker synthesis
|
# args for multi-speaker synthesis
|
||||||
|
@ -254,26 +242,14 @@ def main():
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--list_speaker_idxs",
|
"--list_speaker_idxs",
|
||||||
help="List available speaker ids for the defined multi-speaker model.",
|
help="List available speaker ids for the defined multi-speaker model.",
|
||||||
type=str2bool,
|
action="store_true",
|
||||||
nargs="?",
|
|
||||||
const=True,
|
|
||||||
default=False,
|
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--list_language_idxs",
|
"--list_language_idxs",
|
||||||
help="List available language ids for the defined multi-lingual model.",
|
help="List available language ids for the defined multi-lingual model.",
|
||||||
type=str2bool,
|
action="store_true",
|
||||||
nargs="?",
|
|
||||||
const=True,
|
|
||||||
default=False,
|
|
||||||
)
|
)
|
||||||
# aux args
|
# aux args
|
||||||
parser.add_argument(
|
|
||||||
"--save_spectogram",
|
|
||||||
type=bool,
|
|
||||||
help="If true save raw spectogram for further (vocoder) processing in out_path.",
|
|
||||||
default=False,
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--reference_wav",
|
"--reference_wav",
|
||||||
type=str,
|
type=str,
|
||||||
|
@ -288,8 +264,8 @@ def main():
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--progress_bar",
|
"--progress_bar",
|
||||||
type=str2bool,
|
action=argparse.BooleanOptionalAction,
|
||||||
help="If true shows a progress bar for the model download. Defaults to True",
|
help="Show a progress bar for the model download.",
|
||||||
default=True,
|
default=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -330,6 +306,14 @@ def main():
|
||||||
]
|
]
|
||||||
if not any(check_args):
|
if not any(check_args):
|
||||||
parser.parse_args(["-h"])
|
parser.parse_args(["-h"])
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
"""Entry point for `tts` command line interface."""
|
||||||
|
args = parse_args()
|
||||||
|
stream = sys.stderr if args.pipe_out else sys.stdout
|
||||||
|
setup_logger("TTS", level=logging.INFO, stream=stream, formatter=ConsoleFormatter())
|
||||||
|
|
||||||
pipe_out = sys.stdout if args.pipe_out else None
|
pipe_out = sys.stdout if args.pipe_out else None
|
||||||
|
|
||||||
|
@ -337,12 +321,9 @@ def main():
|
||||||
# Late-import to make things load faster
|
# Late-import to make things load faster
|
||||||
from TTS.api import TTS
|
from TTS.api import TTS
|
||||||
from TTS.utils.manage import ModelManager
|
from TTS.utils.manage import ModelManager
|
||||||
from TTS.utils.synthesizer import Synthesizer
|
|
||||||
|
|
||||||
# load model manager
|
# load model manager
|
||||||
path = Path(__file__).parent / "../.models.json"
|
manager = ModelManager(models_file=TTS.get_models_file_path(), progress_bar=args.progress_bar)
|
||||||
manager = ModelManager(path, progress_bar=args.progress_bar)
|
|
||||||
api = TTS()
|
|
||||||
|
|
||||||
tts_path = None
|
tts_path = None
|
||||||
tts_config_path = None
|
tts_config_path = None
|
||||||
|
@ -356,12 +337,12 @@ def main():
|
||||||
vc_config_path = None
|
vc_config_path = None
|
||||||
model_dir = None
|
model_dir = None
|
||||||
|
|
||||||
# CASE1 #list : list pre-trained TTS models
|
# 1) List pre-trained TTS models
|
||||||
if args.list_models:
|
if args.list_models:
|
||||||
manager.list_models()
|
manager.list_models()
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
# CASE2 #info : model info for pre-trained TTS models
|
# 2) Info about pre-trained TTS models (without loading a model)
|
||||||
if args.model_info_by_idx:
|
if args.model_info_by_idx:
|
||||||
model_query = args.model_info_by_idx
|
model_query = args.model_info_by_idx
|
||||||
manager.model_info_by_idx(model_query)
|
manager.model_info_by_idx(model_query)
|
||||||
|
@ -372,122 +353,83 @@ def main():
|
||||||
manager.model_info_by_full_name(model_query_full_name)
|
manager.model_info_by_full_name(model_query_full_name)
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
# CASE3: load pre-trained model paths
|
# 3) Load a model for further info or TTS/VC
|
||||||
if args.model_name is not None and not args.model_path:
|
|
||||||
model_path, config_path, model_item = manager.download_model(args.model_name)
|
|
||||||
# tts model
|
|
||||||
if model_item["model_type"] == "tts_models":
|
|
||||||
tts_path = model_path
|
|
||||||
tts_config_path = config_path
|
|
||||||
if "default_vocoder" in model_item:
|
|
||||||
args.vocoder_name = (
|
|
||||||
model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
|
|
||||||
)
|
|
||||||
|
|
||||||
# voice conversion model
|
|
||||||
if model_item["model_type"] == "voice_conversion_models":
|
|
||||||
vc_path = model_path
|
|
||||||
vc_config_path = config_path
|
|
||||||
|
|
||||||
# tts model with multiple files to be loaded from the directory path
|
|
||||||
if model_item.get("author", None) == "fairseq" or isinstance(model_item["model_url"], list):
|
|
||||||
model_dir = model_path
|
|
||||||
tts_path = None
|
|
||||||
tts_config_path = None
|
|
||||||
args.vocoder_name = None
|
|
||||||
|
|
||||||
# load vocoder
|
|
||||||
if args.vocoder_name is not None and not args.vocoder_path:
|
|
||||||
vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
|
|
||||||
|
|
||||||
# CASE4: set custom model paths
|
|
||||||
if args.model_path is not None:
|
|
||||||
tts_path = args.model_path
|
|
||||||
tts_config_path = args.config_path
|
|
||||||
speakers_file_path = args.speakers_file_path
|
|
||||||
language_ids_file_path = args.language_ids_file_path
|
|
||||||
|
|
||||||
if args.vocoder_path is not None:
|
|
||||||
vocoder_path = args.vocoder_path
|
|
||||||
vocoder_config_path = args.vocoder_config_path
|
|
||||||
|
|
||||||
if args.encoder_path is not None:
|
|
||||||
encoder_path = args.encoder_path
|
|
||||||
encoder_config_path = args.encoder_config_path
|
|
||||||
|
|
||||||
device = args.device
|
device = args.device
|
||||||
if args.use_cuda:
|
if args.use_cuda:
|
||||||
device = "cuda"
|
device = "cuda"
|
||||||
|
# A local model will take precedence if specified via modeL_path
|
||||||
# load models
|
model_name = args.model_name if args.model_path is None else None
|
||||||
synthesizer = Synthesizer(
|
api = TTS(
|
||||||
tts_path,
|
model_name=model_name,
|
||||||
tts_config_path,
|
model_path=args.model_path,
|
||||||
speakers_file_path,
|
config_path=args.config_path,
|
||||||
language_ids_file_path,
|
vocoder_name=args.vocoder_name,
|
||||||
vocoder_path,
|
vocoder_path=args.vocoder_path,
|
||||||
vocoder_config_path,
|
vocoder_config_path=args.vocoder_config_path,
|
||||||
encoder_path,
|
encoder_path=args.encoder_path,
|
||||||
encoder_config_path,
|
encoder_config_path=args.encoder_config_path,
|
||||||
vc_path,
|
speakers_file_path=args.speakers_file_path,
|
||||||
vc_config_path,
|
language_ids_file_path=args.language_ids_file_path,
|
||||||
model_dir,
|
progress_bar=args.progress_bar,
|
||||||
args.voice_dir,
|
|
||||||
).to(device)
|
).to(device)
|
||||||
|
|
||||||
# query speaker ids of a multi-speaker model.
|
# query speaker ids of a multi-speaker model.
|
||||||
if args.list_speaker_idxs:
|
if args.list_speaker_idxs:
|
||||||
print(
|
if not api.is_multi_speaker:
|
||||||
" > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
|
logger.info("Model only has a single speaker.")
|
||||||
|
return
|
||||||
|
logger.info(
|
||||||
|
"Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
|
||||||
)
|
)
|
||||||
print(synthesizer.tts_model.speaker_manager.name_to_id)
|
logger.info(api.speakers)
|
||||||
return
|
return
|
||||||
|
|
||||||
# query langauge ids of a multi-lingual model.
|
# query langauge ids of a multi-lingual model.
|
||||||
if args.list_language_idxs:
|
if args.list_language_idxs:
|
||||||
print(
|
if not api.is_multi_lingual:
|
||||||
" > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
|
logger.info("Monolingual model.")
|
||||||
|
return
|
||||||
|
logger.info(
|
||||||
|
"Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
|
||||||
)
|
)
|
||||||
print(synthesizer.tts_model.language_manager.name_to_id)
|
logger.info(api.languages)
|
||||||
return
|
return
|
||||||
|
|
||||||
# check the arguments against a multi-speaker model.
|
# check the arguments against a multi-speaker model.
|
||||||
if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
|
if api.is_multi_speaker and (not args.speaker_idx and not args.speaker_wav):
|
||||||
print(
|
logger.error(
|
||||||
" [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
|
"Looks like you use a multi-speaker model. Define `--speaker_idx` to "
|
||||||
"select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
|
"select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
# RUN THE SYNTHESIS
|
# RUN THE SYNTHESIS
|
||||||
if args.text:
|
if args.text:
|
||||||
print(" > Text: {}".format(args.text))
|
logger.info("Text: %s", args.text)
|
||||||
|
|
||||||
# kick it
|
if args.text is not None:
|
||||||
if tts_path is not None:
|
api.tts_to_file(
|
||||||
wav = synthesizer.tts(
|
text=args.text,
|
||||||
args.text,
|
speaker=args.speaker_idx,
|
||||||
speaker_name=args.speaker_idx,
|
language=args.language_idx,
|
||||||
language_name=args.language_idx,
|
|
||||||
speaker_wav=args.speaker_wav,
|
speaker_wav=args.speaker_wav,
|
||||||
|
pipe_out=pipe_out,
|
||||||
|
file_path=args.out_path,
|
||||||
reference_wav=args.reference_wav,
|
reference_wav=args.reference_wav,
|
||||||
style_wav=args.capacitron_style_wav,
|
style_wav=args.capacitron_style_wav,
|
||||||
style_text=args.capacitron_style_text,
|
style_text=args.capacitron_style_text,
|
||||||
reference_speaker_name=args.reference_speaker_idx,
|
reference_speaker_name=args.reference_speaker_idx,
|
||||||
|
voice_dir=args.voice_dir,
|
||||||
)
|
)
|
||||||
elif vc_path is not None:
|
logger.info("Saved TTS output to %s", args.out_path)
|
||||||
wav = synthesizer.voice_conversion(
|
elif args.source_wav is not None and args.target_wav is not None:
|
||||||
|
api.voice_conversion_to_file(
|
||||||
source_wav=args.source_wav,
|
source_wav=args.source_wav,
|
||||||
target_wav=args.target_wav,
|
target_wav=args.target_wav,
|
||||||
|
file_path=args.out_path,
|
||||||
|
pipe_out=pipe_out,
|
||||||
)
|
)
|
||||||
elif model_dir is not None:
|
logger.info("Saved VC output to %s", args.out_path)
|
||||||
wav = synthesizer.tts(
|
|
||||||
args.text, speaker_name=args.speaker_idx, language_name=args.language_idx, speaker_wav=args.speaker_wav
|
|
||||||
)
|
|
||||||
|
|
||||||
# save the results
|
|
||||||
print(" > Saving output to {}".format(args.out_path))
|
|
||||||
synthesizer.save_wav(wav, args.out_path, pipe_out=pipe_out)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
@ -1,13 +1,16 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
|
import warnings
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from torch.utils.data import DataLoader
|
from torch.utils.data import DataLoader
|
||||||
|
from trainer.generic_utils import count_parameters, remove_experiment_folder
|
||||||
from trainer.io import copy_model_files, save_best_model, save_checkpoint
|
from trainer.io import copy_model_files, save_best_model, save_checkpoint
|
||||||
from trainer.torch import NoamLR
|
from trainer.torch import NoamLR
|
||||||
from trainer.trainer_utils import get_optimizer
|
from trainer.trainer_utils import get_optimizer
|
||||||
|
@ -18,7 +21,7 @@ from TTS.encoder.utils.training import init_training
|
||||||
from TTS.encoder.utils.visual import plot_embeddings
|
from TTS.encoder.utils.visual import plot_embeddings
|
||||||
from TTS.tts.datasets import load_tts_samples
|
from TTS.tts.datasets import load_tts_samples
|
||||||
from TTS.utils.audio import AudioProcessor
|
from TTS.utils.audio import AudioProcessor
|
||||||
from TTS.utils.generic_utils import count_parameters, remove_experiment_folder
|
from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
|
||||||
from TTS.utils.samplers import PerfectBatchSampler
|
from TTS.utils.samplers import PerfectBatchSampler
|
||||||
from TTS.utils.training import check_update
|
from TTS.utils.training import check_update
|
||||||
|
|
||||||
|
@ -31,7 +34,7 @@ print(" > Using CUDA: ", use_cuda)
|
||||||
print(" > Number of GPUs: ", num_gpus)
|
print(" > Number of GPUs: ", num_gpus)
|
||||||
|
|
||||||
|
|
||||||
def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False):
|
def setup_loader(ap: AudioProcessor, is_val: bool = False):
|
||||||
num_utter_per_class = c.num_utter_per_class if not is_val else c.eval_num_utter_per_class
|
num_utter_per_class = c.num_utter_per_class if not is_val else c.eval_num_utter_per_class
|
||||||
num_classes_in_batch = c.num_classes_in_batch if not is_val else c.eval_num_classes_in_batch
|
num_classes_in_batch = c.num_classes_in_batch if not is_val else c.eval_num_classes_in_batch
|
||||||
|
|
||||||
|
@ -42,7 +45,6 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False
|
||||||
voice_len=c.voice_len,
|
voice_len=c.voice_len,
|
||||||
num_utter_per_class=num_utter_per_class,
|
num_utter_per_class=num_utter_per_class,
|
||||||
num_classes_in_batch=num_classes_in_batch,
|
num_classes_in_batch=num_classes_in_batch,
|
||||||
verbose=verbose,
|
|
||||||
augmentation_config=c.audio_augmentation if not is_val else None,
|
augmentation_config=c.audio_augmentation if not is_val else None,
|
||||||
use_torch_spec=c.model_params.get("use_torch_spec", False),
|
use_torch_spec=c.model_params.get("use_torch_spec", False),
|
||||||
)
|
)
|
||||||
|
@ -115,11 +117,14 @@ def evaluation(model, criterion, data_loader, global_step):
|
||||||
eval_avg_loss = eval_loss / len(data_loader)
|
eval_avg_loss = eval_loss / len(data_loader)
|
||||||
# save stats
|
# save stats
|
||||||
dashboard_logger.eval_stats(global_step, {"loss": eval_avg_loss})
|
dashboard_logger.eval_stats(global_step, {"loss": eval_avg_loss})
|
||||||
|
try:
|
||||||
# plot the last batch in the evaluation
|
# plot the last batch in the evaluation
|
||||||
figures = {
|
figures = {
|
||||||
"UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
|
"UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
|
||||||
}
|
}
|
||||||
dashboard_logger.eval_figures(global_step, figures)
|
dashboard_logger.eval_figures(global_step, figures)
|
||||||
|
except ImportError:
|
||||||
|
warnings.warn("Install the `umap-learn` package to see embedding plots.")
|
||||||
return eval_avg_loss
|
return eval_avg_loss
|
||||||
|
|
||||||
|
|
||||||
|
@ -160,9 +165,6 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader,
|
||||||
loader_time = time.time() - end_time
|
loader_time = time.time() - end_time
|
||||||
global_step += 1
|
global_step += 1
|
||||||
|
|
||||||
# setup lr
|
|
||||||
if c.lr_decay:
|
|
||||||
scheduler.step()
|
|
||||||
optimizer.zero_grad()
|
optimizer.zero_grad()
|
||||||
|
|
||||||
# dispatch data to GPU
|
# dispatch data to GPU
|
||||||
|
@ -181,6 +183,10 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader,
|
||||||
grad_norm, _ = check_update(model, c.grad_clip)
|
grad_norm, _ = check_update(model, c.grad_clip)
|
||||||
optimizer.step()
|
optimizer.step()
|
||||||
|
|
||||||
|
# setup lr
|
||||||
|
if c.lr_decay:
|
||||||
|
scheduler.step()
|
||||||
|
|
||||||
step_time = time.time() - start_time
|
step_time = time.time() - start_time
|
||||||
epoch_time += step_time
|
epoch_time += step_time
|
||||||
|
|
||||||
|
@ -278,9 +284,9 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
# pylint: disable=redefined-outer-name
|
# pylint: disable=redefined-outer-name
|
||||||
meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=True)
|
meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=True)
|
||||||
|
|
||||||
train_data_loader, train_classes, map_classid_to_classname = setup_loader(ap, is_val=False, verbose=True)
|
train_data_loader, train_classes, map_classid_to_classname = setup_loader(ap, is_val=False)
|
||||||
if c.run_eval:
|
if c.run_eval:
|
||||||
eval_data_loader, _, _ = setup_loader(ap, is_val=True, verbose=True)
|
eval_data_loader, _, _ = setup_loader(ap, is_val=True)
|
||||||
else:
|
else:
|
||||||
eval_data_loader = None
|
eval_data_loader = None
|
||||||
|
|
||||||
|
@ -316,6 +322,8 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
|
||||||
|
|
||||||
args, c, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = init_training()
|
args, c, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = init_training()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
from trainer import Trainer, TrainerArgs
|
from trainer import Trainer, TrainerArgs
|
||||||
|
@ -6,6 +8,7 @@ from trainer import Trainer, TrainerArgs
|
||||||
from TTS.config import load_config, register_config
|
from TTS.config import load_config, register_config
|
||||||
from TTS.tts.datasets import load_tts_samples
|
from TTS.tts.datasets import load_tts_samples
|
||||||
from TTS.tts.models import setup_model
|
from TTS.tts.models import setup_model
|
||||||
|
from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
@ -15,6 +18,8 @@ class TrainTTSArgs(TrainerArgs):
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
"""Run `tts` model training directly by a `config.json` file."""
|
"""Run `tts` model training directly by a `config.json` file."""
|
||||||
|
setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
|
||||||
|
|
||||||
# init trainer args
|
# init trainer args
|
||||||
train_args = TrainTTSArgs()
|
train_args = TrainTTSArgs()
|
||||||
parser = train_args.init_argparse(arg_prefix="")
|
parser = train_args.init_argparse(arg_prefix="")
|
||||||
|
|
|
@ -1,10 +1,13 @@
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
from trainer import Trainer, TrainerArgs
|
from trainer import Trainer, TrainerArgs
|
||||||
|
|
||||||
from TTS.config import load_config, register_config
|
from TTS.config import load_config, register_config
|
||||||
from TTS.utils.audio import AudioProcessor
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
|
||||||
from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
|
from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
|
||||||
from TTS.vocoder.models import setup_model
|
from TTS.vocoder.models import setup_model
|
||||||
|
|
||||||
|
@ -16,6 +19,8 @@ class TrainVocoderArgs(TrainerArgs):
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
"""Run `tts` model training directly by a `config.json` file."""
|
"""Run `tts` model training directly by a `config.json` file."""
|
||||||
|
setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
|
||||||
|
|
||||||
# init trainer args
|
# init trainer args
|
||||||
train_args = TrainVocoderArgs()
|
train_args = TrainVocoderArgs()
|
||||||
parser = train_args.init_argparse(arg_prefix="")
|
parser = train_args.init_argparse(arg_prefix="")
|
||||||
|
|
|
@ -1,5 +1,8 @@
|
||||||
"""Search a good noise schedule for WaveGrad for a given number of inference iterations"""
|
"""Search a good noise schedule for WaveGrad for a given number of inference iterations"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
from itertools import product as cartesian_product
|
from itertools import product as cartesian_product
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
@ -9,11 +12,14 @@ from tqdm import tqdm
|
||||||
|
|
||||||
from TTS.config import load_config
|
from TTS.config import load_config
|
||||||
from TTS.utils.audio import AudioProcessor
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
|
||||||
from TTS.vocoder.datasets.preprocess import load_wav_data
|
from TTS.vocoder.datasets.preprocess import load_wav_data
|
||||||
from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset
|
from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset
|
||||||
from TTS.vocoder.models import setup_model
|
from TTS.vocoder.models import setup_model
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("--model_path", type=str, help="Path to model checkpoint.")
|
parser.add_argument("--model_path", type=str, help="Path to model checkpoint.")
|
||||||
parser.add_argument("--config_path", type=str, help="Path to model config file.")
|
parser.add_argument("--config_path", type=str, help="Path to model config file.")
|
||||||
|
@ -54,7 +60,6 @@ if __name__ == "__main__":
|
||||||
return_segments=False,
|
return_segments=False,
|
||||||
use_noise_augment=False,
|
use_noise_augment=False,
|
||||||
use_cache=False,
|
use_cache=False,
|
||||||
verbose=True,
|
|
||||||
)
|
)
|
||||||
loader = DataLoader(
|
loader = DataLoader(
|
||||||
dataset,
|
dataset,
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
from typing import Dict
|
from typing import Any, Dict, Union
|
||||||
|
|
||||||
import fsspec
|
import fsspec
|
||||||
import yaml
|
import yaml
|
||||||
|
@ -17,9 +17,12 @@ def read_json_with_comments(json_path):
|
||||||
with fsspec.open(json_path, "r", encoding="utf-8") as f:
|
with fsspec.open(json_path, "r", encoding="utf-8") as f:
|
||||||
input_str = f.read()
|
input_str = f.read()
|
||||||
# handle comments but not urls with //
|
# handle comments but not urls with //
|
||||||
input_str = re.sub(r"(\"(?:[^\"\\]|\\.)*\")|(/\*(?:.|[\\n\\r])*?\*/)|(//.*)", lambda m: m.group(1) or m.group(2) or "", input_str)
|
input_str = re.sub(
|
||||||
|
r"(\"(?:[^\"\\]|\\.)*\")|(/\*(?:.|[\\n\\r])*?\*/)|(//.*)", lambda m: m.group(1) or m.group(2) or "", input_str
|
||||||
|
)
|
||||||
return json.loads(input_str)
|
return json.loads(input_str)
|
||||||
|
|
||||||
|
|
||||||
def register_config(model_name: str) -> Coqpit:
|
def register_config(model_name: str) -> Coqpit:
|
||||||
"""Find the right config for the given model name.
|
"""Find the right config for the given model name.
|
||||||
|
|
||||||
|
@ -65,7 +68,7 @@ def _process_model_name(config_dict: Dict) -> str:
|
||||||
return model_name
|
return model_name
|
||||||
|
|
||||||
|
|
||||||
def load_config(config_path: str) -> Coqpit:
|
def load_config(config_path: Union[str, os.PathLike[Any]]) -> Coqpit:
|
||||||
"""Import `json` or `yaml` files as TTS configs. First, load the input file as a `dict` and check the model name
|
"""Import `json` or `yaml` files as TTS configs. First, load the input file as a `dict` and check the model name
|
||||||
to find the corresponding Config class. Then initialize the Config.
|
to find the corresponding Config class. Then initialize the Config.
|
||||||
|
|
||||||
|
@ -78,6 +81,7 @@ def load_config(config_path: str) -> Coqpit:
|
||||||
Returns:
|
Returns:
|
||||||
Coqpit: TTS config object.
|
Coqpit: TTS config object.
|
||||||
"""
|
"""
|
||||||
|
config_path = str(config_path)
|
||||||
config_dict = {}
|
config_dict = {}
|
||||||
ext = os.path.splitext(config_path)[1]
|
ext = os.path.splitext(config_path)[1]
|
||||||
if ext in (".yml", ".yaml"):
|
if ext in (".yml", ".yaml"):
|
||||||
|
|
|
@ -1,23 +1,17 @@
|
||||||
import os
|
|
||||||
import gc
|
import gc
|
||||||
import torchaudio
|
import os
|
||||||
|
|
||||||
import pandas
|
import pandas
|
||||||
from faster_whisper import WhisperModel
|
|
||||||
from glob import glob
|
|
||||||
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torchaudio
|
import torchaudio
|
||||||
# torch.set_num_threads(1)
|
from faster_whisper import WhisperModel
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
# torch.set_num_threads(1)
|
||||||
from TTS.tts.layers.xtts.tokenizer import multilingual_cleaners
|
from TTS.tts.layers.xtts.tokenizer import multilingual_cleaners
|
||||||
|
|
||||||
torch.set_num_threads(16)
|
torch.set_num_threads(16)
|
||||||
|
|
||||||
|
|
||||||
import os
|
|
||||||
|
|
||||||
audio_types = (".wav", ".mp3", ".flac")
|
audio_types = (".wav", ".mp3", ".flac")
|
||||||
|
|
||||||
|
|
||||||
|
@ -25,9 +19,10 @@ def list_audios(basePath, contains=None):
|
||||||
# return the set of files that are valid
|
# return the set of files that are valid
|
||||||
return list_files(basePath, validExts=audio_types, contains=contains)
|
return list_files(basePath, validExts=audio_types, contains=contains)
|
||||||
|
|
||||||
|
|
||||||
def list_files(basePath, validExts=None, contains=None):
|
def list_files(basePath, validExts=None, contains=None):
|
||||||
# loop over the directory structure
|
# loop over the directory structure
|
||||||
for (rootDir, dirNames, filenames) in os.walk(basePath):
|
for rootDir, dirNames, filenames in os.walk(basePath):
|
||||||
# loop over the filenames in the current directory
|
# loop over the filenames in the current directory
|
||||||
for filename in filenames:
|
for filename in filenames:
|
||||||
# if the contains string is not none and the filename does not contain
|
# if the contains string is not none and the filename does not contain
|
||||||
|
@ -36,7 +31,7 @@ def list_files(basePath, validExts=None, contains=None):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# determine the file extension of the current file
|
# determine the file extension of the current file
|
||||||
ext = filename[filename.rfind("."):].lower()
|
ext = filename[filename.rfind(".") :].lower()
|
||||||
|
|
||||||
# check to see if the file is an audio and should be processed
|
# check to see if the file is an audio and should be processed
|
||||||
if validExts is None or ext.endswith(validExts):
|
if validExts is None or ext.endswith(validExts):
|
||||||
|
@ -44,7 +39,16 @@ def list_files(basePath, validExts=None, contains=None):
|
||||||
audioPath = os.path.join(rootDir, filename)
|
audioPath = os.path.join(rootDir, filename)
|
||||||
yield audioPath
|
yield audioPath
|
||||||
|
|
||||||
def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0.2, eval_percentage=0.15, speaker_name="coqui", gradio_progress=None):
|
|
||||||
|
def format_audio_list(
|
||||||
|
audio_files,
|
||||||
|
target_language="en",
|
||||||
|
out_path=None,
|
||||||
|
buffer=0.2,
|
||||||
|
eval_percentage=0.15,
|
||||||
|
speaker_name="coqui",
|
||||||
|
gradio_progress=None,
|
||||||
|
):
|
||||||
audio_total_size = 0
|
audio_total_size = 0
|
||||||
# make sure that ooutput file exists
|
# make sure that ooutput file exists
|
||||||
os.makedirs(out_path, exist_ok=True)
|
os.makedirs(out_path, exist_ok=True)
|
||||||
|
@ -69,7 +73,7 @@ def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0
|
||||||
wav = torch.mean(wav, dim=0, keepdim=True)
|
wav = torch.mean(wav, dim=0, keepdim=True)
|
||||||
|
|
||||||
wav = wav.squeeze()
|
wav = wav.squeeze()
|
||||||
audio_total_size += (wav.size(-1) / sr)
|
audio_total_size += wav.size(-1) / sr
|
||||||
|
|
||||||
segments, _ = asr_model.transcribe(audio_path, word_timestamps=True, language=target_language)
|
segments, _ = asr_model.transcribe(audio_path, word_timestamps=True, language=target_language)
|
||||||
segments = list(segments)
|
segments = list(segments)
|
||||||
|
@ -94,7 +98,7 @@ def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0
|
||||||
# get previous sentence end
|
# get previous sentence end
|
||||||
previous_word_end = words_list[word_idx - 1].end
|
previous_word_end = words_list[word_idx - 1].end
|
||||||
# add buffer or get the silence midle between the previous sentence and the current one
|
# add buffer or get the silence midle between the previous sentence and the current one
|
||||||
sentence_start = max(sentence_start - buffer, (previous_word_end + sentence_start)/2)
|
sentence_start = max(sentence_start - buffer, (previous_word_end + sentence_start) / 2)
|
||||||
|
|
||||||
sentence = word.word
|
sentence = word.word
|
||||||
first_word = False
|
first_word = False
|
||||||
|
@ -124,13 +128,10 @@ def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0
|
||||||
i += 1
|
i += 1
|
||||||
first_word = True
|
first_word = True
|
||||||
|
|
||||||
audio = wav[int(sr*sentence_start):int(sr*word_end)].unsqueeze(0)
|
audio = wav[int(sr * sentence_start) : int(sr * word_end)].unsqueeze(0)
|
||||||
# if the audio is too short ignore it (i.e < 0.33 seconds)
|
# if the audio is too short ignore it (i.e < 0.33 seconds)
|
||||||
if audio.size(-1) >= sr/3:
|
if audio.size(-1) >= sr / 3:
|
||||||
torchaudio.save(absoulte_path,
|
torchaudio.save(absoulte_path, audio, sr)
|
||||||
audio,
|
|
||||||
sr
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
@ -140,17 +141,17 @@ def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0
|
||||||
|
|
||||||
df = pandas.DataFrame(metadata)
|
df = pandas.DataFrame(metadata)
|
||||||
df = df.sample(frac=1)
|
df = df.sample(frac=1)
|
||||||
num_val_samples = int(len(df)*eval_percentage)
|
num_val_samples = int(len(df) * eval_percentage)
|
||||||
|
|
||||||
df_eval = df[:num_val_samples]
|
df_eval = df[:num_val_samples]
|
||||||
df_train = df[num_val_samples:]
|
df_train = df[num_val_samples:]
|
||||||
|
|
||||||
df_train = df_train.sort_values('audio_file')
|
df_train = df_train.sort_values("audio_file")
|
||||||
train_metadata_path = os.path.join(out_path, "metadata_train.csv")
|
train_metadata_path = os.path.join(out_path, "metadata_train.csv")
|
||||||
df_train.to_csv(train_metadata_path, sep="|", index=False)
|
df_train.to_csv(train_metadata_path, sep="|", index=False)
|
||||||
|
|
||||||
eval_metadata_path = os.path.join(out_path, "metadata_eval.csv")
|
eval_metadata_path = os.path.join(out_path, "metadata_eval.csv")
|
||||||
df_eval = df_eval.sort_values('audio_file')
|
df_eval = df_eval.sort_values("audio_file")
|
||||||
df_eval.to_csv(eval_metadata_path, sep="|", index=False)
|
df_eval.to_csv(eval_metadata_path, sep="|", index=False)
|
||||||
|
|
||||||
# deallocate VRAM and RAM
|
# deallocate VRAM and RAM
|
||||||
|
|
|
@ -1,11 +1,12 @@
|
||||||
import os
|
|
||||||
import gc
|
import gc
|
||||||
|
import os
|
||||||
|
|
||||||
from trainer import Trainer, TrainerArgs
|
from trainer import Trainer, TrainerArgs
|
||||||
|
|
||||||
from TTS.config.shared_configs import BaseDatasetConfig
|
from TTS.config.shared_configs import BaseDatasetConfig
|
||||||
from TTS.tts.datasets import load_tts_samples
|
from TTS.tts.datasets import load_tts_samples
|
||||||
from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig
|
from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig
|
||||||
|
from TTS.tts.models.xtts import XttsAudioConfig
|
||||||
from TTS.utils.manage import ModelManager
|
from TTS.utils.manage import ModelManager
|
||||||
|
|
||||||
|
|
||||||
|
@ -25,7 +26,6 @@ def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv,
|
||||||
BATCH_SIZE = batch_size # set here the batch size
|
BATCH_SIZE = batch_size # set here the batch size
|
||||||
GRAD_ACUMM_STEPS = grad_acumm # set here the grad accumulation steps
|
GRAD_ACUMM_STEPS = grad_acumm # set here the grad accumulation steps
|
||||||
|
|
||||||
|
|
||||||
# Define here the dataset that you want to use for the fine-tuning on.
|
# Define here the dataset that you want to use for the fine-tuning on.
|
||||||
config_dataset = BaseDatasetConfig(
|
config_dataset = BaseDatasetConfig(
|
||||||
formatter="coqui",
|
formatter="coqui",
|
||||||
|
@ -43,10 +43,9 @@ def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv,
|
||||||
CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "XTTS_v2.0_original_model_files/")
|
CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "XTTS_v2.0_original_model_files/")
|
||||||
os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)
|
os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
# DVAE files
|
# DVAE files
|
||||||
DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth"
|
DVAE_CHECKPOINT_LINK = "https://huggingface.co/coqui/XTTS-v2/resolve/main/dvae.pth"
|
||||||
MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth"
|
MEL_NORM_LINK = "https://huggingface.co/coqui/XTTS-v2/resolve/main/mel_stats.pth"
|
||||||
|
|
||||||
# Set the path to the downloaded files
|
# Set the path to the downloaded files
|
||||||
DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK))
|
DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK))
|
||||||
|
@ -55,13 +54,14 @@ def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv,
|
||||||
# download DVAE files if needed
|
# download DVAE files if needed
|
||||||
if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
|
if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
|
||||||
print(" > Downloading DVAE files!")
|
print(" > Downloading DVAE files!")
|
||||||
ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True)
|
ModelManager._download_model_files(
|
||||||
|
[MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
|
||||||
|
)
|
||||||
|
|
||||||
# Download XTTS v2.0 checkpoint if needed
|
# Download XTTS v2.0 checkpoint if needed
|
||||||
TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json"
|
TOKENIZER_FILE_LINK = "https://huggingface.co/coqui/XTTS-v2/resolve/main/vocab.json"
|
||||||
XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth"
|
XTTS_CHECKPOINT_LINK = "https://huggingface.co/coqui/XTTS-v2/resolve/main/model.pth"
|
||||||
XTTS_CONFIG_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json"
|
XTTS_CONFIG_LINK = "https://huggingface.co/coqui/XTTS-v2/resolve/main/config.json"
|
||||||
|
|
||||||
# XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
|
# XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
|
||||||
TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK)) # vocab.json file
|
TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK)) # vocab.json file
|
||||||
|
|
|
@ -1,19 +1,16 @@
|
||||||
import argparse
|
import argparse
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import tempfile
|
import tempfile
|
||||||
|
import traceback
|
||||||
|
|
||||||
import gradio as gr
|
import gradio as gr
|
||||||
import librosa.display
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
import os
|
|
||||||
import torch
|
import torch
|
||||||
import torchaudio
|
import torchaudio
|
||||||
import traceback
|
|
||||||
from TTS.demos.xtts_ft_demo.utils.formatter import format_audio_list
|
from TTS.demos.xtts_ft_demo.utils.formatter import format_audio_list
|
||||||
from TTS.demos.xtts_ft_demo.utils.gpt_train import train_gpt
|
from TTS.demos.xtts_ft_demo.utils.gpt_train import train_gpt
|
||||||
|
|
||||||
from TTS.tts.configs.xtts_config import XttsConfig
|
from TTS.tts.configs.xtts_config import XttsConfig
|
||||||
from TTS.tts.models.xtts import Xtts
|
from TTS.tts.models.xtts import Xtts
|
||||||
|
|
||||||
|
@ -23,7 +20,10 @@ def clear_gpu_cache():
|
||||||
if torch.cuda.is_available():
|
if torch.cuda.is_available():
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
|
||||||
XTTS_MODEL = None
|
XTTS_MODEL = None
|
||||||
|
|
||||||
|
|
||||||
def load_model(xtts_checkpoint, xtts_config, xtts_vocab):
|
def load_model(xtts_checkpoint, xtts_config, xtts_vocab):
|
||||||
global XTTS_MODEL
|
global XTTS_MODEL
|
||||||
clear_gpu_cache()
|
clear_gpu_cache()
|
||||||
|
@ -40,11 +40,17 @@ def load_model(xtts_checkpoint, xtts_config, xtts_vocab):
|
||||||
print("Model Loaded!")
|
print("Model Loaded!")
|
||||||
return "Model Loaded!"
|
return "Model Loaded!"
|
||||||
|
|
||||||
|
|
||||||
def run_tts(lang, tts_text, speaker_audio_file):
|
def run_tts(lang, tts_text, speaker_audio_file):
|
||||||
if XTTS_MODEL is None or not speaker_audio_file:
|
if XTTS_MODEL is None or not speaker_audio_file:
|
||||||
return "You need to run the previous step to load the model !!", None, None
|
return "You need to run the previous step to load the model !!", None, None
|
||||||
|
|
||||||
gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(audio_path=speaker_audio_file, gpt_cond_len=XTTS_MODEL.config.gpt_cond_len, max_ref_length=XTTS_MODEL.config.max_ref_len, sound_norm_refs=XTTS_MODEL.config.sound_norm_refs)
|
gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(
|
||||||
|
audio_path=speaker_audio_file,
|
||||||
|
gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
|
||||||
|
max_ref_length=XTTS_MODEL.config.max_ref_len,
|
||||||
|
sound_norm_refs=XTTS_MODEL.config.sound_norm_refs,
|
||||||
|
)
|
||||||
out = XTTS_MODEL.inference(
|
out = XTTS_MODEL.inference(
|
||||||
text=tts_text,
|
text=tts_text,
|
||||||
language=lang,
|
language=lang,
|
||||||
|
@ -65,8 +71,6 @@ def run_tts(lang, tts_text, speaker_audio_file):
|
||||||
return "Speech generated !", out_path, speaker_audio_file
|
return "Speech generated !", out_path, speaker_audio_file
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# define a logger to redirect
|
# define a logger to redirect
|
||||||
class Logger:
|
class Logger:
|
||||||
def __init__(self, filename="log.out"):
|
def __init__(self, filename="log.out"):
|
||||||
|
@ -85,21 +89,19 @@ class Logger:
|
||||||
def isatty(self):
|
def isatty(self):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
# redirect stdout and stderr to a file
|
# redirect stdout and stderr to a file
|
||||||
sys.stdout = Logger()
|
sys.stdout = Logger()
|
||||||
sys.stderr = sys.stdout
|
sys.stderr = sys.stdout
|
||||||
|
|
||||||
|
|
||||||
# logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
# logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
||||||
import logging
|
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=logging.INFO,
|
level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[logging.StreamHandler(sys.stdout)]
|
||||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
|
||||||
handlers=[
|
|
||||||
logging.StreamHandler(sys.stdout)
|
|
||||||
]
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def read_logs():
|
def read_logs():
|
||||||
sys.stdout.flush()
|
sys.stdout.flush()
|
||||||
with open(sys.stdout.log_file, "r") as f:
|
with open(sys.stdout.log_file, "r") as f:
|
||||||
|
@ -107,7 +109,6 @@ def read_logs():
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="""XTTS fine-tuning demo\n\n"""
|
description="""XTTS fine-tuning demo\n\n"""
|
||||||
"""
|
"""
|
||||||
|
@ -190,12 +191,11 @@ if __name__ == "__main__":
|
||||||
"zh",
|
"zh",
|
||||||
"hu",
|
"hu",
|
||||||
"ko",
|
"ko",
|
||||||
"ja"
|
"ja",
|
||||||
|
"hi",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
progress_data = gr.Label(
|
progress_data = gr.Label(label="Progress:")
|
||||||
label="Progress:"
|
|
||||||
)
|
|
||||||
logs = gr.Textbox(
|
logs = gr.Textbox(
|
||||||
label="Logs:",
|
label="Logs:",
|
||||||
interactive=False,
|
interactive=False,
|
||||||
|
@ -209,14 +209,24 @@ if __name__ == "__main__":
|
||||||
out_path = os.path.join(out_path, "dataset")
|
out_path = os.path.join(out_path, "dataset")
|
||||||
os.makedirs(out_path, exist_ok=True)
|
os.makedirs(out_path, exist_ok=True)
|
||||||
if audio_path is None:
|
if audio_path is None:
|
||||||
return "You should provide one or multiple audio files! If you provided it, probably the upload of the files is not finished yet!", "", ""
|
return (
|
||||||
|
"You should provide one or multiple audio files! If you provided it, probably the upload of the files is not finished yet!",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
train_meta, eval_meta, audio_total_size = format_audio_list(audio_path, target_language=language, out_path=out_path, gradio_progress=progress)
|
train_meta, eval_meta, audio_total_size = format_audio_list(
|
||||||
|
audio_path, target_language=language, out_path=out_path, gradio_progress=progress
|
||||||
|
)
|
||||||
except:
|
except:
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
error = traceback.format_exc()
|
error = traceback.format_exc()
|
||||||
return f"The data processing was interrupted due an error !! Please check the console to verify the full error message! \n Error summary: {error}", "", ""
|
return (
|
||||||
|
f"The data processing was interrupted due an error !! Please check the console to verify the full error message! \n Error summary: {error}",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
)
|
||||||
|
|
||||||
clear_gpu_cache()
|
clear_gpu_cache()
|
||||||
|
|
||||||
|
@ -264,9 +274,7 @@ if __name__ == "__main__":
|
||||||
step=1,
|
step=1,
|
||||||
value=args.max_audio_length,
|
value=args.max_audio_length,
|
||||||
)
|
)
|
||||||
progress_train = gr.Label(
|
progress_train = gr.Label(label="Progress:")
|
||||||
label="Progress:"
|
|
||||||
)
|
|
||||||
logs_tts_train = gr.Textbox(
|
logs_tts_train = gr.Textbox(
|
||||||
label="Logs:",
|
label="Logs:",
|
||||||
interactive=False,
|
interactive=False,
|
||||||
|
@ -274,18 +282,41 @@ if __name__ == "__main__":
|
||||||
demo.load(read_logs, None, logs_tts_train, every=1)
|
demo.load(read_logs, None, logs_tts_train, every=1)
|
||||||
train_btn = gr.Button(value="Step 2 - Run the training")
|
train_btn = gr.Button(value="Step 2 - Run the training")
|
||||||
|
|
||||||
def train_model(language, train_csv, eval_csv, num_epochs, batch_size, grad_acumm, output_path, max_audio_length):
|
def train_model(
|
||||||
|
language, train_csv, eval_csv, num_epochs, batch_size, grad_acumm, output_path, max_audio_length
|
||||||
|
):
|
||||||
clear_gpu_cache()
|
clear_gpu_cache()
|
||||||
if not train_csv or not eval_csv:
|
if not train_csv or not eval_csv:
|
||||||
return "You need to run the data processing step or manually set `Train CSV` and `Eval CSV` fields !", "", "", "", ""
|
return (
|
||||||
|
"You need to run the data processing step or manually set `Train CSV` and `Eval CSV` fields !",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
# convert seconds to waveform frames
|
# convert seconds to waveform frames
|
||||||
max_audio_length = int(max_audio_length * 22050)
|
max_audio_length = int(max_audio_length * 22050)
|
||||||
config_path, original_xtts_checkpoint, vocab_file, exp_path, speaker_wav = train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, output_path=output_path, max_audio_length=max_audio_length)
|
config_path, original_xtts_checkpoint, vocab_file, exp_path, speaker_wav = train_gpt(
|
||||||
|
language,
|
||||||
|
num_epochs,
|
||||||
|
batch_size,
|
||||||
|
grad_acumm,
|
||||||
|
train_csv,
|
||||||
|
eval_csv,
|
||||||
|
output_path=output_path,
|
||||||
|
max_audio_length=max_audio_length,
|
||||||
|
)
|
||||||
except:
|
except:
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
error = traceback.format_exc()
|
error = traceback.format_exc()
|
||||||
return f"The training was interrupted due an error !! Please check the console to check the full error message! \n Error summary: {error}", "", "", "", ""
|
return (
|
||||||
|
f"The training was interrupted due an error !! Please check the console to check the full error message! \n Error summary: {error}",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
)
|
||||||
|
|
||||||
# copy original files to avoid parameters changes issues
|
# copy original files to avoid parameters changes issues
|
||||||
os.system(f"cp {config_path} {exp_path}")
|
os.system(f"cp {config_path} {exp_path}")
|
||||||
|
@ -312,9 +343,7 @@ if __name__ == "__main__":
|
||||||
label="XTTS vocab path:",
|
label="XTTS vocab path:",
|
||||||
value="",
|
value="",
|
||||||
)
|
)
|
||||||
progress_load = gr.Label(
|
progress_load = gr.Label(label="Progress:")
|
||||||
label="Progress:"
|
|
||||||
)
|
|
||||||
load_btn = gr.Button(value="Step 3 - Load Fine-tuned XTTS model")
|
load_btn = gr.Button(value="Step 3 - Load Fine-tuned XTTS model")
|
||||||
|
|
||||||
with gr.Column() as col2:
|
with gr.Column() as col2:
|
||||||
|
@ -342,7 +371,8 @@ if __name__ == "__main__":
|
||||||
"hu",
|
"hu",
|
||||||
"ko",
|
"ko",
|
||||||
"ja",
|
"ja",
|
||||||
]
|
"hi",
|
||||||
|
],
|
||||||
)
|
)
|
||||||
tts_text = gr.Textbox(
|
tts_text = gr.Textbox(
|
||||||
label="Input Text.",
|
label="Input Text.",
|
||||||
|
@ -351,9 +381,7 @@ if __name__ == "__main__":
|
||||||
tts_btn = gr.Button(value="Step 4 - Inference")
|
tts_btn = gr.Button(value="Step 4 - Inference")
|
||||||
|
|
||||||
with gr.Column() as col3:
|
with gr.Column() as col3:
|
||||||
progress_gen = gr.Label(
|
progress_gen = gr.Label(label="Progress:")
|
||||||
label="Progress:"
|
|
||||||
)
|
|
||||||
tts_output_audio = gr.Audio(label="Generated Audio.")
|
tts_output_audio = gr.Audio(label="Generated Audio.")
|
||||||
reference_audio = gr.Audio(label="Reference audio used.")
|
reference_audio = gr.Audio(label="Reference audio used.")
|
||||||
|
|
||||||
|
@ -371,7 +399,6 @@ if __name__ == "__main__":
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
train_btn.click(
|
train_btn.click(
|
||||||
fn=train_model,
|
fn=train_model,
|
||||||
inputs=[
|
inputs=[
|
||||||
|
@ -389,11 +416,7 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
load_btn.click(
|
load_btn.click(
|
||||||
fn=load_model,
|
fn=load_model,
|
||||||
inputs=[
|
inputs=[xtts_checkpoint, xtts_config, xtts_vocab],
|
||||||
xtts_checkpoint,
|
|
||||||
xtts_config,
|
|
||||||
xtts_vocab
|
|
||||||
],
|
|
||||||
outputs=[progress_load],
|
outputs=[progress_load],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -407,9 +430,4 @@ if __name__ == "__main__":
|
||||||
outputs=[progress_gen, tts_output_audio, reference_audio],
|
outputs=[progress_gen, tts_output_audio, reference_audio],
|
||||||
)
|
)
|
||||||
|
|
||||||
demo.launch(
|
demo.launch(share=True, debug=False, server_port=args.port, server_name="0.0.0.0")
|
||||||
share=True,
|
|
||||||
debug=False,
|
|
||||||
server_port=args.port,
|
|
||||||
server_name="0.0.0.0"
|
|
||||||
)
|
|
||||||
|
|
|
@ -14,5 +14,5 @@ To run the code, you need to follow the same flow as in TTS.
|
||||||
|
|
||||||
- Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
|
- Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
|
||||||
- Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```
|
- Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```
|
||||||
- Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
|
- Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda /model/path/best_model.pth model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
|
||||||
- Watch training on Tensorboard as in TTS
|
- Watch training on Tensorboard as in TTS
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from dataclasses import asdict, dataclass
|
from dataclasses import dataclass
|
||||||
|
|
||||||
from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
|
from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from dataclasses import asdict, dataclass
|
from dataclasses import dataclass
|
||||||
|
|
||||||
from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
|
from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
import logging
|
||||||
import random
|
import random
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
@ -5,6 +6,8 @@ from torch.utils.data import Dataset
|
||||||
|
|
||||||
from TTS.encoder.utils.generic_utils import AugmentWAV
|
from TTS.encoder.utils.generic_utils import AugmentWAV
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class EncoderDataset(Dataset):
|
class EncoderDataset(Dataset):
|
||||||
def __init__(
|
def __init__(
|
||||||
|
@ -15,7 +18,6 @@ class EncoderDataset(Dataset):
|
||||||
voice_len=1.6,
|
voice_len=1.6,
|
||||||
num_classes_in_batch=64,
|
num_classes_in_batch=64,
|
||||||
num_utter_per_class=10,
|
num_utter_per_class=10,
|
||||||
verbose=False,
|
|
||||||
augmentation_config=None,
|
augmentation_config=None,
|
||||||
use_torch_spec=None,
|
use_torch_spec=None,
|
||||||
):
|
):
|
||||||
|
@ -24,7 +26,6 @@ class EncoderDataset(Dataset):
|
||||||
ap (TTS.tts.utils.AudioProcessor): audio processor object.
|
ap (TTS.tts.utils.AudioProcessor): audio processor object.
|
||||||
meta_data (list): list of dataset instances.
|
meta_data (list): list of dataset instances.
|
||||||
seq_len (int): voice segment length in seconds.
|
seq_len (int): voice segment length in seconds.
|
||||||
verbose (bool): print diagnostic information.
|
|
||||||
"""
|
"""
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.config = config
|
self.config = config
|
||||||
|
@ -33,7 +34,6 @@ class EncoderDataset(Dataset):
|
||||||
self.seq_len = int(voice_len * self.sample_rate)
|
self.seq_len = int(voice_len * self.sample_rate)
|
||||||
self.num_utter_per_class = num_utter_per_class
|
self.num_utter_per_class = num_utter_per_class
|
||||||
self.ap = ap
|
self.ap = ap
|
||||||
self.verbose = verbose
|
|
||||||
self.use_torch_spec = use_torch_spec
|
self.use_torch_spec = use_torch_spec
|
||||||
self.classes, self.items = self.__parse_items()
|
self.classes, self.items = self.__parse_items()
|
||||||
|
|
||||||
|
@ -50,13 +50,12 @@ class EncoderDataset(Dataset):
|
||||||
if "gaussian" in augmentation_config.keys():
|
if "gaussian" in augmentation_config.keys():
|
||||||
self.gaussian_augmentation_config = augmentation_config["gaussian"]
|
self.gaussian_augmentation_config = augmentation_config["gaussian"]
|
||||||
|
|
||||||
if self.verbose:
|
logger.info("DataLoader initialization")
|
||||||
print("\n > DataLoader initialization")
|
logger.info(" | Classes per batch: %d", num_classes_in_batch)
|
||||||
print(f" | > Classes per Batch: {num_classes_in_batch}")
|
logger.info(" | Number of instances: %d", len(self.items))
|
||||||
print(f" | > Number of instances : {len(self.items)}")
|
logger.info(" | Sequence length: %d", self.seq_len)
|
||||||
print(f" | > Sequence length: {self.seq_len}")
|
logger.info(" | Number of classes: %d", len(self.classes))
|
||||||
print(f" | > Num Classes: {len(self.classes)}")
|
logger.info(" | Classes: %s", self.classes)
|
||||||
print(f" | > Classes: {self.classes}")
|
|
||||||
|
|
||||||
def load_wav(self, filename):
|
def load_wav(self, filename):
|
||||||
audio = self.ap.load_wav(filename, sr=self.ap.sample_rate)
|
audio = self.ap.load_wav(filename, sr=self.ap.sample_rate)
|
||||||
|
|
|
@ -1,7 +1,11 @@
|
||||||
|
import logging
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
# adapted from https://github.com/cvqluu/GE2E-Loss
|
# adapted from https://github.com/cvqluu/GE2E-Loss
|
||||||
class GE2ELoss(nn.Module):
|
class GE2ELoss(nn.Module):
|
||||||
|
@ -23,7 +27,7 @@ class GE2ELoss(nn.Module):
|
||||||
self.b = nn.Parameter(torch.tensor(init_b))
|
self.b = nn.Parameter(torch.tensor(init_b))
|
||||||
self.loss_method = loss_method
|
self.loss_method = loss_method
|
||||||
|
|
||||||
print(" > Initialized Generalized End-to-End loss")
|
logger.info("Initialized Generalized End-to-End loss")
|
||||||
|
|
||||||
assert self.loss_method in ["softmax", "contrast"]
|
assert self.loss_method in ["softmax", "contrast"]
|
||||||
|
|
||||||
|
@ -139,7 +143,7 @@ class AngleProtoLoss(nn.Module):
|
||||||
self.b = nn.Parameter(torch.tensor(init_b))
|
self.b = nn.Parameter(torch.tensor(init_b))
|
||||||
self.criterion = torch.nn.CrossEntropyLoss()
|
self.criterion = torch.nn.CrossEntropyLoss()
|
||||||
|
|
||||||
print(" > Initialized Angular Prototypical loss")
|
logger.info("Initialized Angular Prototypical loss")
|
||||||
|
|
||||||
def forward(self, x, _label=None):
|
def forward(self, x, _label=None):
|
||||||
"""
|
"""
|
||||||
|
@ -177,7 +181,7 @@ class SoftmaxLoss(nn.Module):
|
||||||
self.criterion = torch.nn.CrossEntropyLoss()
|
self.criterion = torch.nn.CrossEntropyLoss()
|
||||||
self.fc = nn.Linear(embedding_dim, n_speakers)
|
self.fc = nn.Linear(embedding_dim, n_speakers)
|
||||||
|
|
||||||
print("Initialised Softmax Loss")
|
logger.info("Initialised Softmax Loss")
|
||||||
|
|
||||||
def forward(self, x, label=None):
|
def forward(self, x, label=None):
|
||||||
# reshape for compatibility
|
# reshape for compatibility
|
||||||
|
@ -212,7 +216,7 @@ class SoftmaxAngleProtoLoss(nn.Module):
|
||||||
self.softmax = SoftmaxLoss(embedding_dim, n_speakers)
|
self.softmax = SoftmaxLoss(embedding_dim, n_speakers)
|
||||||
self.angleproto = AngleProtoLoss(init_w, init_b)
|
self.angleproto = AngleProtoLoss(init_w, init_b)
|
||||||
|
|
||||||
print("Initialised SoftmaxAnglePrototypical Loss")
|
logger.info("Initialised SoftmaxAnglePrototypical Loss")
|
||||||
|
|
||||||
def forward(self, x, label=None):
|
def forward(self, x, label=None):
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -1,12 +1,16 @@
|
||||||
|
import logging
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
import torchaudio
|
import torchaudio
|
||||||
from coqpit import Coqpit
|
from coqpit import Coqpit
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
from trainer.generic_utils import set_partial_state_dict
|
||||||
|
from trainer.io import load_fsspec
|
||||||
|
|
||||||
from TTS.encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
|
from TTS.encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
|
||||||
from TTS.utils.generic_utils import set_init_dict
|
|
||||||
from TTS.utils.io import load_fsspec
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class PreEmphasis(nn.Module):
|
class PreEmphasis(nn.Module):
|
||||||
|
@ -118,15 +122,15 @@ class BaseEncoder(nn.Module):
|
||||||
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
|
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
|
||||||
try:
|
try:
|
||||||
self.load_state_dict(state["model"])
|
self.load_state_dict(state["model"])
|
||||||
print(" > Model fully restored. ")
|
logger.info("Model fully restored. ")
|
||||||
except (KeyError, RuntimeError) as error:
|
except (KeyError, RuntimeError) as error:
|
||||||
# If eval raise the error
|
# If eval raise the error
|
||||||
if eval:
|
if eval:
|
||||||
raise error
|
raise error
|
||||||
|
|
||||||
print(" > Partial model initialization.")
|
logger.info("Partial model initialization.")
|
||||||
model_dict = self.state_dict()
|
model_dict = self.state_dict()
|
||||||
model_dict = set_init_dict(model_dict, state["model"], c)
|
model_dict = set_partial_state_dict(model_dict, state["model"], config)
|
||||||
self.load_state_dict(model_dict)
|
self.load_state_dict(model_dict)
|
||||||
del model_dict
|
del model_dict
|
||||||
|
|
||||||
|
@ -135,7 +139,7 @@ class BaseEncoder(nn.Module):
|
||||||
try:
|
try:
|
||||||
criterion.load_state_dict(state["criterion"])
|
criterion.load_state_dict(state["criterion"])
|
||||||
except (KeyError, RuntimeError) as error:
|
except (KeyError, RuntimeError) as error:
|
||||||
print(" > Criterion load ignored because of:", error)
|
logger.exception("Criterion load ignored because of: %s", error)
|
||||||
|
|
||||||
# instance and load the criterion for the encoder classifier in inference time
|
# instance and load the criterion for the encoder classifier in inference time
|
||||||
if (
|
if (
|
||||||
|
|
|
@ -86,7 +86,7 @@ class LSTMSpeakerEncoder(BaseEncoder):
|
||||||
- x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})`
|
- x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})`
|
||||||
"""
|
"""
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
with torch.cuda.amp.autocast(enabled=False):
|
with torch.autocast("cuda", enabled=False):
|
||||||
if self.use_torch_spec:
|
if self.use_torch_spec:
|
||||||
x.squeeze_(1)
|
x.squeeze_(1)
|
||||||
x = self.torch_spec(x)
|
x = self.torch_spec(x)
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import glob
|
import glob
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
|
|
||||||
|
@ -8,6 +9,8 @@ from scipy import signal
|
||||||
from TTS.encoder.models.lstm import LSTMSpeakerEncoder
|
from TTS.encoder.models.lstm import LSTMSpeakerEncoder
|
||||||
from TTS.encoder.models.resnet import ResNetSpeakerEncoder
|
from TTS.encoder.models.resnet import ResNetSpeakerEncoder
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class AugmentWAV(object):
|
class AugmentWAV(object):
|
||||||
def __init__(self, ap, augmentation_config):
|
def __init__(self, ap, augmentation_config):
|
||||||
|
@ -34,12 +37,14 @@ class AugmentWAV(object):
|
||||||
# ignore not listed directories
|
# ignore not listed directories
|
||||||
if noise_dir not in self.additive_noise_types:
|
if noise_dir not in self.additive_noise_types:
|
||||||
continue
|
continue
|
||||||
if not noise_dir in self.noise_list:
|
if noise_dir not in self.noise_list:
|
||||||
self.noise_list[noise_dir] = []
|
self.noise_list[noise_dir] = []
|
||||||
self.noise_list[noise_dir].append(wav_file)
|
self.noise_list[noise_dir].append(wav_file)
|
||||||
|
|
||||||
print(
|
logger.info(
|
||||||
f" | > Using Additive Noise Augmentation: with {len(additive_files)} audios instances from {self.additive_noise_types}"
|
"Using Additive Noise Augmentation: with %d audios instances from %s",
|
||||||
|
len(additive_files),
|
||||||
|
self.additive_noise_types,
|
||||||
)
|
)
|
||||||
|
|
||||||
self.use_rir = False
|
self.use_rir = False
|
||||||
|
@ -50,7 +55,7 @@ class AugmentWAV(object):
|
||||||
self.rir_files = glob.glob(os.path.join(self.rir_config["rir_path"], "**/*.wav"), recursive=True)
|
self.rir_files = glob.glob(os.path.join(self.rir_config["rir_path"], "**/*.wav"), recursive=True)
|
||||||
self.use_rir = True
|
self.use_rir = True
|
||||||
|
|
||||||
print(f" | > Using RIR Noise Augmentation: with {len(self.rir_files)} audios instances")
|
logger.info("Using RIR Noise Augmentation: with %d audios instances", len(self.rir_files))
|
||||||
|
|
||||||
self.create_augmentation_global_list()
|
self.create_augmentation_global_list()
|
||||||
|
|
||||||
|
|
|
@ -19,15 +19,19 @@
|
||||||
# pylint: disable=too-many-locals, too-many-statements, too-many-arguments, too-many-instance-attributes
|
# pylint: disable=too-many-locals, too-many-statements, too-many-arguments, too-many-instance-attributes
|
||||||
""" voxceleb 1 & 2 """
|
""" voxceleb 1 & 2 """
|
||||||
|
|
||||||
|
import csv
|
||||||
import hashlib
|
import hashlib
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
import zipfile
|
import zipfile
|
||||||
|
|
||||||
import pandas
|
|
||||||
import soundfile as sf
|
import soundfile as sf
|
||||||
from absl import logging
|
|
||||||
|
from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
SUBSETS = {
|
SUBSETS = {
|
||||||
"vox1_dev_wav": [
|
"vox1_dev_wav": [
|
||||||
|
@ -77,14 +81,14 @@ def download_and_extract(directory, subset, urls):
|
||||||
zip_filepath = os.path.join(directory, url.split("/")[-1])
|
zip_filepath = os.path.join(directory, url.split("/")[-1])
|
||||||
if os.path.exists(zip_filepath):
|
if os.path.exists(zip_filepath):
|
||||||
continue
|
continue
|
||||||
logging.info("Downloading %s to %s" % (url, zip_filepath))
|
logger.info("Downloading %s to %s" % (url, zip_filepath))
|
||||||
subprocess.call(
|
subprocess.call(
|
||||||
"wget %s --user %s --password %s -O %s" % (url, USER["user"], USER["password"], zip_filepath),
|
"wget %s --user %s --password %s -O %s" % (url, USER["user"], USER["password"], zip_filepath),
|
||||||
shell=True,
|
shell=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
statinfo = os.stat(zip_filepath)
|
statinfo = os.stat(zip_filepath)
|
||||||
logging.info("Successfully downloaded %s, size(bytes): %d" % (url, statinfo.st_size))
|
logger.info("Successfully downloaded %s, size(bytes): %d" % (url, statinfo.st_size))
|
||||||
|
|
||||||
# concatenate all parts into zip files
|
# concatenate all parts into zip files
|
||||||
if ".zip" not in zip_filepath:
|
if ".zip" not in zip_filepath:
|
||||||
|
@ -118,9 +122,9 @@ def exec_cmd(cmd):
|
||||||
try:
|
try:
|
||||||
retcode = subprocess.call(cmd, shell=True)
|
retcode = subprocess.call(cmd, shell=True)
|
||||||
if retcode < 0:
|
if retcode < 0:
|
||||||
logging.info(f"Child was terminated by signal {retcode}")
|
logger.info(f"Child was terminated by signal {retcode}")
|
||||||
except OSError as e:
|
except OSError as e:
|
||||||
logging.info(f"Execution failed: {e}")
|
logger.info(f"Execution failed: {e}")
|
||||||
retcode = -999
|
retcode = -999
|
||||||
return retcode
|
return retcode
|
||||||
|
|
||||||
|
@ -134,11 +138,11 @@ def decode_aac_with_ffmpeg(aac_file, wav_file):
|
||||||
bool, True if success.
|
bool, True if success.
|
||||||
"""
|
"""
|
||||||
cmd = f"ffmpeg -i {aac_file} {wav_file}"
|
cmd = f"ffmpeg -i {aac_file} {wav_file}"
|
||||||
logging.info(f"Decoding aac file using command line: {cmd}")
|
logger.info(f"Decoding aac file using command line: {cmd}")
|
||||||
ret = exec_cmd(cmd)
|
ret = exec_cmd(cmd)
|
||||||
if ret != 0:
|
if ret != 0:
|
||||||
logging.error(f"Failed to decode aac file with retcode {ret}")
|
logger.error(f"Failed to decode aac file with retcode {ret}")
|
||||||
logging.error("Please check your ffmpeg installation.")
|
logger.error("Please check your ffmpeg installation.")
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
@ -152,7 +156,7 @@ def convert_audio_and_make_label(input_dir, subset, output_dir, output_file):
|
||||||
output_file: the name of the newly generated csv file. e.g. vox1_dev_wav.csv
|
output_file: the name of the newly generated csv file. e.g. vox1_dev_wav.csv
|
||||||
"""
|
"""
|
||||||
|
|
||||||
logging.info("Preprocessing audio and label for subset %s" % subset)
|
logger.info("Preprocessing audio and label for subset %s" % subset)
|
||||||
source_dir = os.path.join(input_dir, subset)
|
source_dir = os.path.join(input_dir, subset)
|
||||||
|
|
||||||
files = []
|
files = []
|
||||||
|
@ -185,9 +189,12 @@ def convert_audio_and_make_label(input_dir, subset, output_dir, output_file):
|
||||||
# Write to CSV file which contains four columns:
|
# Write to CSV file which contains four columns:
|
||||||
# "wav_filename", "wav_length_ms", "speaker_id", "speaker_name".
|
# "wav_filename", "wav_length_ms", "speaker_id", "speaker_name".
|
||||||
csv_file_path = os.path.join(output_dir, output_file)
|
csv_file_path = os.path.join(output_dir, output_file)
|
||||||
df = pandas.DataFrame(data=files, columns=["wav_filename", "wav_length_ms", "speaker_id", "speaker_name"])
|
with open(csv_file_path, "w", newline="", encoding="utf-8") as f:
|
||||||
df.to_csv(csv_file_path, index=False, sep="\t")
|
writer = csv.writer(f, delimiter="\t")
|
||||||
logging.info("Successfully generated csv file {}".format(csv_file_path))
|
writer.writerow(["wav_filename", "wav_length_ms", "speaker_id", "speaker_name"])
|
||||||
|
for wav_file in files:
|
||||||
|
writer.writerow(wav_file)
|
||||||
|
logger.info("Successfully generated csv file {}".format(csv_file_path))
|
||||||
|
|
||||||
|
|
||||||
def processor(directory, subset, force_process):
|
def processor(directory, subset, force_process):
|
||||||
|
@ -200,16 +207,16 @@ def processor(directory, subset, force_process):
|
||||||
if not force_process and os.path.exists(subset_csv):
|
if not force_process and os.path.exists(subset_csv):
|
||||||
return subset_csv
|
return subset_csv
|
||||||
|
|
||||||
logging.info("Downloading and process the voxceleb in %s", directory)
|
logger.info("Downloading and process the voxceleb in %s", directory)
|
||||||
logging.info("Preparing subset %s", subset)
|
logger.info("Preparing subset %s", subset)
|
||||||
download_and_extract(directory, subset, urls[subset])
|
download_and_extract(directory, subset, urls[subset])
|
||||||
convert_audio_and_make_label(directory, subset, directory, subset + ".csv")
|
convert_audio_and_make_label(directory, subset, directory, subset + ".csv")
|
||||||
logging.info("Finished downloading and processing")
|
logger.info("Finished downloading and processing")
|
||||||
return subset_csv
|
return subset_csv
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
logging.set_verbosity(logging.INFO)
|
setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
|
||||||
if len(sys.argv) != 4:
|
if len(sys.argv) != 4:
|
||||||
print("Usage: python prepare_data.py save_directory user password")
|
print("Usage: python prepare_data.py save_directory user password")
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
|
@ -2,14 +2,14 @@ import os
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
from coqpit import Coqpit
|
from coqpit import Coqpit
|
||||||
from trainer import TrainerArgs, get_last_checkpoint
|
from trainer import TrainerArgs
|
||||||
from trainer.io import copy_model_files
|
from trainer.generic_utils import get_experiment_folder_path, get_git_branch
|
||||||
|
from trainer.io import copy_model_files, get_last_checkpoint
|
||||||
from trainer.logging import logger_factory
|
from trainer.logging import logger_factory
|
||||||
from trainer.logging.console_logger import ConsoleLogger
|
from trainer.logging.console_logger import ConsoleLogger
|
||||||
|
|
||||||
from TTS.config import load_config, register_config
|
from TTS.config import load_config, register_config
|
||||||
from TTS.tts.utils.text.characters import parse_symbols
|
from TTS.tts.utils.text.characters import parse_symbols
|
||||||
from TTS.utils.generic_utils import get_experiment_folder_path, get_git_branch
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
@ -29,7 +29,7 @@ def process_args(args, config=None):
|
||||||
args (argparse.Namespace or dict like): Parsed input arguments.
|
args (argparse.Namespace or dict like): Parsed input arguments.
|
||||||
config (Coqpit): Model config. If none, it is generated from `args`. Defaults to None.
|
config (Coqpit): Model config. If none, it is generated from `args`. Defaults to None.
|
||||||
Returns:
|
Returns:
|
||||||
c (TTS.utils.io.AttrDict): Config paramaters.
|
c (Coqpit): Config paramaters.
|
||||||
out_path (str): Path to save models and logging.
|
out_path (str): Path to save models and logging.
|
||||||
audio_path (str): Path to save generated test audios.
|
audio_path (str): Path to save generated test audios.
|
||||||
c_logger (TTS.utils.console_logger.ConsoleLogger): Class that does
|
c_logger (TTS.utils.console_logger.ConsoleLogger): Class that does
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
import matplotlib
|
import matplotlib
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import umap
|
|
||||||
|
|
||||||
matplotlib.use("Agg")
|
matplotlib.use("Agg")
|
||||||
|
|
||||||
|
@ -30,6 +29,10 @@ colormap = (
|
||||||
|
|
||||||
|
|
||||||
def plot_embeddings(embeddings, num_classes_in_batch):
|
def plot_embeddings(embeddings, num_classes_in_batch):
|
||||||
|
try:
|
||||||
|
import umap
|
||||||
|
except ImportError as e:
|
||||||
|
raise ImportError("Package not installed: umap-learn") from e
|
||||||
num_utter_per_class = embeddings.shape[0] // num_classes_in_batch
|
num_utter_per_class = embeddings.shape[0] // num_classes_in_batch
|
||||||
|
|
||||||
# if necessary get just the first 10 classes
|
# if necessary get just the first 10 classes
|
||||||
|
|
23
TTS/model.py
23
TTS/model.py
|
@ -1,5 +1,6 @@
|
||||||
|
import os
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
from typing import Dict
|
from typing import Any, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from coqpit import Coqpit
|
from coqpit import Coqpit
|
||||||
|
@ -11,12 +12,12 @@ from trainer import TrainerModel
|
||||||
class BaseTrainerModel(TrainerModel):
|
class BaseTrainerModel(TrainerModel):
|
||||||
"""BaseTrainerModel model expanding TrainerModel with required functions by 🐸TTS.
|
"""BaseTrainerModel model expanding TrainerModel with required functions by 🐸TTS.
|
||||||
|
|
||||||
Every new 🐸TTS model must inherit it.
|
Every new Coqui model must inherit it.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def init_from_config(config: Coqpit):
|
def init_from_config(config: Coqpit) -> "BaseTrainerModel":
|
||||||
"""Init the model and all its attributes from the given config.
|
"""Init the model and all its attributes from the given config.
|
||||||
|
|
||||||
Override this depending on your model.
|
Override this depending on your model.
|
||||||
|
@ -24,7 +25,7 @@ class BaseTrainerModel(TrainerModel):
|
||||||
...
|
...
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def inference(self, input: torch.Tensor, aux_input={}) -> Dict:
|
def inference(self, input: torch.Tensor, aux_input: dict[str, Any] = {}) -> dict[str, Any]:
|
||||||
"""Forward pass for inference.
|
"""Forward pass for inference.
|
||||||
|
|
||||||
It must return a dictionary with the main model output and all the auxiliary outputs. The key ```model_outputs```
|
It must return a dictionary with the main model output and all the auxiliary outputs. The key ```model_outputs```
|
||||||
|
@ -45,15 +46,21 @@ class BaseTrainerModel(TrainerModel):
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def load_checkpoint(
|
def load_checkpoint(
|
||||||
self, config: Coqpit, checkpoint_path: str, eval: bool = False, strict: bool = True, cache=False
|
self,
|
||||||
|
config: Coqpit,
|
||||||
|
checkpoint_path: Union[str, os.PathLike[Any]],
|
||||||
|
eval: bool = False,
|
||||||
|
strict: bool = True,
|
||||||
|
cache: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Load a model checkpoint gile and get ready for training or inference.
|
"""Load a model checkpoint file and get ready for training or inference.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
config (Coqpit): Model configuration.
|
config (Coqpit): Model configuration.
|
||||||
checkpoint_path (str): Path to the model checkpoint file.
|
checkpoint_path (str | os.PathLike): Path to the model checkpoint file.
|
||||||
eval (bool, optional): If true, init model for inference else for training. Defaults to False.
|
eval (bool, optional): If true, init model for inference else for training. Defaults to False.
|
||||||
strict (bool, optional): Match all checkpoint keys to model's keys. Defaults to True.
|
strict (bool, optional): Match all checkpoint keys to model's keys. Defaults to True.
|
||||||
cache (bool, optional): If True, cache the file locally for subsequent calls. It is cached under `get_user_data_dir()/tts_cache`. Defaults to False.
|
cache (bool, optional): If True, cache the file locally for subsequent calls.
|
||||||
|
It is cached under `trainer.io.get_user_data_dir()/tts_cache`. Defaults to False.
|
||||||
"""
|
"""
|
||||||
...
|
...
|
||||||
|
|
|
@ -1,5 +1,8 @@
|
||||||
# :frog: TTS demo server
|
# :frog: TTS demo server
|
||||||
Before you use the server, make sure you [install](https://github.com/coqui-ai/TTS/tree/dev#install-tts)) :frog: TTS properly. Then, you can follow the steps below.
|
Before you use the server, make sure you
|
||||||
|
[install](https://github.com/idiap/coqui-ai-TTS/tree/dev#install-tts)) :frog: TTS
|
||||||
|
properly and install the additional dependencies with `pip install
|
||||||
|
coqui-tts[server]`. Then, you can follow the steps below.
|
||||||
|
|
||||||
**Note:** If you install :frog:TTS using ```pip```, you can also use the ```tts-server``` end point on the terminal.
|
**Note:** If you install :frog:TTS using ```pip```, you can also use the ```tts-server``` end point on the terminal.
|
||||||
|
|
||||||
|
@ -12,7 +15,7 @@ Run the server with the official models.
|
||||||
```python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan```
|
```python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan```
|
||||||
|
|
||||||
Run the server with the official models on a GPU.
|
Run the server with the official models on a GPU.
|
||||||
```CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan --use_cuda True```
|
```CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan --use_cuda```
|
||||||
|
|
||||||
Run the server with a custom models.
|
Run the server with a custom models.
|
||||||
```python TTS/server/server.py --tts_checkpoint /path/to/tts/model.pth --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth --vocoder_config /path/to/vocoder/config.json```
|
```python TTS/server/server.py --tts_checkpoint /path/to/tts/model.pth --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth --vocoder_config /path/to/vocoder/config.json```
|
||||||
|
|
|
@ -1,7 +1,11 @@
|
||||||
#!flask/bin/python
|
#!flask/bin/python
|
||||||
|
|
||||||
|
"""TTS demo server."""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import io
|
import io
|
||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -9,24 +13,26 @@ from threading import Lock
|
||||||
from typing import Union
|
from typing import Union
|
||||||
from urllib.parse import parse_qs
|
from urllib.parse import parse_qs
|
||||||
|
|
||||||
from flask import Flask, render_template, render_template_string, request, send_file
|
try:
|
||||||
|
from flask import Flask, render_template, render_template_string, request, send_file
|
||||||
|
except ImportError as e:
|
||||||
|
msg = "Server requires requires flask, use `pip install coqui-tts[server]`"
|
||||||
|
raise ImportError(msg) from e
|
||||||
|
|
||||||
from TTS.config import load_config
|
from TTS.config import load_config
|
||||||
|
from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
|
||||||
from TTS.utils.manage import ModelManager
|
from TTS.utils.manage import ModelManager
|
||||||
from TTS.utils.synthesizer import Synthesizer
|
from TTS.utils.synthesizer import Synthesizer
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
|
||||||
|
|
||||||
def create_argparser():
|
|
||||||
def convert_boolean(x):
|
|
||||||
return x.lower() in ["true", "1", "yes"]
|
|
||||||
|
|
||||||
|
def create_argparser() -> argparse.ArgumentParser:
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--list_models",
|
"--list_models",
|
||||||
type=convert_boolean,
|
action="store_true",
|
||||||
nargs="?",
|
|
||||||
const=True,
|
|
||||||
default=False,
|
|
||||||
help="list available pre-trained tts and vocoder models.",
|
help="list available pre-trained tts and vocoder models.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
|
@ -54,9 +60,13 @@ def create_argparser():
|
||||||
parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None)
|
parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None)
|
||||||
parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
|
parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
|
||||||
parser.add_argument("--port", type=int, default=5002, help="port to listen on.")
|
parser.add_argument("--port", type=int, default=5002, help="port to listen on.")
|
||||||
parser.add_argument("--use_cuda", type=convert_boolean, default=False, help="true to use CUDA.")
|
parser.add_argument("--use_cuda", action=argparse.BooleanOptionalAction, default=False, help="true to use CUDA.")
|
||||||
parser.add_argument("--debug", type=convert_boolean, default=False, help="true to enable Flask debug mode.")
|
parser.add_argument(
|
||||||
parser.add_argument("--show_details", type=convert_boolean, default=False, help="Generate model detail page.")
|
"--debug", action=argparse.BooleanOptionalAction, default=False, help="true to enable Flask debug mode."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--show_details", action=argparse.BooleanOptionalAction, default=False, help="Generate model detail page."
|
||||||
|
)
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
@ -66,10 +76,6 @@ args = create_argparser().parse_args()
|
||||||
path = Path(__file__).parent / "../.models.json"
|
path = Path(__file__).parent / "../.models.json"
|
||||||
manager = ModelManager(path)
|
manager = ModelManager(path)
|
||||||
|
|
||||||
if args.list_models:
|
|
||||||
manager.list_models()
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
# update in-use models to the specified released models.
|
# update in-use models to the specified released models.
|
||||||
model_path = None
|
model_path = None
|
||||||
config_path = None
|
config_path = None
|
||||||
|
@ -164,14 +170,12 @@ def index():
|
||||||
def details():
|
def details():
|
||||||
if args.config_path is not None and os.path.isfile(args.config_path):
|
if args.config_path is not None and os.path.isfile(args.config_path):
|
||||||
model_config = load_config(args.config_path)
|
model_config = load_config(args.config_path)
|
||||||
else:
|
elif args.model_name is not None:
|
||||||
if args.model_name is not None:
|
|
||||||
model_config = load_config(config_path)
|
model_config = load_config(config_path)
|
||||||
|
|
||||||
if args.vocoder_config_path is not None and os.path.isfile(args.vocoder_config_path):
|
if args.vocoder_config_path is not None and os.path.isfile(args.vocoder_config_path):
|
||||||
vocoder_config = load_config(args.vocoder_config_path)
|
vocoder_config = load_config(args.vocoder_config_path)
|
||||||
else:
|
elif args.vocoder_name is not None:
|
||||||
if args.vocoder_name is not None:
|
|
||||||
vocoder_config = load_config(vocoder_config_path)
|
vocoder_config = load_config(vocoder_config_path)
|
||||||
else:
|
else:
|
||||||
vocoder_config = None
|
vocoder_config = None
|
||||||
|
@ -197,9 +201,9 @@ def tts():
|
||||||
style_wav = request.headers.get("style-wav") or request.values.get("style_wav", "")
|
style_wav = request.headers.get("style-wav") or request.values.get("style_wav", "")
|
||||||
style_wav = style_wav_uri_to_dict(style_wav)
|
style_wav = style_wav_uri_to_dict(style_wav)
|
||||||
|
|
||||||
print(f" > Model input: {text}")
|
logger.info("Model input: %s", text)
|
||||||
print(f" > Speaker Idx: {speaker_idx}")
|
logger.info("Speaker idx: %s", speaker_idx)
|
||||||
print(f" > Language Idx: {language_idx}")
|
logger.info("Language idx: %s", language_idx)
|
||||||
wavs = synthesizer.tts(text, speaker_name=speaker_idx, language_name=language_idx, style_wav=style_wav)
|
wavs = synthesizer.tts(text, speaker_name=speaker_idx, language_name=language_idx, style_wav=style_wav)
|
||||||
out = io.BytesIO()
|
out = io.BytesIO()
|
||||||
synthesizer.save_wav(wavs, out)
|
synthesizer.save_wav(wavs, out)
|
||||||
|
@ -243,7 +247,7 @@ def mary_tts_api_process():
|
||||||
text = data.get("INPUT_TEXT", [""])[0]
|
text = data.get("INPUT_TEXT", [""])[0]
|
||||||
else:
|
else:
|
||||||
text = request.args.get("INPUT_TEXT", "")
|
text = request.args.get("INPUT_TEXT", "")
|
||||||
print(f" > Model input: {text}")
|
logger.info("Model input: %s", text)
|
||||||
wavs = synthesizer.tts(text)
|
wavs = synthesizer.tts(text)
|
||||||
out = io.BytesIO()
|
out = io.BytesIO()
|
||||||
synthesizer.save_wav(wavs, out)
|
synthesizer.save_wav(wavs, out)
|
||||||
|
|
|
@ -30,7 +30,7 @@
|
||||||
</head>
|
</head>
|
||||||
|
|
||||||
<body>
|
<body>
|
||||||
<a href="https://github.com/coqui-ai/TTS"><img style="position: absolute; z-index:1000; top: 0; left: 0; border: 0;"
|
<a href="https://github.com/idiap/coqui-ai-TTS"><img style="position: absolute; z-index:1000; top: 0; left: 0; border: 0;"
|
||||||
src="https://s3.amazonaws.com/github/ribbons/forkme_left_darkblue_121621.png" alt="Fork me on GitHub"></a>
|
src="https://s3.amazonaws.com/github/ribbons/forkme_left_darkblue_121621.png" alt="Fork me on GitHub"></a>
|
||||||
|
|
||||||
<!-- Navigation -->
|
<!-- Navigation -->
|
||||||
|
|
|
@ -2,11 +2,12 @@ import os
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import Dict
|
from typing import Dict
|
||||||
|
|
||||||
|
from trainer.io import get_user_data_dir
|
||||||
|
|
||||||
from TTS.tts.configs.shared_configs import BaseTTSConfig
|
from TTS.tts.configs.shared_configs import BaseTTSConfig
|
||||||
from TTS.tts.layers.bark.model import GPTConfig
|
from TTS.tts.layers.bark.model import GPTConfig
|
||||||
from TTS.tts.layers.bark.model_fine import FineGPTConfig
|
from TTS.tts.layers.bark.model_fine import FineGPTConfig
|
||||||
from TTS.tts.models.bark import BarkAudioConfig
|
from TTS.tts.models.bark import BarkAudioConfig
|
||||||
from TTS.utils.generic_utils import get_user_data_dir
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
@ -95,7 +96,6 @@ class BarkConfig(BaseTTSConfig):
|
||||||
"coarse": os.path.join(self.CACHE_DIR, "coarse_2.pt"),
|
"coarse": os.path.join(self.CACHE_DIR, "coarse_2.pt"),
|
||||||
"fine": os.path.join(self.CACHE_DIR, "fine_2.pt"),
|
"fine": os.path.join(self.CACHE_DIR, "fine_2.pt"),
|
||||||
"hubert_tokenizer": os.path.join(self.CACHE_DIR, "tokenizer.pth"),
|
"hubert_tokenizer": os.path.join(self.CACHE_DIR, "tokenizer.pth"),
|
||||||
"hubert": os.path.join(self.CACHE_DIR, "hubert.pt"),
|
|
||||||
}
|
}
|
||||||
self.SMALL_REMOTE_MODEL_PATHS = {
|
self.SMALL_REMOTE_MODEL_PATHS = {
|
||||||
"text": {"path": os.path.join(self.REMOTE_BASE_URL, "text.pt")},
|
"text": {"path": os.path.join(self.REMOTE_BASE_URL, "text.pt")},
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
|
@ -9,6 +10,8 @@ import numpy as np
|
||||||
from TTS.tts.datasets.dataset import *
|
from TTS.tts.datasets.dataset import *
|
||||||
from TTS.tts.datasets.formatters import *
|
from TTS.tts.datasets.formatters import *
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01):
|
def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01):
|
||||||
"""Split a dataset into train and eval. Consider speaker distribution in multi-speaker training.
|
"""Split a dataset into train and eval. Consider speaker distribution in multi-speaker training.
|
||||||
|
@ -122,7 +125,7 @@ def load_tts_samples(
|
||||||
|
|
||||||
meta_data_train = add_extra_keys(meta_data_train, language, dataset_name)
|
meta_data_train = add_extra_keys(meta_data_train, language, dataset_name)
|
||||||
|
|
||||||
print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}")
|
logger.info("Found %d files in %s", len(meta_data_train), Path(root_path).resolve())
|
||||||
# load evaluation split if set
|
# load evaluation split if set
|
||||||
if eval_split:
|
if eval_split:
|
||||||
if meta_file_val:
|
if meta_file_val:
|
||||||
|
@ -163,19 +166,23 @@ def load_attention_mask_meta_data(metafile_path):
|
||||||
def _get_formatter_by_name(name):
|
def _get_formatter_by_name(name):
|
||||||
"""Returns the respective preprocessing function."""
|
"""Returns the respective preprocessing function."""
|
||||||
thismodule = sys.modules[__name__]
|
thismodule = sys.modules[__name__]
|
||||||
|
if not hasattr(thismodule, name.lower()):
|
||||||
|
msg = (
|
||||||
|
f"{name} formatter not found. If it is a custom formatter, pass the function to load_tts_samples() instead."
|
||||||
|
)
|
||||||
|
raise ValueError(msg)
|
||||||
return getattr(thismodule, name.lower())
|
return getattr(thismodule, name.lower())
|
||||||
|
|
||||||
|
|
||||||
def find_unique_chars(data_samples, verbose=True):
|
def find_unique_chars(data_samples):
|
||||||
texts = "".join(item[0] for item in data_samples)
|
texts = "".join(item["text"] for item in data_samples)
|
||||||
chars = set(texts)
|
chars = set(texts)
|
||||||
lower_chars = filter(lambda c: c.islower(), chars)
|
lower_chars = filter(lambda c: c.islower(), chars)
|
||||||
chars_force_lower = [c.lower() for c in chars]
|
chars_force_lower = [c.lower() for c in chars]
|
||||||
chars_force_lower = set(chars_force_lower)
|
chars_force_lower = set(chars_force_lower)
|
||||||
|
|
||||||
if verbose:
|
logger.info("Number of unique characters: %d", len(chars))
|
||||||
print(f" > Number of unique characters: {len(chars)}")
|
logger.info("Unique characters: %s", "".join(sorted(chars)))
|
||||||
print(f" > Unique characters: {''.join(sorted(chars))}")
|
logger.info("Unique lower characters: %s", "".join(sorted(lower_chars)))
|
||||||
print(f" > Unique lower characters: {''.join(sorted(lower_chars))}")
|
logger.info("Unique all forced to lower characters: %s", "".join(sorted(chars_force_lower)))
|
||||||
print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}")
|
|
||||||
return chars_force_lower
|
return chars_force_lower
|
||||||
|
|
|
@ -1,11 +1,14 @@
|
||||||
import base64
|
import base64
|
||||||
import collections
|
import collections
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
from typing import Dict, List, Union
|
from typing import Any, Optional, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import numpy.typing as npt
|
||||||
import torch
|
import torch
|
||||||
|
import torchaudio
|
||||||
import tqdm
|
import tqdm
|
||||||
from torch.utils.data import Dataset
|
from torch.utils.data import Dataset
|
||||||
|
|
||||||
|
@ -13,7 +16,7 @@ from TTS.tts.utils.data import prepare_data, prepare_stop_target, prepare_tensor
|
||||||
from TTS.utils.audio import AudioProcessor
|
from TTS.utils.audio import AudioProcessor
|
||||||
from TTS.utils.audio.numpy_transforms import compute_energy as calculate_energy
|
from TTS.utils.audio.numpy_transforms import compute_energy as calculate_energy
|
||||||
|
|
||||||
import mutagen
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# to prevent too many open files error as suggested here
|
# to prevent too many open files error as suggested here
|
||||||
# https://github.com/pytorch/pytorch/issues/11201#issuecomment-421146936
|
# https://github.com/pytorch/pytorch/issues/11201#issuecomment-421146936
|
||||||
|
@ -30,27 +33,59 @@ def _parse_sample(item):
|
||||||
elif len(item) == 3:
|
elif len(item) == 3:
|
||||||
text, wav_file, speaker_name = item
|
text, wav_file, speaker_name = item
|
||||||
else:
|
else:
|
||||||
raise ValueError(" [!] Dataset cannot parse the sample.")
|
msg = "Dataset cannot parse the sample."
|
||||||
|
raise ValueError(msg)
|
||||||
return text, wav_file, speaker_name, language_name, attn_file
|
return text, wav_file, speaker_name, language_name, attn_file
|
||||||
|
|
||||||
|
|
||||||
def noise_augment_audio(wav):
|
def noise_augment_audio(wav: npt.NDArray) -> npt.NDArray:
|
||||||
return wav + (1.0 / 32768.0) * np.random.rand(*wav.shape)
|
return wav + (1.0 / 32768.0) * np.random.rand(*wav.shape)
|
||||||
|
|
||||||
|
|
||||||
def string2filename(string):
|
def string2filename(string: str) -> str:
|
||||||
# generate a safe and reversible filename based on a string
|
# generate a safe and reversible filename based on a string
|
||||||
filename = base64.urlsafe_b64encode(string.encode("utf-8")).decode("utf-8", "ignore")
|
return base64.urlsafe_b64encode(string.encode("utf-8")).decode("utf-8", "ignore")
|
||||||
return filename
|
|
||||||
|
|
||||||
|
|
||||||
def get_audio_size(audiopath):
|
def get_audio_size(audiopath: Union[str, os.PathLike[Any]]) -> int:
|
||||||
|
"""Return the number of samples in the audio file."""
|
||||||
|
if not isinstance(audiopath, str):
|
||||||
|
audiopath = str(audiopath)
|
||||||
extension = audiopath.rpartition(".")[-1].lower()
|
extension = audiopath.rpartition(".")[-1].lower()
|
||||||
if extension not in {"mp3", "wav", "flac"}:
|
if extension not in {"mp3", "wav", "flac"}:
|
||||||
raise RuntimeError(f"The audio format {extension} is not supported, please convert the audio files to mp3, flac, or wav format!")
|
msg = f"The audio format {extension} is not supported, please convert the audio files to mp3, flac, or wav format!"
|
||||||
|
raise RuntimeError(msg)
|
||||||
|
|
||||||
audio_info = mutagen.File(audiopath).info
|
try:
|
||||||
return int(audio_info.length * audio_info.sample_rate)
|
return torchaudio.info(audiopath).num_frames
|
||||||
|
except RuntimeError as e:
|
||||||
|
msg = f"Failed to decode {audiopath}"
|
||||||
|
raise RuntimeError(msg) from e
|
||||||
|
|
||||||
|
|
||||||
|
def get_attribute_balancer_weights(items: list, attr_name: str, multi_dict: Optional[dict] = None):
|
||||||
|
"""Create inverse frequency weights for balancing the dataset.
|
||||||
|
|
||||||
|
Use `multi_dict` to scale relative weights."""
|
||||||
|
attr_names_samples = np.array([item[attr_name] for item in items])
|
||||||
|
unique_attr_names = np.unique(attr_names_samples).tolist()
|
||||||
|
attr_idx = [unique_attr_names.index(l) for l in attr_names_samples]
|
||||||
|
attr_count = np.array([len(np.where(attr_names_samples == l)[0]) for l in unique_attr_names])
|
||||||
|
weight_attr = 1.0 / attr_count
|
||||||
|
dataset_samples_weight = np.array([weight_attr[l] for l in attr_idx])
|
||||||
|
dataset_samples_weight = dataset_samples_weight / np.linalg.norm(dataset_samples_weight)
|
||||||
|
if multi_dict is not None:
|
||||||
|
# check if all keys are in the multi_dict
|
||||||
|
for k in multi_dict:
|
||||||
|
assert k in unique_attr_names, f"{k} not in {unique_attr_names}"
|
||||||
|
# scale weights
|
||||||
|
multiplier_samples = np.array([multi_dict.get(item[attr_name], 1.0) for item in items])
|
||||||
|
dataset_samples_weight *= multiplier_samples
|
||||||
|
return (
|
||||||
|
torch.from_numpy(dataset_samples_weight).float(),
|
||||||
|
unique_attr_names,
|
||||||
|
np.unique(dataset_samples_weight).tolist(),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class TTSDataset(Dataset):
|
class TTSDataset(Dataset):
|
||||||
|
@ -59,32 +94,32 @@ class TTSDataset(Dataset):
|
||||||
outputs_per_step: int = 1,
|
outputs_per_step: int = 1,
|
||||||
compute_linear_spec: bool = False,
|
compute_linear_spec: bool = False,
|
||||||
ap: AudioProcessor = None,
|
ap: AudioProcessor = None,
|
||||||
samples: List[Dict] = None,
|
samples: Optional[list[dict]] = None,
|
||||||
tokenizer: "TTSTokenizer" = None,
|
tokenizer: "TTSTokenizer" = None,
|
||||||
compute_f0: bool = False,
|
compute_f0: bool = False,
|
||||||
compute_energy: bool = False,
|
compute_energy: bool = False,
|
||||||
f0_cache_path: str = None,
|
f0_cache_path: Optional[str] = None,
|
||||||
energy_cache_path: str = None,
|
energy_cache_path: Optional[str] = None,
|
||||||
return_wav: bool = False,
|
return_wav: bool = False,
|
||||||
batch_group_size: int = 0,
|
batch_group_size: int = 0,
|
||||||
min_text_len: int = 0,
|
min_text_len: int = 0,
|
||||||
max_text_len: int = float("inf"),
|
max_text_len: int = float("inf"),
|
||||||
min_audio_len: int = 0,
|
min_audio_len: int = 0,
|
||||||
max_audio_len: int = float("inf"),
|
max_audio_len: int = float("inf"),
|
||||||
phoneme_cache_path: str = None,
|
phoneme_cache_path: Optional[str] = None,
|
||||||
precompute_num_workers: int = 0,
|
precompute_num_workers: int = 0,
|
||||||
speaker_id_mapping: Dict = None,
|
speaker_id_mapping: Optional[dict] = None,
|
||||||
d_vector_mapping: Dict = None,
|
d_vector_mapping: Optional[dict] = None,
|
||||||
language_id_mapping: Dict = None,
|
language_id_mapping: Optional[dict] = None,
|
||||||
use_noise_augment: bool = False,
|
use_noise_augment: bool = False,
|
||||||
start_by_longest: bool = False,
|
start_by_longest: bool = False,
|
||||||
verbose: bool = False,
|
) -> None:
|
||||||
):
|
|
||||||
"""Generic 📂 data loader for `tts` models. It is configurable for different outputs and needs.
|
"""Generic 📂 data loader for `tts` models. It is configurable for different outputs and needs.
|
||||||
|
|
||||||
If you need something different, you can subclass and override.
|
If you need something different, you can subclass and override.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
----
|
||||||
outputs_per_step (int): Number of time frames predicted per step.
|
outputs_per_step (int): Number of time frames predicted per step.
|
||||||
|
|
||||||
compute_linear_spec (bool): compute linear spectrogram if True.
|
compute_linear_spec (bool): compute linear spectrogram if True.
|
||||||
|
@ -137,7 +172,6 @@ class TTSDataset(Dataset):
|
||||||
|
|
||||||
start_by_longest (bool): Start by longest sequence. It is especially useful to check OOM. Defaults to False.
|
start_by_longest (bool): Start by longest sequence. It is especially useful to check OOM. Defaults to False.
|
||||||
|
|
||||||
verbose (bool): Print diagnostic information. Defaults to false.
|
|
||||||
"""
|
"""
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.batch_group_size = batch_group_size
|
self.batch_group_size = batch_group_size
|
||||||
|
@ -161,33 +195,44 @@ class TTSDataset(Dataset):
|
||||||
self.use_noise_augment = use_noise_augment
|
self.use_noise_augment = use_noise_augment
|
||||||
self.start_by_longest = start_by_longest
|
self.start_by_longest = start_by_longest
|
||||||
|
|
||||||
self.verbose = verbose
|
|
||||||
self.rescue_item_idx = 1
|
self.rescue_item_idx = 1
|
||||||
self.pitch_computed = False
|
self.pitch_computed = False
|
||||||
self.tokenizer = tokenizer
|
self.tokenizer = tokenizer
|
||||||
|
|
||||||
if self.tokenizer.use_phonemes:
|
if self.tokenizer.use_phonemes:
|
||||||
self.phoneme_dataset = PhonemeDataset(
|
self.phoneme_dataset = PhonemeDataset(
|
||||||
self.samples, self.tokenizer, phoneme_cache_path, precompute_num_workers=precompute_num_workers
|
self.samples,
|
||||||
|
self.tokenizer,
|
||||||
|
phoneme_cache_path,
|
||||||
|
precompute_num_workers=precompute_num_workers,
|
||||||
)
|
)
|
||||||
|
|
||||||
if compute_f0:
|
if compute_f0:
|
||||||
self.f0_dataset = F0Dataset(
|
self.f0_dataset = F0Dataset(
|
||||||
self.samples, self.ap, cache_path=f0_cache_path, precompute_num_workers=precompute_num_workers
|
self.samples,
|
||||||
|
self.ap,
|
||||||
|
cache_path=f0_cache_path,
|
||||||
|
precompute_num_workers=precompute_num_workers,
|
||||||
)
|
)
|
||||||
if compute_energy:
|
if compute_energy:
|
||||||
self.energy_dataset = EnergyDataset(
|
self.energy_dataset = EnergyDataset(
|
||||||
self.samples, self.ap, cache_path=energy_cache_path, precompute_num_workers=precompute_num_workers
|
self.samples,
|
||||||
|
self.ap,
|
||||||
|
cache_path=energy_cache_path,
|
||||||
|
precompute_num_workers=precompute_num_workers,
|
||||||
)
|
)
|
||||||
if self.verbose:
|
|
||||||
self.print_logs()
|
self.print_logs()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def lengths(self):
|
def lengths(self) -> list[int]:
|
||||||
lens = []
|
lens = []
|
||||||
for item in self.samples:
|
for item in self.samples:
|
||||||
_, wav_file, *_ = _parse_sample(item)
|
_, wav_file, *_ = _parse_sample(item)
|
||||||
|
try:
|
||||||
audio_len = get_audio_size(wav_file)
|
audio_len = get_audio_size(wav_file)
|
||||||
|
except RuntimeError:
|
||||||
|
logger.warning(f"Failed to compute length for {item['audio_file']}")
|
||||||
|
audio_len = 0
|
||||||
lens.append(audio_len)
|
lens.append(audio_len)
|
||||||
return lens
|
return lens
|
||||||
|
|
||||||
|
@ -196,7 +241,7 @@ class TTSDataset(Dataset):
|
||||||
return self._samples
|
return self._samples
|
||||||
|
|
||||||
@samples.setter
|
@samples.setter
|
||||||
def samples(self, new_samples):
|
def samples(self, new_samples) -> None:
|
||||||
self._samples = new_samples
|
self._samples = new_samples
|
||||||
if hasattr(self, "f0_dataset"):
|
if hasattr(self, "f0_dataset"):
|
||||||
self.f0_dataset.samples = new_samples
|
self.f0_dataset.samples = new_samples
|
||||||
|
@ -205,7 +250,7 @@ class TTSDataset(Dataset):
|
||||||
if hasattr(self, "phoneme_dataset"):
|
if hasattr(self, "phoneme_dataset"):
|
||||||
self.phoneme_dataset.samples = new_samples
|
self.phoneme_dataset.samples = new_samples
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self) -> int:
|
||||||
return len(self.samples)
|
return len(self.samples)
|
||||||
|
|
||||||
def __getitem__(self, idx):
|
def __getitem__(self, idx):
|
||||||
|
@ -213,11 +258,10 @@ class TTSDataset(Dataset):
|
||||||
|
|
||||||
def print_logs(self, level: int = 0) -> None:
|
def print_logs(self, level: int = 0) -> None:
|
||||||
indent = "\t" * level
|
indent = "\t" * level
|
||||||
print("\n")
|
logger.info("%sDataLoader initialization", indent)
|
||||||
print(f"{indent}> DataLoader initialization")
|
logger.info("%s| Tokenizer:", indent)
|
||||||
print(f"{indent}| > Tokenizer:")
|
|
||||||
self.tokenizer.print_logs(level + 1)
|
self.tokenizer.print_logs(level + 1)
|
||||||
print(f"{indent}| > Number of instances : {len(self.samples)}")
|
logger.info("%s| Number of instances : %d", indent, len(self.samples))
|
||||||
|
|
||||||
def load_wav(self, filename):
|
def load_wav(self, filename):
|
||||||
waveform = self.ap.load_wav(filename)
|
waveform = self.ap.load_wav(filename)
|
||||||
|
@ -253,7 +297,7 @@ class TTSDataset(Dataset):
|
||||||
token_ids = self.tokenizer.text_to_ids(text)
|
token_ids = self.tokenizer.text_to_ids(text)
|
||||||
return np.array(token_ids, dtype=np.int32)
|
return np.array(token_ids, dtype=np.int32)
|
||||||
|
|
||||||
def load_data(self, idx):
|
def load_data(self, idx) -> dict[str, Any]:
|
||||||
item = self.samples[idx]
|
item = self.samples[idx]
|
||||||
|
|
||||||
raw_text = item["text"]
|
raw_text = item["text"]
|
||||||
|
@ -287,7 +331,7 @@ class TTSDataset(Dataset):
|
||||||
if self.compute_energy:
|
if self.compute_energy:
|
||||||
energy = self.get_energy(idx)["energy"]
|
energy = self.get_energy(idx)["energy"]
|
||||||
|
|
||||||
sample = {
|
return {
|
||||||
"raw_text": raw_text,
|
"raw_text": raw_text,
|
||||||
"token_ids": token_ids,
|
"token_ids": token_ids,
|
||||||
"wav": wav,
|
"wav": wav,
|
||||||
|
@ -300,13 +344,16 @@ class TTSDataset(Dataset):
|
||||||
"wav_file_name": os.path.basename(item["audio_file"]),
|
"wav_file_name": os.path.basename(item["audio_file"]),
|
||||||
"audio_unique_name": item["audio_unique_name"],
|
"audio_unique_name": item["audio_unique_name"],
|
||||||
}
|
}
|
||||||
return sample
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _compute_lengths(samples):
|
def _compute_lengths(samples):
|
||||||
new_samples = []
|
new_samples = []
|
||||||
for item in samples:
|
for item in samples:
|
||||||
|
try:
|
||||||
audio_length = get_audio_size(item["audio_file"])
|
audio_length = get_audio_size(item["audio_file"])
|
||||||
|
except RuntimeError:
|
||||||
|
logger.warning(f"Failed to compute length, skipping {item['audio_file']}")
|
||||||
|
continue
|
||||||
text_lenght = len(item["text"])
|
text_lenght = len(item["text"])
|
||||||
item["audio_length"] = audio_length
|
item["audio_length"] = audio_length
|
||||||
item["text_length"] = text_lenght
|
item["text_length"] = text_lenght
|
||||||
|
@ -314,7 +361,7 @@ class TTSDataset(Dataset):
|
||||||
return new_samples
|
return new_samples
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def filter_by_length(lengths: List[int], min_len: int, max_len: int):
|
def filter_by_length(lengths: list[int], min_len: int, max_len: int):
|
||||||
idxs = np.argsort(lengths) # ascending order
|
idxs = np.argsort(lengths) # ascending order
|
||||||
ignore_idx = []
|
ignore_idx = []
|
||||||
keep_idx = []
|
keep_idx = []
|
||||||
|
@ -327,10 +374,9 @@ class TTSDataset(Dataset):
|
||||||
return ignore_idx, keep_idx
|
return ignore_idx, keep_idx
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def sort_by_length(samples: List[List]):
|
def sort_by_length(samples: list[list]):
|
||||||
audio_lengths = [s["audio_length"] for s in samples]
|
audio_lengths = [s["audio_length"] for s in samples]
|
||||||
idxs = np.argsort(audio_lengths) # ascending order
|
return np.argsort(audio_lengths) # ascending order
|
||||||
return idxs
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def create_buckets(samples, batch_group_size: int):
|
def create_buckets(samples, batch_group_size: int):
|
||||||
|
@ -350,7 +396,7 @@ class TTSDataset(Dataset):
|
||||||
samples_new.append(samples[idx])
|
samples_new.append(samples[idx])
|
||||||
return samples_new
|
return samples_new
|
||||||
|
|
||||||
def preprocess_samples(self):
|
def preprocess_samples(self) -> None:
|
||||||
r"""Sort `items` based on text length or audio length in ascending order. Filter out samples out or the length
|
r"""Sort `items` based on text length or audio length in ascending order. Filter out samples out or the length
|
||||||
range.
|
range.
|
||||||
"""
|
"""
|
||||||
|
@ -376,7 +422,8 @@ class TTSDataset(Dataset):
|
||||||
samples = self._select_samples_by_idx(sorted_idxs, samples)
|
samples = self._select_samples_by_idx(sorted_idxs, samples)
|
||||||
|
|
||||||
if len(samples) == 0:
|
if len(samples) == 0:
|
||||||
raise RuntimeError(" [!] No samples left")
|
msg = "No samples left."
|
||||||
|
raise RuntimeError(msg)
|
||||||
|
|
||||||
# shuffle batch groups
|
# shuffle batch groups
|
||||||
# create batches with similar length items
|
# create batches with similar length items
|
||||||
|
@ -389,39 +436,38 @@ class TTSDataset(Dataset):
|
||||||
text_lengths = [s["text_length"] for s in samples]
|
text_lengths = [s["text_length"] for s in samples]
|
||||||
self.samples = samples
|
self.samples = samples
|
||||||
|
|
||||||
if self.verbose:
|
logger.info("Preprocessing samples")
|
||||||
print(" | > Preprocessing samples")
|
logger.info(f"Max text length: {np.max(text_lengths)}")
|
||||||
print(" | > Max text length: {}".format(np.max(text_lengths)))
|
logger.info(f"Min text length: {np.min(text_lengths)}")
|
||||||
print(" | > Min text length: {}".format(np.min(text_lengths)))
|
logger.info(f"Avg text length: {np.mean(text_lengths)}")
|
||||||
print(" | > Avg text length: {}".format(np.mean(text_lengths)))
|
logger.info(f"Max audio length: {np.max(audio_lengths)}")
|
||||||
print(" | ")
|
logger.info(f"Min audio length: {np.min(audio_lengths)}")
|
||||||
print(" | > Max audio length: {}".format(np.max(audio_lengths)))
|
logger.info(f"Avg audio length: {np.mean(audio_lengths)}")
|
||||||
print(" | > Min audio length: {}".format(np.min(audio_lengths)))
|
logger.info("Num. instances discarded samples: %d", len(ignore_idx))
|
||||||
print(" | > Avg audio length: {}".format(np.mean(audio_lengths)))
|
logger.info(f"Batch group size: {self.batch_group_size}.")
|
||||||
print(f" | > Num. instances discarded samples: {len(ignore_idx)}")
|
|
||||||
print(" | > Batch group size: {}.".format(self.batch_group_size))
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _sort_batch(batch, text_lengths):
|
def _sort_batch(batch, text_lengths):
|
||||||
"""Sort the batch by the input text length for RNN efficiency.
|
"""Sort the batch by the input text length for RNN efficiency.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
----
|
||||||
batch (Dict): Batch returned by `__getitem__`.
|
batch (Dict): Batch returned by `__getitem__`.
|
||||||
text_lengths (List[int]): Lengths of the input character sequences.
|
text_lengths (List[int]): Lengths of the input character sequences.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
text_lengths, ids_sorted_decreasing = torch.sort(torch.LongTensor(text_lengths), dim=0, descending=True)
|
text_lengths, ids_sorted_decreasing = torch.sort(torch.LongTensor(text_lengths), dim=0, descending=True)
|
||||||
batch = [batch[idx] for idx in ids_sorted_decreasing]
|
batch = [batch[idx] for idx in ids_sorted_decreasing]
|
||||||
return batch, text_lengths, ids_sorted_decreasing
|
return batch, text_lengths, ids_sorted_decreasing
|
||||||
|
|
||||||
def collate_fn(self, batch):
|
def collate_fn(self, batch):
|
||||||
r"""
|
"""Perform preprocessing and create a final data batch.
|
||||||
Perform preprocessing and create a final data batch:
|
|
||||||
1. Sort batch instances by text-length
|
1. Sort batch instances by text-length
|
||||||
2. Convert Audio signal to features.
|
2. Convert Audio signal to features.
|
||||||
3. PAD sequences wrt r.
|
3. PAD sequences wrt r.
|
||||||
4. Load to Torch.
|
4. Load to Torch.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Puts each data field into a tensor with outer dimension batch size
|
# Puts each data field into a tensor with outer dimension batch size
|
||||||
if isinstance(batch[0], collections.abc.Mapping):
|
if isinstance(batch[0], collections.abc.Mapping):
|
||||||
token_ids_lengths = np.array([len(d["token_ids"]) for d in batch])
|
token_ids_lengths = np.array([len(d["token_ids"]) for d in batch])
|
||||||
|
@ -456,9 +502,11 @@ class TTSDataset(Dataset):
|
||||||
|
|
||||||
# lengths adjusted by the reduction factor
|
# lengths adjusted by the reduction factor
|
||||||
mel_lengths_adjusted = [
|
mel_lengths_adjusted = [
|
||||||
|
(
|
||||||
m.shape[1] + (self.outputs_per_step - (m.shape[1] % self.outputs_per_step))
|
m.shape[1] + (self.outputs_per_step - (m.shape[1] % self.outputs_per_step))
|
||||||
if m.shape[1] % self.outputs_per_step
|
if m.shape[1] % self.outputs_per_step
|
||||||
else m.shape[1]
|
else m.shape[1]
|
||||||
|
)
|
||||||
for m in mel
|
for m in mel
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -564,23 +612,18 @@ class TTSDataset(Dataset):
|
||||||
"audio_unique_names": batch["audio_unique_name"],
|
"audio_unique_names": batch["audio_unique_name"],
|
||||||
}
|
}
|
||||||
|
|
||||||
raise TypeError(
|
msg = f"batch must contain tensors, numbers, dicts or lists; found {type(batch[0])}"
|
||||||
(
|
raise TypeError(msg)
|
||||||
"batch must contain tensors, numbers, dicts or lists;\
|
|
||||||
found {}".format(
|
|
||||||
type(batch[0])
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class PhonemeDataset(Dataset):
|
class PhonemeDataset(Dataset):
|
||||||
"""Phoneme Dataset for converting input text to phonemes and then token IDs
|
"""Phoneme Dataset for converting input text to phonemes and then token IDs.
|
||||||
|
|
||||||
At initialization, it pre-computes the phonemes under `cache_path` and loads them in training to reduce data
|
At initialization, it pre-computes the phonemes under `cache_path` and loads them in training to reduce data
|
||||||
loading latency. If `cache_path` is already present, it skips the pre-computation.
|
loading latency. If `cache_path` is already present, it skips the pre-computation.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
----
|
||||||
samples (Union[List[List], List[Dict]]):
|
samples (Union[List[List], List[Dict]]):
|
||||||
List of samples. Each sample is a list or a dict.
|
List of samples. Each sample is a list or a dict.
|
||||||
|
|
||||||
|
@ -592,15 +635,16 @@ class PhonemeDataset(Dataset):
|
||||||
|
|
||||||
precompute_num_workers (int):
|
precompute_num_workers (int):
|
||||||
Number of workers used for pre-computing the phonemes. Defaults to 0.
|
Number of workers used for pre-computing the phonemes. Defaults to 0.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
samples: Union[List[Dict], List[List]],
|
samples: Union[list[dict], list[list]],
|
||||||
tokenizer: "TTSTokenizer",
|
tokenizer: "TTSTokenizer",
|
||||||
cache_path: str,
|
cache_path: str,
|
||||||
precompute_num_workers=0,
|
precompute_num_workers: int = 0,
|
||||||
):
|
) -> None:
|
||||||
self.samples = samples
|
self.samples = samples
|
||||||
self.tokenizer = tokenizer
|
self.tokenizer = tokenizer
|
||||||
self.cache_path = cache_path
|
self.cache_path = cache_path
|
||||||
|
@ -608,16 +652,16 @@ class PhonemeDataset(Dataset):
|
||||||
os.makedirs(cache_path)
|
os.makedirs(cache_path)
|
||||||
self.precompute(precompute_num_workers)
|
self.precompute(precompute_num_workers)
|
||||||
|
|
||||||
def __getitem__(self, index):
|
def __getitem__(self, index) -> dict[str, Any]:
|
||||||
item = self.samples[index]
|
item = self.samples[index]
|
||||||
ids = self.compute_or_load(string2filename(item["audio_unique_name"]), item["text"], item["language"])
|
ids = self.compute_or_load(string2filename(item["audio_unique_name"]), item["text"], item["language"])
|
||||||
ph_hat = self.tokenizer.ids_to_text(ids)
|
ph_hat = self.tokenizer.ids_to_text(ids)
|
||||||
return {"text": item["text"], "ph_hat": ph_hat, "token_ids": ids, "token_ids_len": len(ids)}
|
return {"text": item["text"], "ph_hat": ph_hat, "token_ids": ids, "token_ids_len": len(ids)}
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self) -> int:
|
||||||
return len(self.samples)
|
return len(self.samples)
|
||||||
|
|
||||||
def compute_or_load(self, file_name, text, language):
|
def compute_or_load(self, file_name: str, text: str, language: str) -> list[int]:
|
||||||
"""Compute phonemes for the given text.
|
"""Compute phonemes for the given text.
|
||||||
|
|
||||||
If the phonemes are already cached, load them from cache.
|
If the phonemes are already cached, load them from cache.
|
||||||
|
@ -631,20 +675,24 @@ class PhonemeDataset(Dataset):
|
||||||
np.save(cache_path, ids)
|
np.save(cache_path, ids)
|
||||||
return ids
|
return ids
|
||||||
|
|
||||||
def get_pad_id(self):
|
def get_pad_id(self) -> int:
|
||||||
"""Get pad token ID for sequence padding"""
|
"""Get pad token ID for sequence padding."""
|
||||||
return self.tokenizer.pad_id
|
return self.tokenizer.pad_id
|
||||||
|
|
||||||
def precompute(self, num_workers=1):
|
def precompute(self, num_workers: int = 1) -> None:
|
||||||
"""Precompute phonemes for all samples.
|
"""Precompute phonemes for all samples.
|
||||||
|
|
||||||
We use pytorch dataloader because we are lazy.
|
We use pytorch dataloader because we are lazy.
|
||||||
"""
|
"""
|
||||||
print("[*] Pre-computing phonemes...")
|
logger.info("Pre-computing phonemes...")
|
||||||
with tqdm.tqdm(total=len(self)) as pbar:
|
with tqdm.tqdm(total=len(self)) as pbar:
|
||||||
batch_size = num_workers if num_workers > 0 else 1
|
batch_size = num_workers if num_workers > 0 else 1
|
||||||
dataloder = torch.utils.data.DataLoader(
|
dataloder = torch.utils.data.DataLoader(
|
||||||
batch_size=batch_size, dataset=self, shuffle=False, num_workers=num_workers, collate_fn=self.collate_fn
|
batch_size=batch_size,
|
||||||
|
dataset=self,
|
||||||
|
shuffle=False,
|
||||||
|
num_workers=num_workers,
|
||||||
|
collate_fn=self.collate_fn,
|
||||||
)
|
)
|
||||||
for _ in dataloder:
|
for _ in dataloder:
|
||||||
pbar.update(batch_size)
|
pbar.update(batch_size)
|
||||||
|
@ -662,20 +710,20 @@ class PhonemeDataset(Dataset):
|
||||||
|
|
||||||
def print_logs(self, level: int = 0) -> None:
|
def print_logs(self, level: int = 0) -> None:
|
||||||
indent = "\t" * level
|
indent = "\t" * level
|
||||||
print("\n")
|
logger.info("%sPhonemeDataset", indent)
|
||||||
print(f"{indent}> PhonemeDataset ")
|
logger.info("%s| Tokenizer:", indent)
|
||||||
print(f"{indent}| > Tokenizer:")
|
|
||||||
self.tokenizer.print_logs(level + 1)
|
self.tokenizer.print_logs(level + 1)
|
||||||
print(f"{indent}| > Number of instances : {len(self.samples)}")
|
logger.info("%s| Number of instances : %d", indent, len(self.samples))
|
||||||
|
|
||||||
|
|
||||||
class F0Dataset:
|
class F0Dataset:
|
||||||
"""F0 Dataset for computing F0 from wav files in CPU
|
"""F0 Dataset for computing F0 from wav files in CPU.
|
||||||
|
|
||||||
Pre-compute F0 values for all the samples at initialization if `cache_path` is not None or already present. It
|
Pre-compute F0 values for all the samples at initialization if `cache_path` is not None or already present. It
|
||||||
also computes the mean and std of F0 values if `normalize_f0` is True.
|
also computes the mean and std of F0 values if `normalize_f0` is True.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
----
|
||||||
samples (Union[List[List], List[Dict]]):
|
samples (Union[List[List], List[Dict]]):
|
||||||
List of samples. Each sample is a list or a dict.
|
List of samples. Each sample is a list or a dict.
|
||||||
|
|
||||||
|
@ -691,21 +739,20 @@ class F0Dataset:
|
||||||
|
|
||||||
normalize_f0 (bool):
|
normalize_f0 (bool):
|
||||||
Whether to normalize F0 values by mean and std. Defaults to True.
|
Whether to normalize F0 values by mean and std. Defaults to True.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
samples: Union[List[List], List[Dict]],
|
samples: Union[list[list], list[dict]],
|
||||||
ap: "AudioProcessor",
|
ap: "AudioProcessor",
|
||||||
audio_config=None, # pylint: disable=unused-argument
|
audio_config=None, # pylint: disable=unused-argument
|
||||||
verbose=False,
|
cache_path: Optional[str] = None,
|
||||||
cache_path: str = None,
|
precompute_num_workers: int = 0,
|
||||||
precompute_num_workers=0,
|
normalize_f0: bool = True,
|
||||||
normalize_f0=True,
|
) -> None:
|
||||||
):
|
|
||||||
self.samples = samples
|
self.samples = samples
|
||||||
self.ap = ap
|
self.ap = ap
|
||||||
self.verbose = verbose
|
|
||||||
self.cache_path = cache_path
|
self.cache_path = cache_path
|
||||||
self.normalize_f0 = normalize_f0
|
self.normalize_f0 = normalize_f0
|
||||||
self.pad_id = 0.0
|
self.pad_id = 0.0
|
||||||
|
@ -725,18 +772,22 @@ class F0Dataset:
|
||||||
f0 = self.normalize(f0)
|
f0 = self.normalize(f0)
|
||||||
return {"audio_unique_name": item["audio_unique_name"], "f0": f0}
|
return {"audio_unique_name": item["audio_unique_name"], "f0": f0}
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self) -> int:
|
||||||
return len(self.samples)
|
return len(self.samples)
|
||||||
|
|
||||||
def precompute(self, num_workers=0):
|
def precompute(self, num_workers: int = 0) -> None:
|
||||||
print("[*] Pre-computing F0s...")
|
logger.info("Pre-computing F0s...")
|
||||||
with tqdm.tqdm(total=len(self)) as pbar:
|
with tqdm.tqdm(total=len(self)) as pbar:
|
||||||
batch_size = num_workers if num_workers > 0 else 1
|
batch_size = num_workers if num_workers > 0 else 1
|
||||||
# we do not normalize at preproessing
|
# we do not normalize at preproessing
|
||||||
normalize_f0 = self.normalize_f0
|
normalize_f0 = self.normalize_f0
|
||||||
self.normalize_f0 = False
|
self.normalize_f0 = False
|
||||||
dataloder = torch.utils.data.DataLoader(
|
dataloder = torch.utils.data.DataLoader(
|
||||||
batch_size=batch_size, dataset=self, shuffle=False, num_workers=num_workers, collate_fn=self.collate_fn
|
batch_size=batch_size,
|
||||||
|
dataset=self,
|
||||||
|
shuffle=False,
|
||||||
|
num_workers=num_workers,
|
||||||
|
collate_fn=self.collate_fn,
|
||||||
)
|
)
|
||||||
computed_data = []
|
computed_data = []
|
||||||
for batch in dataloder:
|
for batch in dataloder:
|
||||||
|
@ -755,9 +806,8 @@ class F0Dataset:
|
||||||
return self.pad_id
|
return self.pad_id
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def create_pitch_file_path(file_name, cache_path):
|
def create_pitch_file_path(file_name: str, cache_path: str) -> str:
|
||||||
pitch_file = os.path.join(cache_path, file_name + "_pitch.npy")
|
return os.path.join(cache_path, file_name + "_pitch.npy")
|
||||||
return pitch_file
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _compute_and_save_pitch(ap, wav_file, pitch_file=None):
|
def _compute_and_save_pitch(ap, wav_file, pitch_file=None):
|
||||||
|
@ -773,7 +823,7 @@ class F0Dataset:
|
||||||
mean, std = np.mean(nonzeros), np.std(nonzeros)
|
mean, std = np.mean(nonzeros), np.std(nonzeros)
|
||||||
return mean, std
|
return mean, std
|
||||||
|
|
||||||
def load_stats(self, cache_path):
|
def load_stats(self, cache_path) -> None:
|
||||||
stats_path = os.path.join(cache_path, "pitch_stats.npy")
|
stats_path = os.path.join(cache_path, "pitch_stats.npy")
|
||||||
stats = np.load(stats_path, allow_pickle=True).item()
|
stats = np.load(stats_path, allow_pickle=True).item()
|
||||||
self.mean = stats["mean"].astype(np.float32)
|
self.mean = stats["mean"].astype(np.float32)
|
||||||
|
@ -794,9 +844,7 @@ class F0Dataset:
|
||||||
return pitch
|
return pitch
|
||||||
|
|
||||||
def compute_or_load(self, wav_file, audio_unique_name):
|
def compute_or_load(self, wav_file, audio_unique_name):
|
||||||
"""
|
"""Compute pitch and return a numpy array of pitch values."""
|
||||||
compute pitch and return a numpy array of pitch values
|
|
||||||
"""
|
|
||||||
pitch_file = self.create_pitch_file_path(audio_unique_name, self.cache_path)
|
pitch_file = self.create_pitch_file_path(audio_unique_name, self.cache_path)
|
||||||
if not os.path.exists(pitch_file):
|
if not os.path.exists(pitch_file):
|
||||||
pitch = self._compute_and_save_pitch(self.ap, wav_file, pitch_file)
|
pitch = self._compute_and_save_pitch(self.ap, wav_file, pitch_file)
|
||||||
|
@ -816,18 +864,18 @@ class F0Dataset:
|
||||||
|
|
||||||
def print_logs(self, level: int = 0) -> None:
|
def print_logs(self, level: int = 0) -> None:
|
||||||
indent = "\t" * level
|
indent = "\t" * level
|
||||||
print("\n")
|
logger.info("%sF0Dataset", indent)
|
||||||
print(f"{indent}> F0Dataset ")
|
logger.info("%s| Number of instances : %d", indent, len(self.samples))
|
||||||
print(f"{indent}| > Number of instances : {len(self.samples)}")
|
|
||||||
|
|
||||||
|
|
||||||
class EnergyDataset:
|
class EnergyDataset:
|
||||||
"""Energy Dataset for computing Energy from wav files in CPU
|
"""Energy Dataset for computing Energy from wav files in CPU.
|
||||||
|
|
||||||
Pre-compute Energy values for all the samples at initialization if `cache_path` is not None or already present. It
|
Pre-compute Energy values for all the samples at initialization if `cache_path` is not None or already present. It
|
||||||
also computes the mean and std of Energy values if `normalize_Energy` is True.
|
also computes the mean and std of Energy values if `normalize_Energy` is True.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
----
|
||||||
samples (Union[List[List], List[Dict]]):
|
samples (Union[List[List], List[Dict]]):
|
||||||
List of samples. Each sample is a list or a dict.
|
List of samples. Each sample is a list or a dict.
|
||||||
|
|
||||||
|
@ -843,20 +891,19 @@ class EnergyDataset:
|
||||||
|
|
||||||
normalize_Energy (bool):
|
normalize_Energy (bool):
|
||||||
Whether to normalize Energy values by mean and std. Defaults to True.
|
Whether to normalize Energy values by mean and std. Defaults to True.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
samples: Union[List[List], List[Dict]],
|
samples: Union[list[list], list[dict]],
|
||||||
ap: "AudioProcessor",
|
ap: "AudioProcessor",
|
||||||
verbose=False,
|
cache_path: Optional[str] = None,
|
||||||
cache_path: str = None,
|
|
||||||
precompute_num_workers=0,
|
precompute_num_workers=0,
|
||||||
normalize_energy=True,
|
normalize_energy=True,
|
||||||
):
|
) -> None:
|
||||||
self.samples = samples
|
self.samples = samples
|
||||||
self.ap = ap
|
self.ap = ap
|
||||||
self.verbose = verbose
|
|
||||||
self.cache_path = cache_path
|
self.cache_path = cache_path
|
||||||
self.normalize_energy = normalize_energy
|
self.normalize_energy = normalize_energy
|
||||||
self.pad_id = 0.0
|
self.pad_id = 0.0
|
||||||
|
@ -876,18 +923,22 @@ class EnergyDataset:
|
||||||
energy = self.normalize(energy)
|
energy = self.normalize(energy)
|
||||||
return {"audio_unique_name": item["audio_unique_name"], "energy": energy}
|
return {"audio_unique_name": item["audio_unique_name"], "energy": energy}
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self) -> int:
|
||||||
return len(self.samples)
|
return len(self.samples)
|
||||||
|
|
||||||
def precompute(self, num_workers=0):
|
def precompute(self, num_workers=0) -> None:
|
||||||
print("[*] Pre-computing energys...")
|
logger.info("Pre-computing energys...")
|
||||||
with tqdm.tqdm(total=len(self)) as pbar:
|
with tqdm.tqdm(total=len(self)) as pbar:
|
||||||
batch_size = num_workers if num_workers > 0 else 1
|
batch_size = num_workers if num_workers > 0 else 1
|
||||||
# we do not normalize at preproessing
|
# we do not normalize at preproessing
|
||||||
normalize_energy = self.normalize_energy
|
normalize_energy = self.normalize_energy
|
||||||
self.normalize_energy = False
|
self.normalize_energy = False
|
||||||
dataloder = torch.utils.data.DataLoader(
|
dataloder = torch.utils.data.DataLoader(
|
||||||
batch_size=batch_size, dataset=self, shuffle=False, num_workers=num_workers, collate_fn=self.collate_fn
|
batch_size=batch_size,
|
||||||
|
dataset=self,
|
||||||
|
shuffle=False,
|
||||||
|
num_workers=num_workers,
|
||||||
|
collate_fn=self.collate_fn,
|
||||||
)
|
)
|
||||||
computed_data = []
|
computed_data = []
|
||||||
for batch in dataloder:
|
for batch in dataloder:
|
||||||
|
@ -908,8 +959,7 @@ class EnergyDataset:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def create_energy_file_path(wav_file, cache_path):
|
def create_energy_file_path(wav_file, cache_path):
|
||||||
file_name = os.path.splitext(os.path.basename(wav_file))[0]
|
file_name = os.path.splitext(os.path.basename(wav_file))[0]
|
||||||
energy_file = os.path.join(cache_path, file_name + "_energy.npy")
|
return os.path.join(cache_path, file_name + "_energy.npy")
|
||||||
return energy_file
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _compute_and_save_energy(ap, wav_file, energy_file=None):
|
def _compute_and_save_energy(ap, wav_file, energy_file=None):
|
||||||
|
@ -925,7 +975,7 @@ class EnergyDataset:
|
||||||
mean, std = np.mean(nonzeros), np.std(nonzeros)
|
mean, std = np.mean(nonzeros), np.std(nonzeros)
|
||||||
return mean, std
|
return mean, std
|
||||||
|
|
||||||
def load_stats(self, cache_path):
|
def load_stats(self, cache_path) -> None:
|
||||||
stats_path = os.path.join(cache_path, "energy_stats.npy")
|
stats_path = os.path.join(cache_path, "energy_stats.npy")
|
||||||
stats = np.load(stats_path, allow_pickle=True).item()
|
stats = np.load(stats_path, allow_pickle=True).item()
|
||||||
self.mean = stats["mean"].astype(np.float32)
|
self.mean = stats["mean"].astype(np.float32)
|
||||||
|
@ -946,9 +996,7 @@ class EnergyDataset:
|
||||||
return energy
|
return energy
|
||||||
|
|
||||||
def compute_or_load(self, wav_file, audio_unique_name):
|
def compute_or_load(self, wav_file, audio_unique_name):
|
||||||
"""
|
"""Compute energy and return a numpy array of energy values."""
|
||||||
compute energy and return a numpy array of energy values
|
|
||||||
"""
|
|
||||||
energy_file = self.create_energy_file_path(audio_unique_name, self.cache_path)
|
energy_file = self.create_energy_file_path(audio_unique_name, self.cache_path)
|
||||||
if not os.path.exists(energy_file):
|
if not os.path.exists(energy_file):
|
||||||
energy = self._compute_and_save_energy(self.ap, wav_file, energy_file)
|
energy = self._compute_and_save_energy(self.ap, wav_file, energy_file)
|
||||||
|
@ -968,6 +1016,5 @@ class EnergyDataset:
|
||||||
|
|
||||||
def print_logs(self, level: int = 0) -> None:
|
def print_logs(self, level: int = 0) -> None:
|
||||||
indent = "\t" * level
|
indent = "\t" * level
|
||||||
print("\n")
|
logger.info("%senergyDataset")
|
||||||
print(f"{indent}> energyDataset ")
|
logger.info("%s| Number of instances : %d", indent, len(self.samples))
|
||||||
print(f"{indent}| > Number of instances : {len(self.samples)}")
|
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
import csv
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
|
@ -5,9 +7,10 @@ from glob import glob
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
########################
|
########################
|
||||||
# DATASETS
|
# DATASETS
|
||||||
########################
|
########################
|
||||||
|
@ -23,32 +26,34 @@ def cml_tts(root_path, meta_file, ignored_speakers=None):
|
||||||
num_cols = len(lines[0].split("|")) # take the first row as reference
|
num_cols = len(lines[0].split("|")) # take the first row as reference
|
||||||
for idx, line in enumerate(lines[1:]):
|
for idx, line in enumerate(lines[1:]):
|
||||||
if len(line.split("|")) != num_cols:
|
if len(line.split("|")) != num_cols:
|
||||||
print(f" > Missing column in line {idx + 1} -> {line.strip()}")
|
logger.warning("Missing column in line %d -> %s", idx + 1, line.strip())
|
||||||
# load metadata
|
# load metadata
|
||||||
metadata = pd.read_csv(os.path.join(root_path, meta_file), sep="|")
|
with open(Path(root_path) / meta_file, newline="", encoding="utf-8") as f:
|
||||||
assert all(x in metadata.columns for x in ["wav_filename", "transcript"])
|
reader = csv.DictReader(f, delimiter="|")
|
||||||
client_id = None if "client_id" in metadata.columns else "default"
|
metadata = list(reader)
|
||||||
emotion_name = None if "emotion_name" in metadata.columns else "neutral"
|
assert all(x in metadata[0] for x in ["wav_filename", "transcript"])
|
||||||
|
client_id = None if "client_id" in metadata[0] else "default"
|
||||||
|
emotion_name = None if "emotion_name" in metadata[0] else "neutral"
|
||||||
items = []
|
items = []
|
||||||
not_found_counter = 0
|
not_found_counter = 0
|
||||||
for row in metadata.itertuples():
|
for row in metadata:
|
||||||
if client_id is None and ignored_speakers is not None and row.client_id in ignored_speakers:
|
if client_id is None and ignored_speakers is not None and row["client_id"] in ignored_speakers:
|
||||||
continue
|
continue
|
||||||
audio_path = os.path.join(root_path, row.wav_filename)
|
audio_path = os.path.join(root_path, row["wav_filename"])
|
||||||
if not os.path.exists(audio_path):
|
if not os.path.exists(audio_path):
|
||||||
not_found_counter += 1
|
not_found_counter += 1
|
||||||
continue
|
continue
|
||||||
items.append(
|
items.append(
|
||||||
{
|
{
|
||||||
"text": row.transcript,
|
"text": row["transcript"],
|
||||||
"audio_file": audio_path,
|
"audio_file": audio_path,
|
||||||
"speaker_name": client_id if client_id is not None else row.client_id,
|
"speaker_name": client_id if client_id is not None else row["client_id"],
|
||||||
"emotion_name": emotion_name if emotion_name is not None else row.emotion_name,
|
"emotion_name": emotion_name if emotion_name is not None else row["emotion_name"],
|
||||||
"root_path": root_path,
|
"root_path": root_path,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
if not_found_counter > 0:
|
if not_found_counter > 0:
|
||||||
print(f" | > [!] {not_found_counter} files not found")
|
logger.warning("%d files not found", not_found_counter)
|
||||||
return items
|
return items
|
||||||
|
|
||||||
|
|
||||||
|
@ -61,32 +66,34 @@ def coqui(root_path, meta_file, ignored_speakers=None):
|
||||||
num_cols = len(lines[0].split("|")) # take the first row as reference
|
num_cols = len(lines[0].split("|")) # take the first row as reference
|
||||||
for idx, line in enumerate(lines[1:]):
|
for idx, line in enumerate(lines[1:]):
|
||||||
if len(line.split("|")) != num_cols:
|
if len(line.split("|")) != num_cols:
|
||||||
print(f" > Missing column in line {idx + 1} -> {line.strip()}")
|
logger.warning("Missing column in line %d -> %s", idx + 1, line.strip())
|
||||||
# load metadata
|
# load metadata
|
||||||
metadata = pd.read_csv(os.path.join(root_path, meta_file), sep="|")
|
with open(Path(root_path) / meta_file, newline="", encoding="utf-8") as f:
|
||||||
assert all(x in metadata.columns for x in ["audio_file", "text"])
|
reader = csv.DictReader(f, delimiter="|")
|
||||||
speaker_name = None if "speaker_name" in metadata.columns else "coqui"
|
metadata = list(reader)
|
||||||
emotion_name = None if "emotion_name" in metadata.columns else "neutral"
|
assert all(x in metadata[0] for x in ["audio_file", "text"])
|
||||||
|
speaker_name = None if "speaker_name" in metadata[0] else "coqui"
|
||||||
|
emotion_name = None if "emotion_name" in metadata[0] else "neutral"
|
||||||
items = []
|
items = []
|
||||||
not_found_counter = 0
|
not_found_counter = 0
|
||||||
for row in metadata.itertuples():
|
for row in metadata:
|
||||||
if speaker_name is None and ignored_speakers is not None and row.speaker_name in ignored_speakers:
|
if speaker_name is None and ignored_speakers is not None and row["speaker_name"] in ignored_speakers:
|
||||||
continue
|
continue
|
||||||
audio_path = os.path.join(root_path, row.audio_file)
|
audio_path = os.path.join(root_path, row["audio_file"])
|
||||||
if not os.path.exists(audio_path):
|
if not os.path.exists(audio_path):
|
||||||
not_found_counter += 1
|
not_found_counter += 1
|
||||||
continue
|
continue
|
||||||
items.append(
|
items.append(
|
||||||
{
|
{
|
||||||
"text": row.text,
|
"text": row["text"],
|
||||||
"audio_file": audio_path,
|
"audio_file": audio_path,
|
||||||
"speaker_name": speaker_name if speaker_name is not None else row.speaker_name,
|
"speaker_name": speaker_name if speaker_name is not None else row["speaker_name"],
|
||||||
"emotion_name": emotion_name if emotion_name is not None else row.emotion_name,
|
"emotion_name": emotion_name if emotion_name is not None else row["emotion_name"],
|
||||||
"root_path": root_path,
|
"root_path": root_path,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
if not_found_counter > 0:
|
if not_found_counter > 0:
|
||||||
print(f" | > [!] {not_found_counter} files not found")
|
logger.warning("%d files not found", not_found_counter)
|
||||||
return items
|
return items
|
||||||
|
|
||||||
|
|
||||||
|
@ -169,7 +176,7 @@ def mailabs(root_path, meta_files=None, ignored_speakers=None):
|
||||||
if isinstance(ignored_speakers, list):
|
if isinstance(ignored_speakers, list):
|
||||||
if speaker_name in ignored_speakers:
|
if speaker_name in ignored_speakers:
|
||||||
continue
|
continue
|
||||||
print(" | > {}".format(csv_file))
|
logger.info(csv_file)
|
||||||
with open(txt_file, "r", encoding="utf-8") as ttf:
|
with open(txt_file, "r", encoding="utf-8") as ttf:
|
||||||
for line in ttf:
|
for line in ttf:
|
||||||
cols = line.split("|")
|
cols = line.split("|")
|
||||||
|
@ -184,7 +191,7 @@ def mailabs(root_path, meta_files=None, ignored_speakers=None):
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# M-AI-Labs have some missing samples, so just print the warning
|
# M-AI-Labs have some missing samples, so just print the warning
|
||||||
print("> File %s does not exist!" % (wav_file))
|
logger.warning("File %s does not exist!", wav_file)
|
||||||
return items
|
return items
|
||||||
|
|
||||||
|
|
||||||
|
@ -249,7 +256,7 @@ def sam_accenture(root_path, meta_file, **kwargs): # pylint: disable=unused-arg
|
||||||
text = item.text
|
text = item.text
|
||||||
wav_file = os.path.join(root_path, "vo_voice_quality_transformation", item.get("id") + ".wav")
|
wav_file = os.path.join(root_path, "vo_voice_quality_transformation", item.get("id") + ".wav")
|
||||||
if not os.path.exists(wav_file):
|
if not os.path.exists(wav_file):
|
||||||
print(f" [!] {wav_file} in metafile does not exist. Skipping...")
|
logger.warning("%s in metafile does not exist. Skipping...", wav_file)
|
||||||
continue
|
continue
|
||||||
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
|
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
|
||||||
return items
|
return items
|
||||||
|
@ -370,7 +377,7 @@ def custom_turkish(root_path, meta_file, **kwargs): # pylint: disable=unused-ar
|
||||||
continue
|
continue
|
||||||
text = cols[1].strip()
|
text = cols[1].strip()
|
||||||
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
|
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
|
||||||
print(f" [!] {len(skipped_files)} files skipped. They don't exist...")
|
logger.warning("%d files skipped. They don't exist...")
|
||||||
return items
|
return items
|
||||||
|
|
||||||
|
|
||||||
|
@ -438,7 +445,7 @@ def vctk(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic
|
||||||
{"text": text, "audio_file": wav_file, "speaker_name": "VCTK_" + speaker_id, "root_path": root_path}
|
{"text": text, "audio_file": wav_file, "speaker_name": "VCTK_" + speaker_id, "root_path": root_path}
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
print(f" [!] wav files don't exist - {wav_file}")
|
logger.warning("Wav file doesn't exist - %s", wav_file)
|
||||||
return items
|
return items
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,11 +1,14 @@
|
||||||
# From https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer
|
# From https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer
|
||||||
|
|
||||||
|
import logging
|
||||||
import os.path
|
import os.path
|
||||||
import shutil
|
import shutil
|
||||||
import urllib.request
|
import urllib.request
|
||||||
|
|
||||||
import huggingface_hub
|
import huggingface_hub
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class HubertManager:
|
class HubertManager:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -13,9 +16,9 @@ class HubertManager:
|
||||||
download_url: str = "https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt", model_path: str = ""
|
download_url: str = "https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt", model_path: str = ""
|
||||||
):
|
):
|
||||||
if not os.path.isfile(model_path):
|
if not os.path.isfile(model_path):
|
||||||
print("Downloading HuBERT base model")
|
logger.info("Downloading HuBERT base model")
|
||||||
urllib.request.urlretrieve(download_url, model_path)
|
urllib.request.urlretrieve(download_url, model_path)
|
||||||
print("Downloaded HuBERT")
|
logger.info("Downloaded HuBERT")
|
||||||
return model_path
|
return model_path
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
@ -27,9 +30,9 @@ class HubertManager:
|
||||||
):
|
):
|
||||||
model_dir = os.path.dirname(model_path)
|
model_dir = os.path.dirname(model_path)
|
||||||
if not os.path.isfile(model_path):
|
if not os.path.isfile(model_path):
|
||||||
print("Downloading HuBERT custom tokenizer")
|
logger.info("Downloading HuBERT custom tokenizer")
|
||||||
huggingface_hub.hf_hub_download(repo, model, local_dir=model_dir, local_dir_use_symlinks=False)
|
huggingface_hub.hf_hub_download(repo, model, local_dir=model_dir, local_dir_use_symlinks=False)
|
||||||
shutil.move(os.path.join(model_dir, model), model_path)
|
shutil.move(os.path.join(model_dir, model), model_path)
|
||||||
print("Downloaded tokenizer")
|
logger.info("Downloaded tokenizer")
|
||||||
return model_path
|
return model_path
|
||||||
return None
|
return None
|
||||||
|
|
|
@ -7,8 +7,6 @@ License: MIT
|
||||||
|
|
||||||
# Modified code from https://github.com/lucidrains/audiolm-pytorch/blob/main/audiolm_pytorch/hubert_kmeans.py
|
# Modified code from https://github.com/lucidrains/audiolm-pytorch/blob/main/audiolm_pytorch/hubert_kmeans.py
|
||||||
|
|
||||||
import logging
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from einops import pack, unpack
|
from einops import pack, unpack
|
||||||
|
@ -16,6 +14,8 @@ from torch import nn
|
||||||
from torchaudio.functional import resample
|
from torchaudio.functional import resample
|
||||||
from transformers import HubertModel
|
from transformers import HubertModel
|
||||||
|
|
||||||
|
from TTS.utils.generic_utils import exists
|
||||||
|
|
||||||
|
|
||||||
def round_down_nearest_multiple(num, divisor):
|
def round_down_nearest_multiple(num, divisor):
|
||||||
return num // divisor * divisor
|
return num // divisor * divisor
|
||||||
|
@ -28,21 +28,13 @@ def curtail_to_multiple(t, mult, from_left=False):
|
||||||
return t[..., seq_slice]
|
return t[..., seq_slice]
|
||||||
|
|
||||||
|
|
||||||
def exists(val):
|
|
||||||
return val is not None
|
|
||||||
|
|
||||||
|
|
||||||
def default(val, d):
|
|
||||||
return val if exists(val) else d
|
|
||||||
|
|
||||||
|
|
||||||
class CustomHubert(nn.Module):
|
class CustomHubert(nn.Module):
|
||||||
"""
|
"""
|
||||||
checkpoint and kmeans can be downloaded at https://github.com/facebookresearch/fairseq/tree/main/examples/hubert
|
checkpoint and kmeans can be downloaded at https://github.com/facebookresearch/fairseq/tree/main/examples/hubert
|
||||||
or you can train your own
|
or you can train your own
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, checkpoint_path, target_sample_hz=16000, seq_len_multiple_of=None, output_layer=9, device=None):
|
def __init__(self, target_sample_hz=16000, seq_len_multiple_of=None, output_layer=9, device=None):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.target_sample_hz = target_sample_hz
|
self.target_sample_hz = target_sample_hz
|
||||||
self.seq_len_multiple_of = seq_len_multiple_of
|
self.seq_len_multiple_of = seq_len_multiple_of
|
||||||
|
|
|
@ -5,6 +5,7 @@ License: MIT
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
import os.path
|
import os.path
|
||||||
from zipfile import ZipFile
|
from zipfile import ZipFile
|
||||||
|
|
||||||
|
@ -12,6 +13,8 @@ import numpy
|
||||||
import torch
|
import torch
|
||||||
from torch import nn, optim
|
from torch import nn, optim
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class HubertTokenizer(nn.Module):
|
class HubertTokenizer(nn.Module):
|
||||||
def __init__(self, hidden_size=1024, input_size=768, output_size=10000, version=0):
|
def __init__(self, hidden_size=1024, input_size=768, output_size=10000, version=0):
|
||||||
|
@ -85,7 +88,7 @@ class HubertTokenizer(nn.Module):
|
||||||
|
|
||||||
# Print loss
|
# Print loss
|
||||||
if log_loss:
|
if log_loss:
|
||||||
print("Loss", loss.item())
|
logger.info("Loss %.3f", loss.item())
|
||||||
|
|
||||||
# Backward pass
|
# Backward pass
|
||||||
loss.backward()
|
loss.backward()
|
||||||
|
@ -157,10 +160,10 @@ def auto_train(data_path, save_path="model.pth", load_model: str = None, save_ep
|
||||||
data_x, data_y = [], []
|
data_x, data_y = [], []
|
||||||
|
|
||||||
if load_model and os.path.isfile(load_model):
|
if load_model and os.path.isfile(load_model):
|
||||||
print("Loading model from", load_model)
|
logger.info("Loading model from %s", load_model)
|
||||||
model_training = HubertTokenizer.load_from_checkpoint(load_model, "cuda")
|
model_training = HubertTokenizer.load_from_checkpoint(load_model, "cuda")
|
||||||
else:
|
else:
|
||||||
print("Creating new model.")
|
logger.info("Creating new model.")
|
||||||
model_training = HubertTokenizer(version=1).to("cuda") # Settings for the model to run without lstm
|
model_training = HubertTokenizer(version=1).to("cuda") # Settings for the model to run without lstm
|
||||||
save_path = os.path.join(data_path, save_path)
|
save_path = os.path.join(data_path, save_path)
|
||||||
base_save_path = ".".join(save_path.split(".")[:-1])
|
base_save_path = ".".join(save_path.split(".")[:-1])
|
||||||
|
@ -191,5 +194,5 @@ def auto_train(data_path, save_path="model.pth", load_model: str = None, save_ep
|
||||||
save_p_2 = f"{base_save_path}_epoch_{epoch}.pth"
|
save_p_2 = f"{base_save_path}_epoch_{epoch}.pth"
|
||||||
model_training.save(save_p)
|
model_training.save(save_p)
|
||||||
model_training.save(save_p_2)
|
model_training.save(save_p_2)
|
||||||
print(f"Epoch {epoch} completed")
|
logger.info("Epoch %d completed", epoch)
|
||||||
epoch += 1
|
epoch += 1
|
||||||
|
|
|
@ -2,10 +2,11 @@ import logging
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
from glob import glob
|
from glob import glob
|
||||||
from typing import Dict, List
|
from typing import Dict, List, Optional, Tuple
|
||||||
|
|
||||||
import librosa
|
import librosa
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import numpy.typing as npt
|
||||||
import torch
|
import torch
|
||||||
import torchaudio
|
import torchaudio
|
||||||
import tqdm
|
import tqdm
|
||||||
|
@ -48,7 +49,7 @@ def get_voices(extra_voice_dirs: List[str] = []): # pylint: disable=dangerous-d
|
||||||
return voices
|
return voices
|
||||||
|
|
||||||
|
|
||||||
def load_npz(npz_file):
|
def load_npz(npz_file: str) -> Tuple[npt.NDArray[np.int64], npt.NDArray[np.int64], npt.NDArray[np.int64]]:
|
||||||
x_history = np.load(npz_file)
|
x_history = np.load(npz_file)
|
||||||
semantic = x_history["semantic_prompt"]
|
semantic = x_history["semantic_prompt"]
|
||||||
coarse = x_history["coarse_prompt"]
|
coarse = x_history["coarse_prompt"]
|
||||||
|
@ -56,7 +57,11 @@ def load_npz(npz_file):
|
||||||
return semantic, coarse, fine
|
return semantic, coarse, fine
|
||||||
|
|
||||||
|
|
||||||
def load_voice(model, voice: str, extra_voice_dirs: List[str] = []): # pylint: disable=dangerous-default-value
|
def load_voice(
|
||||||
|
model, voice: str, extra_voice_dirs: List[str] = []
|
||||||
|
) -> Tuple[
|
||||||
|
Optional[npt.NDArray[np.int64]], Optional[npt.NDArray[np.int64]], Optional[npt.NDArray[np.int64]]
|
||||||
|
]: # pylint: disable=dangerous-default-value
|
||||||
if voice == "random":
|
if voice == "random":
|
||||||
return None, None, None
|
return None, None, None
|
||||||
|
|
||||||
|
@ -107,11 +112,10 @@ def generate_voice(
|
||||||
model,
|
model,
|
||||||
output_path,
|
output_path,
|
||||||
):
|
):
|
||||||
"""Generate a new voice from a given audio and text prompt.
|
"""Generate a new voice from a given audio.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
audio (np.ndarray): The audio to use as a base for the new voice.
|
audio (np.ndarray): The audio to use as a base for the new voice.
|
||||||
text (str): Transcription of the audio you are clonning.
|
|
||||||
model (BarkModel): The BarkModel to use for generating the new voice.
|
model (BarkModel): The BarkModel to use for generating the new voice.
|
||||||
output_path (str): The path to save the generated voice to.
|
output_path (str): The path to save the generated voice to.
|
||||||
"""
|
"""
|
||||||
|
@ -130,10 +134,9 @@ def generate_voice(
|
||||||
# generate semantic tokens
|
# generate semantic tokens
|
||||||
# Load the HuBERT model
|
# Load the HuBERT model
|
||||||
hubert_manager = HubertManager()
|
hubert_manager = HubertManager()
|
||||||
# hubert_manager.make_sure_hubert_installed(model_path=model.config.LOCAL_MODEL_PATHS["hubert"])
|
|
||||||
hubert_manager.make_sure_tokenizer_installed(model_path=model.config.LOCAL_MODEL_PATHS["hubert_tokenizer"])
|
hubert_manager.make_sure_tokenizer_installed(model_path=model.config.LOCAL_MODEL_PATHS["hubert_tokenizer"])
|
||||||
|
|
||||||
hubert_model = CustomHubert(checkpoint_path=model.config.LOCAL_MODEL_PATHS["hubert"]).to(model.device)
|
hubert_model = CustomHubert().to(model.device)
|
||||||
|
|
||||||
# Load the CustomTokenizer model
|
# Load the CustomTokenizer model
|
||||||
tokenizer = HubertTokenizer.load_from_checkpoint(
|
tokenizer = HubertTokenizer.load_from_checkpoint(
|
||||||
|
|
|
@ -10,14 +10,10 @@ import tqdm
|
||||||
|
|
||||||
from TTS.tts.layers.bark.model import GPT, GPTConfig
|
from TTS.tts.layers.bark.model import GPT, GPTConfig
|
||||||
from TTS.tts.layers.bark.model_fine import FineGPT, FineGPTConfig
|
from TTS.tts.layers.bark.model_fine import FineGPT, FineGPTConfig
|
||||||
|
from TTS.utils.generic_utils import is_pytorch_at_least_2_4
|
||||||
|
|
||||||
if (
|
if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
|
||||||
torch.cuda.is_available()
|
autocast = functools.partial(torch.autocast, device_type="cuda", dtype=torch.bfloat16)
|
||||||
and hasattr(torch.cuda, "amp")
|
|
||||||
and hasattr(torch.cuda.amp, "autocast")
|
|
||||||
and torch.cuda.is_bf16_supported()
|
|
||||||
):
|
|
||||||
autocast = functools.partial(torch.cuda.amp.autocast, dtype=torch.bfloat16)
|
|
||||||
else:
|
else:
|
||||||
|
|
||||||
@contextlib.contextmanager
|
@contextlib.contextmanager
|
||||||
|
@ -118,7 +114,7 @@ def load_model(ckpt_path, device, config, model_type="text"):
|
||||||
logger.info(f"{model_type} model not found, downloading...")
|
logger.info(f"{model_type} model not found, downloading...")
|
||||||
_download(config.REMOTE_MODEL_PATHS[model_type]["path"], ckpt_path, config.CACHE_DIR)
|
_download(config.REMOTE_MODEL_PATHS[model_type]["path"], ckpt_path, config.CACHE_DIR)
|
||||||
|
|
||||||
checkpoint = torch.load(ckpt_path, map_location=device)
|
checkpoint = torch.load(ckpt_path, map_location=device, weights_only=is_pytorch_at_least_2_4())
|
||||||
# this is a hack
|
# this is a hack
|
||||||
model_args = checkpoint["model_args"]
|
model_args = checkpoint["model_args"]
|
||||||
if "input_vocab_size" not in model_args:
|
if "input_vocab_size" not in model_args:
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
Much of this code is adapted from Andrej Karpathy's NanoGPT
|
Much of this code is adapted from Andrej Karpathy's NanoGPT
|
||||||
(https://github.com/karpathy/nanoGPT)
|
(https://github.com/karpathy/nanoGPT)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import math
|
import math
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
@ -11,18 +12,6 @@ from torch import nn
|
||||||
from torch.nn import functional as F
|
from torch.nn import functional as F
|
||||||
|
|
||||||
|
|
||||||
class LayerNorm(nn.Module):
|
|
||||||
"""LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False"""
|
|
||||||
|
|
||||||
def __init__(self, ndim, bias):
|
|
||||||
super().__init__()
|
|
||||||
self.weight = nn.Parameter(torch.ones(ndim))
|
|
||||||
self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
|
|
||||||
|
|
||||||
def forward(self, x):
|
|
||||||
return F.layer_norm(x, self.weight.shape, self.weight, self.bias, 1e-5)
|
|
||||||
|
|
||||||
|
|
||||||
class CausalSelfAttention(nn.Module):
|
class CausalSelfAttention(nn.Module):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
@ -118,9 +107,9 @@ class MLP(nn.Module):
|
||||||
class Block(nn.Module):
|
class Block(nn.Module):
|
||||||
def __init__(self, config, layer_idx):
|
def __init__(self, config, layer_idx):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
|
self.ln_1 = nn.LayerNorm(config.n_embd, bias=config.bias)
|
||||||
self.attn = CausalSelfAttention(config)
|
self.attn = CausalSelfAttention(config)
|
||||||
self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
|
self.ln_2 = nn.LayerNorm(config.n_embd, bias=config.bias)
|
||||||
self.mlp = MLP(config)
|
self.mlp = MLP(config)
|
||||||
self.layer_idx = layer_idx
|
self.layer_idx = layer_idx
|
||||||
|
|
||||||
|
@ -157,7 +146,7 @@ class GPT(nn.Module):
|
||||||
wpe=nn.Embedding(config.block_size, config.n_embd),
|
wpe=nn.Embedding(config.block_size, config.n_embd),
|
||||||
drop=nn.Dropout(config.dropout),
|
drop=nn.Dropout(config.dropout),
|
||||||
h=nn.ModuleList([Block(config, idx) for idx in range(config.n_layer)]),
|
h=nn.ModuleList([Block(config, idx) for idx in range(config.n_layer)]),
|
||||||
ln_f=LayerNorm(config.n_embd, bias=config.bias),
|
ln_f=nn.LayerNorm(config.n_embd, bias=config.bias),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
self.lm_head = nn.Linear(config.n_embd, config.output_vocab_size, bias=False)
|
self.lm_head = nn.Linear(config.n_embd, config.output_vocab_size, bias=False)
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
Much of this code is adapted from Andrej Karpathy's NanoGPT
|
Much of this code is adapted from Andrej Karpathy's NanoGPT
|
||||||
(https://github.com/karpathy/nanoGPT)
|
(https://github.com/karpathy/nanoGPT)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import math
|
import math
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
|
|
@ -1,16 +1,17 @@
|
||||||
### credit: https://github.com/dunky11/voicesmith
|
### credit: https://github.com/dunky11/voicesmith
|
||||||
|
import logging
|
||||||
from typing import Callable, Dict, Tuple
|
from typing import Callable, Dict, Tuple
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from coqpit import Coqpit
|
from coqpit import Coqpit
|
||||||
|
from monotonic_alignment_search import maximum_path
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
||||||
from TTS.tts.layers.delightful_tts.conformer import Conformer
|
from TTS.tts.layers.delightful_tts.conformer import Conformer
|
||||||
from TTS.tts.layers.delightful_tts.encoders import (
|
from TTS.tts.layers.delightful_tts.encoders import (
|
||||||
PhonemeLevelProsodyEncoder,
|
PhonemeLevelProsodyEncoder,
|
||||||
UtteranceLevelProsodyEncoder,
|
UtteranceLevelProsodyEncoder,
|
||||||
get_mask_from_lengths,
|
|
||||||
)
|
)
|
||||||
from TTS.tts.layers.delightful_tts.energy_adaptor import EnergyAdaptor
|
from TTS.tts.layers.delightful_tts.energy_adaptor import EnergyAdaptor
|
||||||
from TTS.tts.layers.delightful_tts.networks import EmbeddingPadded, positional_encoding
|
from TTS.tts.layers.delightful_tts.networks import EmbeddingPadded, positional_encoding
|
||||||
|
@ -18,7 +19,9 @@ from TTS.tts.layers.delightful_tts.phoneme_prosody_predictor import PhonemeProso
|
||||||
from TTS.tts.layers.delightful_tts.pitch_adaptor import PitchAdaptor
|
from TTS.tts.layers.delightful_tts.pitch_adaptor import PitchAdaptor
|
||||||
from TTS.tts.layers.delightful_tts.variance_predictor import VariancePredictor
|
from TTS.tts.layers.delightful_tts.variance_predictor import VariancePredictor
|
||||||
from TTS.tts.layers.generic.aligner import AlignmentNetwork
|
from TTS.tts.layers.generic.aligner import AlignmentNetwork
|
||||||
from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask
|
from TTS.tts.utils.helpers import expand_encoder_outputs, generate_attention, sequence_mask
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class AcousticModel(torch.nn.Module):
|
class AcousticModel(torch.nn.Module):
|
||||||
|
@ -217,7 +220,7 @@ class AcousticModel(torch.nn.Module):
|
||||||
def _init_speaker_embedding(self):
|
def _init_speaker_embedding(self):
|
||||||
# pylint: disable=attribute-defined-outside-init
|
# pylint: disable=attribute-defined-outside-init
|
||||||
if self.num_speakers > 0:
|
if self.num_speakers > 0:
|
||||||
print(" > initialization of speaker-embedding layers.")
|
logger.info("Initialization of speaker-embedding layers.")
|
||||||
self.embedded_speaker_dim = self.args.speaker_embedding_channels
|
self.embedded_speaker_dim = self.args.speaker_embedding_channels
|
||||||
self.emb_g = nn.Embedding(self.num_speakers, self.embedded_speaker_dim)
|
self.emb_g = nn.Embedding(self.num_speakers, self.embedded_speaker_dim)
|
||||||
|
|
||||||
|
@ -227,42 +230,6 @@ class AcousticModel(torch.nn.Module):
|
||||||
raise ValueError("[!] Speaker embedding layer already initialized before d_vector settings.")
|
raise ValueError("[!] Speaker embedding layer already initialized before d_vector settings.")
|
||||||
self.embedded_speaker_dim = self.args.d_vector_dim
|
self.embedded_speaker_dim = self.args.d_vector_dim
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def generate_attn(dr, x_mask, y_mask=None):
|
|
||||||
"""Generate an attention mask from the linear scale durations.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
dr (Tensor): Linear scale durations.
|
|
||||||
x_mask (Tensor): Mask for the input (character) sequence.
|
|
||||||
y_mask (Tensor): Mask for the output (spectrogram) sequence. Compute it from the predicted durations
|
|
||||||
if None. Defaults to None.
|
|
||||||
|
|
||||||
Shapes
|
|
||||||
- dr: :math:`(B, T_{en})`
|
|
||||||
- x_mask: :math:`(B, T_{en})`
|
|
||||||
- y_mask: :math:`(B, T_{de})`
|
|
||||||
"""
|
|
||||||
# compute decode mask from the durations
|
|
||||||
if y_mask is None:
|
|
||||||
y_lengths = dr.sum(1).long()
|
|
||||||
y_lengths[y_lengths < 1] = 1
|
|
||||||
y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(dr.dtype)
|
|
||||||
attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2)
|
|
||||||
attn = generate_path(dr, attn_mask.squeeze(1)).to(dr.dtype)
|
|
||||||
return attn
|
|
||||||
|
|
||||||
def _expand_encoder_with_durations(
|
|
||||||
self,
|
|
||||||
o_en: torch.FloatTensor,
|
|
||||||
dr: torch.IntTensor,
|
|
||||||
x_mask: torch.IntTensor,
|
|
||||||
y_lengths: torch.IntTensor,
|
|
||||||
):
|
|
||||||
y_mask = torch.unsqueeze(sequence_mask(y_lengths, None), 1).to(o_en.dtype)
|
|
||||||
attn = self.generate_attn(dr, x_mask, y_mask)
|
|
||||||
o_en_ex = torch.einsum("kmn, kjm -> kjn", [attn.float(), o_en])
|
|
||||||
return y_mask, o_en_ex, attn.transpose(1, 2)
|
|
||||||
|
|
||||||
def _forward_aligner(
|
def _forward_aligner(
|
||||||
self,
|
self,
|
||||||
x: torch.FloatTensor,
|
x: torch.FloatTensor,
|
||||||
|
@ -336,8 +303,8 @@ class AcousticModel(torch.nn.Module):
|
||||||
{"d_vectors": d_vectors, "speaker_ids": speaker_idx}
|
{"d_vectors": d_vectors, "speaker_ids": speaker_idx}
|
||||||
) # pylint: disable=unused-variable
|
) # pylint: disable=unused-variable
|
||||||
|
|
||||||
src_mask = get_mask_from_lengths(src_lens) # [B, T_src]
|
src_mask = ~sequence_mask(src_lens) # [B, T_src]
|
||||||
mel_mask = get_mask_from_lengths(mel_lens) # [B, T_mel]
|
mel_mask = ~sequence_mask(mel_lens) # [B, T_mel]
|
||||||
|
|
||||||
# Token embeddings
|
# Token embeddings
|
||||||
token_embeddings = self.src_word_emb(tokens) # [B, T_src, C_hidden]
|
token_embeddings = self.src_word_emb(tokens) # [B, T_src, C_hidden]
|
||||||
|
@ -362,7 +329,7 @@ class AcousticModel(torch.nn.Module):
|
||||||
|
|
||||||
pos_encoding = positional_encoding(
|
pos_encoding = positional_encoding(
|
||||||
self.emb_dim,
|
self.emb_dim,
|
||||||
max(token_embeddings.shape[1], max(mel_lens)),
|
max(token_embeddings.shape[1], *mel_lens),
|
||||||
device=token_embeddings.device,
|
device=token_embeddings.device,
|
||||||
)
|
)
|
||||||
encoder_outputs = self.encoder(
|
encoder_outputs = self.encoder(
|
||||||
|
@ -416,8 +383,8 @@ class AcousticModel(torch.nn.Module):
|
||||||
encoder_outputs = encoder_outputs.transpose(1, 2) + pitch_emb + energy_emb
|
encoder_outputs = encoder_outputs.transpose(1, 2) + pitch_emb + energy_emb
|
||||||
log_duration_prediction = self.duration_predictor(x=encoder_outputs_res.detach(), mask=src_mask)
|
log_duration_prediction = self.duration_predictor(x=encoder_outputs_res.detach(), mask=src_mask)
|
||||||
|
|
||||||
mel_pred_mask, encoder_outputs_ex, alignments = self._expand_encoder_with_durations(
|
encoder_outputs_ex, alignments, mel_pred_mask = expand_encoder_outputs(
|
||||||
o_en=encoder_outputs, y_lengths=mel_lens, dr=dr, x_mask=~src_mask[:, None]
|
encoder_outputs, y_lengths=mel_lens, duration=dr, x_mask=~src_mask[:, None]
|
||||||
)
|
)
|
||||||
|
|
||||||
x = self.decoder(
|
x = self.decoder(
|
||||||
|
@ -431,7 +398,7 @@ class AcousticModel(torch.nn.Module):
|
||||||
dr = torch.log(dr + 1)
|
dr = torch.log(dr + 1)
|
||||||
|
|
||||||
dr_pred = torch.exp(log_duration_prediction) - 1
|
dr_pred = torch.exp(log_duration_prediction) - 1
|
||||||
alignments_dp = self.generate_attn(dr_pred, src_mask.unsqueeze(1), mel_pred_mask) # [B, T_max, T_max2']
|
alignments_dp = generate_attention(dr_pred, src_mask.unsqueeze(1), mel_pred_mask) # [B, T_max, T_max2']
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"model_outputs": x,
|
"model_outputs": x,
|
||||||
|
@ -444,7 +411,7 @@ class AcousticModel(torch.nn.Module):
|
||||||
"p_prosody_pred": p_prosody_pred,
|
"p_prosody_pred": p_prosody_pred,
|
||||||
"p_prosody_ref": p_prosody_ref,
|
"p_prosody_ref": p_prosody_ref,
|
||||||
"alignments_dp": alignments_dp,
|
"alignments_dp": alignments_dp,
|
||||||
"alignments": alignments, # [B, T_de, T_en]
|
"alignments": alignments.transpose(1, 2), # [B, T_de, T_en]
|
||||||
"aligner_soft": aligner_soft,
|
"aligner_soft": aligner_soft,
|
||||||
"aligner_mas": aligner_mas,
|
"aligner_mas": aligner_mas,
|
||||||
"aligner_durations": aligner_durations,
|
"aligner_durations": aligner_durations,
|
||||||
|
@ -465,7 +432,7 @@ class AcousticModel(torch.nn.Module):
|
||||||
pitch_transform: Callable = None,
|
pitch_transform: Callable = None,
|
||||||
energy_transform: Callable = None,
|
energy_transform: Callable = None,
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
src_mask = get_mask_from_lengths(torch.tensor([tokens.shape[1]], dtype=torch.int64, device=tokens.device))
|
src_mask = ~sequence_mask(torch.tensor([tokens.shape[1]], dtype=torch.int64, device=tokens.device))
|
||||||
src_lens = torch.tensor(tokens.shape[1:2]).to(tokens.device) # pylint: disable=unused-variable
|
src_lens = torch.tensor(tokens.shape[1:2]).to(tokens.device) # pylint: disable=unused-variable
|
||||||
sid, g, lid, _ = self._set_cond_input( # pylint: disable=unused-variable
|
sid, g, lid, _ = self._set_cond_input( # pylint: disable=unused-variable
|
||||||
{"d_vectors": d_vectors, "speaker_ids": speaker_idx}
|
{"d_vectors": d_vectors, "speaker_ids": speaker_idx}
|
||||||
|
@ -532,11 +499,11 @@ class AcousticModel(torch.nn.Module):
|
||||||
duration_pred = torch.round(duration_pred) # -> [B, T_src]
|
duration_pred = torch.round(duration_pred) # -> [B, T_src]
|
||||||
mel_lens = duration_pred.sum(1) # -> [B,]
|
mel_lens = duration_pred.sum(1) # -> [B,]
|
||||||
|
|
||||||
_, encoder_outputs_ex, alignments = self._expand_encoder_with_durations(
|
encoder_outputs_ex, alignments, _ = expand_encoder_outputs(
|
||||||
o_en=encoder_outputs, y_lengths=mel_lens, dr=duration_pred.squeeze(1), x_mask=~src_mask[:, None]
|
encoder_outputs, y_lengths=mel_lens, duration=duration_pred.squeeze(1), x_mask=~src_mask[:, None]
|
||||||
)
|
)
|
||||||
|
|
||||||
mel_mask = get_mask_from_lengths(
|
mel_mask = ~sequence_mask(
|
||||||
torch.tensor([encoder_outputs_ex.shape[2]], dtype=torch.int64, device=encoder_outputs_ex.device)
|
torch.tensor([encoder_outputs_ex.shape[2]], dtype=torch.int64, device=encoder_outputs_ex.device)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -553,7 +520,7 @@ class AcousticModel(torch.nn.Module):
|
||||||
x = self.to_mel(x)
|
x = self.to_mel(x)
|
||||||
outputs = {
|
outputs = {
|
||||||
"model_outputs": x,
|
"model_outputs": x,
|
||||||
"alignments": alignments,
|
"alignments": alignments.transpose(1, 2),
|
||||||
# "pitch": pitch_emb_pred,
|
# "pitch": pitch_emb_pred,
|
||||||
"durations": duration_pred,
|
"durations": duration_pred,
|
||||||
"pitch": pitch_pred,
|
"pitch": pitch_pred,
|
||||||
|
|
|
@ -1,20 +1,14 @@
|
||||||
### credit: https://github.com/dunky11/voicesmith
|
### credit: https://github.com/dunky11/voicesmith
|
||||||
import math
|
import math
|
||||||
from typing import Tuple
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn # pylint: disable=consider-using-from-import
|
import torch.nn as nn # pylint: disable=consider-using-from-import
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
|
|
||||||
from TTS.tts.layers.delightful_tts.conv_layers import Conv1dGLU, DepthWiseConv1d, PointwiseConv1d
|
from TTS.tts.layers.delightful_tts.conv_layers import Conv1dGLU, DepthWiseConv1d, PointwiseConv1d, calc_same_padding
|
||||||
from TTS.tts.layers.delightful_tts.networks import GLUActivation
|
from TTS.tts.layers.delightful_tts.networks import GLUActivation
|
||||||
|
|
||||||
|
|
||||||
def calc_same_padding(kernel_size: int) -> Tuple[int, int]:
|
|
||||||
pad = kernel_size // 2
|
|
||||||
return (pad, pad - (kernel_size + 1) % 2)
|
|
||||||
|
|
||||||
|
|
||||||
class Conformer(nn.Module):
|
class Conformer(nn.Module):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
@ -322,7 +316,7 @@ class ConformerMultiHeadedSelfAttention(nn.Module):
|
||||||
value: torch.Tensor,
|
value: torch.Tensor,
|
||||||
mask: torch.Tensor,
|
mask: torch.Tensor,
|
||||||
encoding: torch.Tensor,
|
encoding: torch.Tensor,
|
||||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||||
batch_size, seq_length, _ = key.size() # pylint: disable=unused-variable
|
batch_size, seq_length, _ = key.size() # pylint: disable=unused-variable
|
||||||
encoding = encoding[:, : key.shape[1]]
|
encoding = encoding[:, : key.shape[1]]
|
||||||
encoding = encoding.repeat(batch_size, 1, 1)
|
encoding = encoding.repeat(batch_size, 1, 1)
|
||||||
|
@ -378,7 +372,7 @@ class RelativeMultiHeadAttention(nn.Module):
|
||||||
value: torch.Tensor,
|
value: torch.Tensor,
|
||||||
pos_embedding: torch.Tensor,
|
pos_embedding: torch.Tensor,
|
||||||
mask: torch.Tensor,
|
mask: torch.Tensor,
|
||||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||||
batch_size = query.shape[0]
|
batch_size = query.shape[0]
|
||||||
query = self.query_proj(query).view(batch_size, -1, self.num_heads, self.d_head)
|
query = self.query_proj(query).view(batch_size, -1, self.num_heads, self.d_head)
|
||||||
key = self.key_proj(key).view(batch_size, -1, self.num_heads, self.d_head).permute(0, 2, 1, 3)
|
key = self.key_proj(key).view(batch_size, -1, self.num_heads, self.d_head).permute(0, 2, 1, 3)
|
||||||
|
@ -411,40 +405,3 @@ class RelativeMultiHeadAttention(nn.Module):
|
||||||
padded_pos_score = padded_pos_score.view(batch_size, num_heads, seq_length2 + 1, seq_length1)
|
padded_pos_score = padded_pos_score.view(batch_size, num_heads, seq_length2 + 1, seq_length1)
|
||||||
pos_score = padded_pos_score[:, :, 1:].view_as(pos_score)
|
pos_score = padded_pos_score[:, :, 1:].view_as(pos_score)
|
||||||
return pos_score
|
return pos_score
|
||||||
|
|
||||||
|
|
||||||
class MultiHeadAttention(nn.Module):
|
|
||||||
"""
|
|
||||||
input:
|
|
||||||
query --- [N, T_q, query_dim]
|
|
||||||
key --- [N, T_k, key_dim]
|
|
||||||
output:
|
|
||||||
out --- [N, T_q, num_units]
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, query_dim: int, key_dim: int, num_units: int, num_heads: int):
|
|
||||||
super().__init__()
|
|
||||||
self.num_units = num_units
|
|
||||||
self.num_heads = num_heads
|
|
||||||
self.key_dim = key_dim
|
|
||||||
|
|
||||||
self.W_query = nn.Linear(in_features=query_dim, out_features=num_units, bias=False)
|
|
||||||
self.W_key = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
|
|
||||||
self.W_value = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
|
|
||||||
|
|
||||||
def forward(self, query: torch.Tensor, key: torch.Tensor) -> torch.Tensor:
|
|
||||||
querys = self.W_query(query) # [N, T_q, num_units]
|
|
||||||
keys = self.W_key(key) # [N, T_k, num_units]
|
|
||||||
values = self.W_value(key)
|
|
||||||
split_size = self.num_units // self.num_heads
|
|
||||||
querys = torch.stack(torch.split(querys, split_size, dim=2), dim=0) # [h, N, T_q, num_units/h]
|
|
||||||
keys = torch.stack(torch.split(keys, split_size, dim=2), dim=0) # [h, N, T_k, num_units/h]
|
|
||||||
values = torch.stack(torch.split(values, split_size, dim=2), dim=0) # [h, N, T_k, num_units/h]
|
|
||||||
# score = softmax(QK^T / (d_k ** 0.5))
|
|
||||||
scores = torch.matmul(querys, keys.transpose(2, 3)) # [h, N, T_q, T_k]
|
|
||||||
scores = scores / (self.key_dim**0.5)
|
|
||||||
scores = F.softmax(scores, dim=3)
|
|
||||||
# out = score * V
|
|
||||||
out = torch.matmul(scores, values) # [h, N, T_q, num_units/h]
|
|
||||||
out = torch.cat(torch.split(out, 1, dim=0), dim=3).squeeze(0) # [N, T_q, num_units]
|
|
||||||
return out
|
|
||||||
|
|
|
@ -3,9 +3,6 @@ from typing import Tuple
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn # pylint: disable=consider-using-from-import
|
import torch.nn as nn # pylint: disable=consider-using-from-import
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from torch.nn.utils import parametrize
|
|
||||||
|
|
||||||
from TTS.tts.layers.delightful_tts.kernel_predictor import KernelPredictor
|
|
||||||
|
|
||||||
|
|
||||||
def calc_same_padding(kernel_size: int) -> Tuple[int, int]:
|
def calc_same_padding(kernel_size: int) -> Tuple[int, int]:
|
||||||
|
@ -530,142 +527,3 @@ class CoordConv2d(nn.modules.conv.Conv2d):
|
||||||
x = self.addcoords(x)
|
x = self.addcoords(x)
|
||||||
x = self.conv(x)
|
x = self.conv(x)
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
class LVCBlock(torch.nn.Module):
|
|
||||||
"""the location-variable convolutions"""
|
|
||||||
|
|
||||||
def __init__( # pylint: disable=dangerous-default-value
|
|
||||||
self,
|
|
||||||
in_channels,
|
|
||||||
cond_channels,
|
|
||||||
stride,
|
|
||||||
dilations=[1, 3, 9, 27],
|
|
||||||
lReLU_slope=0.2,
|
|
||||||
conv_kernel_size=3,
|
|
||||||
cond_hop_length=256,
|
|
||||||
kpnet_hidden_channels=64,
|
|
||||||
kpnet_conv_size=3,
|
|
||||||
kpnet_dropout=0.0,
|
|
||||||
):
|
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
self.cond_hop_length = cond_hop_length
|
|
||||||
self.conv_layers = len(dilations)
|
|
||||||
self.conv_kernel_size = conv_kernel_size
|
|
||||||
|
|
||||||
self.kernel_predictor = KernelPredictor(
|
|
||||||
cond_channels=cond_channels,
|
|
||||||
conv_in_channels=in_channels,
|
|
||||||
conv_out_channels=2 * in_channels,
|
|
||||||
conv_layers=len(dilations),
|
|
||||||
conv_kernel_size=conv_kernel_size,
|
|
||||||
kpnet_hidden_channels=kpnet_hidden_channels,
|
|
||||||
kpnet_conv_size=kpnet_conv_size,
|
|
||||||
kpnet_dropout=kpnet_dropout,
|
|
||||||
kpnet_nonlinear_activation_params={"negative_slope": lReLU_slope},
|
|
||||||
)
|
|
||||||
|
|
||||||
self.convt_pre = nn.Sequential(
|
|
||||||
nn.LeakyReLU(lReLU_slope),
|
|
||||||
nn.utils.parametrizations.weight_norm(
|
|
||||||
nn.ConvTranspose1d(
|
|
||||||
in_channels,
|
|
||||||
in_channels,
|
|
||||||
2 * stride,
|
|
||||||
stride=stride,
|
|
||||||
padding=stride // 2 + stride % 2,
|
|
||||||
output_padding=stride % 2,
|
|
||||||
)
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
self.conv_blocks = nn.ModuleList()
|
|
||||||
for dilation in dilations:
|
|
||||||
self.conv_blocks.append(
|
|
||||||
nn.Sequential(
|
|
||||||
nn.LeakyReLU(lReLU_slope),
|
|
||||||
nn.utils.parametrizations.weight_norm(
|
|
||||||
nn.Conv1d(
|
|
||||||
in_channels,
|
|
||||||
in_channels,
|
|
||||||
conv_kernel_size,
|
|
||||||
padding=dilation * (conv_kernel_size - 1) // 2,
|
|
||||||
dilation=dilation,
|
|
||||||
)
|
|
||||||
),
|
|
||||||
nn.LeakyReLU(lReLU_slope),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
def forward(self, x, c):
|
|
||||||
"""forward propagation of the location-variable convolutions.
|
|
||||||
Args:
|
|
||||||
x (Tensor): the input sequence (batch, in_channels, in_length)
|
|
||||||
c (Tensor): the conditioning sequence (batch, cond_channels, cond_length)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Tensor: the output sequence (batch, in_channels, in_length)
|
|
||||||
"""
|
|
||||||
_, in_channels, _ = x.shape # (B, c_g, L')
|
|
||||||
|
|
||||||
x = self.convt_pre(x) # (B, c_g, stride * L')
|
|
||||||
kernels, bias = self.kernel_predictor(c)
|
|
||||||
|
|
||||||
for i, conv in enumerate(self.conv_blocks):
|
|
||||||
output = conv(x) # (B, c_g, stride * L')
|
|
||||||
|
|
||||||
k = kernels[:, i, :, :, :, :] # (B, 2 * c_g, c_g, kernel_size, cond_length)
|
|
||||||
b = bias[:, i, :, :] # (B, 2 * c_g, cond_length)
|
|
||||||
|
|
||||||
output = self.location_variable_convolution(
|
|
||||||
output, k, b, hop_size=self.cond_hop_length
|
|
||||||
) # (B, 2 * c_g, stride * L'): LVC
|
|
||||||
x = x + torch.sigmoid(output[:, :in_channels, :]) * torch.tanh(
|
|
||||||
output[:, in_channels:, :]
|
|
||||||
) # (B, c_g, stride * L'): GAU
|
|
||||||
|
|
||||||
return x
|
|
||||||
|
|
||||||
def location_variable_convolution(self, x, kernel, bias, dilation=1, hop_size=256): # pylint: disable=no-self-use
|
|
||||||
"""perform location-variable convolution operation on the input sequence (x) using the local convolution kernl.
|
|
||||||
Time: 414 μs ± 309 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each), test on NVIDIA V100.
|
|
||||||
Args:
|
|
||||||
x (Tensor): the input sequence (batch, in_channels, in_length).
|
|
||||||
kernel (Tensor): the local convolution kernel (batch, in_channel, out_channels, kernel_size, kernel_length)
|
|
||||||
bias (Tensor): the bias for the local convolution (batch, out_channels, kernel_length)
|
|
||||||
dilation (int): the dilation of convolution.
|
|
||||||
hop_size (int): the hop_size of the conditioning sequence.
|
|
||||||
Returns:
|
|
||||||
(Tensor): the output sequence after performing local convolution. (batch, out_channels, in_length).
|
|
||||||
"""
|
|
||||||
batch, _, in_length = x.shape
|
|
||||||
batch, _, out_channels, kernel_size, kernel_length = kernel.shape
|
|
||||||
assert in_length == (kernel_length * hop_size), "length of (x, kernel) is not matched"
|
|
||||||
|
|
||||||
padding = dilation * int((kernel_size - 1) / 2)
|
|
||||||
x = F.pad(x, (padding, padding), "constant", 0) # (batch, in_channels, in_length + 2*padding)
|
|
||||||
x = x.unfold(2, hop_size + 2 * padding, hop_size) # (batch, in_channels, kernel_length, hop_size + 2*padding)
|
|
||||||
|
|
||||||
if hop_size < dilation:
|
|
||||||
x = F.pad(x, (0, dilation), "constant", 0)
|
|
||||||
x = x.unfold(
|
|
||||||
3, dilation, dilation
|
|
||||||
) # (batch, in_channels, kernel_length, (hop_size + 2*padding)/dilation, dilation)
|
|
||||||
x = x[:, :, :, :, :hop_size]
|
|
||||||
x = x.transpose(3, 4) # (batch, in_channels, kernel_length, dilation, (hop_size + 2*padding)/dilation)
|
|
||||||
x = x.unfold(4, kernel_size, 1) # (batch, in_channels, kernel_length, dilation, _, kernel_size)
|
|
||||||
|
|
||||||
o = torch.einsum("bildsk,biokl->bolsd", x, kernel)
|
|
||||||
o = o.to(memory_format=torch.channels_last_3d)
|
|
||||||
bias = bias.unsqueeze(-1).unsqueeze(-1).to(memory_format=torch.channels_last_3d)
|
|
||||||
o = o + bias
|
|
||||||
o = o.contiguous().view(batch, out_channels, -1)
|
|
||||||
|
|
||||||
return o
|
|
||||||
|
|
||||||
def remove_weight_norm(self):
|
|
||||||
self.kernel_predictor.remove_weight_norm()
|
|
||||||
parametrize.remove_parametrizations(self.convt_pre[1], "weight")
|
|
||||||
for block in self.conv_blocks:
|
|
||||||
parametrize.remove_parametrizations(block[1], "weight")
|
|
||||||
|
|
|
@ -7,14 +7,7 @@ import torch.nn.functional as F
|
||||||
from TTS.tts.layers.delightful_tts.conformer import ConformerMultiHeadedSelfAttention
|
from TTS.tts.layers.delightful_tts.conformer import ConformerMultiHeadedSelfAttention
|
||||||
from TTS.tts.layers.delightful_tts.conv_layers import CoordConv1d
|
from TTS.tts.layers.delightful_tts.conv_layers import CoordConv1d
|
||||||
from TTS.tts.layers.delightful_tts.networks import STL
|
from TTS.tts.layers.delightful_tts.networks import STL
|
||||||
|
from TTS.tts.utils.helpers import sequence_mask
|
||||||
|
|
||||||
def get_mask_from_lengths(lengths: torch.Tensor) -> torch.Tensor:
|
|
||||||
batch_size = lengths.shape[0]
|
|
||||||
max_len = torch.max(lengths).item()
|
|
||||||
ids = torch.arange(0, max_len, device=lengths.device).unsqueeze(0).expand(batch_size, -1)
|
|
||||||
mask = ids >= lengths.unsqueeze(1).expand(-1, max_len)
|
|
||||||
return mask
|
|
||||||
|
|
||||||
|
|
||||||
def stride_lens(lens: torch.Tensor, stride: int = 2) -> torch.Tensor:
|
def stride_lens(lens: torch.Tensor, stride: int = 2) -> torch.Tensor:
|
||||||
|
@ -93,7 +86,7 @@ class ReferenceEncoder(nn.Module):
|
||||||
outputs --- [N, E//2]
|
outputs --- [N, E//2]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
mel_masks = get_mask_from_lengths(mel_lens).unsqueeze(1)
|
mel_masks = ~sequence_mask(mel_lens).unsqueeze(1)
|
||||||
x = x.masked_fill(mel_masks, 0)
|
x = x.masked_fill(mel_masks, 0)
|
||||||
for conv, norm in zip(self.convs, self.norms):
|
for conv, norm in zip(self.convs, self.norms):
|
||||||
x = conv(x)
|
x = conv(x)
|
||||||
|
@ -103,7 +96,7 @@ class ReferenceEncoder(nn.Module):
|
||||||
for _ in range(2):
|
for _ in range(2):
|
||||||
mel_lens = stride_lens(mel_lens)
|
mel_lens = stride_lens(mel_lens)
|
||||||
|
|
||||||
mel_masks = get_mask_from_lengths(mel_lens)
|
mel_masks = ~sequence_mask(mel_lens)
|
||||||
|
|
||||||
x = x.masked_fill(mel_masks.unsqueeze(1), 0)
|
x = x.masked_fill(mel_masks.unsqueeze(1), 0)
|
||||||
x = x.permute((0, 2, 1))
|
x = x.permute((0, 2, 1))
|
||||||
|
|
|
@ -1,128 +0,0 @@
|
||||||
import torch.nn as nn # pylint: disable=consider-using-from-import
|
|
||||||
from torch.nn.utils import parametrize
|
|
||||||
|
|
||||||
|
|
||||||
class KernelPredictor(nn.Module):
|
|
||||||
"""Kernel predictor for the location-variable convolutions
|
|
||||||
|
|
||||||
Args:
|
|
||||||
cond_channels (int): number of channel for the conditioning sequence,
|
|
||||||
conv_in_channels (int): number of channel for the input sequence,
|
|
||||||
conv_out_channels (int): number of channel for the output sequence,
|
|
||||||
conv_layers (int): number of layers
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__( # pylint: disable=dangerous-default-value
|
|
||||||
self,
|
|
||||||
cond_channels,
|
|
||||||
conv_in_channels,
|
|
||||||
conv_out_channels,
|
|
||||||
conv_layers,
|
|
||||||
conv_kernel_size=3,
|
|
||||||
kpnet_hidden_channels=64,
|
|
||||||
kpnet_conv_size=3,
|
|
||||||
kpnet_dropout=0.0,
|
|
||||||
kpnet_nonlinear_activation="LeakyReLU",
|
|
||||||
kpnet_nonlinear_activation_params={"negative_slope": 0.1},
|
|
||||||
):
|
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
self.conv_in_channels = conv_in_channels
|
|
||||||
self.conv_out_channels = conv_out_channels
|
|
||||||
self.conv_kernel_size = conv_kernel_size
|
|
||||||
self.conv_layers = conv_layers
|
|
||||||
|
|
||||||
kpnet_kernel_channels = conv_in_channels * conv_out_channels * conv_kernel_size * conv_layers # l_w
|
|
||||||
kpnet_bias_channels = conv_out_channels * conv_layers # l_b
|
|
||||||
|
|
||||||
self.input_conv = nn.Sequential(
|
|
||||||
nn.utils.parametrizations.weight_norm(
|
|
||||||
nn.Conv1d(cond_channels, kpnet_hidden_channels, 5, padding=2, bias=True)
|
|
||||||
),
|
|
||||||
getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
|
|
||||||
)
|
|
||||||
|
|
||||||
self.residual_convs = nn.ModuleList()
|
|
||||||
padding = (kpnet_conv_size - 1) // 2
|
|
||||||
for _ in range(3):
|
|
||||||
self.residual_convs.append(
|
|
||||||
nn.Sequential(
|
|
||||||
nn.Dropout(kpnet_dropout),
|
|
||||||
nn.utils.parametrizations.weight_norm(
|
|
||||||
nn.Conv1d(
|
|
||||||
kpnet_hidden_channels,
|
|
||||||
kpnet_hidden_channels,
|
|
||||||
kpnet_conv_size,
|
|
||||||
padding=padding,
|
|
||||||
bias=True,
|
|
||||||
)
|
|
||||||
),
|
|
||||||
getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
|
|
||||||
nn.utils.parametrizations.weight_norm(
|
|
||||||
nn.Conv1d(
|
|
||||||
kpnet_hidden_channels,
|
|
||||||
kpnet_hidden_channels,
|
|
||||||
kpnet_conv_size,
|
|
||||||
padding=padding,
|
|
||||||
bias=True,
|
|
||||||
)
|
|
||||||
),
|
|
||||||
getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
self.kernel_conv = nn.utils.parametrizations.weight_norm(
|
|
||||||
nn.Conv1d(
|
|
||||||
kpnet_hidden_channels,
|
|
||||||
kpnet_kernel_channels,
|
|
||||||
kpnet_conv_size,
|
|
||||||
padding=padding,
|
|
||||||
bias=True,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
self.bias_conv = nn.utils.parametrizations.weight_norm(
|
|
||||||
nn.Conv1d(
|
|
||||||
kpnet_hidden_channels,
|
|
||||||
kpnet_bias_channels,
|
|
||||||
kpnet_conv_size,
|
|
||||||
padding=padding,
|
|
||||||
bias=True,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
def forward(self, c):
|
|
||||||
"""
|
|
||||||
Args:
|
|
||||||
c (Tensor): the conditioning sequence (batch, cond_channels, cond_length)
|
|
||||||
"""
|
|
||||||
batch, _, cond_length = c.shape
|
|
||||||
c = self.input_conv(c)
|
|
||||||
for residual_conv in self.residual_convs:
|
|
||||||
residual_conv.to(c.device)
|
|
||||||
c = c + residual_conv(c)
|
|
||||||
k = self.kernel_conv(c)
|
|
||||||
b = self.bias_conv(c)
|
|
||||||
kernels = k.contiguous().view(
|
|
||||||
batch,
|
|
||||||
self.conv_layers,
|
|
||||||
self.conv_in_channels,
|
|
||||||
self.conv_out_channels,
|
|
||||||
self.conv_kernel_size,
|
|
||||||
cond_length,
|
|
||||||
)
|
|
||||||
bias = b.contiguous().view(
|
|
||||||
batch,
|
|
||||||
self.conv_layers,
|
|
||||||
self.conv_out_channels,
|
|
||||||
cond_length,
|
|
||||||
)
|
|
||||||
|
|
||||||
return kernels, bias
|
|
||||||
|
|
||||||
def remove_weight_norm(self):
|
|
||||||
parametrize.remove_parametrizations(self.input_conv[0], "weight")
|
|
||||||
parametrize.remove_parametrizations(self.kernel_conv, "weight")
|
|
||||||
parametrize.remove_parametrizations(self.bias_conv, "weight")
|
|
||||||
for block in self.residual_convs:
|
|
||||||
parametrize.remove_parametrizations(block[1], "weight")
|
|
||||||
parametrize.remove_parametrizations(block[3], "weight")
|
|
|
@ -1,5 +1,4 @@
|
||||||
import torch
|
import torch
|
||||||
from packaging.version import Version
|
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from torch.nn import functional as F
|
from torch.nn import functional as F
|
||||||
|
|
||||||
|
@ -90,9 +89,6 @@ class InvConvNear(nn.Module):
|
||||||
self.no_jacobian = no_jacobian
|
self.no_jacobian = no_jacobian
|
||||||
self.weight_inv = None
|
self.weight_inv = None
|
||||||
|
|
||||||
if Version(torch.__version__) < Version("1.9"):
|
|
||||||
w_init = torch.qr(torch.FloatTensor(self.num_splits, self.num_splits).normal_())[0]
|
|
||||||
else:
|
|
||||||
w_init = torch.linalg.qr(torch.FloatTensor(self.num_splits, self.num_splits).normal_(), "complete")[0]
|
w_init = torch.linalg.qr(torch.FloatTensor(self.num_splits, self.num_splits).normal_(), "complete")[0]
|
||||||
|
|
||||||
if torch.det(w_init) < 0:
|
if torch.det(w_init) < 0:
|
||||||
|
|
|
@ -5,6 +5,7 @@ from torch import nn
|
||||||
from torch.nn import functional as F
|
from torch.nn import functional as F
|
||||||
|
|
||||||
from TTS.tts.layers.generic.normalization import LayerNorm, LayerNorm2
|
from TTS.tts.layers.generic.normalization import LayerNorm, LayerNorm2
|
||||||
|
from TTS.tts.utils.helpers import convert_pad_shape
|
||||||
|
|
||||||
|
|
||||||
class RelativePositionMultiHeadAttention(nn.Module):
|
class RelativePositionMultiHeadAttention(nn.Module):
|
||||||
|
@ -300,7 +301,7 @@ class FeedForwardNetwork(nn.Module):
|
||||||
pad_l = self.kernel_size - 1
|
pad_l = self.kernel_size - 1
|
||||||
pad_r = 0
|
pad_r = 0
|
||||||
padding = [[0, 0], [0, 0], [pad_l, pad_r]]
|
padding = [[0, 0], [0, 0], [pad_l, pad_r]]
|
||||||
x = F.pad(x, self._pad_shape(padding))
|
x = F.pad(x, convert_pad_shape(padding))
|
||||||
return x
|
return x
|
||||||
|
|
||||||
def _same_padding(self, x):
|
def _same_padding(self, x):
|
||||||
|
@ -309,15 +310,9 @@ class FeedForwardNetwork(nn.Module):
|
||||||
pad_l = (self.kernel_size - 1) // 2
|
pad_l = (self.kernel_size - 1) // 2
|
||||||
pad_r = self.kernel_size // 2
|
pad_r = self.kernel_size // 2
|
||||||
padding = [[0, 0], [0, 0], [pad_l, pad_r]]
|
padding = [[0, 0], [0, 0], [pad_l, pad_r]]
|
||||||
x = F.pad(x, self._pad_shape(padding))
|
x = F.pad(x, convert_pad_shape(padding))
|
||||||
return x
|
return x
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _pad_shape(padding):
|
|
||||||
l = padding[::-1]
|
|
||||||
pad_shape = [item for sublist in l for item in sublist]
|
|
||||||
return pad_shape
|
|
||||||
|
|
||||||
|
|
||||||
class RelativePositionTransformer(nn.Module):
|
class RelativePositionTransformer(nn.Module):
|
||||||
"""Transformer with Relative Potional Encoding.
|
"""Transformer with Relative Potional Encoding.
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
import logging
|
||||||
import math
|
import math
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
@ -10,6 +11,8 @@ from TTS.tts.utils.helpers import sequence_mask
|
||||||
from TTS.tts.utils.ssim import SSIMLoss as _SSIMLoss
|
from TTS.tts.utils.ssim import SSIMLoss as _SSIMLoss
|
||||||
from TTS.utils.audio.torch_transforms import TorchSTFT
|
from TTS.utils.audio.torch_transforms import TorchSTFT
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
# pylint: disable=abstract-method
|
# pylint: disable=abstract-method
|
||||||
# relates https://github.com/pytorch/pytorch/issues/42305
|
# relates https://github.com/pytorch/pytorch/issues/42305
|
||||||
|
@ -132,11 +135,11 @@ class SSIMLoss(torch.nn.Module):
|
||||||
ssim_loss = self.loss_func((y_norm * mask).unsqueeze(1), (y_hat_norm * mask).unsqueeze(1))
|
ssim_loss = self.loss_func((y_norm * mask).unsqueeze(1), (y_hat_norm * mask).unsqueeze(1))
|
||||||
|
|
||||||
if ssim_loss.item() > 1.0:
|
if ssim_loss.item() > 1.0:
|
||||||
print(f" > SSIM loss is out-of-range {ssim_loss.item()}, setting it 1.0")
|
logger.info("SSIM loss is out-of-range (%.2f), setting it to 1.0", ssim_loss.item())
|
||||||
ssim_loss = torch.tensor(1.0, device=ssim_loss.device)
|
ssim_loss = torch.tensor(1.0, device=ssim_loss.device)
|
||||||
|
|
||||||
if ssim_loss.item() < 0.0:
|
if ssim_loss.item() < 0.0:
|
||||||
print(f" > SSIM loss is out-of-range {ssim_loss.item()}, setting it 0.0")
|
logger.info("SSIM loss is out-of-range (%.2f), setting it to 0.0", ssim_loss.item())
|
||||||
ssim_loss = torch.tensor(0.0, device=ssim_loss.device)
|
ssim_loss = torch.tensor(0.0, device=ssim_loss.device)
|
||||||
|
|
||||||
return ssim_loss
|
return ssim_loss
|
||||||
|
@ -252,7 +255,7 @@ class GuidedAttentionLoss(torch.nn.Module):
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _make_ga_mask(ilen, olen, sigma):
|
def _make_ga_mask(ilen, olen, sigma):
|
||||||
grid_x, grid_y = torch.meshgrid(torch.arange(olen).to(olen), torch.arange(ilen).to(ilen))
|
grid_x, grid_y = torch.meshgrid(torch.arange(olen).to(olen), torch.arange(ilen).to(ilen), indexing="ij")
|
||||||
grid_x, grid_y = grid_x.float(), grid_y.float()
|
grid_x, grid_y = grid_x.float(), grid_y.float()
|
||||||
return 1.0 - torch.exp(-((grid_y / ilen - grid_x / olen) ** 2) / (2 * (sigma**2)))
|
return 1.0 - torch.exp(-((grid_y / ilen - grid_x / olen) ** 2) / (2 * (sigma**2)))
|
||||||
|
|
||||||
|
@ -306,6 +309,24 @@ class ForwardSumLoss(nn.Module):
|
||||||
return total_loss
|
return total_loss
|
||||||
|
|
||||||
|
|
||||||
|
class NLLLoss(nn.Module):
|
||||||
|
"""Negative log likelihood loss."""
|
||||||
|
|
||||||
|
def forward(self, log_prob: torch.Tensor) -> dict: # pylint: disable=no-self-use
|
||||||
|
"""Compute the loss.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
logits (Tensor): [B, T, D]
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tensor: [1]
|
||||||
|
|
||||||
|
"""
|
||||||
|
return_dict = {}
|
||||||
|
return_dict["loss"] = -log_prob.mean()
|
||||||
|
return return_dict
|
||||||
|
|
||||||
|
|
||||||
########################
|
########################
|
||||||
# MODEL LOSS LAYERS
|
# MODEL LOSS LAYERS
|
||||||
########################
|
########################
|
||||||
|
@ -616,6 +637,28 @@ class AlignTTSLoss(nn.Module):
|
||||||
return {"loss": loss, "loss_l1": spec_loss, "loss_ssim": ssim_loss, "loss_dur": dur_loss, "mdn_loss": mdn_loss}
|
return {"loss": loss, "loss_l1": spec_loss, "loss_ssim": ssim_loss, "loss_dur": dur_loss, "mdn_loss": mdn_loss}
|
||||||
|
|
||||||
|
|
||||||
|
def feature_loss(feats_real, feats_generated):
|
||||||
|
loss = 0
|
||||||
|
for dr, dg in zip(feats_real, feats_generated):
|
||||||
|
for rl, gl in zip(dr, dg):
|
||||||
|
rl = rl.float().detach()
|
||||||
|
gl = gl.float()
|
||||||
|
loss += torch.mean(torch.abs(rl - gl))
|
||||||
|
return loss * 2
|
||||||
|
|
||||||
|
|
||||||
|
def generator_loss(scores_fake):
|
||||||
|
loss = 0
|
||||||
|
gen_losses = []
|
||||||
|
for dg in scores_fake:
|
||||||
|
dg = dg.float()
|
||||||
|
l = torch.mean((1 - dg) ** 2)
|
||||||
|
gen_losses.append(l)
|
||||||
|
loss += l
|
||||||
|
|
||||||
|
return loss, gen_losses
|
||||||
|
|
||||||
|
|
||||||
class VitsGeneratorLoss(nn.Module):
|
class VitsGeneratorLoss(nn.Module):
|
||||||
def __init__(self, c: Coqpit):
|
def __init__(self, c: Coqpit):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
@ -637,28 +680,6 @@ class VitsGeneratorLoss(nn.Module):
|
||||||
do_amp_to_db=True,
|
do_amp_to_db=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def feature_loss(feats_real, feats_generated):
|
|
||||||
loss = 0
|
|
||||||
for dr, dg in zip(feats_real, feats_generated):
|
|
||||||
for rl, gl in zip(dr, dg):
|
|
||||||
rl = rl.float().detach()
|
|
||||||
gl = gl.float()
|
|
||||||
loss += torch.mean(torch.abs(rl - gl))
|
|
||||||
return loss * 2
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def generator_loss(scores_fake):
|
|
||||||
loss = 0
|
|
||||||
gen_losses = []
|
|
||||||
for dg in scores_fake:
|
|
||||||
dg = dg.float()
|
|
||||||
l = torch.mean((1 - dg) ** 2)
|
|
||||||
gen_losses.append(l)
|
|
||||||
loss += l
|
|
||||||
|
|
||||||
return loss, gen_losses
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
|
def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
|
||||||
"""
|
"""
|
||||||
|
@ -719,10 +740,8 @@ class VitsGeneratorLoss(nn.Module):
|
||||||
self.kl_loss(z_p=z_p, logs_q=logs_q, m_p=m_p, logs_p=logs_p, z_mask=z_mask.unsqueeze(1))
|
self.kl_loss(z_p=z_p, logs_q=logs_q, m_p=m_p, logs_p=logs_p, z_mask=z_mask.unsqueeze(1))
|
||||||
* self.kl_loss_alpha
|
* self.kl_loss_alpha
|
||||||
)
|
)
|
||||||
loss_feat = (
|
loss_feat = feature_loss(feats_real=feats_disc_real, feats_generated=feats_disc_fake) * self.feat_loss_alpha
|
||||||
self.feature_loss(feats_real=feats_disc_real, feats_generated=feats_disc_fake) * self.feat_loss_alpha
|
loss_gen = generator_loss(scores_fake=scores_disc_fake)[0] * self.gen_loss_alpha
|
||||||
)
|
|
||||||
loss_gen = self.generator_loss(scores_fake=scores_disc_fake)[0] * self.gen_loss_alpha
|
|
||||||
loss_mel = torch.nn.functional.l1_loss(mel_slice, mel_slice_hat) * self.mel_loss_alpha
|
loss_mel = torch.nn.functional.l1_loss(mel_slice, mel_slice_hat) * self.mel_loss_alpha
|
||||||
loss_duration = torch.sum(loss_duration.float()) * self.dur_loss_alpha
|
loss_duration = torch.sum(loss_duration.float()) * self.dur_loss_alpha
|
||||||
loss = loss_kl + loss_feat + loss_mel + loss_gen + loss_duration
|
loss = loss_kl + loss_feat + loss_mel + loss_gen + loss_duration
|
||||||
|
@ -776,6 +795,15 @@ class VitsDiscriminatorLoss(nn.Module):
|
||||||
return return_dict
|
return return_dict
|
||||||
|
|
||||||
|
|
||||||
|
def _binary_alignment_loss(alignment_hard, alignment_soft):
|
||||||
|
"""Binary loss that forces soft alignments to match the hard alignments.
|
||||||
|
|
||||||
|
Explained in `https://arxiv.org/pdf/2108.10447.pdf`.
|
||||||
|
"""
|
||||||
|
log_sum = torch.log(torch.clamp(alignment_soft[alignment_hard == 1], min=1e-12)).sum()
|
||||||
|
return -log_sum / alignment_hard.sum()
|
||||||
|
|
||||||
|
|
||||||
class ForwardTTSLoss(nn.Module):
|
class ForwardTTSLoss(nn.Module):
|
||||||
"""Generic configurable ForwardTTS loss."""
|
"""Generic configurable ForwardTTS loss."""
|
||||||
|
|
||||||
|
@ -817,14 +845,6 @@ class ForwardTTSLoss(nn.Module):
|
||||||
self.dur_loss_alpha = c.dur_loss_alpha
|
self.dur_loss_alpha = c.dur_loss_alpha
|
||||||
self.binary_alignment_loss_alpha = c.binary_align_loss_alpha
|
self.binary_alignment_loss_alpha = c.binary_align_loss_alpha
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _binary_alignment_loss(alignment_hard, alignment_soft):
|
|
||||||
"""Binary loss that forces soft alignments to match the hard alignments as
|
|
||||||
explained in `https://arxiv.org/pdf/2108.10447.pdf`.
|
|
||||||
"""
|
|
||||||
log_sum = torch.log(torch.clamp(alignment_soft[alignment_hard == 1], min=1e-12)).sum()
|
|
||||||
return -log_sum / alignment_hard.sum()
|
|
||||||
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
decoder_output,
|
decoder_output,
|
||||||
|
@ -876,7 +896,7 @@ class ForwardTTSLoss(nn.Module):
|
||||||
return_dict["loss_aligner"] = self.aligner_loss_alpha * aligner_loss
|
return_dict["loss_aligner"] = self.aligner_loss_alpha * aligner_loss
|
||||||
|
|
||||||
if self.binary_alignment_loss_alpha > 0 and alignment_hard is not None:
|
if self.binary_alignment_loss_alpha > 0 and alignment_hard is not None:
|
||||||
binary_alignment_loss = self._binary_alignment_loss(alignment_hard, alignment_soft)
|
binary_alignment_loss = _binary_alignment_loss(alignment_hard, alignment_soft)
|
||||||
loss = loss + self.binary_alignment_loss_alpha * binary_alignment_loss
|
loss = loss + self.binary_alignment_loss_alpha * binary_alignment_loss
|
||||||
if binary_loss_weight:
|
if binary_loss_weight:
|
||||||
return_dict["loss_binary_alignment"] = (
|
return_dict["loss_binary_alignment"] = (
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
import logging
|
||||||
from typing import List, Tuple
|
from typing import List, Tuple
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
@ -8,6 +9,8 @@ from tqdm.auto import tqdm
|
||||||
from TTS.tts.layers.tacotron.common_layers import Linear
|
from TTS.tts.layers.tacotron.common_layers import Linear
|
||||||
from TTS.tts.layers.tacotron.tacotron2 import ConvBNBlock
|
from TTS.tts.layers.tacotron.tacotron2 import ConvBNBlock
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class Encoder(nn.Module):
|
class Encoder(nn.Module):
|
||||||
r"""Neural HMM Encoder
|
r"""Neural HMM Encoder
|
||||||
|
@ -213,8 +216,8 @@ class Outputnet(nn.Module):
|
||||||
original_tensor = std.clone().detach()
|
original_tensor = std.clone().detach()
|
||||||
std = torch.clamp(std, min=self.std_floor)
|
std = torch.clamp(std, min=self.std_floor)
|
||||||
if torch.any(original_tensor != std):
|
if torch.any(original_tensor != std):
|
||||||
print(
|
logger.info(
|
||||||
"[*] Standard deviation was floored! The model is preventing overfitting, nothing serious to worry about"
|
"Standard deviation was floored! The model is preventing overfitting, nothing serious to worry about"
|
||||||
)
|
)
|
||||||
return std
|
return std
|
||||||
|
|
||||||
|
|
|
@ -128,7 +128,8 @@ class NeuralHMM(nn.Module):
|
||||||
# Get mean, std and transition vector from decoder for this timestep
|
# Get mean, std and transition vector from decoder for this timestep
|
||||||
# Note: Gradient checkpointing currently doesn't works with multiple gpus inside a loop
|
# Note: Gradient checkpointing currently doesn't works with multiple gpus inside a loop
|
||||||
if self.use_grad_checkpointing and self.training:
|
if self.use_grad_checkpointing and self.training:
|
||||||
mean, std, transition_vector = checkpoint(self.output_net, h_memory, inputs)
|
# TODO: use_reentrant=False is recommended
|
||||||
|
mean, std, transition_vector = checkpoint(self.output_net, h_memory, inputs, use_reentrant=True)
|
||||||
else:
|
else:
|
||||||
mean, std, transition_vector = self.output_net(h_memory, inputs)
|
mean, std, transition_vector = self.output_net(h_memory, inputs)
|
||||||
|
|
||||||
|
|
|
@ -71,7 +71,7 @@ def plot_transition_probabilities_to_numpy(states, transition_probabilities, out
|
||||||
ax.set_title("Transition probability of state")
|
ax.set_title("Transition probability of state")
|
||||||
ax.set_xlabel("hidden state")
|
ax.set_xlabel("hidden state")
|
||||||
ax.set_ylabel("probability")
|
ax.set_ylabel("probability")
|
||||||
ax.set_xticks([i for i in range(len(transition_probabilities))]) # pylint: disable=unnecessary-comprehension
|
ax.set_xticks(list(range(len(transition_probabilities))))
|
||||||
ax.set_xticklabels([int(x) for x in states], rotation=90)
|
ax.set_xticklabels([int(x) for x in states], rotation=90)
|
||||||
plt.tight_layout()
|
plt.tight_layout()
|
||||||
if not output_fig:
|
if not output_fig:
|
||||||
|
|
|
@ -3,6 +3,8 @@ from torch import nn
|
||||||
from torch.distributions.multivariate_normal import MultivariateNormal as MVN
|
from torch.distributions.multivariate_normal import MultivariateNormal as MVN
|
||||||
from torch.nn import functional as F
|
from torch.nn import functional as F
|
||||||
|
|
||||||
|
from TTS.tts.layers.tacotron.common_layers import calculate_post_conv_height
|
||||||
|
|
||||||
|
|
||||||
class CapacitronVAE(nn.Module):
|
class CapacitronVAE(nn.Module):
|
||||||
"""Effective Use of Variational Embedding Capacity for prosody transfer.
|
"""Effective Use of Variational Embedding Capacity for prosody transfer.
|
||||||
|
@ -97,7 +99,7 @@ class ReferenceEncoder(nn.Module):
|
||||||
self.training = False
|
self.training = False
|
||||||
self.bns = nn.ModuleList([nn.BatchNorm2d(num_features=filter_size) for filter_size in filters[1:]])
|
self.bns = nn.ModuleList([nn.BatchNorm2d(num_features=filter_size) for filter_size in filters[1:]])
|
||||||
|
|
||||||
post_conv_height = self.calculate_post_conv_height(num_mel, 3, 2, 2, num_layers)
|
post_conv_height = calculate_post_conv_height(num_mel, 3, 2, 2, num_layers)
|
||||||
self.recurrence = nn.LSTM(
|
self.recurrence = nn.LSTM(
|
||||||
input_size=filters[-1] * post_conv_height, hidden_size=out_dim, batch_first=True, bidirectional=False
|
input_size=filters[-1] * post_conv_height, hidden_size=out_dim, batch_first=True, bidirectional=False
|
||||||
)
|
)
|
||||||
|
@ -155,13 +157,6 @@ class ReferenceEncoder(nn.Module):
|
||||||
|
|
||||||
return last_output.to(inputs.device) # [B, 128]
|
return last_output.to(inputs.device) # [B, 128]
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def calculate_post_conv_height(height, kernel_size, stride, pad, n_convs):
|
|
||||||
"""Height of spec after n convolutions with fixed kernel/stride/pad."""
|
|
||||||
for _ in range(n_convs):
|
|
||||||
height = (height - kernel_size + 2 * pad) // stride + 1
|
|
||||||
return height
|
|
||||||
|
|
||||||
|
|
||||||
class TextSummary(nn.Module):
|
class TextSummary(nn.Module):
|
||||||
def __init__(self, embedding_dim, encoder_output_dim):
|
def __init__(self, embedding_dim, encoder_output_dim):
|
||||||
|
|
|
@ -3,6 +3,13 @@ from torch import nn
|
||||||
from torch.nn import functional as F
|
from torch.nn import functional as F
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_post_conv_height(height: int, kernel_size: int, stride: int, pad: int, n_convs: int) -> int:
|
||||||
|
"""Height of spec after n convolutions with fixed kernel/stride/pad."""
|
||||||
|
for _ in range(n_convs):
|
||||||
|
height = (height - kernel_size + 2 * pad) // stride + 1
|
||||||
|
return height
|
||||||
|
|
||||||
|
|
||||||
class Linear(nn.Module):
|
class Linear(nn.Module):
|
||||||
"""Linear layer with a specific initialization.
|
"""Linear layer with a specific initialization.
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,8 @@ import torch
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
||||||
|
from TTS.tts.layers.tacotron.common_layers import calculate_post_conv_height
|
||||||
|
|
||||||
|
|
||||||
class GST(nn.Module):
|
class GST(nn.Module):
|
||||||
"""Global Style Token Module for factorizing prosody in speech.
|
"""Global Style Token Module for factorizing prosody in speech.
|
||||||
|
@ -44,7 +46,7 @@ class ReferenceEncoder(nn.Module):
|
||||||
self.convs = nn.ModuleList(convs)
|
self.convs = nn.ModuleList(convs)
|
||||||
self.bns = nn.ModuleList([nn.BatchNorm2d(num_features=filter_size) for filter_size in filters[1:]])
|
self.bns = nn.ModuleList([nn.BatchNorm2d(num_features=filter_size) for filter_size in filters[1:]])
|
||||||
|
|
||||||
post_conv_height = self.calculate_post_conv_height(num_mel, 3, 2, 1, num_layers)
|
post_conv_height = calculate_post_conv_height(num_mel, 3, 2, 1, num_layers)
|
||||||
self.recurrence = nn.GRU(
|
self.recurrence = nn.GRU(
|
||||||
input_size=filters[-1] * post_conv_height, hidden_size=embedding_dim // 2, batch_first=True
|
input_size=filters[-1] * post_conv_height, hidden_size=embedding_dim // 2, batch_first=True
|
||||||
)
|
)
|
||||||
|
@ -71,13 +73,6 @@ class ReferenceEncoder(nn.Module):
|
||||||
|
|
||||||
return out.squeeze(0)
|
return out.squeeze(0)
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def calculate_post_conv_height(height, kernel_size, stride, pad, n_convs):
|
|
||||||
"""Height of spec after n convolutions with fixed kernel/stride/pad."""
|
|
||||||
for _ in range(n_convs):
|
|
||||||
height = (height - kernel_size + 2 * pad) // stride + 1
|
|
||||||
return height
|
|
||||||
|
|
||||||
|
|
||||||
class StyleTokenLayer(nn.Module):
|
class StyleTokenLayer(nn.Module):
|
||||||
"""NN Module attending to style tokens based on prosody encodings."""
|
"""NN Module attending to style tokens based on prosody encodings."""
|
||||||
|
@ -117,7 +112,7 @@ class MultiHeadAttention(nn.Module):
|
||||||
out --- [N, T_q, num_units]
|
out --- [N, T_q, num_units]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, query_dim, key_dim, num_units, num_heads):
|
def __init__(self, query_dim: int, key_dim: int, num_units: int, num_heads: int):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.num_units = num_units
|
self.num_units = num_units
|
||||||
self.num_heads = num_heads
|
self.num_heads = num_heads
|
||||||
|
@ -127,7 +122,7 @@ class MultiHeadAttention(nn.Module):
|
||||||
self.W_key = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
|
self.W_key = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
|
||||||
self.W_value = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
|
self.W_value = nn.Linear(in_features=key_dim, out_features=num_units, bias=False)
|
||||||
|
|
||||||
def forward(self, query, key):
|
def forward(self, query: torch.Tensor, key: torch.Tensor) -> torch.Tensor:
|
||||||
queries = self.W_query(query) # [N, T_q, num_units]
|
queries = self.W_query(query) # [N, T_q, num_units]
|
||||||
keys = self.W_key(key) # [N, T_k, num_units]
|
keys = self.W_key(key) # [N, T_k, num_units]
|
||||||
values = self.W_value(key)
|
values = self.W_value(key)
|
||||||
|
@ -137,13 +132,11 @@ class MultiHeadAttention(nn.Module):
|
||||||
keys = torch.stack(torch.split(keys, split_size, dim=2), dim=0) # [h, N, T_k, num_units/h]
|
keys = torch.stack(torch.split(keys, split_size, dim=2), dim=0) # [h, N, T_k, num_units/h]
|
||||||
values = torch.stack(torch.split(values, split_size, dim=2), dim=0) # [h, N, T_k, num_units/h]
|
values = torch.stack(torch.split(values, split_size, dim=2), dim=0) # [h, N, T_k, num_units/h]
|
||||||
|
|
||||||
# score = softmax(QK^T / (d_k**0.5))
|
# score = softmax(QK^T / (d_k ** 0.5))
|
||||||
scores = torch.matmul(queries, keys.transpose(2, 3)) # [h, N, T_q, T_k]
|
scores = torch.matmul(queries, keys.transpose(2, 3)) # [h, N, T_q, T_k]
|
||||||
scores = scores / (self.key_dim**0.5)
|
scores = scores / (self.key_dim**0.5)
|
||||||
scores = F.softmax(scores, dim=3)
|
scores = F.softmax(scores, dim=3)
|
||||||
|
|
||||||
# out = score * V
|
# out = score * V
|
||||||
out = torch.matmul(scores, values) # [h, N, T_q, num_units/h]
|
out = torch.matmul(scores, values) # [h, N, T_q, num_units/h]
|
||||||
out = torch.cat(torch.split(out, 1, dim=0), dim=3).squeeze(0) # [N, T_q, num_units]
|
return torch.cat(torch.split(out, 1, dim=0), dim=3).squeeze(0) # [N, T_q, num_units]
|
||||||
|
|
||||||
return out
|
|
||||||
|
|
|
@ -1,12 +1,16 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
# adapted from https://github.com/r9y9/tacotron_pytorch
|
# adapted from https://github.com/r9y9/tacotron_pytorch
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
||||||
from .attentions import init_attn
|
from .attentions import init_attn
|
||||||
from .common_layers import Prenet
|
from .common_layers import Prenet
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class BatchNormConv1d(nn.Module):
|
class BatchNormConv1d(nn.Module):
|
||||||
r"""A wrapper for Conv1d with BatchNorm. It sets the activation
|
r"""A wrapper for Conv1d with BatchNorm. It sets the activation
|
||||||
|
@ -480,7 +484,7 @@ class Decoder(nn.Module):
|
||||||
if t > inputs.shape[1] / 4 and (stop_token > 0.6 or attention[:, -1].item() > 0.6):
|
if t > inputs.shape[1] / 4 and (stop_token > 0.6 or attention[:, -1].item() > 0.6):
|
||||||
break
|
break
|
||||||
if t > self.max_decoder_steps:
|
if t > self.max_decoder_steps:
|
||||||
print(" | > Decoder stopped with 'max_decoder_steps")
|
logger.info("Decoder stopped with `max_decoder_steps` %d", self.max_decoder_steps)
|
||||||
break
|
break
|
||||||
return self._parse_outputs(outputs, attentions, stop_tokens)
|
return self._parse_outputs(outputs, attentions, stop_tokens)
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
import logging
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from torch.nn import functional as F
|
from torch.nn import functional as F
|
||||||
|
@ -5,6 +7,8 @@ from torch.nn import functional as F
|
||||||
from .attentions import init_attn
|
from .attentions import init_attn
|
||||||
from .common_layers import Linear, Prenet
|
from .common_layers import Linear, Prenet
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
# pylint: disable=no-value-for-parameter
|
# pylint: disable=no-value-for-parameter
|
||||||
# pylint: disable=unexpected-keyword-arg
|
# pylint: disable=unexpected-keyword-arg
|
||||||
|
@ -356,7 +360,7 @@ class Decoder(nn.Module):
|
||||||
if stop_token > self.stop_threshold and t > inputs.shape[0] // 2:
|
if stop_token > self.stop_threshold and t > inputs.shape[0] // 2:
|
||||||
break
|
break
|
||||||
if len(outputs) == self.max_decoder_steps:
|
if len(outputs) == self.max_decoder_steps:
|
||||||
print(f" > Decoder stopped with `max_decoder_steps` {self.max_decoder_steps}")
|
logger.info("Decoder stopped with `max_decoder_steps` %d", self.max_decoder_steps)
|
||||||
break
|
break
|
||||||
|
|
||||||
memory = self._update_memory(decoder_output)
|
memory = self._update_memory(decoder_output)
|
||||||
|
@ -389,7 +393,7 @@ class Decoder(nn.Module):
|
||||||
if stop_token > 0.7:
|
if stop_token > 0.7:
|
||||||
break
|
break
|
||||||
if len(outputs) == self.max_decoder_steps:
|
if len(outputs) == self.max_decoder_steps:
|
||||||
print(" | > Decoder stopped with 'max_decoder_steps")
|
logger.info("Decoder stopped with `max_decoder_steps` %d", self.max_decoder_steps)
|
||||||
break
|
break
|
||||||
|
|
||||||
self.memory_truncated = decoder_output
|
self.memory_truncated = decoder_output
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
import functools
|
import functools
|
||||||
import math
|
import math
|
||||||
import os
|
|
||||||
|
|
||||||
import fsspec
|
import fsspec
|
||||||
import torch
|
import torch
|
||||||
|
@ -10,6 +9,7 @@ import torchaudio
|
||||||
from transformers import LogitsWarper
|
from transformers import LogitsWarper
|
||||||
|
|
||||||
from TTS.tts.layers.tortoise.xtransformers import ContinuousTransformerWrapper, RelativePositionBias
|
from TTS.tts.layers.tortoise.xtransformers import ContinuousTransformerWrapper, RelativePositionBias
|
||||||
|
from TTS.utils.generic_utils import is_pytorch_at_least_2_4
|
||||||
|
|
||||||
|
|
||||||
def zero_module(module):
|
def zero_module(module):
|
||||||
|
@ -70,11 +70,10 @@ class QKVAttentionLegacy(nn.Module):
|
||||||
weight = rel_pos(weight.reshape(bs, self.n_heads, weight.shape[-2], weight.shape[-1])).reshape(
|
weight = rel_pos(weight.reshape(bs, self.n_heads, weight.shape[-2], weight.shape[-1])).reshape(
|
||||||
bs * self.n_heads, weight.shape[-2], weight.shape[-1]
|
bs * self.n_heads, weight.shape[-2], weight.shape[-1]
|
||||||
)
|
)
|
||||||
weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
|
|
||||||
if mask is not None:
|
if mask is not None:
|
||||||
# The proper way to do this is to mask before the softmax using -inf, but that doesn't work properly on CPUs.
|
mask = mask.repeat(self.n_heads, 1, 1)
|
||||||
mask = mask.repeat(self.n_heads, 1).unsqueeze(1)
|
weight[mask.logical_not()] = -torch.inf
|
||||||
weight = weight * mask
|
weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
|
||||||
a = torch.einsum("bts,bcs->bct", weight, v)
|
a = torch.einsum("bts,bcs->bct", weight, v)
|
||||||
|
|
||||||
return a.reshape(bs, -1, length)
|
return a.reshape(bs, -1, length)
|
||||||
|
@ -93,12 +92,12 @@ class AttentionBlock(nn.Module):
|
||||||
channels,
|
channels,
|
||||||
num_heads=1,
|
num_heads=1,
|
||||||
num_head_channels=-1,
|
num_head_channels=-1,
|
||||||
do_checkpoint=True,
|
*,
|
||||||
relative_pos_embeddings=False,
|
relative_pos_embeddings=False,
|
||||||
|
tortoise_norm=False,
|
||||||
):
|
):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.channels = channels
|
self.channels = channels
|
||||||
self.do_checkpoint = do_checkpoint
|
|
||||||
if num_head_channels == -1:
|
if num_head_channels == -1:
|
||||||
self.num_heads = num_heads
|
self.num_heads = num_heads
|
||||||
else:
|
else:
|
||||||
|
@ -110,6 +109,7 @@ class AttentionBlock(nn.Module):
|
||||||
self.qkv = nn.Conv1d(channels, channels * 3, 1)
|
self.qkv = nn.Conv1d(channels, channels * 3, 1)
|
||||||
# split heads before split qkv
|
# split heads before split qkv
|
||||||
self.attention = QKVAttentionLegacy(self.num_heads)
|
self.attention = QKVAttentionLegacy(self.num_heads)
|
||||||
|
self.tortoise_norm = tortoise_norm
|
||||||
|
|
||||||
self.proj_out = zero_module(nn.Conv1d(channels, channels, 1))
|
self.proj_out = zero_module(nn.Conv1d(channels, channels, 1))
|
||||||
if relative_pos_embeddings:
|
if relative_pos_embeddings:
|
||||||
|
@ -126,10 +126,13 @@ class AttentionBlock(nn.Module):
|
||||||
def forward(self, x, mask=None):
|
def forward(self, x, mask=None):
|
||||||
b, c, *spatial = x.shape
|
b, c, *spatial = x.shape
|
||||||
x = x.reshape(b, c, -1)
|
x = x.reshape(b, c, -1)
|
||||||
qkv = self.qkv(self.norm(x))
|
x_norm = self.norm(x)
|
||||||
|
qkv = self.qkv(x_norm)
|
||||||
h = self.attention(qkv, mask, self.relative_pos_embeddings)
|
h = self.attention(qkv, mask, self.relative_pos_embeddings)
|
||||||
h = self.proj_out(h)
|
h = self.proj_out(h)
|
||||||
|
if self.tortoise_norm:
|
||||||
return (x + h).reshape(b, c, *spatial)
|
return (x + h).reshape(b, c, *spatial)
|
||||||
|
return (x_norm + h).reshape(b, c, *spatial)
|
||||||
|
|
||||||
|
|
||||||
class Upsample(nn.Module):
|
class Upsample(nn.Module):
|
||||||
|
@ -185,115 +188,7 @@ class Downsample(nn.Module):
|
||||||
return self.op(x)
|
return self.op(x)
|
||||||
|
|
||||||
|
|
||||||
class ResBlock(nn.Module):
|
DEFAULT_MEL_NORM_FILE = "https://github.com/coqui-ai/TTS/releases/download/v0.14.1_models/mel_norms.pth"
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
channels,
|
|
||||||
dropout,
|
|
||||||
out_channels=None,
|
|
||||||
use_conv=False,
|
|
||||||
use_scale_shift_norm=False,
|
|
||||||
up=False,
|
|
||||||
down=False,
|
|
||||||
kernel_size=3,
|
|
||||||
):
|
|
||||||
super().__init__()
|
|
||||||
self.channels = channels
|
|
||||||
self.dropout = dropout
|
|
||||||
self.out_channels = out_channels or channels
|
|
||||||
self.use_conv = use_conv
|
|
||||||
self.use_scale_shift_norm = use_scale_shift_norm
|
|
||||||
padding = 1 if kernel_size == 3 else 2
|
|
||||||
|
|
||||||
self.in_layers = nn.Sequential(
|
|
||||||
normalization(channels),
|
|
||||||
nn.SiLU(),
|
|
||||||
nn.Conv1d(channels, self.out_channels, kernel_size, padding=padding),
|
|
||||||
)
|
|
||||||
|
|
||||||
self.updown = up or down
|
|
||||||
|
|
||||||
if up:
|
|
||||||
self.h_upd = Upsample(channels, False)
|
|
||||||
self.x_upd = Upsample(channels, False)
|
|
||||||
elif down:
|
|
||||||
self.h_upd = Downsample(channels, False)
|
|
||||||
self.x_upd = Downsample(channels, False)
|
|
||||||
else:
|
|
||||||
self.h_upd = self.x_upd = nn.Identity()
|
|
||||||
|
|
||||||
self.out_layers = nn.Sequential(
|
|
||||||
normalization(self.out_channels),
|
|
||||||
nn.SiLU(),
|
|
||||||
nn.Dropout(p=dropout),
|
|
||||||
zero_module(nn.Conv1d(self.out_channels, self.out_channels, kernel_size, padding=padding)),
|
|
||||||
)
|
|
||||||
|
|
||||||
if self.out_channels == channels:
|
|
||||||
self.skip_connection = nn.Identity()
|
|
||||||
elif use_conv:
|
|
||||||
self.skip_connection = nn.Conv1d(channels, self.out_channels, kernel_size, padding=padding)
|
|
||||||
else:
|
|
||||||
self.skip_connection = nn.Conv1d(channels, self.out_channels, 1)
|
|
||||||
|
|
||||||
def forward(self, x):
|
|
||||||
if self.updown:
|
|
||||||
in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
|
|
||||||
h = in_rest(x)
|
|
||||||
h = self.h_upd(h)
|
|
||||||
x = self.x_upd(x)
|
|
||||||
h = in_conv(h)
|
|
||||||
else:
|
|
||||||
h = self.in_layers(x)
|
|
||||||
h = self.out_layers(h)
|
|
||||||
return self.skip_connection(x) + h
|
|
||||||
|
|
||||||
|
|
||||||
class AudioMiniEncoder(nn.Module):
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
spec_dim,
|
|
||||||
embedding_dim,
|
|
||||||
base_channels=128,
|
|
||||||
depth=2,
|
|
||||||
resnet_blocks=2,
|
|
||||||
attn_blocks=4,
|
|
||||||
num_attn_heads=4,
|
|
||||||
dropout=0,
|
|
||||||
downsample_factor=2,
|
|
||||||
kernel_size=3,
|
|
||||||
):
|
|
||||||
super().__init__()
|
|
||||||
self.init = nn.Sequential(nn.Conv1d(spec_dim, base_channels, 3, padding=1))
|
|
||||||
ch = base_channels
|
|
||||||
res = []
|
|
||||||
for l in range(depth):
|
|
||||||
for r in range(resnet_blocks):
|
|
||||||
res.append(ResBlock(ch, dropout, kernel_size=kernel_size))
|
|
||||||
res.append(Downsample(ch, use_conv=True, out_channels=ch * 2, factor=downsample_factor))
|
|
||||||
ch *= 2
|
|
||||||
self.res = nn.Sequential(*res)
|
|
||||||
self.final = nn.Sequential(normalization(ch), nn.SiLU(), nn.Conv1d(ch, embedding_dim, 1))
|
|
||||||
attn = []
|
|
||||||
for a in range(attn_blocks):
|
|
||||||
attn.append(
|
|
||||||
AttentionBlock(
|
|
||||||
embedding_dim,
|
|
||||||
num_attn_heads,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
self.attn = nn.Sequential(*attn)
|
|
||||||
self.dim = embedding_dim
|
|
||||||
|
|
||||||
def forward(self, x):
|
|
||||||
h = self.init(x)
|
|
||||||
h = self.res(h)
|
|
||||||
h = self.final(h)
|
|
||||||
h = self.attn(h)
|
|
||||||
return h[:, :, 0]
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_MEL_NORM_FILE = "https://coqui.gateway.scarf.sh/v0.14.1_models/mel_norms.pth"
|
|
||||||
|
|
||||||
|
|
||||||
class TorchMelSpectrogram(nn.Module):
|
class TorchMelSpectrogram(nn.Module):
|
||||||
|
@ -333,7 +228,7 @@ class TorchMelSpectrogram(nn.Module):
|
||||||
self.mel_norm_file = mel_norm_file
|
self.mel_norm_file = mel_norm_file
|
||||||
if self.mel_norm_file is not None:
|
if self.mel_norm_file is not None:
|
||||||
with fsspec.open(self.mel_norm_file) as f:
|
with fsspec.open(self.mel_norm_file) as f:
|
||||||
self.mel_norms = torch.load(f)
|
self.mel_norms = torch.load(f, weights_only=is_pytorch_at_least_2_4())
|
||||||
else:
|
else:
|
||||||
self.mel_norms = None
|
self.mel_norms = None
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
from glob import glob
|
from glob import glob
|
||||||
from typing import Dict, List
|
from typing import Dict, List
|
||||||
|
@ -8,7 +9,10 @@ import torch
|
||||||
import torchaudio
|
import torchaudio
|
||||||
from scipy.io.wavfile import read
|
from scipy.io.wavfile import read
|
||||||
|
|
||||||
from TTS.utils.audio.torch_transforms import TorchSTFT
|
from TTS.utils.audio.torch_transforms import TorchSTFT, amp_to_db
|
||||||
|
from TTS.utils.generic_utils import is_pytorch_at_least_2_4
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def load_wav_to_torch(full_path):
|
def load_wav_to_torch(full_path):
|
||||||
|
@ -28,7 +32,7 @@ def check_audio(audio, audiopath: str):
|
||||||
# Check some assumptions about audio range. This should be automatically fixed in load_wav_to_torch, but might not be in some edge cases, where we should squawk.
|
# Check some assumptions about audio range. This should be automatically fixed in load_wav_to_torch, but might not be in some edge cases, where we should squawk.
|
||||||
# '2' is arbitrarily chosen since it seems like audio will often "overdrive" the [-1,1] bounds.
|
# '2' is arbitrarily chosen since it seems like audio will often "overdrive" the [-1,1] bounds.
|
||||||
if torch.any(audio > 2) or not torch.any(audio < 0):
|
if torch.any(audio > 2) or not torch.any(audio < 0):
|
||||||
print(f"Error with {audiopath}. Max={audio.max()} min={audio.min()}")
|
logger.error("Error with %s. Max=%.2f min=%.2f", audiopath, audio.max(), audio.min())
|
||||||
audio.clip_(-1, 1)
|
audio.clip_(-1, 1)
|
||||||
|
|
||||||
|
|
||||||
|
@ -84,24 +88,6 @@ def normalize_tacotron_mel(mel):
|
||||||
return 2 * ((mel - TACOTRON_MEL_MIN) / (TACOTRON_MEL_MAX - TACOTRON_MEL_MIN)) - 1
|
return 2 * ((mel - TACOTRON_MEL_MIN) / (TACOTRON_MEL_MAX - TACOTRON_MEL_MIN)) - 1
|
||||||
|
|
||||||
|
|
||||||
def dynamic_range_compression(x, C=1, clip_val=1e-5):
|
|
||||||
"""
|
|
||||||
PARAMS
|
|
||||||
------
|
|
||||||
C: compression factor
|
|
||||||
"""
|
|
||||||
return torch.log(torch.clamp(x, min=clip_val) * C)
|
|
||||||
|
|
||||||
|
|
||||||
def dynamic_range_decompression(x, C=1):
|
|
||||||
"""
|
|
||||||
PARAMS
|
|
||||||
------
|
|
||||||
C: compression factor used to compress
|
|
||||||
"""
|
|
||||||
return torch.exp(x) / C
|
|
||||||
|
|
||||||
|
|
||||||
def get_voices(extra_voice_dirs: List[str] = []):
|
def get_voices(extra_voice_dirs: List[str] = []):
|
||||||
dirs = extra_voice_dirs
|
dirs = extra_voice_dirs
|
||||||
voices: Dict[str, List[str]] = {}
|
voices: Dict[str, List[str]] = {}
|
||||||
|
@ -121,7 +107,7 @@ def load_voice(voice: str, extra_voice_dirs: List[str] = []):
|
||||||
voices = get_voices(extra_voice_dirs)
|
voices = get_voices(extra_voice_dirs)
|
||||||
paths = voices[voice]
|
paths = voices[voice]
|
||||||
if len(paths) == 1 and paths[0].endswith(".pth"):
|
if len(paths) == 1 and paths[0].endswith(".pth"):
|
||||||
return None, torch.load(paths[0])
|
return None, torch.load(paths[0], weights_only=is_pytorch_at_least_2_4())
|
||||||
else:
|
else:
|
||||||
conds = []
|
conds = []
|
||||||
for cond_path in paths:
|
for cond_path in paths:
|
||||||
|
@ -136,7 +122,7 @@ def load_voices(voices: List[str], extra_voice_dirs: List[str] = []):
|
||||||
for voice in voices:
|
for voice in voices:
|
||||||
if voice == "random":
|
if voice == "random":
|
||||||
if len(voices) > 1:
|
if len(voices) > 1:
|
||||||
print("Cannot combine a random voice with a non-random voice. Just using a random voice.")
|
logger.warning("Cannot combine a random voice with a non-random voice. Just using a random voice.")
|
||||||
return None, None
|
return None, None
|
||||||
clip, latent = load_voice(voice, extra_voice_dirs)
|
clip, latent = load_voice(voice, extra_voice_dirs)
|
||||||
if latent is None:
|
if latent is None:
|
||||||
|
@ -171,7 +157,7 @@ def wav_to_univnet_mel(wav, do_normalization=False, device="cuda"):
|
||||||
)
|
)
|
||||||
stft = stft.to(device)
|
stft = stft.to(device)
|
||||||
mel = stft(wav)
|
mel = stft(wav)
|
||||||
mel = dynamic_range_compression(mel)
|
mel = amp_to_db(mel)
|
||||||
if do_normalization:
|
if do_normalization:
|
||||||
mel = normalize_tacotron_mel(mel)
|
mel = normalize_tacotron_mel(mel)
|
||||||
return mel
|
return mel
|
||||||
|
|
|
@ -1,14 +1,23 @@
|
||||||
# AGPL: a notification must be added stating that changes have been made to that file.
|
# AGPL: a notification must be added stating that changes have been made to that file.
|
||||||
import functools
|
import functools
|
||||||
|
import random
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
|
import transformers
|
||||||
|
from packaging.version import Version
|
||||||
from transformers import GPT2Config, GPT2PreTrainedModel, LogitsProcessorList
|
from transformers import GPT2Config, GPT2PreTrainedModel, LogitsProcessorList
|
||||||
from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
|
from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
|
||||||
|
|
||||||
from TTS.tts.layers.tortoise.arch_utils import AttentionBlock, TypicalLogitsWarper
|
from TTS.tts.layers.tortoise.arch_utils import AttentionBlock, TypicalLogitsWarper
|
||||||
|
|
||||||
|
if Version(transformers.__version__) >= Version("4.45"):
|
||||||
|
isin = transformers.pytorch_utils.isin_mps_friendly
|
||||||
|
else:
|
||||||
|
isin = torch.isin
|
||||||
|
|
||||||
|
|
||||||
def null_position_embeddings(range, dim):
|
def null_position_embeddings(range, dim):
|
||||||
return torch.zeros((range.shape[0], range.shape[1], dim), device=range.device)
|
return torch.zeros((range.shape[0], range.shape[1], dim), device=range.device)
|
||||||
|
@ -115,7 +124,7 @@ class GPT2InferenceModel(GPT2PreTrainedModel):
|
||||||
else:
|
else:
|
||||||
emb = self.embeddings(input_ids)
|
emb = self.embeddings(input_ids)
|
||||||
emb = emb + self.text_pos_embedding.get_fixed_embedding(
|
emb = emb + self.text_pos_embedding.get_fixed_embedding(
|
||||||
attention_mask.shape[1] - mel_len, attention_mask.device
|
attention_mask.shape[1] - (mel_len + 1), attention_mask.device
|
||||||
)
|
)
|
||||||
|
|
||||||
transformer_outputs = self.transformer(
|
transformer_outputs = self.transformer(
|
||||||
|
@ -167,44 +176,56 @@ class ConditioningEncoder(nn.Module):
|
||||||
embedding_dim,
|
embedding_dim,
|
||||||
attn_blocks=6,
|
attn_blocks=6,
|
||||||
num_attn_heads=4,
|
num_attn_heads=4,
|
||||||
do_checkpointing=False,
|
*,
|
||||||
mean=False,
|
tortoise_norm=False,
|
||||||
):
|
):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
attn = []
|
attn = []
|
||||||
self.init = nn.Conv1d(spec_dim, embedding_dim, kernel_size=1)
|
self.init = nn.Conv1d(spec_dim, embedding_dim, kernel_size=1)
|
||||||
for a in range(attn_blocks):
|
for a in range(attn_blocks):
|
||||||
attn.append(AttentionBlock(embedding_dim, num_attn_heads))
|
attn.append(AttentionBlock(embedding_dim, num_attn_heads, tortoise_norm=tortoise_norm))
|
||||||
self.attn = nn.Sequential(*attn)
|
self.attn = nn.Sequential(*attn)
|
||||||
self.dim = embedding_dim
|
self.dim = embedding_dim
|
||||||
self.do_checkpointing = do_checkpointing
|
|
||||||
self.mean = mean
|
|
||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
|
"""
|
||||||
|
x: (b, 80, s)
|
||||||
|
"""
|
||||||
h = self.init(x)
|
h = self.init(x)
|
||||||
h = self.attn(h)
|
h = self.attn(h)
|
||||||
if self.mean:
|
return h
|
||||||
return h.mean(dim=2)
|
|
||||||
else:
|
|
||||||
return h[:, :, 0]
|
|
||||||
|
|
||||||
|
|
||||||
class LearnedPositionEmbeddings(nn.Module):
|
class LearnedPositionEmbeddings(nn.Module):
|
||||||
def __init__(self, seq_len, model_dim, init=0.02):
|
def __init__(self, seq_len, model_dim, init=0.02, relative=False):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.emb = nn.Embedding(seq_len, model_dim)
|
self.emb = nn.Embedding(seq_len, model_dim)
|
||||||
# Initializing this way is standard for GPT-2
|
# Initializing this way is standard for GPT-2
|
||||||
self.emb.weight.data.normal_(mean=0.0, std=init)
|
self.emb.weight.data.normal_(mean=0.0, std=init)
|
||||||
|
self.relative = relative
|
||||||
|
self.seq_len = seq_len
|
||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
sl = x.shape[1]
|
sl = x.shape[1]
|
||||||
|
if self.relative:
|
||||||
|
start = random.randint(sl, self.seq_len) - sl
|
||||||
|
return self.emb(torch.arange(start, start + sl, device=x.device))
|
||||||
|
else:
|
||||||
return self.emb(torch.arange(0, sl, device=x.device))
|
return self.emb(torch.arange(0, sl, device=x.device))
|
||||||
|
|
||||||
def get_fixed_embedding(self, ind, dev):
|
def get_fixed_embedding(self, ind, dev):
|
||||||
return self.emb(torch.arange(0, ind, device=dev))[ind - 1 : ind]
|
return self.emb(torch.tensor([ind], device=dev)).unsqueeze(0)
|
||||||
|
|
||||||
|
|
||||||
def build_hf_gpt_transformer(layers, model_dim, heads, max_mel_seq_len, max_text_seq_len, checkpointing):
|
def build_hf_gpt_transformer(
|
||||||
|
layers: int,
|
||||||
|
model_dim: int,
|
||||||
|
heads: int,
|
||||||
|
max_mel_seq_len: int,
|
||||||
|
max_text_seq_len: int,
|
||||||
|
checkpointing: bool,
|
||||||
|
max_prompt_len: int = 0,
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
GPT-2 implemented by the HuggingFace library.
|
GPT-2 implemented by the HuggingFace library.
|
||||||
"""
|
"""
|
||||||
|
@ -212,8 +233,8 @@ def build_hf_gpt_transformer(layers, model_dim, heads, max_mel_seq_len, max_text
|
||||||
|
|
||||||
gpt_config = GPT2Config(
|
gpt_config = GPT2Config(
|
||||||
vocab_size=256, # Unused.
|
vocab_size=256, # Unused.
|
||||||
n_positions=max_mel_seq_len + max_text_seq_len,
|
n_positions=max_mel_seq_len + max_text_seq_len + max_prompt_len,
|
||||||
n_ctx=max_mel_seq_len + max_text_seq_len,
|
n_ctx=max_mel_seq_len + max_text_seq_len + max_prompt_len,
|
||||||
n_embd=model_dim,
|
n_embd=model_dim,
|
||||||
n_layer=layers,
|
n_layer=layers,
|
||||||
n_head=heads,
|
n_head=heads,
|
||||||
|
@ -226,13 +247,18 @@ def build_hf_gpt_transformer(layers, model_dim, heads, max_mel_seq_len, max_text
|
||||||
gpt.wpe = functools.partial(null_position_embeddings, dim=model_dim)
|
gpt.wpe = functools.partial(null_position_embeddings, dim=model_dim)
|
||||||
# Built-in token embeddings are unused.
|
# Built-in token embeddings are unused.
|
||||||
del gpt.wte
|
del gpt.wte
|
||||||
return (
|
|
||||||
gpt,
|
mel_pos_emb = (
|
||||||
LearnedPositionEmbeddings(max_mel_seq_len, model_dim),
|
LearnedPositionEmbeddings(max_mel_seq_len, model_dim)
|
||||||
LearnedPositionEmbeddings(max_text_seq_len, model_dim),
|
if max_mel_seq_len != -1
|
||||||
None,
|
else functools.partial(null_position_embeddings, dim=model_dim)
|
||||||
None,
|
|
||||||
)
|
)
|
||||||
|
text_pos_emb = (
|
||||||
|
LearnedPositionEmbeddings(max_text_seq_len, model_dim)
|
||||||
|
if max_mel_seq_len != -1
|
||||||
|
else functools.partial(null_position_embeddings, dim=model_dim)
|
||||||
|
)
|
||||||
|
return gpt, mel_pos_emb, text_pos_emb, None, None
|
||||||
|
|
||||||
|
|
||||||
class MelEncoder(nn.Module):
|
class MelEncoder(nn.Module):
|
||||||
|
@ -326,12 +352,12 @@ class UnifiedVoice(nn.Module):
|
||||||
self.mel_layer_pos_embedding,
|
self.mel_layer_pos_embedding,
|
||||||
self.text_layer_pos_embedding,
|
self.text_layer_pos_embedding,
|
||||||
) = build_hf_gpt_transformer(
|
) = build_hf_gpt_transformer(
|
||||||
layers,
|
layers=layers,
|
||||||
model_dim,
|
model_dim=model_dim,
|
||||||
heads,
|
heads=heads,
|
||||||
self.max_mel_tokens + 2 + self.max_conditioning_inputs,
|
max_mel_seq_len=self.max_mel_tokens + 2 + self.max_conditioning_inputs,
|
||||||
self.max_text_tokens + 2,
|
max_text_seq_len=self.max_text_tokens + 2,
|
||||||
checkpointing,
|
checkpointing=checkpointing,
|
||||||
)
|
)
|
||||||
if train_solo_embeddings:
|
if train_solo_embeddings:
|
||||||
self.mel_solo_embedding = nn.Parameter(torch.randn(1, 1, model_dim) * 0.02, requires_grad=True)
|
self.mel_solo_embedding = nn.Parameter(torch.randn(1, 1, model_dim) * 0.02, requires_grad=True)
|
||||||
|
@ -447,7 +473,7 @@ class UnifiedVoice(nn.Module):
|
||||||
)
|
)
|
||||||
conds = []
|
conds = []
|
||||||
for j in range(speech_conditioning_input.shape[1]):
|
for j in range(speech_conditioning_input.shape[1]):
|
||||||
conds.append(self.conditioning_encoder(speech_conditioning_input[:, j]))
|
conds.append(self.conditioning_encoder(speech_conditioning_input[:, j])[:, :, 0])
|
||||||
conds = torch.stack(conds, dim=1)
|
conds = torch.stack(conds, dim=1)
|
||||||
conds = conds.mean(dim=1)
|
conds = conds.mean(dim=1)
|
||||||
return conds
|
return conds
|
||||||
|
@ -596,6 +622,8 @@ class UnifiedVoice(nn.Module):
|
||||||
max_length = (
|
max_length = (
|
||||||
trunc_index + self.max_mel_tokens - 1 if max_generate_length is None else trunc_index + max_generate_length
|
trunc_index + self.max_mel_tokens - 1 if max_generate_length is None else trunc_index + max_generate_length
|
||||||
)
|
)
|
||||||
|
stop_token_tensor = torch.tensor(self.stop_mel_token, device=inputs.device, dtype=torch.long)
|
||||||
|
attention_mask = _prepare_attention_mask_for_generation(inputs, stop_token_tensor, stop_token_tensor)
|
||||||
gen = self.inference_model.generate(
|
gen = self.inference_model.generate(
|
||||||
inputs,
|
inputs,
|
||||||
bos_token_id=self.start_mel_token,
|
bos_token_id=self.start_mel_token,
|
||||||
|
@ -604,11 +632,39 @@ class UnifiedVoice(nn.Module):
|
||||||
max_length=max_length,
|
max_length=max_length,
|
||||||
logits_processor=logits_processor,
|
logits_processor=logits_processor,
|
||||||
num_return_sequences=num_return_sequences,
|
num_return_sequences=num_return_sequences,
|
||||||
|
attention_mask=attention_mask,
|
||||||
**hf_generate_kwargs,
|
**hf_generate_kwargs,
|
||||||
)
|
)
|
||||||
return gen[:, trunc_index:]
|
return gen[:, trunc_index:]
|
||||||
|
|
||||||
|
|
||||||
|
def _prepare_attention_mask_for_generation(
|
||||||
|
inputs: torch.Tensor,
|
||||||
|
pad_token_id: Optional[torch.Tensor],
|
||||||
|
eos_token_id: Optional[torch.Tensor],
|
||||||
|
) -> torch.LongTensor:
|
||||||
|
# No information for attention mask inference -> return default attention mask
|
||||||
|
default_attention_mask = torch.ones(inputs.shape[:2], dtype=torch.long, device=inputs.device)
|
||||||
|
if pad_token_id is None:
|
||||||
|
return default_attention_mask
|
||||||
|
|
||||||
|
is_input_ids = len(inputs.shape) == 2 and inputs.dtype in [torch.int, torch.long]
|
||||||
|
if not is_input_ids:
|
||||||
|
return default_attention_mask
|
||||||
|
|
||||||
|
is_pad_token_in_inputs = (pad_token_id is not None) and (isin(elements=inputs, test_elements=pad_token_id).any())
|
||||||
|
is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or ~(
|
||||||
|
isin(elements=eos_token_id, test_elements=pad_token_id).any()
|
||||||
|
)
|
||||||
|
can_infer_attention_mask = is_pad_token_in_inputs * is_pad_token_not_equal_to_eos_token_id
|
||||||
|
attention_mask_from_padding = inputs.ne(pad_token_id).long()
|
||||||
|
|
||||||
|
attention_mask = (
|
||||||
|
attention_mask_from_padding * can_infer_attention_mask + default_attention_mask * ~can_infer_attention_mask
|
||||||
|
)
|
||||||
|
return attention_mask
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
gpt = UnifiedVoice(
|
gpt = UnifiedVoice(
|
||||||
model_dim=256,
|
model_dim=256,
|
||||||
|
|
|
@ -16,7 +16,6 @@ class ResBlock(nn.Module):
|
||||||
up=False,
|
up=False,
|
||||||
down=False,
|
down=False,
|
||||||
kernel_size=3,
|
kernel_size=3,
|
||||||
do_checkpoint=True,
|
|
||||||
):
|
):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.channels = channels
|
self.channels = channels
|
||||||
|
@ -24,7 +23,6 @@ class ResBlock(nn.Module):
|
||||||
self.out_channels = out_channels or channels
|
self.out_channels = out_channels or channels
|
||||||
self.use_conv = use_conv
|
self.use_conv = use_conv
|
||||||
self.use_scale_shift_norm = use_scale_shift_norm
|
self.use_scale_shift_norm = use_scale_shift_norm
|
||||||
self.do_checkpoint = do_checkpoint
|
|
||||||
padding = 1 if kernel_size == 3 else 2
|
padding = 1 if kernel_size == 3 else 2
|
||||||
|
|
||||||
self.in_layers = nn.Sequential(
|
self.in_layers = nn.Sequential(
|
||||||
|
@ -92,14 +90,14 @@ class AudioMiniEncoder(nn.Module):
|
||||||
self.layers = depth
|
self.layers = depth
|
||||||
for l in range(depth):
|
for l in range(depth):
|
||||||
for r in range(resnet_blocks):
|
for r in range(resnet_blocks):
|
||||||
res.append(ResBlock(ch, dropout, do_checkpoint=False, kernel_size=kernel_size))
|
res.append(ResBlock(ch, dropout, kernel_size=kernel_size))
|
||||||
res.append(Downsample(ch, use_conv=True, out_channels=ch * 2, factor=downsample_factor))
|
res.append(Downsample(ch, use_conv=True, out_channels=ch * 2, factor=downsample_factor))
|
||||||
ch *= 2
|
ch *= 2
|
||||||
self.res = nn.Sequential(*res)
|
self.res = nn.Sequential(*res)
|
||||||
self.final = nn.Sequential(normalization(ch), nn.SiLU(), nn.Conv1d(ch, embedding_dim, 1))
|
self.final = nn.Sequential(normalization(ch), nn.SiLU(), nn.Conv1d(ch, embedding_dim, 1))
|
||||||
attn = []
|
attn = []
|
||||||
for a in range(attn_blocks):
|
for a in range(attn_blocks):
|
||||||
attn.append(AttentionBlock(embedding_dim, num_attn_heads, do_checkpoint=False))
|
attn.append(AttentionBlock(embedding_dim, num_attn_heads, tortoise_norm=True))
|
||||||
self.attn = nn.Sequential(*attn)
|
self.attn = nn.Sequential(*attn)
|
||||||
self.dim = embedding_dim
|
self.dim = embedding_dim
|
||||||
|
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue