mirror of https://github.com/coqui-ai/TTS.git
Merge branch 'dev' of github.com:idiap/coqui-ai-TTS into fix/macos-stream-generator
This commit is contained in:
commit
61ec4322d4
|
@ -1,5 +0,0 @@
|
||||||
linters:
|
|
||||||
- pylint:
|
|
||||||
# pylintrc: pylintrc
|
|
||||||
filefilter: ['- test_*.py', '+ *.py', '- *.npy']
|
|
||||||
# exclude:
|
|
|
@ -6,4 +6,4 @@ TTS.egg-info/
|
||||||
tests/outputs/*
|
tests/outputs/*
|
||||||
tests/train_outputs/*
|
tests/train_outputs/*
|
||||||
__pycache__/
|
__pycache__/
|
||||||
*.pyc
|
*.pyc
|
||||||
|
|
|
@ -59,7 +59,7 @@ body:
|
||||||
You can either run `TTS/bin/collect_env_info.py`
|
You can either run `TTS/bin/collect_env_info.py`
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
wget https://raw.githubusercontent.com/coqui-ai/TTS/main/TTS/bin/collect_env_info.py
|
wget https://raw.githubusercontent.com/idiap/coqui-ai-TTS/main/TTS/bin/collect_env_info.py
|
||||||
python collect_env_info.py
|
python collect_env_info.py
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
blank_issues_enabled: false
|
blank_issues_enabled: false
|
||||||
contact_links:
|
contact_links:
|
||||||
- name: CoquiTTS GitHub Discussions
|
- name: CoquiTTS GitHub Discussions
|
||||||
url: https://github.com/coqui-ai/TTS/discussions
|
url: https://github.com/idiap/coqui-ai-TTS/discussions
|
||||||
about: Please ask and answer questions here.
|
about: Please ask and answer questions here.
|
||||||
- name: Coqui Security issue disclosure
|
- name: Coqui Security issue disclosure
|
||||||
url: mailto:info@coqui.ai
|
url: mailto:enno.hermann@gmail.com
|
||||||
about: Please report security vulnerabilities here.
|
about: Please report security vulnerabilities here.
|
||||||
|
|
|
@ -5,11 +5,3 @@ Welcome to the 🐸TTS project! We are excited to see your interest, and appreci
|
||||||
This repository is governed by the Contributor Covenant Code of Conduct. For more details, see the [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md) file.
|
This repository is governed by the Contributor Covenant Code of Conduct. For more details, see the [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md) file.
|
||||||
|
|
||||||
In order to make a good pull request, please see our [CONTRIBUTING.md](CONTRIBUTING.md) file.
|
In order to make a good pull request, please see our [CONTRIBUTING.md](CONTRIBUTING.md) file.
|
||||||
|
|
||||||
Before accepting your pull request, you will be asked to sign a [Contributor License Agreement](https://cla-assistant.io/coqui-ai/TTS).
|
|
||||||
|
|
||||||
This [Contributor License Agreement](https://cla-assistant.io/coqui-ai/TTS):
|
|
||||||
|
|
||||||
- Protects you, Coqui, and the users of the code.
|
|
||||||
- Does not change your rights to use your contributions for any purpose.
|
|
||||||
- Does not change the license of the 🐸TTS project. It just makes the terms of your contribution clearer and lets us know you are OK to contribute.
|
|
||||||
|
|
|
@ -15,4 +15,3 @@ markComment: >
|
||||||
for your contributions. You might also look our discussion channels.
|
for your contributions. You might also look our discussion channels.
|
||||||
# Comment to post when closing a stale issue. Set to `false` to disable
|
# Comment to post when closing a stale issue. Set to `false` to disable
|
||||||
closeComment: false
|
closeComment: false
|
||||||
|
|
||||||
|
|
|
@ -1,51 +0,0 @@
|
||||||
name: aux-tests
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
pull_request:
|
|
||||||
types: [opened, synchronize, reopened]
|
|
||||||
jobs:
|
|
||||||
check_skip:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
if: "! contains(github.event.head_commit.message, '[ci skip]')"
|
|
||||||
steps:
|
|
||||||
- run: echo "${{ github.event.head_commit.message }}"
|
|
||||||
|
|
||||||
test:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
python-version: [3.9, "3.10", "3.11"]
|
|
||||||
experimental: [false]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v3
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
|
||||||
uses: actions/setup-python@v4
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python-version }}
|
|
||||||
architecture: x64
|
|
||||||
cache: 'pip'
|
|
||||||
cache-dependency-path: 'requirements*'
|
|
||||||
- name: check OS
|
|
||||||
run: cat /etc/os-release
|
|
||||||
- name: set ENV
|
|
||||||
run: export TRAINER_TELEMETRY=0
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install -y git make gcc
|
|
||||||
make system-deps
|
|
||||||
- name: Install/upgrade Python setup deps
|
|
||||||
run: python3 -m pip install --upgrade pip setuptools wheel
|
|
||||||
- name: Replace scarf urls
|
|
||||||
run: |
|
|
||||||
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
|
|
||||||
- name: Install TTS
|
|
||||||
run: |
|
|
||||||
python3 -m pip install .[all]
|
|
||||||
python3 setup.py egg_info
|
|
||||||
- name: Unit tests
|
|
||||||
run: make test_aux
|
|
|
@ -1,51 +0,0 @@
|
||||||
name: data-tests
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
pull_request:
|
|
||||||
types: [opened, synchronize, reopened]
|
|
||||||
jobs:
|
|
||||||
check_skip:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
if: "! contains(github.event.head_commit.message, '[ci skip]')"
|
|
||||||
steps:
|
|
||||||
- run: echo "${{ github.event.head_commit.message }}"
|
|
||||||
|
|
||||||
test:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
python-version: [3.9, "3.10", "3.11"]
|
|
||||||
experimental: [false]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v3
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
|
||||||
uses: actions/setup-python@v4
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python-version }}
|
|
||||||
architecture: x64
|
|
||||||
cache: 'pip'
|
|
||||||
cache-dependency-path: 'requirements*'
|
|
||||||
- name: check OS
|
|
||||||
run: cat /etc/os-release
|
|
||||||
- name: set ENV
|
|
||||||
run: export TRAINER_TELEMETRY=0
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install -y --no-install-recommends git make gcc
|
|
||||||
make system-deps
|
|
||||||
- name: Install/upgrade Python setup deps
|
|
||||||
run: python3 -m pip install --upgrade pip setuptools wheel
|
|
||||||
- name: Replace scarf urls
|
|
||||||
run: |
|
|
||||||
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
|
|
||||||
- name: Install TTS
|
|
||||||
run: |
|
|
||||||
python3 -m pip install .[all]
|
|
||||||
python3 setup.py egg_info
|
|
||||||
- name: Unit tests
|
|
||||||
run: make data_tests
|
|
|
@ -10,7 +10,7 @@ on:
|
||||||
jobs:
|
jobs:
|
||||||
docker-build:
|
docker-build:
|
||||||
name: "Build and push Docker image"
|
name: "Build and push Docker image"
|
||||||
runs-on: ubuntu-20.04
|
runs-on: ubuntu-latest
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
arch: ["amd64"]
|
arch: ["amd64"]
|
||||||
|
@ -18,7 +18,7 @@ jobs:
|
||||||
- "nvidia/cuda:11.8.0-base-ubuntu22.04" # GPU enabled
|
- "nvidia/cuda:11.8.0-base-ubuntu22.04" # GPU enabled
|
||||||
- "python:3.10.8-slim" # CPU only
|
- "python:3.10.8-slim" # CPU only
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v4
|
||||||
- name: Log in to the Container registry
|
- name: Log in to the Container registry
|
||||||
uses: docker/login-action@v1
|
uses: docker/login-action@v1
|
||||||
with:
|
with:
|
||||||
|
@ -29,11 +29,11 @@ jobs:
|
||||||
id: compute-tag
|
id: compute-tag
|
||||||
run: |
|
run: |
|
||||||
set -ex
|
set -ex
|
||||||
base="ghcr.io/coqui-ai/tts"
|
base="ghcr.io/idiap/coqui-tts"
|
||||||
tags="" # PR build
|
tags="" # PR build
|
||||||
|
|
||||||
if [[ ${{ matrix.base }} = "python:3.10.8-slim" ]]; then
|
if [[ ${{ matrix.base }} = "python:3.10.8-slim" ]]; then
|
||||||
base="ghcr.io/coqui-ai/tts-cpu"
|
base="ghcr.io/idiap/coqui-tts-cpu"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ "${{ startsWith(github.ref, 'refs/heads/') }}" = "true" ]]; then
|
if [[ "${{ startsWith(github.ref, 'refs/heads/') }}" = "true" ]]; then
|
||||||
|
@ -42,7 +42,7 @@ jobs:
|
||||||
branch=${github_ref#*refs/heads/} # strip prefix to get branch name
|
branch=${github_ref#*refs/heads/} # strip prefix to get branch name
|
||||||
tags="${base}:${branch},${base}:${{ github.sha }},"
|
tags="${base}:${branch},${base}:${{ github.sha }},"
|
||||||
elif [[ "${{ startsWith(github.ref, 'refs/tags/') }}" = "true" ]]; then
|
elif [[ "${{ startsWith(github.ref, 'refs/tags/') }}" = "true" ]]; then
|
||||||
VERSION="v$(cat TTS/VERSION)"
|
VERSION="v$(grep -m 1 version pyproject.toml | grep -P '\d+\.\d+\.\d+' -o)"
|
||||||
if [[ "${{ github.ref }}" != "refs/tags/${VERSION}" ]]; then
|
if [[ "${{ github.ref }}" != "refs/tags/${VERSION}" ]]; then
|
||||||
echo "Pushed tag does not match VERSION file. Aborting push."
|
echo "Pushed tag does not match VERSION file. Aborting push."
|
||||||
exit 1
|
exit 1
|
||||||
|
@ -63,3 +63,58 @@ jobs:
|
||||||
push: ${{ github.event_name == 'push' }}
|
push: ${{ github.event_name == 'push' }}
|
||||||
build-args: "BASE=${{ matrix.base }}"
|
build-args: "BASE=${{ matrix.base }}"
|
||||||
tags: ${{ steps.compute-tag.outputs.tags }}
|
tags: ${{ steps.compute-tag.outputs.tags }}
|
||||||
|
docker-dev-build:
|
||||||
|
name: "Build the development Docker image"
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
arch: ["amd64"]
|
||||||
|
base:
|
||||||
|
- "nvidia/cuda:11.8.0-base-ubuntu22.04" # GPU enabled
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- name: Log in to the Container registry
|
||||||
|
uses: docker/login-action@v1
|
||||||
|
with:
|
||||||
|
registry: ghcr.io
|
||||||
|
username: ${{ github.actor }}
|
||||||
|
password: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
- name: Compute Docker tags, check VERSION file matches tag
|
||||||
|
id: compute-tag
|
||||||
|
run: |
|
||||||
|
set -ex
|
||||||
|
base="ghcr.io/idiap/coqui-tts-dev"
|
||||||
|
tags="" # PR build
|
||||||
|
|
||||||
|
if [[ ${{ matrix.base }} = "python:3.10.8-slim" ]]; then
|
||||||
|
base="ghcr.io/idiap/coqui-tts-dev-cpu"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "${{ startsWith(github.ref, 'refs/heads/') }}" = "true" ]]; then
|
||||||
|
# Push to branch
|
||||||
|
github_ref="${{ github.ref }}"
|
||||||
|
branch=${github_ref#*refs/heads/} # strip prefix to get branch name
|
||||||
|
tags="${base}:${branch},${base}:${{ github.sha }},"
|
||||||
|
elif [[ "${{ startsWith(github.ref, 'refs/tags/') }}" = "true" ]]; then
|
||||||
|
VERSION="v$(grep -m 1 version pyproject.toml | grep -P '\d+\.\d+\.\d+' -o)"
|
||||||
|
if [[ "${{ github.ref }}" != "refs/tags/${VERSION}" ]]; then
|
||||||
|
echo "Pushed tag does not match VERSION file. Aborting push."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
tags="${base}:${VERSION},${base}:latest,${base}:${{ github.sha }}"
|
||||||
|
fi
|
||||||
|
echo "::set-output name=tags::${tags}"
|
||||||
|
- name: Set up QEMU
|
||||||
|
uses: docker/setup-qemu-action@v1
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
id: buildx
|
||||||
|
uses: docker/setup-buildx-action@v1
|
||||||
|
- name: Build and push
|
||||||
|
uses: docker/build-push-action@v2
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
file: dockerfiles/Dockerfile.dev
|
||||||
|
platforms: linux/${{ matrix.arch }}
|
||||||
|
push: false
|
||||||
|
build-args: "BASE=${{ matrix.base }}"
|
||||||
|
tags: ${{ steps.compute-tag.outputs.tags }}
|
||||||
|
|
|
@ -1,53 +0,0 @@
|
||||||
name: inference_tests
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
pull_request:
|
|
||||||
types: [opened, synchronize, reopened]
|
|
||||||
jobs:
|
|
||||||
check_skip:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
if: "! contains(github.event.head_commit.message, '[ci skip]')"
|
|
||||||
steps:
|
|
||||||
- run: echo "${{ github.event.head_commit.message }}"
|
|
||||||
|
|
||||||
test:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
python-version: [3.9, "3.10", "3.11"]
|
|
||||||
experimental: [false]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v3
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
|
||||||
uses: actions/setup-python@v4
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python-version }}
|
|
||||||
architecture: x64
|
|
||||||
cache: 'pip'
|
|
||||||
cache-dependency-path: 'requirements*'
|
|
||||||
- name: check OS
|
|
||||||
run: cat /etc/os-release
|
|
||||||
- name: set ENV
|
|
||||||
run: |
|
|
||||||
export TRAINER_TELEMETRY=0
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install -y --no-install-recommends git make gcc
|
|
||||||
sudo apt-get install espeak-ng
|
|
||||||
make system-deps
|
|
||||||
- name: Install/upgrade Python setup deps
|
|
||||||
run: python3 -m pip install --upgrade pip setuptools wheel
|
|
||||||
- name: Replace scarf urls
|
|
||||||
run: |
|
|
||||||
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
|
|
||||||
- name: Install TTS
|
|
||||||
run: |
|
|
||||||
python3 -m pip install .[all]
|
|
||||||
python3 setup.py egg_info
|
|
||||||
- name: Unit tests
|
|
||||||
run: make inference_tests
|
|
|
@ -8,18 +8,18 @@ defaults:
|
||||||
bash
|
bash
|
||||||
jobs:
|
jobs:
|
||||||
build-sdist:
|
build-sdist:
|
||||||
runs-on: ubuntu-20.04
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v4
|
||||||
- name: Verify tag matches version
|
- name: Verify tag matches version
|
||||||
run: |
|
run: |
|
||||||
set -ex
|
set -ex
|
||||||
version=$(cat TTS/VERSION)
|
version=$(grep -m 1 version pyproject.toml | grep -P '\d+\.\d+\.\d+' -o)
|
||||||
tag="${GITHUB_REF/refs\/tags\/}"
|
tag="${GITHUB_REF/refs\/tags\/}"
|
||||||
if [[ "v$version" != "$tag" ]]; then
|
if [[ "v$version" != "$tag" ]]; then
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
- uses: actions/setup-python@v2
|
- uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: 3.9
|
python-version: 3.9
|
||||||
- run: |
|
- run: |
|
||||||
|
@ -28,67 +28,63 @@ jobs:
|
||||||
python -m build
|
python -m build
|
||||||
- run: |
|
- run: |
|
||||||
pip install dist/*.tar.gz
|
pip install dist/*.tar.gz
|
||||||
- uses: actions/upload-artifact@v2
|
- uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: sdist
|
name: sdist
|
||||||
path: dist/*.tar.gz
|
path: dist/*.tar.gz
|
||||||
build-wheels:
|
build-wheels:
|
||||||
runs-on: ubuntu-20.04
|
runs-on: ubuntu-latest
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
python-version: ["3.9", "3.10", "3.11"]
|
python-version: ["3.9", "3.10", "3.11", "3.12"]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v4
|
||||||
- uses: actions/setup-python@v2
|
- uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
- name: Install pip requirements
|
- name: Install build requirements
|
||||||
run: |
|
run: |
|
||||||
python -m pip install -U pip setuptools wheel build
|
python -m pip install -U pip setuptools wheel build numpy cython
|
||||||
python -m pip install -r requirements.txt
|
|
||||||
- name: Setup and install manylinux1_x86_64 wheel
|
- name: Setup and install manylinux1_x86_64 wheel
|
||||||
run: |
|
run: |
|
||||||
python setup.py bdist_wheel --plat-name=manylinux1_x86_64
|
python setup.py bdist_wheel --plat-name=manylinux1_x86_64
|
||||||
python -m pip install dist/*-manylinux*.whl
|
python -m pip install dist/*-manylinux*.whl
|
||||||
- uses: actions/upload-artifact@v2
|
- uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: wheel-${{ matrix.python-version }}
|
name: wheel-${{ matrix.python-version }}
|
||||||
path: dist/*-manylinux*.whl
|
path: dist/*-manylinux*.whl
|
||||||
publish-artifacts:
|
publish-artifacts:
|
||||||
runs-on: ubuntu-20.04
|
runs-on: ubuntu-latest
|
||||||
needs: [build-sdist, build-wheels]
|
needs: [build-sdist, build-wheels]
|
||||||
|
environment:
|
||||||
|
name: release
|
||||||
|
url: https://pypi.org/p/coqui-tts
|
||||||
|
permissions:
|
||||||
|
id-token: write
|
||||||
steps:
|
steps:
|
||||||
- run: |
|
- run: |
|
||||||
mkdir dist
|
mkdir dist
|
||||||
- uses: actions/download-artifact@v2
|
- uses: actions/download-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: "sdist"
|
name: "sdist"
|
||||||
path: "dist/"
|
path: "dist/"
|
||||||
- uses: actions/download-artifact@v2
|
- uses: actions/download-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: "wheel-3.9"
|
name: "wheel-3.9"
|
||||||
path: "dist/"
|
path: "dist/"
|
||||||
- uses: actions/download-artifact@v2
|
- uses: actions/download-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: "wheel-3.10"
|
name: "wheel-3.10"
|
||||||
path: "dist/"
|
path: "dist/"
|
||||||
- uses: actions/download-artifact@v2
|
- uses: actions/download-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: "wheel-3.11"
|
name: "wheel-3.11"
|
||||||
path: "dist/"
|
path: "dist/"
|
||||||
|
- uses: actions/download-artifact@v4
|
||||||
|
with:
|
||||||
|
name: "wheel-3.12"
|
||||||
|
path: "dist/"
|
||||||
- run: |
|
- run: |
|
||||||
ls -lh dist/
|
ls -lh dist/
|
||||||
- name: Setup PyPI config
|
- name: Publish package distributions to PyPI
|
||||||
run: |
|
uses: pypa/gh-action-pypi-publish@release/v1
|
||||||
cat << EOF > ~/.pypirc
|
|
||||||
[pypi]
|
|
||||||
username=__token__
|
|
||||||
password=${{ secrets.PYPI_TOKEN }}
|
|
||||||
EOF
|
|
||||||
- uses: actions/setup-python@v2
|
|
||||||
with:
|
|
||||||
python-version: 3.9
|
|
||||||
- run: |
|
|
||||||
python -m pip install twine
|
|
||||||
- run: |
|
|
||||||
twine upload --repository pypi dist/*
|
|
||||||
|
|
|
@ -7,12 +7,6 @@ on:
|
||||||
pull_request:
|
pull_request:
|
||||||
types: [opened, synchronize, reopened]
|
types: [opened, synchronize, reopened]
|
||||||
jobs:
|
jobs:
|
||||||
check_skip:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
if: "! contains(github.event.head_commit.message, '[ci skip]')"
|
|
||||||
steps:
|
|
||||||
- run: echo "${{ github.event.head_commit.message }}"
|
|
||||||
|
|
||||||
test:
|
test:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
strategy:
|
strategy:
|
||||||
|
@ -21,26 +15,15 @@ jobs:
|
||||||
python-version: [3.9]
|
python-version: [3.9]
|
||||||
experimental: [false]
|
experimental: [false]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v4
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
uses: actions/setup-python@v4
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
architecture: x64
|
architecture: x64
|
||||||
cache: 'pip'
|
cache: 'pip'
|
||||||
cache-dependency-path: 'requirements*'
|
cache-dependency-path: 'requirements*'
|
||||||
- name: check OS
|
- name: Install/upgrade dev dependencies
|
||||||
run: cat /etc/os-release
|
run: python3 -m pip install -r requirements.dev.txt
|
||||||
- name: Install dependencies
|
- name: Lint check
|
||||||
run: |
|
run: make lint
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install -y git make gcc
|
|
||||||
make system-deps
|
|
||||||
- name: Install/upgrade Python setup deps
|
|
||||||
run: python3 -m pip install --upgrade pip setuptools wheel
|
|
||||||
- name: Install TTS
|
|
||||||
run: |
|
|
||||||
python3 -m pip install .[all]
|
|
||||||
python3 setup.py egg_info
|
|
||||||
- name: Style check
|
|
||||||
run: make style
|
|
||||||
|
|
|
@ -0,0 +1,81 @@
|
||||||
|
name: tests
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
pull_request:
|
||||||
|
types: [opened, synchronize, reopened]
|
||||||
|
jobs:
|
||||||
|
test:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
python-version: [3.9, "3.10", "3.11", "3.12"]
|
||||||
|
subset: ["data_tests", "inference_tests", "test_aux", "test_text", "test_tts", "test_tts2", "test_vocoder", "test_xtts", "test_zoo0", "test_zoo1", "test_zoo2"]
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
|
uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: ${{ matrix.python-version }}
|
||||||
|
architecture: x64
|
||||||
|
cache: 'pip'
|
||||||
|
cache-dependency-path: 'requirements*'
|
||||||
|
- name: check OS
|
||||||
|
run: cat /etc/os-release
|
||||||
|
- name: set ENV
|
||||||
|
run: export TRAINER_TELEMETRY=0
|
||||||
|
- name: Install Espeak
|
||||||
|
if: contains(fromJSON('["inference_tests", "test_text", "test_tts", "test_tts2", "test_xtts", "test_zoo0", "test_zoo1", "test_zoo2"]'), matrix.subset)
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install espeak espeak-ng
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install -y --no-install-recommends git make gcc
|
||||||
|
make system-deps
|
||||||
|
- name: Install/upgrade Python setup deps
|
||||||
|
run: python3 -m pip install --upgrade pip setuptools wheel uv
|
||||||
|
- name: Replace scarf urls
|
||||||
|
if: contains(fromJSON('["data_tests", "inference_tests", "test_aux", "test_tts", "test_tts2", "test_xtts", "test_zoo0", "test_zoo1", "test_zoo2"]'), matrix.subset)
|
||||||
|
run: |
|
||||||
|
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
|
||||||
|
- name: Install TTS
|
||||||
|
run: |
|
||||||
|
resolution=highest
|
||||||
|
if [ "${{ matrix.python-version }}" == "3.9" ]; then
|
||||||
|
resolution=lowest-direct
|
||||||
|
fi
|
||||||
|
python3 -m uv pip install --resolution=$resolution --system "coqui-tts[dev,server,languages] @ ."
|
||||||
|
- name: Unit tests
|
||||||
|
run: make ${{ matrix.subset }}
|
||||||
|
- name: Upload coverage data
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: coverage-data-${{ matrix.subset }}-${{ matrix.python-version }}
|
||||||
|
path: .coverage.*
|
||||||
|
if-no-files-found: ignore
|
||||||
|
coverage:
|
||||||
|
if: always()
|
||||||
|
needs: test
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: "3.12"
|
||||||
|
- uses: actions/download-artifact@v4
|
||||||
|
with:
|
||||||
|
pattern: coverage-data-*
|
||||||
|
merge-multiple: true
|
||||||
|
- name: Combine coverage
|
||||||
|
run: |
|
||||||
|
python -Im pip install --upgrade coverage[toml]
|
||||||
|
|
||||||
|
python -Im coverage combine
|
||||||
|
python -Im coverage html --skip-covered --skip-empty
|
||||||
|
|
||||||
|
python -Im coverage report --format=markdown >> $GITHUB_STEP_SUMMARY
|
|
@ -1,50 +0,0 @@
|
||||||
name: text-tests
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
pull_request:
|
|
||||||
types: [opened, synchronize, reopened]
|
|
||||||
jobs:
|
|
||||||
check_skip:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
if: "! contains(github.event.head_commit.message, '[ci skip]')"
|
|
||||||
steps:
|
|
||||||
- run: echo "${{ github.event.head_commit.message }}"
|
|
||||||
|
|
||||||
test:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
python-version: [3.9, "3.10", "3.11"]
|
|
||||||
experimental: [false]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v3
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
|
||||||
uses: actions/setup-python@v4
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python-version }}
|
|
||||||
architecture: x64
|
|
||||||
cache: 'pip'
|
|
||||||
cache-dependency-path: 'requirements*'
|
|
||||||
- name: check OS
|
|
||||||
run: cat /etc/os-release
|
|
||||||
- name: set ENV
|
|
||||||
run: export TRAINER_TELEMETRY=0
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install -y --no-install-recommends git make gcc
|
|
||||||
sudo apt-get install espeak
|
|
||||||
sudo apt-get install espeak-ng
|
|
||||||
make system-deps
|
|
||||||
- name: Install/upgrade Python setup deps
|
|
||||||
run: python3 -m pip install --upgrade pip setuptools wheel
|
|
||||||
- name: Install TTS
|
|
||||||
run: |
|
|
||||||
python3 -m pip install .[all]
|
|
||||||
python3 setup.py egg_info
|
|
||||||
- name: Unit tests
|
|
||||||
run: make test_text
|
|
|
@ -1,53 +0,0 @@
|
||||||
name: tts-tests
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
pull_request:
|
|
||||||
types: [opened, synchronize, reopened]
|
|
||||||
jobs:
|
|
||||||
check_skip:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
if: "! contains(github.event.head_commit.message, '[ci skip]')"
|
|
||||||
steps:
|
|
||||||
- run: echo "${{ github.event.head_commit.message }}"
|
|
||||||
|
|
||||||
test:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
python-version: [3.9, "3.10", "3.11"]
|
|
||||||
experimental: [false]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v3
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
|
||||||
uses: actions/setup-python@v4
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python-version }}
|
|
||||||
architecture: x64
|
|
||||||
cache: 'pip'
|
|
||||||
cache-dependency-path: 'requirements*'
|
|
||||||
- name: check OS
|
|
||||||
run: cat /etc/os-release
|
|
||||||
- name: set ENV
|
|
||||||
run: export TRAINER_TELEMETRY=0
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install -y --no-install-recommends git make gcc
|
|
||||||
sudo apt-get install espeak
|
|
||||||
sudo apt-get install espeak-ng
|
|
||||||
make system-deps
|
|
||||||
- name: Install/upgrade Python setup deps
|
|
||||||
run: python3 -m pip install --upgrade pip setuptools wheel
|
|
||||||
- name: Replace scarf urls
|
|
||||||
run: |
|
|
||||||
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
|
|
||||||
- name: Install TTS
|
|
||||||
run: |
|
|
||||||
python3 -m pip install .[all]
|
|
||||||
python3 setup.py egg_info
|
|
||||||
- name: Unit tests
|
|
||||||
run: make test_tts
|
|
|
@ -1,53 +0,0 @@
|
||||||
name: tts-tests2
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
pull_request:
|
|
||||||
types: [opened, synchronize, reopened]
|
|
||||||
jobs:
|
|
||||||
check_skip:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
if: "! contains(github.event.head_commit.message, '[ci skip]')"
|
|
||||||
steps:
|
|
||||||
- run: echo "${{ github.event.head_commit.message }}"
|
|
||||||
|
|
||||||
test:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
python-version: [3.9, "3.10", "3.11"]
|
|
||||||
experimental: [false]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v3
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
|
||||||
uses: actions/setup-python@v4
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python-version }}
|
|
||||||
architecture: x64
|
|
||||||
cache: 'pip'
|
|
||||||
cache-dependency-path: 'requirements*'
|
|
||||||
- name: check OS
|
|
||||||
run: cat /etc/os-release
|
|
||||||
- name: set ENV
|
|
||||||
run: export TRAINER_TELEMETRY=0
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install -y --no-install-recommends git make gcc
|
|
||||||
sudo apt-get install espeak
|
|
||||||
sudo apt-get install espeak-ng
|
|
||||||
make system-deps
|
|
||||||
- name: Install/upgrade Python setup deps
|
|
||||||
run: python3 -m pip install --upgrade pip setuptools wheel
|
|
||||||
- name: Replace scarf urls
|
|
||||||
run: |
|
|
||||||
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
|
|
||||||
- name: Install TTS
|
|
||||||
run: |
|
|
||||||
python3 -m pip install .[all]
|
|
||||||
python3 setup.py egg_info
|
|
||||||
- name: Unit tests
|
|
||||||
run: make test_tts2
|
|
|
@ -1,48 +0,0 @@
|
||||||
name: vocoder-tests
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
pull_request:
|
|
||||||
types: [opened, synchronize, reopened]
|
|
||||||
jobs:
|
|
||||||
check_skip:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
if: "! contains(github.event.head_commit.message, '[ci skip]')"
|
|
||||||
steps:
|
|
||||||
- run: echo "${{ github.event.head_commit.message }}"
|
|
||||||
|
|
||||||
test:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
python-version: [3.9, "3.10", "3.11"]
|
|
||||||
experimental: [false]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v3
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
|
||||||
uses: actions/setup-python@v4
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python-version }}
|
|
||||||
architecture: x64
|
|
||||||
cache: 'pip'
|
|
||||||
cache-dependency-path: 'requirements*'
|
|
||||||
- name: check OS
|
|
||||||
run: cat /etc/os-release
|
|
||||||
- name: set ENV
|
|
||||||
run: export TRAINER_TELEMETRY=0
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install -y git make gcc
|
|
||||||
make system-deps
|
|
||||||
- name: Install/upgrade Python setup deps
|
|
||||||
run: python3 -m pip install --upgrade pip setuptools wheel
|
|
||||||
- name: Install TTS
|
|
||||||
run: |
|
|
||||||
python3 -m pip install .[all]
|
|
||||||
python3 setup.py egg_info
|
|
||||||
- name: Unit tests
|
|
||||||
run: make test_vocoder
|
|
|
@ -1,53 +0,0 @@
|
||||||
name: xtts-tests
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
pull_request:
|
|
||||||
types: [opened, synchronize, reopened]
|
|
||||||
jobs:
|
|
||||||
check_skip:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
if: "! contains(github.event.head_commit.message, '[ci skip]')"
|
|
||||||
steps:
|
|
||||||
- run: echo "${{ github.event.head_commit.message }}"
|
|
||||||
|
|
||||||
test:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
python-version: [3.9, "3.10", "3.11"]
|
|
||||||
experimental: [false]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v3
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
|
||||||
uses: actions/setup-python@v4
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python-version }}
|
|
||||||
architecture: x64
|
|
||||||
cache: 'pip'
|
|
||||||
cache-dependency-path: 'requirements*'
|
|
||||||
- name: check OS
|
|
||||||
run: cat /etc/os-release
|
|
||||||
- name: set ENV
|
|
||||||
run: export TRAINER_TELEMETRY=0
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install -y --no-install-recommends git make gcc
|
|
||||||
sudo apt-get install espeak
|
|
||||||
sudo apt-get install espeak-ng
|
|
||||||
make system-deps
|
|
||||||
- name: Install/upgrade Python setup deps
|
|
||||||
run: python3 -m pip install --upgrade pip setuptools wheel
|
|
||||||
- name: Replace scarf urls
|
|
||||||
run: |
|
|
||||||
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
|
|
||||||
- name: Install TTS
|
|
||||||
run: |
|
|
||||||
python3 -m pip install .[all]
|
|
||||||
python3 setup.py egg_info
|
|
||||||
- name: Unit tests
|
|
||||||
run: make test_xtts
|
|
|
@ -1,54 +0,0 @@
|
||||||
name: zoo-tests-0
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
pull_request:
|
|
||||||
types: [opened, synchronize, reopened]
|
|
||||||
jobs:
|
|
||||||
check_skip:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
if: "! contains(github.event.head_commit.message, '[ci skip]')"
|
|
||||||
steps:
|
|
||||||
- run: echo "${{ github.event.head_commit.message }}"
|
|
||||||
|
|
||||||
test:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
python-version: [3.9, "3.10", "3.11"]
|
|
||||||
experimental: [false]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v3
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
|
||||||
uses: actions/setup-python@v4
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python-version }}
|
|
||||||
architecture: x64
|
|
||||||
cache: 'pip'
|
|
||||||
cache-dependency-path: 'requirements*'
|
|
||||||
- name: check OS
|
|
||||||
run: cat /etc/os-release
|
|
||||||
- name: set ENV
|
|
||||||
run: export TRAINER_TELEMETRY=0
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install -y git make gcc
|
|
||||||
sudo apt-get install espeak espeak-ng
|
|
||||||
make system-deps
|
|
||||||
- name: Install/upgrade Python setup deps
|
|
||||||
run: python3 -m pip install --upgrade pip setuptools wheel
|
|
||||||
- name: Replace scarf urls
|
|
||||||
run: |
|
|
||||||
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
|
|
||||||
- name: Install TTS
|
|
||||||
run: |
|
|
||||||
python3 -m pip install .[all]
|
|
||||||
python3 setup.py egg_info
|
|
||||||
- name: Unit tests
|
|
||||||
run: |
|
|
||||||
nose2 -F -v -B TTS tests.zoo_tests.test_models.test_models_offset_0_step_3
|
|
||||||
nose2 -F -v -B TTS tests.zoo_tests.test_models.test_voice_conversion
|
|
|
@ -1,53 +0,0 @@
|
||||||
name: zoo-tests-1
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
pull_request:
|
|
||||||
types: [opened, synchronize, reopened]
|
|
||||||
jobs:
|
|
||||||
check_skip:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
if: "! contains(github.event.head_commit.message, '[ci skip]')"
|
|
||||||
steps:
|
|
||||||
- run: echo "${{ github.event.head_commit.message }}"
|
|
||||||
|
|
||||||
test:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
python-version: [3.9, "3.10", "3.11"]
|
|
||||||
experimental: [false]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v3
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
|
||||||
uses: actions/setup-python@v4
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python-version }}
|
|
||||||
architecture: x64
|
|
||||||
cache: 'pip'
|
|
||||||
cache-dependency-path: 'requirements*'
|
|
||||||
- name: check OS
|
|
||||||
run: cat /etc/os-release
|
|
||||||
- name: set ENV
|
|
||||||
run: export TRAINER_TELEMETRY=0
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install -y git make gcc
|
|
||||||
sudo apt-get install espeak espeak-ng
|
|
||||||
make system-deps
|
|
||||||
- name: Install/upgrade Python setup deps
|
|
||||||
run: python3 -m pip install --upgrade pip setuptools wheel
|
|
||||||
- name: Replace scarf urls
|
|
||||||
run: |
|
|
||||||
sed -i 's/https:\/\/coqui.gateway.scarf.sh\/hf\/bark\//https:\/\/huggingface.co\/erogol\/bark\/resolve\/main\//g' TTS/.models.json
|
|
||||||
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
|
|
||||||
- name: Install TTS
|
|
||||||
run: |
|
|
||||||
python3 -m pip install .[all]
|
|
||||||
python3 setup.py egg_info
|
|
||||||
- name: Unit tests
|
|
||||||
run: nose2 -F -v -B --with-coverage --coverage TTS tests.zoo_tests.test_models.test_models_offset_1_step_3
|
|
|
@ -1,52 +0,0 @@
|
||||||
name: zoo-tests-2
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
pull_request:
|
|
||||||
types: [opened, synchronize, reopened]
|
|
||||||
jobs:
|
|
||||||
check_skip:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
if: "! contains(github.event.head_commit.message, '[ci skip]')"
|
|
||||||
steps:
|
|
||||||
- run: echo "${{ github.event.head_commit.message }}"
|
|
||||||
|
|
||||||
test:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
python-version: [3.9, "3.10", "3.11"]
|
|
||||||
experimental: [false]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v3
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
|
||||||
uses: actions/setup-python@v4
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python-version }}
|
|
||||||
architecture: x64
|
|
||||||
cache: 'pip'
|
|
||||||
cache-dependency-path: 'requirements*'
|
|
||||||
- name: check OS
|
|
||||||
run: cat /etc/os-release
|
|
||||||
- name: set ENV
|
|
||||||
run: export TRAINER_TELEMETRY=0
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install -y git make gcc
|
|
||||||
sudo apt-get install espeak espeak-ng
|
|
||||||
make system-deps
|
|
||||||
- name: Install/upgrade Python setup deps
|
|
||||||
run: python3 -m pip install --upgrade pip setuptools wheel
|
|
||||||
- name: Replace scarf urls
|
|
||||||
run: |
|
|
||||||
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
|
|
||||||
- name: Install TTS
|
|
||||||
run: |
|
|
||||||
python3 -m pip install .[all]
|
|
||||||
python3 setup.py egg_info
|
|
||||||
- name: Unit tests
|
|
||||||
run: nose2 -F -v -B --with-coverage --coverage TTS tests.zoo_tests.test_models.test_models_offset_2_step_3
|
|
|
@ -169,4 +169,4 @@ wandb
|
||||||
depot/*
|
depot/*
|
||||||
coqui_recipes/*
|
coqui_recipes/*
|
||||||
local_scripts/*
|
local_scripts/*
|
||||||
coqui_demos/*
|
coqui_demos/*
|
||||||
|
|
|
@ -1,27 +1,24 @@
|
||||||
repos:
|
repos:
|
||||||
- repo: 'https://github.com/pre-commit/pre-commit-hooks'
|
- repo: "https://github.com/pre-commit/pre-commit-hooks"
|
||||||
rev: v2.3.0
|
rev: v4.5.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: check-yaml
|
- id: check-yaml
|
||||||
- id: end-of-file-fixer
|
- id: end-of-file-fixer
|
||||||
- id: trailing-whitespace
|
- id: trailing-whitespace
|
||||||
- repo: 'https://github.com/psf/black'
|
- repo: "https://github.com/psf/black"
|
||||||
rev: 22.3.0
|
rev: 24.2.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: black
|
- id: black
|
||||||
language_version: python3
|
language_version: python3
|
||||||
- repo: https://github.com/pycqa/isort
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||||
rev: 5.8.0
|
rev: v0.3.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: isort
|
- id: ruff
|
||||||
name: isort (python)
|
args: [--fix, --exit-non-zero-on-fix]
|
||||||
- id: isort
|
- repo: local
|
||||||
name: isort (cython)
|
|
||||||
types: [cython]
|
|
||||||
- id: isort
|
|
||||||
name: isort (pyi)
|
|
||||||
types: [pyi]
|
|
||||||
- repo: https://github.com/pycqa/pylint
|
|
||||||
rev: v2.8.2
|
|
||||||
hooks:
|
hooks:
|
||||||
- id: pylint
|
- id: generate_requirements.py
|
||||||
|
name: generate_requirements.py
|
||||||
|
language: system
|
||||||
|
entry: python scripts/generate_requirements.py
|
||||||
|
files: "pyproject.toml|requirements.*\\.txt|tools/generate_requirements.py"
|
||||||
|
|
599
.pylintrc
599
.pylintrc
|
@ -1,599 +0,0 @@
|
||||||
[MASTER]
|
|
||||||
|
|
||||||
# A comma-separated list of package or module names from where C extensions may
|
|
||||||
# be loaded. Extensions are loading into the active Python interpreter and may
|
|
||||||
# run arbitrary code.
|
|
||||||
extension-pkg-whitelist=
|
|
||||||
|
|
||||||
# Add files or directories to the blacklist. They should be base names, not
|
|
||||||
# paths.
|
|
||||||
ignore=CVS
|
|
||||||
|
|
||||||
# Add files or directories matching the regex patterns to the blacklist. The
|
|
||||||
# regex matches against base names, not paths.
|
|
||||||
ignore-patterns=
|
|
||||||
|
|
||||||
# Python code to execute, usually for sys.path manipulation such as
|
|
||||||
# pygtk.require().
|
|
||||||
#init-hook=
|
|
||||||
|
|
||||||
# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
|
|
||||||
# number of processors available to use.
|
|
||||||
jobs=1
|
|
||||||
|
|
||||||
# Control the amount of potential inferred values when inferring a single
|
|
||||||
# object. This can help the performance when dealing with large functions or
|
|
||||||
# complex, nested conditions.
|
|
||||||
limit-inference-results=100
|
|
||||||
|
|
||||||
# List of plugins (as comma separated values of python modules names) to load,
|
|
||||||
# usually to register additional checkers.
|
|
||||||
load-plugins=
|
|
||||||
|
|
||||||
# Pickle collected data for later comparisons.
|
|
||||||
persistent=yes
|
|
||||||
|
|
||||||
# Specify a configuration file.
|
|
||||||
#rcfile=
|
|
||||||
|
|
||||||
# When enabled, pylint would attempt to guess common misconfiguration and emit
|
|
||||||
# user-friendly hints instead of false-positive error messages.
|
|
||||||
suggestion-mode=yes
|
|
||||||
|
|
||||||
# Allow loading of arbitrary C extensions. Extensions are imported into the
|
|
||||||
# active Python interpreter and may run arbitrary code.
|
|
||||||
unsafe-load-any-extension=no
|
|
||||||
|
|
||||||
|
|
||||||
[MESSAGES CONTROL]
|
|
||||||
|
|
||||||
# Only show warnings with the listed confidence levels. Leave empty to show
|
|
||||||
# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED.
|
|
||||||
confidence=
|
|
||||||
|
|
||||||
# Disable the message, report, category or checker with the given id(s). You
|
|
||||||
# can either give multiple identifiers separated by comma (,) or put this
|
|
||||||
# option multiple times (only on the command line, not in the configuration
|
|
||||||
# file where it should appear only once). You can also use "--disable=all" to
|
|
||||||
# disable everything first and then reenable specific checks. For example, if
|
|
||||||
# you want to run only the similarities checker, you can use "--disable=all
|
|
||||||
# --enable=similarities". If you want to run only the classes checker, but have
|
|
||||||
# no Warning level messages displayed, use "--disable=all --enable=classes
|
|
||||||
# --disable=W".
|
|
||||||
disable=missing-docstring,
|
|
||||||
too-many-public-methods,
|
|
||||||
too-many-lines,
|
|
||||||
bare-except,
|
|
||||||
## for avoiding weird p3.6 CI linter error
|
|
||||||
## TODO: see later if we can remove this
|
|
||||||
assigning-non-slot,
|
|
||||||
unsupported-assignment-operation,
|
|
||||||
## end
|
|
||||||
line-too-long,
|
|
||||||
fixme,
|
|
||||||
wrong-import-order,
|
|
||||||
ungrouped-imports,
|
|
||||||
wrong-import-position,
|
|
||||||
import-error,
|
|
||||||
invalid-name,
|
|
||||||
too-many-instance-attributes,
|
|
||||||
arguments-differ,
|
|
||||||
arguments-renamed,
|
|
||||||
no-name-in-module,
|
|
||||||
no-member,
|
|
||||||
unsubscriptable-object,
|
|
||||||
print-statement,
|
|
||||||
parameter-unpacking,
|
|
||||||
unpacking-in-except,
|
|
||||||
old-raise-syntax,
|
|
||||||
backtick,
|
|
||||||
long-suffix,
|
|
||||||
old-ne-operator,
|
|
||||||
old-octal-literal,
|
|
||||||
import-star-module-level,
|
|
||||||
non-ascii-bytes-literal,
|
|
||||||
raw-checker-failed,
|
|
||||||
bad-inline-option,
|
|
||||||
locally-disabled,
|
|
||||||
file-ignored,
|
|
||||||
suppressed-message,
|
|
||||||
useless-suppression,
|
|
||||||
deprecated-pragma,
|
|
||||||
use-symbolic-message-instead,
|
|
||||||
useless-object-inheritance,
|
|
||||||
too-few-public-methods,
|
|
||||||
too-many-branches,
|
|
||||||
too-many-arguments,
|
|
||||||
too-many-locals,
|
|
||||||
too-many-statements,
|
|
||||||
apply-builtin,
|
|
||||||
basestring-builtin,
|
|
||||||
buffer-builtin,
|
|
||||||
cmp-builtin,
|
|
||||||
coerce-builtin,
|
|
||||||
execfile-builtin,
|
|
||||||
file-builtin,
|
|
||||||
long-builtin,
|
|
||||||
raw_input-builtin,
|
|
||||||
reduce-builtin,
|
|
||||||
standarderror-builtin,
|
|
||||||
unicode-builtin,
|
|
||||||
xrange-builtin,
|
|
||||||
coerce-method,
|
|
||||||
delslice-method,
|
|
||||||
getslice-method,
|
|
||||||
setslice-method,
|
|
||||||
no-absolute-import,
|
|
||||||
old-division,
|
|
||||||
dict-iter-method,
|
|
||||||
dict-view-method,
|
|
||||||
next-method-called,
|
|
||||||
metaclass-assignment,
|
|
||||||
indexing-exception,
|
|
||||||
raising-string,
|
|
||||||
reload-builtin,
|
|
||||||
oct-method,
|
|
||||||
hex-method,
|
|
||||||
nonzero-method,
|
|
||||||
cmp-method,
|
|
||||||
input-builtin,
|
|
||||||
round-builtin,
|
|
||||||
intern-builtin,
|
|
||||||
unichr-builtin,
|
|
||||||
map-builtin-not-iterating,
|
|
||||||
zip-builtin-not-iterating,
|
|
||||||
range-builtin-not-iterating,
|
|
||||||
filter-builtin-not-iterating,
|
|
||||||
using-cmp-argument,
|
|
||||||
eq-without-hash,
|
|
||||||
div-method,
|
|
||||||
idiv-method,
|
|
||||||
rdiv-method,
|
|
||||||
exception-message-attribute,
|
|
||||||
invalid-str-codec,
|
|
||||||
sys-max-int,
|
|
||||||
bad-python3-import,
|
|
||||||
deprecated-string-function,
|
|
||||||
deprecated-str-translate-call,
|
|
||||||
deprecated-itertools-function,
|
|
||||||
deprecated-types-field,
|
|
||||||
next-method-defined,
|
|
||||||
dict-items-not-iterating,
|
|
||||||
dict-keys-not-iterating,
|
|
||||||
dict-values-not-iterating,
|
|
||||||
deprecated-operator-function,
|
|
||||||
deprecated-urllib-function,
|
|
||||||
xreadlines-attribute,
|
|
||||||
deprecated-sys-function,
|
|
||||||
exception-escape,
|
|
||||||
comprehension-escape,
|
|
||||||
duplicate-code,
|
|
||||||
not-callable,
|
|
||||||
import-outside-toplevel,
|
|
||||||
logging-fstring-interpolation,
|
|
||||||
logging-not-lazy
|
|
||||||
|
|
||||||
# Enable the message, report, category or checker with the given id(s). You can
|
|
||||||
# either give multiple identifier separated by comma (,) or put this option
|
|
||||||
# multiple time (only on the command line, not in the configuration file where
|
|
||||||
# it should appear only once). See also the "--disable" option for examples.
|
|
||||||
enable=c-extension-no-member
|
|
||||||
|
|
||||||
|
|
||||||
[REPORTS]
|
|
||||||
|
|
||||||
# Python expression which should return a note less than 10 (10 is the highest
|
|
||||||
# note). You have access to the variables errors warning, statement which
|
|
||||||
# respectively contain the number of errors / warnings messages and the total
|
|
||||||
# number of statements analyzed. This is used by the global evaluation report
|
|
||||||
# (RP0004).
|
|
||||||
evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
|
|
||||||
|
|
||||||
# Template used to display messages. This is a python new-style format string
|
|
||||||
# used to format the message information. See doc for all details.
|
|
||||||
#msg-template=
|
|
||||||
|
|
||||||
# Set the output format. Available formats are text, parseable, colorized, json
|
|
||||||
# and msvs (visual studio). You can also give a reporter class, e.g.
|
|
||||||
# mypackage.mymodule.MyReporterClass.
|
|
||||||
output-format=text
|
|
||||||
|
|
||||||
# Tells whether to display a full report or only the messages.
|
|
||||||
reports=no
|
|
||||||
|
|
||||||
# Activate the evaluation score.
|
|
||||||
score=yes
|
|
||||||
|
|
||||||
|
|
||||||
[REFACTORING]
|
|
||||||
|
|
||||||
# Maximum number of nested blocks for function / method body
|
|
||||||
max-nested-blocks=5
|
|
||||||
|
|
||||||
# Complete name of functions that never returns. When checking for
|
|
||||||
# inconsistent-return-statements if a never returning function is called then
|
|
||||||
# it will be considered as an explicit return statement and no message will be
|
|
||||||
# printed.
|
|
||||||
never-returning-functions=sys.exit
|
|
||||||
|
|
||||||
|
|
||||||
[LOGGING]
|
|
||||||
|
|
||||||
# Format style used to check logging format string. `old` means using %
|
|
||||||
# formatting, while `new` is for `{}` formatting.
|
|
||||||
logging-format-style=old
|
|
||||||
|
|
||||||
# Logging modules to check that the string format arguments are in logging
|
|
||||||
# function parameter format.
|
|
||||||
logging-modules=logging
|
|
||||||
|
|
||||||
|
|
||||||
[SPELLING]
|
|
||||||
|
|
||||||
# Limits count of emitted suggestions for spelling mistakes.
|
|
||||||
max-spelling-suggestions=4
|
|
||||||
|
|
||||||
# Spelling dictionary name. Available dictionaries: none. To make it working
|
|
||||||
# install python-enchant package..
|
|
||||||
spelling-dict=
|
|
||||||
|
|
||||||
# List of comma separated words that should not be checked.
|
|
||||||
spelling-ignore-words=
|
|
||||||
|
|
||||||
# A path to a file that contains private dictionary; one word per line.
|
|
||||||
spelling-private-dict-file=
|
|
||||||
|
|
||||||
# Tells whether to store unknown words to indicated private dictionary in
|
|
||||||
# --spelling-private-dict-file option instead of raising a message.
|
|
||||||
spelling-store-unknown-words=no
|
|
||||||
|
|
||||||
|
|
||||||
[MISCELLANEOUS]
|
|
||||||
|
|
||||||
# List of note tags to take in consideration, separated by a comma.
|
|
||||||
notes=FIXME,
|
|
||||||
XXX,
|
|
||||||
TODO
|
|
||||||
|
|
||||||
|
|
||||||
[TYPECHECK]
|
|
||||||
|
|
||||||
# List of decorators that produce context managers, such as
|
|
||||||
# contextlib.contextmanager. Add to this list to register other decorators that
|
|
||||||
# produce valid context managers.
|
|
||||||
contextmanager-decorators=contextlib.contextmanager
|
|
||||||
|
|
||||||
# List of members which are set dynamically and missed by pylint inference
|
|
||||||
# system, and so shouldn't trigger E1101 when accessed. Python regular
|
|
||||||
# expressions are accepted.
|
|
||||||
generated-members=numpy.*,torch.*
|
|
||||||
|
|
||||||
# Tells whether missing members accessed in mixin class should be ignored. A
|
|
||||||
# mixin class is detected if its name ends with "mixin" (case insensitive).
|
|
||||||
ignore-mixin-members=yes
|
|
||||||
|
|
||||||
# Tells whether to warn about missing members when the owner of the attribute
|
|
||||||
# is inferred to be None.
|
|
||||||
ignore-none=yes
|
|
||||||
|
|
||||||
# This flag controls whether pylint should warn about no-member and similar
|
|
||||||
# checks whenever an opaque object is returned when inferring. The inference
|
|
||||||
# can return multiple potential results while evaluating a Python object, but
|
|
||||||
# some branches might not be evaluated, which results in partial inference. In
|
|
||||||
# that case, it might be useful to still emit no-member and other checks for
|
|
||||||
# the rest of the inferred objects.
|
|
||||||
ignore-on-opaque-inference=yes
|
|
||||||
|
|
||||||
# List of class names for which member attributes should not be checked (useful
|
|
||||||
# for classes with dynamically set attributes). This supports the use of
|
|
||||||
# qualified names.
|
|
||||||
ignored-classes=optparse.Values,thread._local,_thread._local
|
|
||||||
|
|
||||||
# List of module names for which member attributes should not be checked
|
|
||||||
# (useful for modules/projects where namespaces are manipulated during runtime
|
|
||||||
# and thus existing member attributes cannot be deduced by static analysis. It
|
|
||||||
# supports qualified module names, as well as Unix pattern matching.
|
|
||||||
ignored-modules=
|
|
||||||
|
|
||||||
# Show a hint with possible names when a member name was not found. The aspect
|
|
||||||
# of finding the hint is based on edit distance.
|
|
||||||
missing-member-hint=yes
|
|
||||||
|
|
||||||
# The minimum edit distance a name should have in order to be considered a
|
|
||||||
# similar match for a missing member name.
|
|
||||||
missing-member-hint-distance=1
|
|
||||||
|
|
||||||
# The total number of similar names that should be taken in consideration when
|
|
||||||
# showing a hint for a missing member.
|
|
||||||
missing-member-max-choices=1
|
|
||||||
|
|
||||||
|
|
||||||
[VARIABLES]
|
|
||||||
|
|
||||||
# List of additional names supposed to be defined in builtins. Remember that
|
|
||||||
# you should avoid defining new builtins when possible.
|
|
||||||
additional-builtins=
|
|
||||||
|
|
||||||
# Tells whether unused global variables should be treated as a violation.
|
|
||||||
allow-global-unused-variables=yes
|
|
||||||
|
|
||||||
# List of strings which can identify a callback function by name. A callback
|
|
||||||
# name must start or end with one of those strings.
|
|
||||||
callbacks=cb_,
|
|
||||||
_cb
|
|
||||||
|
|
||||||
# A regular expression matching the name of dummy variables (i.e. expected to
|
|
||||||
# not be used).
|
|
||||||
dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
|
|
||||||
|
|
||||||
# Argument names that match this expression will be ignored. Default to name
|
|
||||||
# with leading underscore.
|
|
||||||
ignored-argument-names=_.*|^ignored_|^unused_
|
|
||||||
|
|
||||||
# Tells whether we should check for unused import in __init__ files.
|
|
||||||
init-import=no
|
|
||||||
|
|
||||||
# List of qualified module names which can have objects that can redefine
|
|
||||||
# builtins.
|
|
||||||
redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
|
|
||||||
|
|
||||||
|
|
||||||
[FORMAT]
|
|
||||||
|
|
||||||
# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
|
|
||||||
expected-line-ending-format=
|
|
||||||
|
|
||||||
# Regexp for a line that is allowed to be longer than the limit.
|
|
||||||
ignore-long-lines=^\s*(# )?<?https?://\S+>?$
|
|
||||||
|
|
||||||
# Number of spaces of indent required inside a hanging or continued line.
|
|
||||||
indent-after-paren=4
|
|
||||||
|
|
||||||
# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
|
|
||||||
# tab).
|
|
||||||
indent-string=' '
|
|
||||||
|
|
||||||
# Maximum number of characters on a single line.
|
|
||||||
max-line-length=120
|
|
||||||
|
|
||||||
# Maximum number of lines in a module.
|
|
||||||
max-module-lines=1000
|
|
||||||
|
|
||||||
# List of optional constructs for which whitespace checking is disabled. `dict-
|
|
||||||
# separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}.
|
|
||||||
# `trailing-comma` allows a space between comma and closing bracket: (a, ).
|
|
||||||
# `empty-line` allows space-only lines.
|
|
||||||
no-space-check=trailing-comma,
|
|
||||||
dict-separator
|
|
||||||
|
|
||||||
# Allow the body of a class to be on the same line as the declaration if body
|
|
||||||
# contains single statement.
|
|
||||||
single-line-class-stmt=no
|
|
||||||
|
|
||||||
# Allow the body of an if to be on the same line as the test if there is no
|
|
||||||
# else.
|
|
||||||
single-line-if-stmt=no
|
|
||||||
|
|
||||||
|
|
||||||
[SIMILARITIES]
|
|
||||||
|
|
||||||
# Ignore comments when computing similarities.
|
|
||||||
ignore-comments=yes
|
|
||||||
|
|
||||||
# Ignore docstrings when computing similarities.
|
|
||||||
ignore-docstrings=yes
|
|
||||||
|
|
||||||
# Ignore imports when computing similarities.
|
|
||||||
ignore-imports=no
|
|
||||||
|
|
||||||
# Minimum lines number of a similarity.
|
|
||||||
min-similarity-lines=4
|
|
||||||
|
|
||||||
|
|
||||||
[BASIC]
|
|
||||||
|
|
||||||
# Naming style matching correct argument names.
|
|
||||||
argument-naming-style=snake_case
|
|
||||||
|
|
||||||
# Regular expression matching correct argument names. Overrides argument-
|
|
||||||
# naming-style.
|
|
||||||
argument-rgx=[a-z_][a-z0-9_]{0,30}$
|
|
||||||
|
|
||||||
# Naming style matching correct attribute names.
|
|
||||||
attr-naming-style=snake_case
|
|
||||||
|
|
||||||
# Regular expression matching correct attribute names. Overrides attr-naming-
|
|
||||||
# style.
|
|
||||||
#attr-rgx=
|
|
||||||
|
|
||||||
# Bad variable names which should always be refused, separated by a comma.
|
|
||||||
bad-names=
|
|
||||||
|
|
||||||
# Naming style matching correct class attribute names.
|
|
||||||
class-attribute-naming-style=any
|
|
||||||
|
|
||||||
# Regular expression matching correct class attribute names. Overrides class-
|
|
||||||
# attribute-naming-style.
|
|
||||||
#class-attribute-rgx=
|
|
||||||
|
|
||||||
# Naming style matching correct class names.
|
|
||||||
class-naming-style=PascalCase
|
|
||||||
|
|
||||||
# Regular expression matching correct class names. Overrides class-naming-
|
|
||||||
# style.
|
|
||||||
#class-rgx=
|
|
||||||
|
|
||||||
# Naming style matching correct constant names.
|
|
||||||
const-naming-style=UPPER_CASE
|
|
||||||
|
|
||||||
# Regular expression matching correct constant names. Overrides const-naming-
|
|
||||||
# style.
|
|
||||||
#const-rgx=
|
|
||||||
|
|
||||||
# Minimum line length for functions/classes that require docstrings, shorter
|
|
||||||
# ones are exempt.
|
|
||||||
docstring-min-length=-1
|
|
||||||
|
|
||||||
# Naming style matching correct function names.
|
|
||||||
function-naming-style=snake_case
|
|
||||||
|
|
||||||
# Regular expression matching correct function names. Overrides function-
|
|
||||||
# naming-style.
|
|
||||||
#function-rgx=
|
|
||||||
|
|
||||||
# Good variable names which should always be accepted, separated by a comma.
|
|
||||||
good-names=i,
|
|
||||||
j,
|
|
||||||
k,
|
|
||||||
x,
|
|
||||||
ex,
|
|
||||||
Run,
|
|
||||||
_
|
|
||||||
|
|
||||||
# Include a hint for the correct naming format with invalid-name.
|
|
||||||
include-naming-hint=no
|
|
||||||
|
|
||||||
# Naming style matching correct inline iteration names.
|
|
||||||
inlinevar-naming-style=any
|
|
||||||
|
|
||||||
# Regular expression matching correct inline iteration names. Overrides
|
|
||||||
# inlinevar-naming-style.
|
|
||||||
#inlinevar-rgx=
|
|
||||||
|
|
||||||
# Naming style matching correct method names.
|
|
||||||
method-naming-style=snake_case
|
|
||||||
|
|
||||||
# Regular expression matching correct method names. Overrides method-naming-
|
|
||||||
# style.
|
|
||||||
#method-rgx=
|
|
||||||
|
|
||||||
# Naming style matching correct module names.
|
|
||||||
module-naming-style=snake_case
|
|
||||||
|
|
||||||
# Regular expression matching correct module names. Overrides module-naming-
|
|
||||||
# style.
|
|
||||||
#module-rgx=
|
|
||||||
|
|
||||||
# Colon-delimited sets of names that determine each other's naming style when
|
|
||||||
# the name regexes allow several styles.
|
|
||||||
name-group=
|
|
||||||
|
|
||||||
# Regular expression which should only match function or class names that do
|
|
||||||
# not require a docstring.
|
|
||||||
no-docstring-rgx=^_
|
|
||||||
|
|
||||||
# List of decorators that produce properties, such as abc.abstractproperty. Add
|
|
||||||
# to this list to register other decorators that produce valid properties.
|
|
||||||
# These decorators are taken in consideration only for invalid-name.
|
|
||||||
property-classes=abc.abstractproperty
|
|
||||||
|
|
||||||
# Naming style matching correct variable names.
|
|
||||||
variable-naming-style=snake_case
|
|
||||||
|
|
||||||
# Regular expression matching correct variable names. Overrides variable-
|
|
||||||
# naming-style.
|
|
||||||
variable-rgx=[a-z_][a-z0-9_]{0,30}$
|
|
||||||
|
|
||||||
|
|
||||||
[STRING]
|
|
||||||
|
|
||||||
# This flag controls whether the implicit-str-concat-in-sequence should
|
|
||||||
# generate a warning on implicit string concatenation in sequences defined over
|
|
||||||
# several lines.
|
|
||||||
check-str-concat-over-line-jumps=no
|
|
||||||
|
|
||||||
|
|
||||||
[IMPORTS]
|
|
||||||
|
|
||||||
# Allow wildcard imports from modules that define __all__.
|
|
||||||
allow-wildcard-with-all=no
|
|
||||||
|
|
||||||
# Analyse import fallback blocks. This can be used to support both Python 2 and
|
|
||||||
# 3 compatible code, which means that the block might have code that exists
|
|
||||||
# only in one or another interpreter, leading to false positives when analysed.
|
|
||||||
analyse-fallback-blocks=no
|
|
||||||
|
|
||||||
# Deprecated modules which should not be used, separated by a comma.
|
|
||||||
deprecated-modules=optparse,tkinter.tix
|
|
||||||
|
|
||||||
# Create a graph of external dependencies in the given file (report RP0402 must
|
|
||||||
# not be disabled).
|
|
||||||
ext-import-graph=
|
|
||||||
|
|
||||||
# Create a graph of every (i.e. internal and external) dependencies in the
|
|
||||||
# given file (report RP0402 must not be disabled).
|
|
||||||
import-graph=
|
|
||||||
|
|
||||||
# Create a graph of internal dependencies in the given file (report RP0402 must
|
|
||||||
# not be disabled).
|
|
||||||
int-import-graph=
|
|
||||||
|
|
||||||
# Force import order to recognize a module as part of the standard
|
|
||||||
# compatibility libraries.
|
|
||||||
known-standard-library=
|
|
||||||
|
|
||||||
# Force import order to recognize a module as part of a third party library.
|
|
||||||
known-third-party=enchant
|
|
||||||
|
|
||||||
|
|
||||||
[CLASSES]
|
|
||||||
|
|
||||||
# List of method names used to declare (i.e. assign) instance attributes.
|
|
||||||
defining-attr-methods=__init__,
|
|
||||||
__new__,
|
|
||||||
setUp
|
|
||||||
|
|
||||||
# List of member names, which should be excluded from the protected access
|
|
||||||
# warning.
|
|
||||||
exclude-protected=_asdict,
|
|
||||||
_fields,
|
|
||||||
_replace,
|
|
||||||
_source,
|
|
||||||
_make
|
|
||||||
|
|
||||||
# List of valid names for the first argument in a class method.
|
|
||||||
valid-classmethod-first-arg=cls
|
|
||||||
|
|
||||||
# List of valid names for the first argument in a metaclass class method.
|
|
||||||
valid-metaclass-classmethod-first-arg=cls
|
|
||||||
|
|
||||||
|
|
||||||
[DESIGN]
|
|
||||||
|
|
||||||
# Maximum number of arguments for function / method.
|
|
||||||
max-args=5
|
|
||||||
|
|
||||||
# Maximum number of attributes for a class (see R0902).
|
|
||||||
max-attributes=7
|
|
||||||
|
|
||||||
# Maximum number of boolean expressions in an if statement.
|
|
||||||
max-bool-expr=5
|
|
||||||
|
|
||||||
# Maximum number of branch for function / method body.
|
|
||||||
max-branches=12
|
|
||||||
|
|
||||||
# Maximum number of locals for function / method body.
|
|
||||||
max-locals=15
|
|
||||||
|
|
||||||
# Maximum number of parents for a class (see R0901).
|
|
||||||
max-parents=15
|
|
||||||
|
|
||||||
# Maximum number of public methods for a class (see R0904).
|
|
||||||
max-public-methods=20
|
|
||||||
|
|
||||||
# Maximum number of return / yield for function / method body.
|
|
||||||
max-returns=6
|
|
||||||
|
|
||||||
# Maximum number of statements in function / method body.
|
|
||||||
max-statements=50
|
|
||||||
|
|
||||||
# Minimum number of public methods for a class (see R0903).
|
|
||||||
min-public-methods=2
|
|
||||||
|
|
||||||
|
|
||||||
[EXCEPTIONS]
|
|
||||||
|
|
||||||
# Exceptions that will emit a warning when being caught. Defaults to
|
|
||||||
# "BaseException, Exception".
|
|
||||||
overgeneral-exceptions=BaseException,
|
|
||||||
Exception
|
|
|
@ -14,8 +14,9 @@ build:
|
||||||
# Optionally set the version of Python and requirements required to build your docs
|
# Optionally set the version of Python and requirements required to build your docs
|
||||||
python:
|
python:
|
||||||
install:
|
install:
|
||||||
- requirements: docs/requirements.txt
|
- path: .
|
||||||
- requirements: requirements.txt
|
extra_requirements:
|
||||||
|
- docs
|
||||||
|
|
||||||
# Build documentation in the docs/ directory with Sphinx
|
# Build documentation in the docs/ directory with Sphinx
|
||||||
sphinx:
|
sphinx:
|
||||||
|
|
|
@ -10,11 +10,11 @@ authors:
|
||||||
version: 1.4
|
version: 1.4
|
||||||
doi: 10.5281/zenodo.6334862
|
doi: 10.5281/zenodo.6334862
|
||||||
license: "MPL-2.0"
|
license: "MPL-2.0"
|
||||||
url: "https://www.coqui.ai"
|
url: "https://github.com/idiap/coqui-ai-TTS"
|
||||||
repository-code: "https://github.com/coqui-ai/TTS"
|
repository-code: "https://github.com/idiap/coqui-ai-TTS"
|
||||||
keywords:
|
keywords:
|
||||||
- machine learning
|
- machine learning
|
||||||
- deep learning
|
- deep learning
|
||||||
- artificial intelligence
|
- artificial intelligence
|
||||||
- text to speech
|
- text to speech
|
||||||
- TTS
|
- TTS
|
||||||
|
|
|
@ -119,11 +119,11 @@ This Code of Conduct is adapted from the [Contributor Covenant][homepage],
|
||||||
version 2.0, available at
|
version 2.0, available at
|
||||||
[https://www.contributor-covenant.org/version/2/0/code_of_conduct.html][v2.0].
|
[https://www.contributor-covenant.org/version/2/0/code_of_conduct.html][v2.0].
|
||||||
|
|
||||||
Community Impact Guidelines were inspired by
|
Community Impact Guidelines were inspired by
|
||||||
[Mozilla's code of conduct enforcement ladder][Mozilla CoC].
|
[Mozilla's code of conduct enforcement ladder][Mozilla CoC].
|
||||||
|
|
||||||
For answers to common questions about this code of conduct, see the FAQ at
|
For answers to common questions about this code of conduct, see the FAQ at
|
||||||
[https://www.contributor-covenant.org/faq][FAQ]. Translations are available
|
[https://www.contributor-covenant.org/faq][FAQ]. Translations are available
|
||||||
at [https://www.contributor-covenant.org/translations][translations].
|
at [https://www.contributor-covenant.org/translations][translations].
|
||||||
|
|
||||||
[homepage]: https://www.contributor-covenant.org
|
[homepage]: https://www.contributor-covenant.org
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
Welcome to the 🐸TTS!
|
Welcome to the 🐸TTS!
|
||||||
|
|
||||||
This repository is governed by [the Contributor Covenant Code of Conduct](https://github.com/coqui-ai/TTS/blob/main/CODE_OF_CONDUCT.md).
|
This repository is governed by [the Contributor Covenant Code of Conduct](https://github.com/idiap/coqui-ai-TTS/blob/main/CODE_OF_CONDUCT.md).
|
||||||
|
|
||||||
## Where to start.
|
## Where to start.
|
||||||
We welcome everyone who likes to contribute to 🐸TTS.
|
We welcome everyone who likes to contribute to 🐸TTS.
|
||||||
|
@ -15,13 +15,13 @@ If you like to contribute code, squash a bug but if you don't know where to star
|
||||||
|
|
||||||
You can pick something out of our road map. We keep the progess of the project in this simple issue thread. It has new model proposals or developmental updates etc.
|
You can pick something out of our road map. We keep the progess of the project in this simple issue thread. It has new model proposals or developmental updates etc.
|
||||||
|
|
||||||
- [Github Issues Tracker](https://github.com/coqui-ai/TTS/issues)
|
- [Github Issues Tracker](https://github.com/idiap/coqui-ai-TTS/issues)
|
||||||
|
|
||||||
This is a place to find feature requests, bugs.
|
This is a place to find feature requests, bugs.
|
||||||
|
|
||||||
Issues with the ```good first issue``` tag are good place for beginners to take on.
|
Issues with the ```good first issue``` tag are good place for beginners to take on.
|
||||||
|
|
||||||
- ✨**PR**✨ [pages](https://github.com/coqui-ai/TTS/pulls) with the ```🚀new version``` tag.
|
- ✨**PR**✨ [pages](https://github.com/idiap/coqui-ai-TTS/pulls) with the ```🚀new version``` tag.
|
||||||
|
|
||||||
We list all the target improvements for the next version. You can pick one of them and start contributing.
|
We list all the target improvements for the next version. You can pick one of them and start contributing.
|
||||||
|
|
||||||
|
@ -46,21 +46,21 @@ Let us know if you encounter a problem along the way.
|
||||||
|
|
||||||
The following steps are tested on an Ubuntu system.
|
The following steps are tested on an Ubuntu system.
|
||||||
|
|
||||||
1. Fork 🐸TTS[https://github.com/coqui-ai/TTS] by clicking the fork button at the top right corner of the project page.
|
1. Fork 🐸TTS[https://github.com/idiap/coqui-ai-TTS] by clicking the fork button at the top right corner of the project page.
|
||||||
|
|
||||||
2. Clone 🐸TTS and add the main repo as a new remote named ```upstream```.
|
2. Clone 🐸TTS and add the main repo as a new remote named ```upstream```.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ git clone git@github.com:<your Github name>/TTS.git
|
$ git clone git@github.com:<your Github name>/coqui-ai-TTS.git
|
||||||
$ cd TTS
|
$ cd coqui-ai-TTS
|
||||||
$ git remote add upstream https://github.com/coqui-ai/TTS.git
|
$ git remote add upstream https://github.com/idiap/coqui-ai-TTS.git
|
||||||
```
|
```
|
||||||
|
|
||||||
3. Install 🐸TTS for development.
|
3. Install 🐸TTS for development.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ make system-deps # intended to be used on Ubuntu (Debian). Let us know if you have a different OS.
|
$ make system-deps # intended to be used on Ubuntu (Debian). Let us know if you have a different OS.
|
||||||
$ make install
|
$ make install_dev
|
||||||
```
|
```
|
||||||
|
|
||||||
4. Create a new branch with an informative name for your goal.
|
4. Create a new branch with an informative name for your goal.
|
||||||
|
@ -82,13 +82,13 @@ The following steps are tested on an Ubuntu system.
|
||||||
$ make test_all # run all the tests, report all the errors
|
$ make test_all # run all the tests, report all the errors
|
||||||
```
|
```
|
||||||
|
|
||||||
9. Format your code. We use ```black``` for code and ```isort``` for ```import``` formatting.
|
9. Format your code. We use ```black``` for code formatting.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ make style
|
$ make style
|
||||||
```
|
```
|
||||||
|
|
||||||
10. Run the linter and correct the issues raised. We use ```pylint``` for linting. It helps to enforce a coding standard, offers simple refactoring suggestions.
|
10. Run the linter and correct the issues raised. We use ```ruff``` for linting. It helps to enforce a coding standard, offers simple refactoring suggestions.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ make lint
|
$ make lint
|
||||||
|
@ -105,7 +105,7 @@ The following steps are tested on an Ubuntu system.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ git fetch upstream
|
$ git fetch upstream
|
||||||
$ git rebase upstream/master
|
$ git rebase upstream/main
|
||||||
# or for the development version
|
# or for the development version
|
||||||
$ git rebase upstream/dev
|
$ git rebase upstream/dev
|
||||||
```
|
```
|
||||||
|
@ -124,7 +124,7 @@ The following steps are tested on an Ubuntu system.
|
||||||
|
|
||||||
13. Let's discuss until it is perfect. 💪
|
13. Let's discuss until it is perfect. 💪
|
||||||
|
|
||||||
We might ask you for certain changes that would appear in the ✨**PR**✨'s page under 🐸TTS[https://github.com/coqui-ai/TTS/pulls].
|
We might ask you for certain changes that would appear in the ✨**PR**✨'s page under 🐸TTS[https://github.com/idiap/coqui-ai-TTS/pulls].
|
||||||
|
|
||||||
14. Once things look perfect, We merge it to the ```dev``` branch and make it ready for the next version.
|
14. Once things look perfect, We merge it to the ```dev``` branch and make it ready for the next version.
|
||||||
|
|
||||||
|
@ -132,14 +132,14 @@ The following steps are tested on an Ubuntu system.
|
||||||
|
|
||||||
If you prefer working within a Docker container as your development environment, you can do the following:
|
If you prefer working within a Docker container as your development environment, you can do the following:
|
||||||
|
|
||||||
1. Fork 🐸TTS[https://github.com/coqui-ai/TTS] by clicking the fork button at the top right corner of the project page.
|
1. Fork 🐸TTS[https://github.com/idiap/coqui-ai-TTS] by clicking the fork button at the top right corner of the project page.
|
||||||
|
|
||||||
2. Clone 🐸TTS and add the main repo as a new remote named ```upsteam```.
|
2. Clone 🐸TTS and add the main repo as a new remote named ```upsteam```.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ git clone git@github.com:<your Github name>/TTS.git
|
$ git clone git@github.com:<your Github name>/coqui-ai-TTS.git
|
||||||
$ cd TTS
|
$ cd coqui-ai-TTS
|
||||||
$ git remote add upstream https://github.com/coqui-ai/TTS.git
|
$ git remote add upstream https://github.com/idiap/coqui-ai-TTS.git
|
||||||
```
|
```
|
||||||
|
|
||||||
3. Build the Docker Image as your development environment (it installs all of the dependencies for you):
|
3. Build the Docker Image as your development environment (it installs all of the dependencies for you):
|
||||||
|
|
|
@ -3,6 +3,7 @@ FROM ${BASE}
|
||||||
|
|
||||||
RUN apt-get update && apt-get upgrade -y
|
RUN apt-get update && apt-get upgrade -y
|
||||||
RUN apt-get install -y --no-install-recommends gcc g++ make python3 python3-dev python3-pip python3-venv python3-wheel espeak-ng libsndfile1-dev && rm -rf /var/lib/apt/lists/*
|
RUN apt-get install -y --no-install-recommends gcc g++ make python3 python3-dev python3-pip python3-venv python3-wheel espeak-ng libsndfile1-dev && rm -rf /var/lib/apt/lists/*
|
||||||
|
RUN pip3 install -U pip setuptools
|
||||||
RUN pip3 install llvmlite --ignore-installed
|
RUN pip3 install llvmlite --ignore-installed
|
||||||
|
|
||||||
# Install Dependencies:
|
# Install Dependencies:
|
||||||
|
|
|
@ -35,7 +35,7 @@ Mozilla Public License Version 2.0
|
||||||
means any form of the work other than Source Code Form.
|
means any form of the work other than Source Code Form.
|
||||||
|
|
||||||
1.7. "Larger Work"
|
1.7. "Larger Work"
|
||||||
means a work that combines Covered Software with other material, in
|
means a work that combines Covered Software with other material, in
|
||||||
a separate file or files, that is not Covered Software.
|
a separate file or files, that is not Covered Software.
|
||||||
|
|
||||||
1.8. "License"
|
1.8. "License"
|
||||||
|
|
|
@ -1,9 +1,6 @@
|
||||||
include README.md
|
include README.md
|
||||||
include LICENSE.txt
|
include LICENSE.txt
|
||||||
include requirements.*.txt
|
|
||||||
include *.cff
|
include *.cff
|
||||||
include requirements.txt
|
|
||||||
include TTS/VERSION
|
|
||||||
recursive-include TTS *.json
|
recursive-include TTS *.json
|
||||||
recursive-include TTS *.html
|
recursive-include TTS *.html
|
||||||
recursive-include TTS *.png
|
recursive-include TTS *.png
|
||||||
|
@ -11,5 +8,3 @@ recursive-include TTS *.md
|
||||||
recursive-include TTS *.py
|
recursive-include TTS *.py
|
||||||
recursive-include TTS *.pyx
|
recursive-include TTS *.pyx
|
||||||
recursive-include images *.png
|
recursive-include images *.png
|
||||||
recursive-exclude tests *
|
|
||||||
prune tests*
|
|
||||||
|
|
52
Makefile
52
Makefile
|
@ -1,5 +1,5 @@
|
||||||
.DEFAULT_GOAL := help
|
.DEFAULT_GOAL := help
|
||||||
.PHONY: test system-deps dev-deps deps style lint install help docs
|
.PHONY: test system-deps dev-deps style lint install install_dev help docs
|
||||||
|
|
||||||
help:
|
help:
|
||||||
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
|
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
|
||||||
|
@ -11,47 +11,50 @@ test_all: ## run tests and don't stop on an error.
|
||||||
./run_bash_tests.sh
|
./run_bash_tests.sh
|
||||||
|
|
||||||
test: ## run tests.
|
test: ## run tests.
|
||||||
nose2 -F -v -B --with-coverage --coverage TTS tests
|
coverage run -m nose2 -F -v -B tests
|
||||||
|
|
||||||
test_vocoder: ## run vocoder tests.
|
test_vocoder: ## run vocoder tests.
|
||||||
nose2 -F -v -B --with-coverage --coverage TTS tests.vocoder_tests
|
coverage run -m nose2 -F -v -B tests.vocoder_tests
|
||||||
|
|
||||||
test_tts: ## run tts tests.
|
test_tts: ## run tts tests.
|
||||||
nose2 -F -v -B --with-coverage --coverage TTS tests.tts_tests
|
coverage run -m nose2 -F -v -B tests.tts_tests
|
||||||
|
|
||||||
test_tts2: ## run tts tests.
|
test_tts2: ## run tts tests.
|
||||||
nose2 -F -v -B --with-coverage --coverage TTS tests.tts_tests2
|
coverage run -m nose2 -F -v -B tests.tts_tests2
|
||||||
|
|
||||||
test_xtts:
|
test_xtts:
|
||||||
nose2 -F -v -B --with-coverage --coverage TTS tests.xtts_tests
|
coverage run -m nose2 -F -v -B tests.xtts_tests
|
||||||
|
|
||||||
test_aux: ## run aux tests.
|
test_aux: ## run aux tests.
|
||||||
nose2 -F -v -B --with-coverage --coverage TTS tests.aux_tests
|
coverage run -m nose2 -F -v -B tests.aux_tests
|
||||||
./run_bash_tests.sh
|
./run_bash_tests.sh
|
||||||
|
|
||||||
test_zoo: ## run zoo tests.
|
test_zoo0: ## run zoo tests.
|
||||||
nose2 -F -v -B --with-coverage --coverage TTS tests.zoo_tests
|
coverage run -m nose2 -F -v -B tests.zoo_tests.test_models.test_models_offset_0_step_3 \
|
||||||
|
tests.zoo_tests.test_models.test_voice_conversion
|
||||||
|
test_zoo1: ## run zoo tests.
|
||||||
|
coverage run -m nose2 -F -v -B tests.zoo_tests.test_models.test_models_offset_1_step_3
|
||||||
|
test_zoo2: ## run zoo tests.
|
||||||
|
coverage run -m nose2 -F -v -B tests.zoo_tests.test_models.test_models_offset_2_step_3
|
||||||
|
|
||||||
inference_tests: ## run inference tests.
|
inference_tests: ## run inference tests.
|
||||||
nose2 -F -v -B --with-coverage --coverage TTS tests.inference_tests
|
coverage run -m nose2 -F -v -B tests.inference_tests
|
||||||
|
|
||||||
data_tests: ## run data tests.
|
data_tests: ## run data tests.
|
||||||
nose2 -F -v -B --with-coverage --coverage TTS tests.data_tests
|
coverage run -m nose2 -F -v -B tests.data_tests
|
||||||
|
|
||||||
test_text: ## run text tests.
|
test_text: ## run text tests.
|
||||||
nose2 -F -v -B --with-coverage --coverage TTS tests.text_tests
|
coverage run -m nose2 -F -v -B tests.text_tests
|
||||||
|
|
||||||
test_failed: ## only run tests failed the last time.
|
test_failed: ## only run tests failed the last time.
|
||||||
nose2 -F -v -B --with-coverage --coverage TTS tests
|
coverage run -m nose2 -F -v -B tests
|
||||||
|
|
||||||
style: ## update code style.
|
style: ## update code style.
|
||||||
black ${target_dirs}
|
black ${target_dirs}
|
||||||
isort ${target_dirs}
|
|
||||||
|
|
||||||
lint: ## run pylint linter.
|
lint: ## run linters.
|
||||||
pylint ${target_dirs}
|
ruff check ${target_dirs}
|
||||||
black ${target_dirs} --check
|
black ${target_dirs} --check
|
||||||
isort ${target_dirs} --check-only
|
|
||||||
|
|
||||||
system-deps: ## install linux system deps
|
system-deps: ## install linux system deps
|
||||||
sudo apt-get install -y libsndfile1-dev
|
sudo apt-get install -y libsndfile1-dev
|
||||||
|
@ -59,20 +62,15 @@ system-deps: ## install linux system deps
|
||||||
dev-deps: ## install development deps
|
dev-deps: ## install development deps
|
||||||
pip install -r requirements.dev.txt
|
pip install -r requirements.dev.txt
|
||||||
|
|
||||||
doc-deps: ## install docs dependencies
|
|
||||||
pip install -r docs/requirements.txt
|
|
||||||
|
|
||||||
build-docs: ## build the docs
|
build-docs: ## build the docs
|
||||||
cd docs && make clean && make build
|
cd docs && make clean && make build
|
||||||
|
|
||||||
hub-deps: ## install deps for torch hub use
|
install: ## install 🐸 TTS
|
||||||
pip install -r requirements.hub.txt
|
|
||||||
|
|
||||||
deps: ## install 🐸 requirements.
|
|
||||||
pip install -r requirements.txt
|
|
||||||
|
|
||||||
install: ## install 🐸 TTS for development.
|
|
||||||
pip install -e .[all]
|
pip install -e .[all]
|
||||||
|
|
||||||
|
install_dev: ## install 🐸 TTS for development.
|
||||||
|
pip install -e .[all,dev]
|
||||||
|
pre-commit install
|
||||||
|
|
||||||
docs: ## build the docs
|
docs: ## build the docs
|
||||||
$(MAKE) -C docs clean && $(MAKE) -C docs html
|
$(MAKE) -C docs clean && $(MAKE) -C docs html
|
||||||
|
|
106
README.md
106
README.md
|
@ -1,17 +1,18 @@
|
||||||
|
|
||||||
## 🐸Coqui.ai News
|
## 🐸Coqui TTS News
|
||||||
|
- 📣 Fork of the [original, unmaintained repository](https://github.com/coqui-ai/TTS). New PyPI package: [coqui-tts](https://pypi.org/project/coqui-tts)
|
||||||
- 📣 ⓍTTSv2 is here with 16 languages and better performance across the board.
|
- 📣 ⓍTTSv2 is here with 16 languages and better performance across the board.
|
||||||
- 📣 ⓍTTS fine-tuning code is out. Check the [example recipes](https://github.com/coqui-ai/TTS/tree/dev/recipes/ljspeech).
|
- 📣 ⓍTTS fine-tuning code is out. Check the [example recipes](https://github.com/idiap/coqui-ai-TTS/tree/dev/recipes/ljspeech).
|
||||||
- 📣 ⓍTTS can now stream with <200ms latency.
|
- 📣 ⓍTTS can now stream with <200ms latency.
|
||||||
- 📣 ⓍTTS, our production TTS model that can speak 13 languages, is released [Blog Post](https://coqui.ai/blog/tts/open_xtts), [Demo](https://huggingface.co/spaces/coqui/xtts), [Docs](https://tts.readthedocs.io/en/dev/models/xtts.html)
|
- 📣 ⓍTTS, our production TTS model that can speak 13 languages, is released [Blog Post](https://coqui.ai/blog/tts/open_xtts), [Demo](https://huggingface.co/spaces/coqui/xtts), [Docs](https://coqui-tts.readthedocs.io/en/latest/models/xtts.html)
|
||||||
- 📣 [🐶Bark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://tts.readthedocs.io/en/dev/models/bark.html)
|
- 📣 [🐶Bark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://coqui-tts.readthedocs.io/en/latest/models/bark.html)
|
||||||
- 📣 You can use [~1100 Fairseq models](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS.
|
- 📣 You can use [~1100 Fairseq models](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS.
|
||||||
- 📣 🐸TTS now supports 🐢Tortoise with faster inference. [Docs](https://tts.readthedocs.io/en/dev/models/tortoise.html)
|
- 📣 🐸TTS now supports 🐢Tortoise with faster inference. [Docs](https://coqui-tts.readthedocs.io/en/latest/models/tortoise.html)
|
||||||
|
|
||||||
<div align="center">
|
<div align="center">
|
||||||
<img src="https://static.scarf.sh/a.png?x-pxid=cf317fe7-2188-4721-bc01-124bb5d5dbb2" />
|
<img src="https://static.scarf.sh/a.png?x-pxid=cf317fe7-2188-4721-bc01-124bb5d5dbb2" />
|
||||||
|
|
||||||
## <img src="https://raw.githubusercontent.com/coqui-ai/TTS/main/images/coqui-log-green-TTS.png" height="56"/>
|
## <img src="https://raw.githubusercontent.com/idiap/coqui-ai-TTS/main/images/coqui-log-green-TTS.png" height="56"/>
|
||||||
|
|
||||||
|
|
||||||
**🐸TTS is a library for advanced Text-to-Speech generation.**
|
**🐸TTS is a library for advanced Text-to-Speech generation.**
|
||||||
|
@ -25,23 +26,15 @@ ______________________________________________________________________
|
||||||
|
|
||||||
[](https://discord.gg/5eXr5seRrv)
|
[](https://discord.gg/5eXr5seRrv)
|
||||||
[](https://opensource.org/licenses/MPL-2.0)
|
[](https://opensource.org/licenses/MPL-2.0)
|
||||||
[](https://badge.fury.io/py/TTS)
|
[](https://badge.fury.io/py/coqui-tts)
|
||||||
[](https://github.com/coqui-ai/TTS/blob/master/CODE_OF_CONDUCT.md)
|
[](https://github.com/idiap/coqui-ai-TTS/blob/main/CODE_OF_CONDUCT.md)
|
||||||
[](https://pepy.tech/project/tts)
|
[](https://pepy.tech/project/coqui-tts)
|
||||||
[](https://zenodo.org/badge/latestdoi/265612440)
|
[](https://zenodo.org/badge/latestdoi/265612440)
|
||||||
|
|
||||||

|

|
||||||

|

|
||||||

|

|
||||||

|
[](https://coqui-tts.readthedocs.io/en/latest/)
|
||||||

|
|
||||||

|
|
||||||

|
|
||||||

|
|
||||||

|
|
||||||

|
|
||||||

|
|
||||||
[](https://tts.readthedocs.io/en/latest/)
|
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
@ -57,28 +50,26 @@ Please use our dedicated channels for questions and discussion. Help is much mor
|
||||||
| 👩💻 **Usage Questions** | [GitHub Discussions] |
|
| 👩💻 **Usage Questions** | [GitHub Discussions] |
|
||||||
| 🗯 **General Discussion** | [GitHub Discussions] or [Discord] |
|
| 🗯 **General Discussion** | [GitHub Discussions] or [Discord] |
|
||||||
|
|
||||||
[github issue tracker]: https://github.com/coqui-ai/tts/issues
|
[github issue tracker]: https://github.com/idiap/coqui-ai-TTS/issues
|
||||||
[github discussions]: https://github.com/coqui-ai/TTS/discussions
|
[github discussions]: https://github.com/idiap/coqui-ai-TTS/discussions
|
||||||
[discord]: https://discord.gg/5eXr5seRrv
|
[discord]: https://discord.gg/5eXr5seRrv
|
||||||
[Tutorials and Examples]: https://github.com/coqui-ai/TTS/wiki/TTS-Notebooks-and-Tutorials
|
[Tutorials and Examples]: https://github.com/coqui-ai/TTS/wiki/TTS-Notebooks-and-Tutorials
|
||||||
|
|
||||||
|
The [issues](https://github.com/coqui-ai/TTS/issues) and
|
||||||
|
[discussions](https://github.com/coqui-ai/TTS/discussions) in the original
|
||||||
|
repository are also still a useful source of information.
|
||||||
|
|
||||||
|
|
||||||
## 🔗 Links and Resources
|
## 🔗 Links and Resources
|
||||||
| Type | Links |
|
| Type | Links |
|
||||||
| ------------------------------- | --------------------------------------- |
|
| ------------------------------- | --------------------------------------- |
|
||||||
| 💼 **Documentation** | [ReadTheDocs](https://tts.readthedocs.io/en/latest/)
|
| 💼 **Documentation** | [ReadTheDocs](https://coqui-tts.readthedocs.io/en/latest/)
|
||||||
| 💾 **Installation** | [TTS/README.md](https://github.com/coqui-ai/TTS/tree/dev#installation)|
|
| 💾 **Installation** | [TTS/README.md](https://github.com/idiap/coqui-ai-TTS/tree/dev#installation)|
|
||||||
| 👩💻 **Contributing** | [CONTRIBUTING.md](https://github.com/coqui-ai/TTS/blob/main/CONTRIBUTING.md)|
|
| 👩💻 **Contributing** | [CONTRIBUTING.md](https://github.com/idiap/coqui-ai-TTS/blob/main/CONTRIBUTING.md)|
|
||||||
| 📌 **Road Map** | [Main Development Plans](https://github.com/coqui-ai/TTS/issues/378)
|
| 📌 **Road Map** | [Main Development Plans](https://github.com/coqui-ai/TTS/issues/378)
|
||||||
| 🚀 **Released Models** | [TTS Releases](https://github.com/coqui-ai/TTS/releases) and [Experimental Models](https://github.com/coqui-ai/TTS/wiki/Experimental-Released-Models)|
|
| 🚀 **Released Models** | [Standard models](https://github.com/idiap/coqui-ai-TTS/blob/dev/TTS/.models.json) and [Fairseq models in ~1100 languages](https://github.com/idiap/coqui-ai-TTS#example-text-to-speech-using-fairseq-models-in-1100-languages-)|
|
||||||
| 📰 **Papers** | [TTS Papers](https://github.com/erogol/TTS-papers)|
|
| 📰 **Papers** | [TTS Papers](https://github.com/erogol/TTS-papers)|
|
||||||
|
|
||||||
|
|
||||||
## 🥇 TTS Performance
|
|
||||||
<p align="center"><img src="https://raw.githubusercontent.com/coqui-ai/TTS/main/images/TTS-performance.png" width="800" /></p>
|
|
||||||
|
|
||||||
Underlined "TTS*" and "Judy*" are **internal** 🐸TTS models that are not released open-source. They are here to show the potential. Models prefixed with a dot (.Jofish .Abe and .Janice) are real human voices.
|
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
- High-performance Deep Learning models for Text2Speech tasks.
|
- High-performance Deep Learning models for Text2Speech tasks.
|
||||||
- Text2Spec models (Tacotron, Tacotron2, Glow-TTS, SpeedySpeech).
|
- Text2Spec models (Tacotron, Tacotron2, Glow-TTS, SpeedySpeech).
|
||||||
|
@ -144,21 +135,48 @@ Underlined "TTS*" and "Judy*" are **internal** 🐸TTS models that are not relea
|
||||||
You can also help us implement more models.
|
You can also help us implement more models.
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
🐸TTS is tested on Ubuntu 18.04 with **python >= 3.9, < 3.12.**.
|
🐸TTS is tested on Ubuntu 22.04 with **python >= 3.9, < 3.13.**.
|
||||||
|
|
||||||
If you are only interested in [synthesizing speech](https://tts.readthedocs.io/en/latest/inference.html) with the released 🐸TTS models, installing from PyPI is the easiest option.
|
If you are only interested in [synthesizing speech](https://coqui-tts.readthedocs.io/en/latest/inference.html) with the released 🐸TTS models, installing from PyPI is the easiest option.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install TTS
|
pip install coqui-tts
|
||||||
```
|
```
|
||||||
|
|
||||||
If you plan to code or train models, clone 🐸TTS and install it locally.
|
If you plan to code or train models, clone 🐸TTS and install it locally.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
git clone https://github.com/coqui-ai/TTS
|
git clone https://github.com/idiap/coqui-ai-TTS
|
||||||
pip install -e .[all,dev,notebooks] # Select the relevant extras
|
cd coqui-ai-TTS
|
||||||
|
pip install -e .
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Optional dependencies
|
||||||
|
|
||||||
|
The following extras allow the installation of optional dependencies:
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
|------|-------------|
|
||||||
|
| `all` | All optional dependencies, except `dev` and `docs` |
|
||||||
|
| `dev` | Development dependencies |
|
||||||
|
| `docs` | Dependencies for building the documentation |
|
||||||
|
| `notebooks` | Dependencies only used in notebooks |
|
||||||
|
| `server` | Dependencies to run the TTS server |
|
||||||
|
| `bn` | Bangla G2P |
|
||||||
|
| `ja` | Japanese G2P |
|
||||||
|
| `ko` | Korean G2P |
|
||||||
|
| `zh` | Chinese G2P |
|
||||||
|
| `languages` | All language-specific dependencies |
|
||||||
|
|
||||||
|
You can install extras with one of the following commands:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install coqui-tts[server,ja]
|
||||||
|
pip install -e .[server,ja]
|
||||||
|
```
|
||||||
|
|
||||||
|
### Platforms
|
||||||
|
|
||||||
If you are on Ubuntu (Debian), you can also run following commands for installation.
|
If you are on Ubuntu (Debian), you can also run following commands for installation.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
@ -166,7 +184,9 @@ $ make system-deps # intended to be used on Ubuntu (Debian). Let us know if you
|
||||||
$ make install
|
$ make install
|
||||||
```
|
```
|
||||||
|
|
||||||
If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](https://stackoverflow.com/questions/66726331/how-can-i-run-mozilla-tts-coqui-tts-training-with-cuda-on-a-windows-system).
|
If you are on Windows, 👑@GuyPaddock wrote installation instructions
|
||||||
|
[here](https://stackoverflow.com/questions/66726331/how-can-i-run-mozilla-tts-coqui-tts-training-with-cuda-on-a-windows-system)
|
||||||
|
(note that these are out of date, e.g. you need to have at least Python 3.9).
|
||||||
|
|
||||||
|
|
||||||
## Docker Image
|
## Docker Image
|
||||||
|
@ -180,7 +200,8 @@ python3 TTS/server/server.py --model_name tts_models/en/vctk/vits # To start a s
|
||||||
```
|
```
|
||||||
|
|
||||||
You can then enjoy the TTS server [here](http://[::1]:5002/)
|
You can then enjoy the TTS server [here](http://[::1]:5002/)
|
||||||
More details about the docker images (like GPU support) can be found [here](https://tts.readthedocs.io/en/latest/docker_images.html)
|
More details about the docker images (like GPU support) can be found
|
||||||
|
[here](https://coqui-tts.readthedocs.io/en/latest/docker_images.html)
|
||||||
|
|
||||||
|
|
||||||
## Synthesizing speech by 🐸TTS
|
## Synthesizing speech by 🐸TTS
|
||||||
|
@ -254,11 +275,10 @@ You can find the language ISO codes [here](https://dl.fbaipublicfiles.com/mms/tt
|
||||||
and learn about the Fairseq models [here](https://github.com/facebookresearch/fairseq/tree/main/examples/mms).
|
and learn about the Fairseq models [here](https://github.com/facebookresearch/fairseq/tree/main/examples/mms).
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# TTS with on the fly voice conversion
|
# TTS with fairseq models
|
||||||
api = TTS("tts_models/deu/fairseq/vits")
|
api = TTS("tts_models/deu/fairseq/vits")
|
||||||
api.tts_with_vc_to_file(
|
api.tts_to_file(
|
||||||
"Wie sage ich auf Italienisch, dass ich dich liebe?",
|
"Wie sage ich auf Italienisch, dass ich dich liebe?",
|
||||||
speaker_wav="target/speaker.wav",
|
|
||||||
file_path="output.wav"
|
file_path="output.wav"
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
|
@ -46,7 +46,7 @@
|
||||||
"hf_url": [
|
"hf_url": [
|
||||||
"https://coqui.gateway.scarf.sh/hf/bark/coarse_2.pt",
|
"https://coqui.gateway.scarf.sh/hf/bark/coarse_2.pt",
|
||||||
"https://coqui.gateway.scarf.sh/hf/bark/fine_2.pt",
|
"https://coqui.gateway.scarf.sh/hf/bark/fine_2.pt",
|
||||||
"https://coqui.gateway.scarf.sh/hf/text_2.pt",
|
"https://coqui.gateway.scarf.sh/hf/bark/text_2.pt",
|
||||||
"https://coqui.gateway.scarf.sh/hf/bark/config.json",
|
"https://coqui.gateway.scarf.sh/hf/bark/config.json",
|
||||||
"https://coqui.gateway.scarf.sh/hf/bark/hubert.pt",
|
"https://coqui.gateway.scarf.sh/hf/bark/hubert.pt",
|
||||||
"https://coqui.gateway.scarf.sh/hf/bark/tokenizer.pth"
|
"https://coqui.gateway.scarf.sh/hf/bark/tokenizer.pth"
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
0.22.0
|
|
|
@ -1,6 +1,3 @@
|
||||||
import os
|
import importlib.metadata
|
||||||
|
|
||||||
with open(os.path.join(os.path.dirname(__file__), "VERSION"), "r", encoding="utf-8") as f:
|
__version__ = importlib.metadata.version("coqui-tts")
|
||||||
version = f.read().strip()
|
|
||||||
|
|
||||||
__version__ = version
|
|
||||||
|
|
22
TTS/api.py
22
TTS/api.py
|
@ -1,15 +1,16 @@
|
||||||
|
import logging
|
||||||
import tempfile
|
import tempfile
|
||||||
import warnings
|
import warnings
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Union
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
||||||
|
from TTS.config import load_config
|
||||||
from TTS.utils.audio.numpy_transforms import save_wav
|
from TTS.utils.audio.numpy_transforms import save_wav
|
||||||
from TTS.utils.manage import ModelManager
|
from TTS.utils.manage import ModelManager
|
||||||
from TTS.utils.synthesizer import Synthesizer
|
from TTS.utils.synthesizer import Synthesizer
|
||||||
from TTS.config import load_config
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class TTS(nn.Module):
|
class TTS(nn.Module):
|
||||||
|
@ -61,7 +62,7 @@ class TTS(nn.Module):
|
||||||
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
|
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
|
||||||
"""
|
"""
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar, verbose=False)
|
self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar)
|
||||||
self.config = load_config(config_path) if config_path else None
|
self.config = load_config(config_path) if config_path else None
|
||||||
self.synthesizer = None
|
self.synthesizer = None
|
||||||
self.voice_converter = None
|
self.voice_converter = None
|
||||||
|
@ -99,7 +100,7 @@ class TTS(nn.Module):
|
||||||
isinstance(self.model_name, str)
|
isinstance(self.model_name, str)
|
||||||
and "xtts" in self.model_name
|
and "xtts" in self.model_name
|
||||||
or self.config
|
or self.config
|
||||||
and ("xtts" in self.config.model or len(self.config.languages) > 1)
|
and ("xtts" in self.config.model or "languages" in self.config and len(self.config.languages) > 1)
|
||||||
):
|
):
|
||||||
return True
|
return True
|
||||||
if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager:
|
if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager:
|
||||||
|
@ -122,8 +123,9 @@ class TTS(nn.Module):
|
||||||
def get_models_file_path():
|
def get_models_file_path():
|
||||||
return Path(__file__).parent / ".models.json"
|
return Path(__file__).parent / ".models.json"
|
||||||
|
|
||||||
def list_models(self):
|
@staticmethod
|
||||||
return ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False)
|
def list_models():
|
||||||
|
return ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False).list_models()
|
||||||
|
|
||||||
def download_model_by_name(self, model_name: str):
|
def download_model_by_name(self, model_name: str):
|
||||||
model_path, config_path, model_item = self.manager.download_model(model_name)
|
model_path, config_path, model_item = self.manager.download_model(model_name)
|
||||||
|
@ -168,9 +170,7 @@ class TTS(nn.Module):
|
||||||
self.synthesizer = None
|
self.synthesizer = None
|
||||||
self.model_name = model_name
|
self.model_name = model_name
|
||||||
|
|
||||||
model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
|
model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(model_name)
|
||||||
model_name
|
|
||||||
)
|
|
||||||
|
|
||||||
# init synthesizer
|
# init synthesizer
|
||||||
# None values are fetch from the model
|
# None values are fetch from the model
|
||||||
|
@ -231,7 +231,7 @@ class TTS(nn.Module):
|
||||||
raise ValueError("Model is not multi-speaker but `speaker` is provided.")
|
raise ValueError("Model is not multi-speaker but `speaker` is provided.")
|
||||||
if not self.is_multi_lingual and language is not None:
|
if not self.is_multi_lingual and language is not None:
|
||||||
raise ValueError("Model is not multi-lingual but `language` is provided.")
|
raise ValueError("Model is not multi-lingual but `language` is provided.")
|
||||||
if not emotion is None and not speed is None:
|
if emotion is not None and speed is not None:
|
||||||
raise ValueError("Emotion and speed can only be used with Coqui Studio models. Which is discontinued.")
|
raise ValueError("Emotion and speed can only be used with Coqui Studio models. Which is discontinued.")
|
||||||
|
|
||||||
def tts(
|
def tts(
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
"""Get detailed info about the working environment."""
|
"""Get detailed info about the working environment."""
|
||||||
|
|
||||||
|
import json
|
||||||
import os
|
import os
|
||||||
import platform
|
import platform
|
||||||
import sys
|
import sys
|
||||||
|
@ -6,11 +8,10 @@ import sys
|
||||||
import numpy
|
import numpy
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
sys.path += [os.path.abspath(".."), os.path.abspath(".")]
|
|
||||||
import json
|
|
||||||
|
|
||||||
import TTS
|
import TTS
|
||||||
|
|
||||||
|
sys.path += [os.path.abspath(".."), os.path.abspath(".")]
|
||||||
|
|
||||||
|
|
||||||
def system_info():
|
def system_info():
|
||||||
return {
|
return {
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
import argparse
|
import argparse
|
||||||
import importlib
|
import importlib
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
from argparse import RawTextHelpFormatter
|
from argparse import RawTextHelpFormatter
|
||||||
|
|
||||||
|
@ -7,15 +8,18 @@ import numpy as np
|
||||||
import torch
|
import torch
|
||||||
from torch.utils.data import DataLoader
|
from torch.utils.data import DataLoader
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
from trainer.io import load_checkpoint
|
||||||
|
|
||||||
from TTS.config import load_config
|
from TTS.config import load_config
|
||||||
from TTS.tts.datasets.TTSDataset import TTSDataset
|
from TTS.tts.datasets.TTSDataset import TTSDataset
|
||||||
from TTS.tts.models import setup_model
|
from TTS.tts.models import setup_model
|
||||||
from TTS.tts.utils.text.characters import make_symbols, phonemes, symbols
|
from TTS.tts.utils.text.characters import make_symbols, phonemes, symbols
|
||||||
from TTS.utils.audio import AudioProcessor
|
from TTS.utils.audio import AudioProcessor
|
||||||
from TTS.utils.io import load_checkpoint
|
from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
|
||||||
|
|
||||||
# pylint: disable=bad-option-value
|
# pylint: disable=bad-option-value
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="""Extract attention masks from trained Tacotron/Tacotron2 models.
|
description="""Extract attention masks from trained Tacotron/Tacotron2 models.
|
||||||
|
@ -31,7 +35,7 @@ Example run:
|
||||||
--data_path /root/LJSpeech-1.1/
|
--data_path /root/LJSpeech-1.1/
|
||||||
--batch_size 32
|
--batch_size 32
|
||||||
--dataset ljspeech
|
--dataset ljspeech
|
||||||
--use_cuda True
|
--use_cuda
|
||||||
""",
|
""",
|
||||||
formatter_class=RawTextHelpFormatter,
|
formatter_class=RawTextHelpFormatter,
|
||||||
)
|
)
|
||||||
|
@ -58,7 +62,7 @@ Example run:
|
||||||
help="Dataset metafile inclusing file paths with transcripts.",
|
help="Dataset metafile inclusing file paths with transcripts.",
|
||||||
)
|
)
|
||||||
parser.add_argument("--data_path", type=str, default="", help="Defines the data path. It overwrites config.json.")
|
parser.add_argument("--data_path", type=str, default="", help="Defines the data path. It overwrites config.json.")
|
||||||
parser.add_argument("--use_cuda", type=bool, default=False, help="enable/disable cuda.")
|
parser.add_argument("--use_cuda", action=argparse.BooleanOptionalAction, default=False, help="enable/disable cuda.")
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--batch_size", default=16, type=int, help="Batch size for the model. Use batch_size=1 if you have no CUDA."
|
"--batch_size", default=16, type=int, help="Batch size for the model. Use batch_size=1 if you have no CUDA."
|
||||||
|
@ -70,7 +74,7 @@ Example run:
|
||||||
|
|
||||||
# if the vocabulary was passed, replace the default
|
# if the vocabulary was passed, replace the default
|
||||||
if "characters" in C.keys():
|
if "characters" in C.keys():
|
||||||
symbols, phonemes = make_symbols(**C.characters)
|
symbols, phonemes = make_symbols(**C.characters) # noqa: F811
|
||||||
|
|
||||||
# load the model
|
# load the model
|
||||||
num_chars = len(phonemes) if C.use_phonemes else len(symbols)
|
num_chars = len(phonemes) if C.use_phonemes else len(symbols)
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import argparse
|
import argparse
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
from argparse import RawTextHelpFormatter
|
from argparse import RawTextHelpFormatter
|
||||||
|
|
||||||
|
@ -10,6 +11,7 @@ from TTS.config.shared_configs import BaseDatasetConfig
|
||||||
from TTS.tts.datasets import load_tts_samples
|
from TTS.tts.datasets import load_tts_samples
|
||||||
from TTS.tts.utils.managers import save_file
|
from TTS.tts.utils.managers import save_file
|
||||||
from TTS.tts.utils.speakers import SpeakerManager
|
from TTS.tts.utils.speakers import SpeakerManager
|
||||||
|
from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
|
||||||
|
|
||||||
|
|
||||||
def compute_embeddings(
|
def compute_embeddings(
|
||||||
|
@ -100,6 +102,8 @@ def compute_embeddings(
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="""Compute embedding vectors for each audio file in a dataset and store them keyed by `{dataset_name}#{file_path}` in a .pth file\n\n"""
|
description="""Compute embedding vectors for each audio file in a dataset and store them keyed by `{dataset_name}#{file_path}` in a .pth file\n\n"""
|
||||||
"""
|
"""
|
||||||
|
@ -146,7 +150,7 @@ if __name__ == "__main__":
|
||||||
default=False,
|
default=False,
|
||||||
action="store_true",
|
action="store_true",
|
||||||
)
|
)
|
||||||
parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False)
|
parser.add_argument("--disable_cuda", action="store_true", help="Flag to disable cuda.", default=False)
|
||||||
parser.add_argument("--no_eval", help="Do not compute eval?. Default False", default=False, action="store_true")
|
parser.add_argument("--no_eval", help="Do not compute eval?. Default False", default=False, action="store_true")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--formatter_name",
|
"--formatter_name",
|
||||||
|
|
|
@ -3,6 +3,7 @@
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import glob
|
import glob
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
@ -12,10 +13,13 @@ from tqdm import tqdm
|
||||||
from TTS.config import load_config
|
from TTS.config import load_config
|
||||||
from TTS.tts.datasets import load_tts_samples
|
from TTS.tts.datasets import load_tts_samples
|
||||||
from TTS.utils.audio import AudioProcessor
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
"""Run preprocessing process."""
|
"""Run preprocessing process."""
|
||||||
|
setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description="Compute mean and variance of spectrogtram features.")
|
parser = argparse.ArgumentParser(description="Compute mean and variance of spectrogtram features.")
|
||||||
parser.add_argument("config_path", type=str, help="TTS config file path to define audio processin parameters.")
|
parser.add_argument("config_path", type=str, help="TTS config file path to define audio processin parameters.")
|
||||||
parser.add_argument("out_path", type=str, help="save path (directory and filename).")
|
parser.add_argument("out_path", type=str, help="save path (directory and filename).")
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import argparse
|
import argparse
|
||||||
|
import logging
|
||||||
from argparse import RawTextHelpFormatter
|
from argparse import RawTextHelpFormatter
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
@ -7,6 +8,7 @@ from tqdm import tqdm
|
||||||
from TTS.config import load_config
|
from TTS.config import load_config
|
||||||
from TTS.tts.datasets import load_tts_samples
|
from TTS.tts.datasets import load_tts_samples
|
||||||
from TTS.tts.utils.speakers import SpeakerManager
|
from TTS.tts.utils.speakers import SpeakerManager
|
||||||
|
from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
|
||||||
|
|
||||||
|
|
||||||
def compute_encoder_accuracy(dataset_items, encoder_manager):
|
def compute_encoder_accuracy(dataset_items, encoder_manager):
|
||||||
|
@ -51,6 +53,8 @@ def compute_encoder_accuracy(dataset_items, encoder_manager):
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="""Compute the accuracy of the encoder.\n\n"""
|
description="""Compute the accuracy of the encoder.\n\n"""
|
||||||
"""
|
"""
|
||||||
|
@ -71,8 +75,8 @@ if __name__ == "__main__":
|
||||||
type=str,
|
type=str,
|
||||||
help="Path to dataset config file.",
|
help="Path to dataset config file.",
|
||||||
)
|
)
|
||||||
parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)
|
parser.add_argument("--use_cuda", action=argparse.BooleanOptionalAction, help="flag to set cuda.", default=True)
|
||||||
parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
|
parser.add_argument("--eval", action=argparse.BooleanOptionalAction, help="compute eval.", default=True)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
|
|
@ -2,12 +2,14 @@
|
||||||
"""Extract Mel spectrograms with teacher forcing."""
|
"""Extract Mel spectrograms with teacher forcing."""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
from torch.utils.data import DataLoader
|
from torch.utils.data import DataLoader
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
from trainer.generic_utils import count_parameters
|
||||||
|
|
||||||
from TTS.config import load_config
|
from TTS.config import load_config
|
||||||
from TTS.tts.datasets import TTSDataset, load_tts_samples
|
from TTS.tts.datasets import TTSDataset, load_tts_samples
|
||||||
|
@ -16,12 +18,12 @@ from TTS.tts.utils.speakers import SpeakerManager
|
||||||
from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
||||||
from TTS.utils.audio import AudioProcessor
|
from TTS.utils.audio import AudioProcessor
|
||||||
from TTS.utils.audio.numpy_transforms import quantize
|
from TTS.utils.audio.numpy_transforms import quantize
|
||||||
from TTS.utils.generic_utils import count_parameters
|
from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
|
||||||
|
|
||||||
use_cuda = torch.cuda.is_available()
|
use_cuda = torch.cuda.is_available()
|
||||||
|
|
||||||
|
|
||||||
def setup_loader(ap, r, verbose=False):
|
def setup_loader(ap, r):
|
||||||
tokenizer, _ = TTSTokenizer.init_from_config(c)
|
tokenizer, _ = TTSTokenizer.init_from_config(c)
|
||||||
dataset = TTSDataset(
|
dataset = TTSDataset(
|
||||||
outputs_per_step=r,
|
outputs_per_step=r,
|
||||||
|
@ -37,7 +39,6 @@ def setup_loader(ap, r, verbose=False):
|
||||||
phoneme_cache_path=c.phoneme_cache_path,
|
phoneme_cache_path=c.phoneme_cache_path,
|
||||||
precompute_num_workers=0,
|
precompute_num_workers=0,
|
||||||
use_noise_augment=False,
|
use_noise_augment=False,
|
||||||
verbose=verbose,
|
|
||||||
speaker_id_mapping=speaker_manager.name_to_id if c.use_speaker_embedding else None,
|
speaker_id_mapping=speaker_manager.name_to_id if c.use_speaker_embedding else None,
|
||||||
d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None,
|
d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None,
|
||||||
)
|
)
|
||||||
|
@ -257,7 +258,7 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
print("\n > Model has {} parameters".format(num_params), flush=True)
|
print("\n > Model has {} parameters".format(num_params), flush=True)
|
||||||
# set r
|
# set r
|
||||||
r = 1 if c.model.lower() == "glow_tts" else model.decoder.r
|
r = 1 if c.model.lower() == "glow_tts" else model.decoder.r
|
||||||
own_loader = setup_loader(ap, r, verbose=True)
|
own_loader = setup_loader(ap, r)
|
||||||
|
|
||||||
extract_spectrograms(
|
extract_spectrograms(
|
||||||
own_loader,
|
own_loader,
|
||||||
|
@ -272,6 +273,8 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("--config_path", type=str, help="Path to config file for training.", required=True)
|
parser.add_argument("--config_path", type=str, help="Path to config file for training.", required=True)
|
||||||
parser.add_argument("--checkpoint_path", type=str, help="Model file to be restored.", required=True)
|
parser.add_argument("--checkpoint_path", type=str, help="Model file to be restored.", required=True)
|
||||||
|
@ -279,7 +282,7 @@ if __name__ == "__main__":
|
||||||
parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug")
|
parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug")
|
||||||
parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files")
|
parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files")
|
||||||
parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero")
|
parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero")
|
||||||
parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
|
parser.add_argument("--eval", action=argparse.BooleanOptionalAction, help="compute eval.", default=True)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
c = load_config(args.config_path)
|
c = load_config(args.config_path)
|
||||||
|
|
|
@ -1,12 +1,17 @@
|
||||||
"""Find all the unique characters in a dataset"""
|
"""Find all the unique characters in a dataset"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import logging
|
||||||
from argparse import RawTextHelpFormatter
|
from argparse import RawTextHelpFormatter
|
||||||
|
|
||||||
from TTS.config import load_config
|
from TTS.config import load_config
|
||||||
from TTS.tts.datasets import load_tts_samples
|
from TTS.tts.datasets import find_unique_chars, load_tts_samples
|
||||||
|
from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
|
||||||
|
|
||||||
# pylint: disable=bad-option-value
|
# pylint: disable=bad-option-value
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="""Find all the unique characters or phonemes in a dataset.\n\n"""
|
description="""Find all the unique characters or phonemes in a dataset.\n\n"""
|
||||||
|
@ -28,17 +33,7 @@ def main():
|
||||||
)
|
)
|
||||||
|
|
||||||
items = train_items + eval_items
|
items = train_items + eval_items
|
||||||
|
find_unique_chars(items)
|
||||||
texts = "".join(item["text"] for item in items)
|
|
||||||
chars = set(texts)
|
|
||||||
lower_chars = filter(lambda c: c.islower(), chars)
|
|
||||||
chars_force_lower = [c.lower() for c in chars]
|
|
||||||
chars_force_lower = set(chars_force_lower)
|
|
||||||
|
|
||||||
print(f" > Number of unique characters: {len(chars)}")
|
|
||||||
print(f" > Unique characters: {''.join(sorted(chars))}")
|
|
||||||
print(f" > Unique lower characters: {''.join(sorted(lower_chars))}")
|
|
||||||
print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
"""Find all the unique characters in a dataset"""
|
"""Find all the unique characters in a dataset"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import logging
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
from argparse import RawTextHelpFormatter
|
from argparse import RawTextHelpFormatter
|
||||||
|
|
||||||
|
@ -8,15 +10,18 @@ from tqdm.contrib.concurrent import process_map
|
||||||
from TTS.config import load_config
|
from TTS.config import load_config
|
||||||
from TTS.tts.datasets import load_tts_samples
|
from TTS.tts.datasets import load_tts_samples
|
||||||
from TTS.tts.utils.text.phonemizers import Gruut
|
from TTS.tts.utils.text.phonemizers import Gruut
|
||||||
|
from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
|
||||||
|
|
||||||
|
|
||||||
def compute_phonemes(item):
|
def compute_phonemes(item):
|
||||||
text = item["text"]
|
text = item["text"]
|
||||||
ph = phonemizer.phonemize(text).replace("|", "")
|
ph = phonemizer.phonemize(text).replace("|", "")
|
||||||
return set(list(ph))
|
return set(ph)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
|
||||||
|
|
||||||
# pylint: disable=W0601
|
# pylint: disable=W0601
|
||||||
global c, phonemizer
|
global c, phonemizer
|
||||||
# pylint: disable=bad-option-value
|
# pylint: disable=bad-option-value
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
import argparse
|
import argparse
|
||||||
import glob
|
import glob
|
||||||
|
import logging
|
||||||
import multiprocessing
|
import multiprocessing
|
||||||
import os
|
import os
|
||||||
import pathlib
|
import pathlib
|
||||||
|
@ -7,6 +8,7 @@ import pathlib
|
||||||
import torch
|
import torch
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
|
||||||
from TTS.utils.vad import get_vad_model_and_utils, remove_silence
|
from TTS.utils.vad import get_vad_model_and_utils, remove_silence
|
||||||
|
|
||||||
torch.set_num_threads(1)
|
torch.set_num_threads(1)
|
||||||
|
@ -75,8 +77,10 @@ def preprocess_audios():
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end True"
|
description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end"
|
||||||
)
|
)
|
||||||
parser.add_argument("-i", "--input_dir", type=str, help="Dataset root dir", required=True)
|
parser.add_argument("-i", "--input_dir", type=str, help="Dataset root dir", required=True)
|
||||||
parser.add_argument("-o", "--output_dir", type=str, help="Output Dataset dir", default="")
|
parser.add_argument("-o", "--output_dir", type=str, help="Output Dataset dir", default="")
|
||||||
|
@ -91,20 +95,20 @@ if __name__ == "__main__":
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-t",
|
"-t",
|
||||||
"--trim_just_beginning_and_end",
|
"--trim_just_beginning_and_end",
|
||||||
type=bool,
|
action=argparse.BooleanOptionalAction,
|
||||||
default=True,
|
default=True,
|
||||||
help="If True this script will trim just the beginning and end nonspeech parts. If False all nonspeech parts will be trim. Default True",
|
help="If True this script will trim just the beginning and end nonspeech parts. If False all nonspeech parts will be trimmed.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-c",
|
"-c",
|
||||||
"--use_cuda",
|
"--use_cuda",
|
||||||
type=bool,
|
action=argparse.BooleanOptionalAction,
|
||||||
default=False,
|
default=False,
|
||||||
help="If True use cuda",
|
help="If True use cuda",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--use_onnx",
|
"--use_onnx",
|
||||||
type=bool,
|
action=argparse.BooleanOptionalAction,
|
||||||
default=False,
|
default=False,
|
||||||
help="If True use onnx",
|
help="If True use onnx",
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,14 +1,20 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
"""Command line interface."""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import contextlib
|
import contextlib
|
||||||
|
import logging
|
||||||
import sys
|
import sys
|
||||||
from argparse import RawTextHelpFormatter
|
from argparse import RawTextHelpFormatter
|
||||||
|
|
||||||
# pylint: disable=redefined-outer-name, unused-argument
|
# pylint: disable=redefined-outer-name, unused-argument
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
description = """
|
description = """
|
||||||
Synthesize speech on command line.
|
Synthesize speech on command line.
|
||||||
|
|
||||||
|
@ -131,17 +137,8 @@ $ tts --out_path output/path/speech.wav --model_name "<language>/<dataset>/<mode
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
def str2bool(v):
|
def parse_args() -> argparse.Namespace:
|
||||||
if isinstance(v, bool):
|
"""Parse arguments."""
|
||||||
return v
|
|
||||||
if v.lower() in ("yes", "true", "t", "y", "1"):
|
|
||||||
return True
|
|
||||||
if v.lower() in ("no", "false", "f", "n", "0"):
|
|
||||||
return False
|
|
||||||
raise argparse.ArgumentTypeError("Boolean value expected.")
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description=description.replace(" ```\n", ""),
|
description=description.replace(" ```\n", ""),
|
||||||
formatter_class=RawTextHelpFormatter,
|
formatter_class=RawTextHelpFormatter,
|
||||||
|
@ -149,10 +146,7 @@ def main():
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--list_models",
|
"--list_models",
|
||||||
type=str2bool,
|
action="store_true",
|
||||||
nargs="?",
|
|
||||||
const=True,
|
|
||||||
default=False,
|
|
||||||
help="list available pre-trained TTS and vocoder models.",
|
help="list available pre-trained TTS and vocoder models.",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -200,7 +194,7 @@ def main():
|
||||||
default="tts_output.wav",
|
default="tts_output.wav",
|
||||||
help="Output wav file path.",
|
help="Output wav file path.",
|
||||||
)
|
)
|
||||||
parser.add_argument("--use_cuda", type=bool, help="Run model on CUDA.", default=False)
|
parser.add_argument("--use_cuda", action="store_true", help="Run model on CUDA.")
|
||||||
parser.add_argument("--device", type=str, help="Device to run model on.", default="cpu")
|
parser.add_argument("--device", type=str, help="Device to run model on.", default="cpu")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--vocoder_path",
|
"--vocoder_path",
|
||||||
|
@ -219,12 +213,9 @@ def main():
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--pipe_out",
|
"--pipe_out",
|
||||||
help="stdout the generated TTS wav file for shell pipe.",
|
help="stdout the generated TTS wav file for shell pipe.",
|
||||||
type=str2bool,
|
action="store_true",
|
||||||
nargs="?",
|
|
||||||
const=True,
|
|
||||||
default=False,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# args for multi-speaker synthesis
|
# args for multi-speaker synthesis
|
||||||
parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
|
parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
|
||||||
parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None)
|
parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None)
|
||||||
|
@ -254,25 +245,18 @@ def main():
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--list_speaker_idxs",
|
"--list_speaker_idxs",
|
||||||
help="List available speaker ids for the defined multi-speaker model.",
|
help="List available speaker ids for the defined multi-speaker model.",
|
||||||
type=str2bool,
|
action="store_true",
|
||||||
nargs="?",
|
|
||||||
const=True,
|
|
||||||
default=False,
|
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--list_language_idxs",
|
"--list_language_idxs",
|
||||||
help="List available language ids for the defined multi-lingual model.",
|
help="List available language ids for the defined multi-lingual model.",
|
||||||
type=str2bool,
|
action="store_true",
|
||||||
nargs="?",
|
|
||||||
const=True,
|
|
||||||
default=False,
|
|
||||||
)
|
)
|
||||||
# aux args
|
# aux args
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--save_spectogram",
|
"--save_spectogram",
|
||||||
type=bool,
|
action="store_true",
|
||||||
help="If true save raw spectogram for further (vocoder) processing in out_path.",
|
help="Save raw spectogram for further (vocoder) processing in out_path.",
|
||||||
default=False,
|
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--reference_wav",
|
"--reference_wav",
|
||||||
|
@ -288,8 +272,8 @@ def main():
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--progress_bar",
|
"--progress_bar",
|
||||||
type=str2bool,
|
action=argparse.BooleanOptionalAction,
|
||||||
help="If true shows a progress bar for the model download. Defaults to True",
|
help="Show a progress bar for the model download.",
|
||||||
default=True,
|
default=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -330,19 +314,23 @@ def main():
|
||||||
]
|
]
|
||||||
if not any(check_args):
|
if not any(check_args):
|
||||||
parser.parse_args(["-h"])
|
parser.parse_args(["-h"])
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
pipe_out = sys.stdout if args.pipe_out else None
|
pipe_out = sys.stdout if args.pipe_out else None
|
||||||
|
|
||||||
with contextlib.redirect_stdout(None if args.pipe_out else sys.stdout):
|
with contextlib.redirect_stdout(None if args.pipe_out else sys.stdout):
|
||||||
# Late-import to make things load faster
|
# Late-import to make things load faster
|
||||||
from TTS.api import TTS
|
|
||||||
from TTS.utils.manage import ModelManager
|
from TTS.utils.manage import ModelManager
|
||||||
from TTS.utils.synthesizer import Synthesizer
|
from TTS.utils.synthesizer import Synthesizer
|
||||||
|
|
||||||
# load model manager
|
# load model manager
|
||||||
path = Path(__file__).parent / "../.models.json"
|
path = Path(__file__).parent / "../.models.json"
|
||||||
manager = ModelManager(path, progress_bar=args.progress_bar)
|
manager = ModelManager(path, progress_bar=args.progress_bar)
|
||||||
api = TTS()
|
|
||||||
|
|
||||||
tts_path = None
|
tts_path = None
|
||||||
tts_config_path = None
|
tts_config_path = None
|
||||||
|
@ -379,10 +367,8 @@ def main():
|
||||||
if model_item["model_type"] == "tts_models":
|
if model_item["model_type"] == "tts_models":
|
||||||
tts_path = model_path
|
tts_path = model_path
|
||||||
tts_config_path = config_path
|
tts_config_path = config_path
|
||||||
if "default_vocoder" in model_item:
|
if args.vocoder_name is None and "default_vocoder" in model_item:
|
||||||
args.vocoder_name = (
|
args.vocoder_name = model_item["default_vocoder"]
|
||||||
model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
|
|
||||||
)
|
|
||||||
|
|
||||||
# voice conversion model
|
# voice conversion model
|
||||||
if model_item["model_type"] == "voice_conversion_models":
|
if model_item["model_type"] == "voice_conversion_models":
|
||||||
|
@ -437,31 +423,37 @@ def main():
|
||||||
|
|
||||||
# query speaker ids of a multi-speaker model.
|
# query speaker ids of a multi-speaker model.
|
||||||
if args.list_speaker_idxs:
|
if args.list_speaker_idxs:
|
||||||
print(
|
if synthesizer.tts_model.speaker_manager is None:
|
||||||
" > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
|
logger.info("Model only has a single speaker.")
|
||||||
|
return
|
||||||
|
logger.info(
|
||||||
|
"Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
|
||||||
)
|
)
|
||||||
print(synthesizer.tts_model.speaker_manager.name_to_id)
|
logger.info(synthesizer.tts_model.speaker_manager.name_to_id)
|
||||||
return
|
return
|
||||||
|
|
||||||
# query langauge ids of a multi-lingual model.
|
# query langauge ids of a multi-lingual model.
|
||||||
if args.list_language_idxs:
|
if args.list_language_idxs:
|
||||||
print(
|
if synthesizer.tts_model.language_manager is None:
|
||||||
" > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
|
logger.info("Monolingual model.")
|
||||||
|
return
|
||||||
|
logger.info(
|
||||||
|
"Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
|
||||||
)
|
)
|
||||||
print(synthesizer.tts_model.language_manager.name_to_id)
|
logger.info(synthesizer.tts_model.language_manager.name_to_id)
|
||||||
return
|
return
|
||||||
|
|
||||||
# check the arguments against a multi-speaker model.
|
# check the arguments against a multi-speaker model.
|
||||||
if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
|
if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
|
||||||
print(
|
logger.error(
|
||||||
" [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
|
"Looks like you use a multi-speaker model. Define `--speaker_idx` to "
|
||||||
"select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
|
"select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
# RUN THE SYNTHESIS
|
# RUN THE SYNTHESIS
|
||||||
if args.text:
|
if args.text:
|
||||||
print(" > Text: {}".format(args.text))
|
logger.info("Text: %s", args.text)
|
||||||
|
|
||||||
# kick it
|
# kick it
|
||||||
if tts_path is not None:
|
if tts_path is not None:
|
||||||
|
@ -486,8 +478,8 @@ def main():
|
||||||
)
|
)
|
||||||
|
|
||||||
# save the results
|
# save the results
|
||||||
print(" > Saving output to {}".format(args.out_path))
|
|
||||||
synthesizer.save_wav(wav, args.out_path, pipe_out=pipe_out)
|
synthesizer.save_wav(wav, args.out_path, pipe_out=pipe_out)
|
||||||
|
logger.info("Saved output to %s", args.out_path)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
|
@ -8,6 +9,7 @@ import traceback
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from torch.utils.data import DataLoader
|
from torch.utils.data import DataLoader
|
||||||
|
from trainer.generic_utils import count_parameters, remove_experiment_folder
|
||||||
from trainer.io import copy_model_files, save_best_model, save_checkpoint
|
from trainer.io import copy_model_files, save_best_model, save_checkpoint
|
||||||
from trainer.torch import NoamLR
|
from trainer.torch import NoamLR
|
||||||
from trainer.trainer_utils import get_optimizer
|
from trainer.trainer_utils import get_optimizer
|
||||||
|
@ -18,7 +20,7 @@ from TTS.encoder.utils.training import init_training
|
||||||
from TTS.encoder.utils.visual import plot_embeddings
|
from TTS.encoder.utils.visual import plot_embeddings
|
||||||
from TTS.tts.datasets import load_tts_samples
|
from TTS.tts.datasets import load_tts_samples
|
||||||
from TTS.utils.audio import AudioProcessor
|
from TTS.utils.audio import AudioProcessor
|
||||||
from TTS.utils.generic_utils import count_parameters, remove_experiment_folder
|
from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
|
||||||
from TTS.utils.samplers import PerfectBatchSampler
|
from TTS.utils.samplers import PerfectBatchSampler
|
||||||
from TTS.utils.training import check_update
|
from TTS.utils.training import check_update
|
||||||
|
|
||||||
|
@ -31,7 +33,7 @@ print(" > Using CUDA: ", use_cuda)
|
||||||
print(" > Number of GPUs: ", num_gpus)
|
print(" > Number of GPUs: ", num_gpus)
|
||||||
|
|
||||||
|
|
||||||
def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False):
|
def setup_loader(ap: AudioProcessor, is_val: bool = False):
|
||||||
num_utter_per_class = c.num_utter_per_class if not is_val else c.eval_num_utter_per_class
|
num_utter_per_class = c.num_utter_per_class if not is_val else c.eval_num_utter_per_class
|
||||||
num_classes_in_batch = c.num_classes_in_batch if not is_val else c.eval_num_classes_in_batch
|
num_classes_in_batch = c.num_classes_in_batch if not is_val else c.eval_num_classes_in_batch
|
||||||
|
|
||||||
|
@ -42,7 +44,6 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False
|
||||||
voice_len=c.voice_len,
|
voice_len=c.voice_len,
|
||||||
num_utter_per_class=num_utter_per_class,
|
num_utter_per_class=num_utter_per_class,
|
||||||
num_classes_in_batch=num_classes_in_batch,
|
num_classes_in_batch=num_classes_in_batch,
|
||||||
verbose=verbose,
|
|
||||||
augmentation_config=c.audio_augmentation if not is_val else None,
|
augmentation_config=c.audio_augmentation if not is_val else None,
|
||||||
use_torch_spec=c.model_params.get("use_torch_spec", False),
|
use_torch_spec=c.model_params.get("use_torch_spec", False),
|
||||||
)
|
)
|
||||||
|
@ -160,9 +161,6 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader,
|
||||||
loader_time = time.time() - end_time
|
loader_time = time.time() - end_time
|
||||||
global_step += 1
|
global_step += 1
|
||||||
|
|
||||||
# setup lr
|
|
||||||
if c.lr_decay:
|
|
||||||
scheduler.step()
|
|
||||||
optimizer.zero_grad()
|
optimizer.zero_grad()
|
||||||
|
|
||||||
# dispatch data to GPU
|
# dispatch data to GPU
|
||||||
|
@ -181,6 +179,10 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader,
|
||||||
grad_norm, _ = check_update(model, c.grad_clip)
|
grad_norm, _ = check_update(model, c.grad_clip)
|
||||||
optimizer.step()
|
optimizer.step()
|
||||||
|
|
||||||
|
# setup lr
|
||||||
|
if c.lr_decay:
|
||||||
|
scheduler.step()
|
||||||
|
|
||||||
step_time = time.time() - start_time
|
step_time = time.time() - start_time
|
||||||
epoch_time += step_time
|
epoch_time += step_time
|
||||||
|
|
||||||
|
@ -278,9 +280,9 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
# pylint: disable=redefined-outer-name
|
# pylint: disable=redefined-outer-name
|
||||||
meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=True)
|
meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=True)
|
||||||
|
|
||||||
train_data_loader, train_classes, map_classid_to_classname = setup_loader(ap, is_val=False, verbose=True)
|
train_data_loader, train_classes, map_classid_to_classname = setup_loader(ap, is_val=False)
|
||||||
if c.run_eval:
|
if c.run_eval:
|
||||||
eval_data_loader, _, _ = setup_loader(ap, is_val=True, verbose=True)
|
eval_data_loader, _, _ = setup_loader(ap, is_val=True)
|
||||||
else:
|
else:
|
||||||
eval_data_loader = None
|
eval_data_loader = None
|
||||||
|
|
||||||
|
@ -316,6 +318,8 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
|
||||||
|
|
||||||
args, c, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = init_training()
|
args, c, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = init_training()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
|
@ -6,6 +7,7 @@ from trainer import Trainer, TrainerArgs
|
||||||
from TTS.config import load_config, register_config
|
from TTS.config import load_config, register_config
|
||||||
from TTS.tts.datasets import load_tts_samples
|
from TTS.tts.datasets import load_tts_samples
|
||||||
from TTS.tts.models import setup_model
|
from TTS.tts.models import setup_model
|
||||||
|
from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
@ -15,6 +17,8 @@ class TrainTTSArgs(TrainerArgs):
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
"""Run `tts` model training directly by a `config.json` file."""
|
"""Run `tts` model training directly by a `config.json` file."""
|
||||||
|
setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
|
||||||
|
|
||||||
# init trainer args
|
# init trainer args
|
||||||
train_args = TrainTTSArgs()
|
train_args = TrainTTSArgs()
|
||||||
parser = train_args.init_argparse(arg_prefix="")
|
parser = train_args.init_argparse(arg_prefix="")
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
|
@ -5,6 +6,7 @@ from trainer import Trainer, TrainerArgs
|
||||||
|
|
||||||
from TTS.config import load_config, register_config
|
from TTS.config import load_config, register_config
|
||||||
from TTS.utils.audio import AudioProcessor
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
|
||||||
from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
|
from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
|
||||||
from TTS.vocoder.models import setup_model
|
from TTS.vocoder.models import setup_model
|
||||||
|
|
||||||
|
@ -16,6 +18,8 @@ class TrainVocoderArgs(TrainerArgs):
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
"""Run `tts` model training directly by a `config.json` file."""
|
"""Run `tts` model training directly by a `config.json` file."""
|
||||||
|
setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
|
||||||
|
|
||||||
# init trainer args
|
# init trainer args
|
||||||
train_args = TrainVocoderArgs()
|
train_args = TrainVocoderArgs()
|
||||||
parser = train_args.init_argparse(arg_prefix="")
|
parser = train_args.init_argparse(arg_prefix="")
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
"""Search a good noise schedule for WaveGrad for a given number of inference iterations"""
|
"""Search a good noise schedule for WaveGrad for a given number of inference iterations"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import logging
|
||||||
from itertools import product as cartesian_product
|
from itertools import product as cartesian_product
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
@ -9,11 +11,14 @@ from tqdm import tqdm
|
||||||
|
|
||||||
from TTS.config import load_config
|
from TTS.config import load_config
|
||||||
from TTS.utils.audio import AudioProcessor
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
|
||||||
from TTS.vocoder.datasets.preprocess import load_wav_data
|
from TTS.vocoder.datasets.preprocess import load_wav_data
|
||||||
from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset
|
from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset
|
||||||
from TTS.vocoder.models import setup_model
|
from TTS.vocoder.models import setup_model
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("--model_path", type=str, help="Path to model checkpoint.")
|
parser.add_argument("--model_path", type=str, help="Path to model checkpoint.")
|
||||||
parser.add_argument("--config_path", type=str, help="Path to model config file.")
|
parser.add_argument("--config_path", type=str, help="Path to model config file.")
|
||||||
|
@ -54,7 +59,6 @@ if __name__ == "__main__":
|
||||||
return_segments=False,
|
return_segments=False,
|
||||||
use_noise_augment=False,
|
use_noise_augment=False,
|
||||||
use_cache=False,
|
use_cache=False,
|
||||||
verbose=True,
|
|
||||||
)
|
)
|
||||||
loader = DataLoader(
|
loader = DataLoader(
|
||||||
dataset,
|
dataset,
|
||||||
|
|
|
@ -17,9 +17,12 @@ def read_json_with_comments(json_path):
|
||||||
with fsspec.open(json_path, "r", encoding="utf-8") as f:
|
with fsspec.open(json_path, "r", encoding="utf-8") as f:
|
||||||
input_str = f.read()
|
input_str = f.read()
|
||||||
# handle comments but not urls with //
|
# handle comments but not urls with //
|
||||||
input_str = re.sub(r"(\"(?:[^\"\\]|\\.)*\")|(/\*(?:.|[\\n\\r])*?\*/)|(//.*)", lambda m: m.group(1) or m.group(2) or "", input_str)
|
input_str = re.sub(
|
||||||
|
r"(\"(?:[^\"\\]|\\.)*\")|(/\*(?:.|[\\n\\r])*?\*/)|(//.*)", lambda m: m.group(1) or m.group(2) or "", input_str
|
||||||
|
)
|
||||||
return json.loads(input_str)
|
return json.loads(input_str)
|
||||||
|
|
||||||
|
|
||||||
def register_config(model_name: str) -> Coqpit:
|
def register_config(model_name: str) -> Coqpit:
|
||||||
"""Find the right config for the given model name.
|
"""Find the right config for the given model name.
|
||||||
|
|
||||||
|
|
|
@ -1,2 +1,2 @@
|
||||||
faster_whisper==0.9.0
|
faster_whisper==0.9.0
|
||||||
gradio==4.7.1
|
gradio==4.7.1
|
||||||
|
|
|
@ -1,23 +1,17 @@
|
||||||
import os
|
|
||||||
import gc
|
import gc
|
||||||
import torchaudio
|
import os
|
||||||
|
|
||||||
import pandas
|
import pandas
|
||||||
from faster_whisper import WhisperModel
|
|
||||||
from glob import glob
|
|
||||||
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torchaudio
|
import torchaudio
|
||||||
# torch.set_num_threads(1)
|
from faster_whisper import WhisperModel
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
# torch.set_num_threads(1)
|
||||||
from TTS.tts.layers.xtts.tokenizer import multilingual_cleaners
|
from TTS.tts.layers.xtts.tokenizer import multilingual_cleaners
|
||||||
|
|
||||||
torch.set_num_threads(16)
|
torch.set_num_threads(16)
|
||||||
|
|
||||||
|
|
||||||
import os
|
|
||||||
|
|
||||||
audio_types = (".wav", ".mp3", ".flac")
|
audio_types = (".wav", ".mp3", ".flac")
|
||||||
|
|
||||||
|
|
||||||
|
@ -25,9 +19,10 @@ def list_audios(basePath, contains=None):
|
||||||
# return the set of files that are valid
|
# return the set of files that are valid
|
||||||
return list_files(basePath, validExts=audio_types, contains=contains)
|
return list_files(basePath, validExts=audio_types, contains=contains)
|
||||||
|
|
||||||
|
|
||||||
def list_files(basePath, validExts=None, contains=None):
|
def list_files(basePath, validExts=None, contains=None):
|
||||||
# loop over the directory structure
|
# loop over the directory structure
|
||||||
for (rootDir, dirNames, filenames) in os.walk(basePath):
|
for rootDir, dirNames, filenames in os.walk(basePath):
|
||||||
# loop over the filenames in the current directory
|
# loop over the filenames in the current directory
|
||||||
for filename in filenames:
|
for filename in filenames:
|
||||||
# if the contains string is not none and the filename does not contain
|
# if the contains string is not none and the filename does not contain
|
||||||
|
@ -36,7 +31,7 @@ def list_files(basePath, validExts=None, contains=None):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# determine the file extension of the current file
|
# determine the file extension of the current file
|
||||||
ext = filename[filename.rfind("."):].lower()
|
ext = filename[filename.rfind(".") :].lower()
|
||||||
|
|
||||||
# check to see if the file is an audio and should be processed
|
# check to see if the file is an audio and should be processed
|
||||||
if validExts is None or ext.endswith(validExts):
|
if validExts is None or ext.endswith(validExts):
|
||||||
|
@ -44,13 +39,22 @@ def list_files(basePath, validExts=None, contains=None):
|
||||||
audioPath = os.path.join(rootDir, filename)
|
audioPath = os.path.join(rootDir, filename)
|
||||||
yield audioPath
|
yield audioPath
|
||||||
|
|
||||||
def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0.2, eval_percentage=0.15, speaker_name="coqui", gradio_progress=None):
|
|
||||||
|
def format_audio_list(
|
||||||
|
audio_files,
|
||||||
|
target_language="en",
|
||||||
|
out_path=None,
|
||||||
|
buffer=0.2,
|
||||||
|
eval_percentage=0.15,
|
||||||
|
speaker_name="coqui",
|
||||||
|
gradio_progress=None,
|
||||||
|
):
|
||||||
audio_total_size = 0
|
audio_total_size = 0
|
||||||
# make sure that ooutput file exists
|
# make sure that ooutput file exists
|
||||||
os.makedirs(out_path, exist_ok=True)
|
os.makedirs(out_path, exist_ok=True)
|
||||||
|
|
||||||
# Loading Whisper
|
# Loading Whisper
|
||||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
|
|
||||||
print("Loading Whisper Model!")
|
print("Loading Whisper Model!")
|
||||||
asr_model = WhisperModel("large-v2", device=device, compute_type="float16")
|
asr_model = WhisperModel("large-v2", device=device, compute_type="float16")
|
||||||
|
@ -69,7 +73,7 @@ def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0
|
||||||
wav = torch.mean(wav, dim=0, keepdim=True)
|
wav = torch.mean(wav, dim=0, keepdim=True)
|
||||||
|
|
||||||
wav = wav.squeeze()
|
wav = wav.squeeze()
|
||||||
audio_total_size += (wav.size(-1) / sr)
|
audio_total_size += wav.size(-1) / sr
|
||||||
|
|
||||||
segments, _ = asr_model.transcribe(audio_path, word_timestamps=True, language=target_language)
|
segments, _ = asr_model.transcribe(audio_path, word_timestamps=True, language=target_language)
|
||||||
segments = list(segments)
|
segments = list(segments)
|
||||||
|
@ -94,7 +98,7 @@ def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0
|
||||||
# get previous sentence end
|
# get previous sentence end
|
||||||
previous_word_end = words_list[word_idx - 1].end
|
previous_word_end = words_list[word_idx - 1].end
|
||||||
# add buffer or get the silence midle between the previous sentence and the current one
|
# add buffer or get the silence midle between the previous sentence and the current one
|
||||||
sentence_start = max(sentence_start - buffer, (previous_word_end + sentence_start)/2)
|
sentence_start = max(sentence_start - buffer, (previous_word_end + sentence_start) / 2)
|
||||||
|
|
||||||
sentence = word.word
|
sentence = word.word
|
||||||
first_word = False
|
first_word = False
|
||||||
|
@ -118,19 +122,16 @@ def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0
|
||||||
|
|
||||||
# Average the current word end and next word start
|
# Average the current word end and next word start
|
||||||
word_end = min((word.end + next_word_start) / 2, word.end + buffer)
|
word_end = min((word.end + next_word_start) / 2, word.end + buffer)
|
||||||
|
|
||||||
absoulte_path = os.path.join(out_path, audio_file)
|
absoulte_path = os.path.join(out_path, audio_file)
|
||||||
os.makedirs(os.path.dirname(absoulte_path), exist_ok=True)
|
os.makedirs(os.path.dirname(absoulte_path), exist_ok=True)
|
||||||
i += 1
|
i += 1
|
||||||
first_word = True
|
first_word = True
|
||||||
|
|
||||||
audio = wav[int(sr*sentence_start):int(sr*word_end)].unsqueeze(0)
|
audio = wav[int(sr * sentence_start) : int(sr * word_end)].unsqueeze(0)
|
||||||
# if the audio is too short ignore it (i.e < 0.33 seconds)
|
# if the audio is too short ignore it (i.e < 0.33 seconds)
|
||||||
if audio.size(-1) >= sr/3:
|
if audio.size(-1) >= sr / 3:
|
||||||
torchaudio.save(absoulte_path,
|
torchaudio.save(absoulte_path, audio, sr)
|
||||||
audio,
|
|
||||||
sr
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
@ -140,21 +141,21 @@ def format_audio_list(audio_files, target_language="en", out_path=None, buffer=0
|
||||||
|
|
||||||
df = pandas.DataFrame(metadata)
|
df = pandas.DataFrame(metadata)
|
||||||
df = df.sample(frac=1)
|
df = df.sample(frac=1)
|
||||||
num_val_samples = int(len(df)*eval_percentage)
|
num_val_samples = int(len(df) * eval_percentage)
|
||||||
|
|
||||||
df_eval = df[:num_val_samples]
|
df_eval = df[:num_val_samples]
|
||||||
df_train = df[num_val_samples:]
|
df_train = df[num_val_samples:]
|
||||||
|
|
||||||
df_train = df_train.sort_values('audio_file')
|
df_train = df_train.sort_values("audio_file")
|
||||||
train_metadata_path = os.path.join(out_path, "metadata_train.csv")
|
train_metadata_path = os.path.join(out_path, "metadata_train.csv")
|
||||||
df_train.to_csv(train_metadata_path, sep="|", index=False)
|
df_train.to_csv(train_metadata_path, sep="|", index=False)
|
||||||
|
|
||||||
eval_metadata_path = os.path.join(out_path, "metadata_eval.csv")
|
eval_metadata_path = os.path.join(out_path, "metadata_eval.csv")
|
||||||
df_eval = df_eval.sort_values('audio_file')
|
df_eval = df_eval.sort_values("audio_file")
|
||||||
df_eval.to_csv(eval_metadata_path, sep="|", index=False)
|
df_eval.to_csv(eval_metadata_path, sep="|", index=False)
|
||||||
|
|
||||||
# deallocate VRAM and RAM
|
# deallocate VRAM and RAM
|
||||||
del asr_model, df_train, df_eval, df, metadata
|
del asr_model, df_train, df_eval, df, metadata
|
||||||
gc.collect()
|
gc.collect()
|
||||||
|
|
||||||
return train_metadata_path, eval_metadata_path, audio_total_size
|
return train_metadata_path, eval_metadata_path, audio_total_size
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
import os
|
|
||||||
import gc
|
import gc
|
||||||
|
import os
|
||||||
|
|
||||||
from trainer import Trainer, TrainerArgs
|
from trainer import Trainer, TrainerArgs
|
||||||
|
|
||||||
|
@ -25,7 +25,6 @@ def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv,
|
||||||
BATCH_SIZE = batch_size # set here the batch size
|
BATCH_SIZE = batch_size # set here the batch size
|
||||||
GRAD_ACUMM_STEPS = grad_acumm # set here the grad accumulation steps
|
GRAD_ACUMM_STEPS = grad_acumm # set here the grad accumulation steps
|
||||||
|
|
||||||
|
|
||||||
# Define here the dataset that you want to use for the fine-tuning on.
|
# Define here the dataset that you want to use for the fine-tuning on.
|
||||||
config_dataset = BaseDatasetConfig(
|
config_dataset = BaseDatasetConfig(
|
||||||
formatter="coqui",
|
formatter="coqui",
|
||||||
|
@ -43,7 +42,6 @@ def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv,
|
||||||
CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "XTTS_v2.0_original_model_files/")
|
CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "XTTS_v2.0_original_model_files/")
|
||||||
os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)
|
os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
# DVAE files
|
# DVAE files
|
||||||
DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth"
|
DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth"
|
||||||
MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth"
|
MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth"
|
||||||
|
@ -55,8 +53,9 @@ def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv,
|
||||||
# download DVAE files if needed
|
# download DVAE files if needed
|
||||||
if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
|
if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
|
||||||
print(" > Downloading DVAE files!")
|
print(" > Downloading DVAE files!")
|
||||||
ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True)
|
ModelManager._download_model_files(
|
||||||
|
[MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
|
||||||
|
)
|
||||||
|
|
||||||
# Download XTTS v2.0 checkpoint if needed
|
# Download XTTS v2.0 checkpoint if needed
|
||||||
TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json"
|
TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json"
|
||||||
|
@ -160,7 +159,7 @@ def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv,
|
||||||
|
|
||||||
# get the longest text audio file to use as speaker reference
|
# get the longest text audio file to use as speaker reference
|
||||||
samples_len = [len(item["text"].split(" ")) for item in train_samples]
|
samples_len = [len(item["text"].split(" ")) for item in train_samples]
|
||||||
longest_text_idx = samples_len.index(max(samples_len))
|
longest_text_idx = samples_len.index(max(samples_len))
|
||||||
speaker_ref = train_samples[longest_text_idx]["audio_file"]
|
speaker_ref = train_samples[longest_text_idx]["audio_file"]
|
||||||
|
|
||||||
trainer_out_path = trainer.output_path
|
trainer_out_path = trainer.output_path
|
||||||
|
|
|
@ -1,19 +1,16 @@
|
||||||
import argparse
|
import argparse
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import tempfile
|
import tempfile
|
||||||
|
import traceback
|
||||||
|
|
||||||
import gradio as gr
|
import gradio as gr
|
||||||
import librosa.display
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
import os
|
|
||||||
import torch
|
import torch
|
||||||
import torchaudio
|
import torchaudio
|
||||||
import traceback
|
|
||||||
from TTS.demos.xtts_ft_demo.utils.formatter import format_audio_list
|
from TTS.demos.xtts_ft_demo.utils.formatter import format_audio_list
|
||||||
from TTS.demos.xtts_ft_demo.utils.gpt_train import train_gpt
|
from TTS.demos.xtts_ft_demo.utils.gpt_train import train_gpt
|
||||||
|
|
||||||
from TTS.tts.configs.xtts_config import XttsConfig
|
from TTS.tts.configs.xtts_config import XttsConfig
|
||||||
from TTS.tts.models.xtts import Xtts
|
from TTS.tts.models.xtts import Xtts
|
||||||
|
|
||||||
|
@ -23,7 +20,10 @@ def clear_gpu_cache():
|
||||||
if torch.cuda.is_available():
|
if torch.cuda.is_available():
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
|
||||||
XTTS_MODEL = None
|
XTTS_MODEL = None
|
||||||
|
|
||||||
|
|
||||||
def load_model(xtts_checkpoint, xtts_config, xtts_vocab):
|
def load_model(xtts_checkpoint, xtts_config, xtts_vocab):
|
||||||
global XTTS_MODEL
|
global XTTS_MODEL
|
||||||
clear_gpu_cache()
|
clear_gpu_cache()
|
||||||
|
@ -40,17 +40,23 @@ def load_model(xtts_checkpoint, xtts_config, xtts_vocab):
|
||||||
print("Model Loaded!")
|
print("Model Loaded!")
|
||||||
return "Model Loaded!"
|
return "Model Loaded!"
|
||||||
|
|
||||||
|
|
||||||
def run_tts(lang, tts_text, speaker_audio_file):
|
def run_tts(lang, tts_text, speaker_audio_file):
|
||||||
if XTTS_MODEL is None or not speaker_audio_file:
|
if XTTS_MODEL is None or not speaker_audio_file:
|
||||||
return "You need to run the previous step to load the model !!", None, None
|
return "You need to run the previous step to load the model !!", None, None
|
||||||
|
|
||||||
gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(audio_path=speaker_audio_file, gpt_cond_len=XTTS_MODEL.config.gpt_cond_len, max_ref_length=XTTS_MODEL.config.max_ref_len, sound_norm_refs=XTTS_MODEL.config.sound_norm_refs)
|
gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(
|
||||||
|
audio_path=speaker_audio_file,
|
||||||
|
gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
|
||||||
|
max_ref_length=XTTS_MODEL.config.max_ref_len,
|
||||||
|
sound_norm_refs=XTTS_MODEL.config.sound_norm_refs,
|
||||||
|
)
|
||||||
out = XTTS_MODEL.inference(
|
out = XTTS_MODEL.inference(
|
||||||
text=tts_text,
|
text=tts_text,
|
||||||
language=lang,
|
language=lang,
|
||||||
gpt_cond_latent=gpt_cond_latent,
|
gpt_cond_latent=gpt_cond_latent,
|
||||||
speaker_embedding=speaker_embedding,
|
speaker_embedding=speaker_embedding,
|
||||||
temperature=XTTS_MODEL.config.temperature, # Add custom parameters here
|
temperature=XTTS_MODEL.config.temperature, # Add custom parameters here
|
||||||
length_penalty=XTTS_MODEL.config.length_penalty,
|
length_penalty=XTTS_MODEL.config.length_penalty,
|
||||||
repetition_penalty=XTTS_MODEL.config.repetition_penalty,
|
repetition_penalty=XTTS_MODEL.config.repetition_penalty,
|
||||||
top_k=XTTS_MODEL.config.top_k,
|
top_k=XTTS_MODEL.config.top_k,
|
||||||
|
@ -65,9 +71,7 @@ def run_tts(lang, tts_text, speaker_audio_file):
|
||||||
return "Speech generated !", out_path, speaker_audio_file
|
return "Speech generated !", out_path, speaker_audio_file
|
||||||
|
|
||||||
|
|
||||||
|
# define a logger to redirect
|
||||||
|
|
||||||
# define a logger to redirect
|
|
||||||
class Logger:
|
class Logger:
|
||||||
def __init__(self, filename="log.out"):
|
def __init__(self, filename="log.out"):
|
||||||
self.log_file = filename
|
self.log_file = filename
|
||||||
|
@ -85,21 +89,19 @@ class Logger:
|
||||||
def isatty(self):
|
def isatty(self):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
# redirect stdout and stderr to a file
|
# redirect stdout and stderr to a file
|
||||||
sys.stdout = Logger()
|
sys.stdout = Logger()
|
||||||
sys.stderr = sys.stdout
|
sys.stderr = sys.stdout
|
||||||
|
|
||||||
|
|
||||||
# logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
# logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
||||||
import logging
|
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=logging.INFO,
|
level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[logging.StreamHandler(sys.stdout)]
|
||||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
|
||||||
handlers=[
|
|
||||||
logging.StreamHandler(sys.stdout)
|
|
||||||
]
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def read_logs():
|
def read_logs():
|
||||||
sys.stdout.flush()
|
sys.stdout.flush()
|
||||||
with open(sys.stdout.log_file, "r") as f:
|
with open(sys.stdout.log_file, "r") as f:
|
||||||
|
@ -107,12 +109,11 @@ def read_logs():
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="""XTTS fine-tuning demo\n\n"""
|
description="""XTTS fine-tuning demo\n\n"""
|
||||||
"""
|
"""
|
||||||
Example runs:
|
Example runs:
|
||||||
python3 TTS/demos/xtts_ft_demo/xtts_demo.py --port
|
python3 TTS/demos/xtts_ft_demo/xtts_demo.py --port
|
||||||
""",
|
""",
|
||||||
formatter_class=argparse.RawTextHelpFormatter,
|
formatter_class=argparse.RawTextHelpFormatter,
|
||||||
)
|
)
|
||||||
|
@ -190,12 +191,11 @@ if __name__ == "__main__":
|
||||||
"zh",
|
"zh",
|
||||||
"hu",
|
"hu",
|
||||||
"ko",
|
"ko",
|
||||||
"ja"
|
"ja",
|
||||||
|
"hi",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
progress_data = gr.Label(
|
progress_data = gr.Label(label="Progress:")
|
||||||
label="Progress:"
|
|
||||||
)
|
|
||||||
logs = gr.Textbox(
|
logs = gr.Textbox(
|
||||||
label="Logs:",
|
label="Logs:",
|
||||||
interactive=False,
|
interactive=False,
|
||||||
|
@ -203,20 +203,30 @@ if __name__ == "__main__":
|
||||||
demo.load(read_logs, None, logs, every=1)
|
demo.load(read_logs, None, logs, every=1)
|
||||||
|
|
||||||
prompt_compute_btn = gr.Button(value="Step 1 - Create dataset")
|
prompt_compute_btn = gr.Button(value="Step 1 - Create dataset")
|
||||||
|
|
||||||
def preprocess_dataset(audio_path, language, out_path, progress=gr.Progress(track_tqdm=True)):
|
def preprocess_dataset(audio_path, language, out_path, progress=gr.Progress(track_tqdm=True)):
|
||||||
clear_gpu_cache()
|
clear_gpu_cache()
|
||||||
out_path = os.path.join(out_path, "dataset")
|
out_path = os.path.join(out_path, "dataset")
|
||||||
os.makedirs(out_path, exist_ok=True)
|
os.makedirs(out_path, exist_ok=True)
|
||||||
if audio_path is None:
|
if audio_path is None:
|
||||||
return "You should provide one or multiple audio files! If you provided it, probably the upload of the files is not finished yet!", "", ""
|
return (
|
||||||
|
"You should provide one or multiple audio files! If you provided it, probably the upload of the files is not finished yet!",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
train_meta, eval_meta, audio_total_size = format_audio_list(audio_path, target_language=language, out_path=out_path, gradio_progress=progress)
|
train_meta, eval_meta, audio_total_size = format_audio_list(
|
||||||
|
audio_path, target_language=language, out_path=out_path, gradio_progress=progress
|
||||||
|
)
|
||||||
except:
|
except:
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
error = traceback.format_exc()
|
error = traceback.format_exc()
|
||||||
return f"The data processing was interrupted due an error !! Please check the console to verify the full error message! \n Error summary: {error}", "", ""
|
return (
|
||||||
|
f"The data processing was interrupted due an error !! Please check the console to verify the full error message! \n Error summary: {error}",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
)
|
||||||
|
|
||||||
clear_gpu_cache()
|
clear_gpu_cache()
|
||||||
|
|
||||||
|
@ -236,7 +246,7 @@ if __name__ == "__main__":
|
||||||
eval_csv = gr.Textbox(
|
eval_csv = gr.Textbox(
|
||||||
label="Eval CSV:",
|
label="Eval CSV:",
|
||||||
)
|
)
|
||||||
num_epochs = gr.Slider(
|
num_epochs = gr.Slider(
|
||||||
label="Number of epochs:",
|
label="Number of epochs:",
|
||||||
minimum=1,
|
minimum=1,
|
||||||
maximum=100,
|
maximum=100,
|
||||||
|
@ -264,9 +274,7 @@ if __name__ == "__main__":
|
||||||
step=1,
|
step=1,
|
||||||
value=args.max_audio_length,
|
value=args.max_audio_length,
|
||||||
)
|
)
|
||||||
progress_train = gr.Label(
|
progress_train = gr.Label(label="Progress:")
|
||||||
label="Progress:"
|
|
||||||
)
|
|
||||||
logs_tts_train = gr.Textbox(
|
logs_tts_train = gr.Textbox(
|
||||||
label="Logs:",
|
label="Logs:",
|
||||||
interactive=False,
|
interactive=False,
|
||||||
|
@ -274,18 +282,41 @@ if __name__ == "__main__":
|
||||||
demo.load(read_logs, None, logs_tts_train, every=1)
|
demo.load(read_logs, None, logs_tts_train, every=1)
|
||||||
train_btn = gr.Button(value="Step 2 - Run the training")
|
train_btn = gr.Button(value="Step 2 - Run the training")
|
||||||
|
|
||||||
def train_model(language, train_csv, eval_csv, num_epochs, batch_size, grad_acumm, output_path, max_audio_length):
|
def train_model(
|
||||||
|
language, train_csv, eval_csv, num_epochs, batch_size, grad_acumm, output_path, max_audio_length
|
||||||
|
):
|
||||||
clear_gpu_cache()
|
clear_gpu_cache()
|
||||||
if not train_csv or not eval_csv:
|
if not train_csv or not eval_csv:
|
||||||
return "You need to run the data processing step or manually set `Train CSV` and `Eval CSV` fields !", "", "", "", ""
|
return (
|
||||||
|
"You need to run the data processing step or manually set `Train CSV` and `Eval CSV` fields !",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
# convert seconds to waveform frames
|
# convert seconds to waveform frames
|
||||||
max_audio_length = int(max_audio_length * 22050)
|
max_audio_length = int(max_audio_length * 22050)
|
||||||
config_path, original_xtts_checkpoint, vocab_file, exp_path, speaker_wav = train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, output_path=output_path, max_audio_length=max_audio_length)
|
config_path, original_xtts_checkpoint, vocab_file, exp_path, speaker_wav = train_gpt(
|
||||||
|
language,
|
||||||
|
num_epochs,
|
||||||
|
batch_size,
|
||||||
|
grad_acumm,
|
||||||
|
train_csv,
|
||||||
|
eval_csv,
|
||||||
|
output_path=output_path,
|
||||||
|
max_audio_length=max_audio_length,
|
||||||
|
)
|
||||||
except:
|
except:
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
error = traceback.format_exc()
|
error = traceback.format_exc()
|
||||||
return f"The training was interrupted due an error !! Please check the console to check the full error message! \n Error summary: {error}", "", "", "", ""
|
return (
|
||||||
|
f"The training was interrupted due an error !! Please check the console to check the full error message! \n Error summary: {error}",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
)
|
||||||
|
|
||||||
# copy original files to avoid parameters changes issues
|
# copy original files to avoid parameters changes issues
|
||||||
os.system(f"cp {config_path} {exp_path}")
|
os.system(f"cp {config_path} {exp_path}")
|
||||||
|
@ -312,9 +343,7 @@ if __name__ == "__main__":
|
||||||
label="XTTS vocab path:",
|
label="XTTS vocab path:",
|
||||||
value="",
|
value="",
|
||||||
)
|
)
|
||||||
progress_load = gr.Label(
|
progress_load = gr.Label(label="Progress:")
|
||||||
label="Progress:"
|
|
||||||
)
|
|
||||||
load_btn = gr.Button(value="Step 3 - Load Fine-tuned XTTS model")
|
load_btn = gr.Button(value="Step 3 - Load Fine-tuned XTTS model")
|
||||||
|
|
||||||
with gr.Column() as col2:
|
with gr.Column() as col2:
|
||||||
|
@ -342,7 +371,8 @@ if __name__ == "__main__":
|
||||||
"hu",
|
"hu",
|
||||||
"ko",
|
"ko",
|
||||||
"ja",
|
"ja",
|
||||||
]
|
"hi",
|
||||||
|
],
|
||||||
)
|
)
|
||||||
tts_text = gr.Textbox(
|
tts_text = gr.Textbox(
|
||||||
label="Input Text.",
|
label="Input Text.",
|
||||||
|
@ -351,9 +381,7 @@ if __name__ == "__main__":
|
||||||
tts_btn = gr.Button(value="Step 4 - Inference")
|
tts_btn = gr.Button(value="Step 4 - Inference")
|
||||||
|
|
||||||
with gr.Column() as col3:
|
with gr.Column() as col3:
|
||||||
progress_gen = gr.Label(
|
progress_gen = gr.Label(label="Progress:")
|
||||||
label="Progress:"
|
|
||||||
)
|
|
||||||
tts_output_audio = gr.Audio(label="Generated Audio.")
|
tts_output_audio = gr.Audio(label="Generated Audio.")
|
||||||
reference_audio = gr.Audio(label="Reference audio used.")
|
reference_audio = gr.Audio(label="Reference audio used.")
|
||||||
|
|
||||||
|
@ -371,7 +399,6 @@ if __name__ == "__main__":
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
train_btn.click(
|
train_btn.click(
|
||||||
fn=train_model,
|
fn=train_model,
|
||||||
inputs=[
|
inputs=[
|
||||||
|
@ -386,14 +413,10 @@ if __name__ == "__main__":
|
||||||
],
|
],
|
||||||
outputs=[progress_train, xtts_config, xtts_vocab, xtts_checkpoint, speaker_reference_audio],
|
outputs=[progress_train, xtts_config, xtts_vocab, xtts_checkpoint, speaker_reference_audio],
|
||||||
)
|
)
|
||||||
|
|
||||||
load_btn.click(
|
load_btn.click(
|
||||||
fn=load_model,
|
fn=load_model,
|
||||||
inputs=[
|
inputs=[xtts_checkpoint, xtts_config, xtts_vocab],
|
||||||
xtts_checkpoint,
|
|
||||||
xtts_config,
|
|
||||||
xtts_vocab
|
|
||||||
],
|
|
||||||
outputs=[progress_load],
|
outputs=[progress_load],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -407,9 +430,4 @@ if __name__ == "__main__":
|
||||||
outputs=[progress_gen, tts_output_audio, reference_audio],
|
outputs=[progress_gen, tts_output_audio, reference_audio],
|
||||||
)
|
)
|
||||||
|
|
||||||
demo.launch(
|
demo.launch(share=True, debug=False, server_port=args.port, server_name="0.0.0.0")
|
||||||
share=True,
|
|
||||||
debug=False,
|
|
||||||
server_port=args.port,
|
|
||||||
server_name="0.0.0.0"
|
|
||||||
)
|
|
||||||
|
|
|
@ -14,5 +14,5 @@ To run the code, you need to follow the same flow as in TTS.
|
||||||
|
|
||||||
- Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
|
- Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
|
||||||
- Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```
|
- Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```
|
||||||
- Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
|
- Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda /model/path/best_model.pth model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
|
||||||
- Watch training on Tensorboard as in TTS
|
- Watch training on Tensorboard as in TTS
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from dataclasses import asdict, dataclass
|
from dataclasses import dataclass
|
||||||
|
|
||||||
from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
|
from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from dataclasses import asdict, dataclass
|
from dataclasses import dataclass
|
||||||
|
|
||||||
from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
|
from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
import logging
|
||||||
import random
|
import random
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
@ -5,6 +6,8 @@ from torch.utils.data import Dataset
|
||||||
|
|
||||||
from TTS.encoder.utils.generic_utils import AugmentWAV
|
from TTS.encoder.utils.generic_utils import AugmentWAV
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class EncoderDataset(Dataset):
|
class EncoderDataset(Dataset):
|
||||||
def __init__(
|
def __init__(
|
||||||
|
@ -15,7 +18,6 @@ class EncoderDataset(Dataset):
|
||||||
voice_len=1.6,
|
voice_len=1.6,
|
||||||
num_classes_in_batch=64,
|
num_classes_in_batch=64,
|
||||||
num_utter_per_class=10,
|
num_utter_per_class=10,
|
||||||
verbose=False,
|
|
||||||
augmentation_config=None,
|
augmentation_config=None,
|
||||||
use_torch_spec=None,
|
use_torch_spec=None,
|
||||||
):
|
):
|
||||||
|
@ -24,7 +26,6 @@ class EncoderDataset(Dataset):
|
||||||
ap (TTS.tts.utils.AudioProcessor): audio processor object.
|
ap (TTS.tts.utils.AudioProcessor): audio processor object.
|
||||||
meta_data (list): list of dataset instances.
|
meta_data (list): list of dataset instances.
|
||||||
seq_len (int): voice segment length in seconds.
|
seq_len (int): voice segment length in seconds.
|
||||||
verbose (bool): print diagnostic information.
|
|
||||||
"""
|
"""
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.config = config
|
self.config = config
|
||||||
|
@ -33,7 +34,6 @@ class EncoderDataset(Dataset):
|
||||||
self.seq_len = int(voice_len * self.sample_rate)
|
self.seq_len = int(voice_len * self.sample_rate)
|
||||||
self.num_utter_per_class = num_utter_per_class
|
self.num_utter_per_class = num_utter_per_class
|
||||||
self.ap = ap
|
self.ap = ap
|
||||||
self.verbose = verbose
|
|
||||||
self.use_torch_spec = use_torch_spec
|
self.use_torch_spec = use_torch_spec
|
||||||
self.classes, self.items = self.__parse_items()
|
self.classes, self.items = self.__parse_items()
|
||||||
|
|
||||||
|
@ -50,13 +50,12 @@ class EncoderDataset(Dataset):
|
||||||
if "gaussian" in augmentation_config.keys():
|
if "gaussian" in augmentation_config.keys():
|
||||||
self.gaussian_augmentation_config = augmentation_config["gaussian"]
|
self.gaussian_augmentation_config = augmentation_config["gaussian"]
|
||||||
|
|
||||||
if self.verbose:
|
logger.info("DataLoader initialization")
|
||||||
print("\n > DataLoader initialization")
|
logger.info(" | Classes per batch: %d", num_classes_in_batch)
|
||||||
print(f" | > Classes per Batch: {num_classes_in_batch}")
|
logger.info(" | Number of instances: %d", len(self.items))
|
||||||
print(f" | > Number of instances : {len(self.items)}")
|
logger.info(" | Sequence length: %d", self.seq_len)
|
||||||
print(f" | > Sequence length: {self.seq_len}")
|
logger.info(" | Number of classes: %d", len(self.classes))
|
||||||
print(f" | > Num Classes: {len(self.classes)}")
|
logger.info(" | Classes: %s", self.classes)
|
||||||
print(f" | > Classes: {self.classes}")
|
|
||||||
|
|
||||||
def load_wav(self, filename):
|
def load_wav(self, filename):
|
||||||
audio = self.ap.load_wav(filename, sr=self.ap.sample_rate)
|
audio = self.ap.load_wav(filename, sr=self.ap.sample_rate)
|
||||||
|
|
|
@ -1,7 +1,11 @@
|
||||||
|
import logging
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
# adapted from https://github.com/cvqluu/GE2E-Loss
|
# adapted from https://github.com/cvqluu/GE2E-Loss
|
||||||
class GE2ELoss(nn.Module):
|
class GE2ELoss(nn.Module):
|
||||||
|
@ -23,7 +27,7 @@ class GE2ELoss(nn.Module):
|
||||||
self.b = nn.Parameter(torch.tensor(init_b))
|
self.b = nn.Parameter(torch.tensor(init_b))
|
||||||
self.loss_method = loss_method
|
self.loss_method = loss_method
|
||||||
|
|
||||||
print(" > Initialized Generalized End-to-End loss")
|
logger.info("Initialized Generalized End-to-End loss")
|
||||||
|
|
||||||
assert self.loss_method in ["softmax", "contrast"]
|
assert self.loss_method in ["softmax", "contrast"]
|
||||||
|
|
||||||
|
@ -139,7 +143,7 @@ class AngleProtoLoss(nn.Module):
|
||||||
self.b = nn.Parameter(torch.tensor(init_b))
|
self.b = nn.Parameter(torch.tensor(init_b))
|
||||||
self.criterion = torch.nn.CrossEntropyLoss()
|
self.criterion = torch.nn.CrossEntropyLoss()
|
||||||
|
|
||||||
print(" > Initialized Angular Prototypical loss")
|
logger.info("Initialized Angular Prototypical loss")
|
||||||
|
|
||||||
def forward(self, x, _label=None):
|
def forward(self, x, _label=None):
|
||||||
"""
|
"""
|
||||||
|
@ -177,7 +181,7 @@ class SoftmaxLoss(nn.Module):
|
||||||
self.criterion = torch.nn.CrossEntropyLoss()
|
self.criterion = torch.nn.CrossEntropyLoss()
|
||||||
self.fc = nn.Linear(embedding_dim, n_speakers)
|
self.fc = nn.Linear(embedding_dim, n_speakers)
|
||||||
|
|
||||||
print("Initialised Softmax Loss")
|
logger.info("Initialised Softmax Loss")
|
||||||
|
|
||||||
def forward(self, x, label=None):
|
def forward(self, x, label=None):
|
||||||
# reshape for compatibility
|
# reshape for compatibility
|
||||||
|
@ -212,7 +216,7 @@ class SoftmaxAngleProtoLoss(nn.Module):
|
||||||
self.softmax = SoftmaxLoss(embedding_dim, n_speakers)
|
self.softmax = SoftmaxLoss(embedding_dim, n_speakers)
|
||||||
self.angleproto = AngleProtoLoss(init_w, init_b)
|
self.angleproto = AngleProtoLoss(init_w, init_b)
|
||||||
|
|
||||||
print("Initialised SoftmaxAnglePrototypical Loss")
|
logger.info("Initialised SoftmaxAnglePrototypical Loss")
|
||||||
|
|
||||||
def forward(self, x, label=None):
|
def forward(self, x, label=None):
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -1,12 +1,16 @@
|
||||||
|
import logging
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
import torchaudio
|
import torchaudio
|
||||||
from coqpit import Coqpit
|
from coqpit import Coqpit
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
from trainer.io import load_fsspec
|
||||||
|
|
||||||
from TTS.encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
|
from TTS.encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
|
||||||
from TTS.utils.generic_utils import set_init_dict
|
from TTS.utils.generic_utils import set_init_dict
|
||||||
from TTS.utils.io import load_fsspec
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class PreEmphasis(nn.Module):
|
class PreEmphasis(nn.Module):
|
||||||
|
@ -118,13 +122,13 @@ class BaseEncoder(nn.Module):
|
||||||
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
|
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
|
||||||
try:
|
try:
|
||||||
self.load_state_dict(state["model"])
|
self.load_state_dict(state["model"])
|
||||||
print(" > Model fully restored. ")
|
logger.info("Model fully restored. ")
|
||||||
except (KeyError, RuntimeError) as error:
|
except (KeyError, RuntimeError) as error:
|
||||||
# If eval raise the error
|
# If eval raise the error
|
||||||
if eval:
|
if eval:
|
||||||
raise error
|
raise error
|
||||||
|
|
||||||
print(" > Partial model initialization.")
|
logger.info("Partial model initialization.")
|
||||||
model_dict = self.state_dict()
|
model_dict = self.state_dict()
|
||||||
model_dict = set_init_dict(model_dict, state["model"], c)
|
model_dict = set_init_dict(model_dict, state["model"], c)
|
||||||
self.load_state_dict(model_dict)
|
self.load_state_dict(model_dict)
|
||||||
|
@ -135,7 +139,7 @@ class BaseEncoder(nn.Module):
|
||||||
try:
|
try:
|
||||||
criterion.load_state_dict(state["criterion"])
|
criterion.load_state_dict(state["criterion"])
|
||||||
except (KeyError, RuntimeError) as error:
|
except (KeyError, RuntimeError) as error:
|
||||||
print(" > Criterion load ignored because of:", error)
|
logger.exception("Criterion load ignored because of: %s", error)
|
||||||
|
|
||||||
# instance and load the criterion for the encoder classifier in inference time
|
# instance and load the criterion for the encoder classifier in inference time
|
||||||
if (
|
if (
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
import glob
|
import glob
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
|
|
||||||
|
@ -8,6 +9,8 @@ from scipy import signal
|
||||||
from TTS.encoder.models.lstm import LSTMSpeakerEncoder
|
from TTS.encoder.models.lstm import LSTMSpeakerEncoder
|
||||||
from TTS.encoder.models.resnet import ResNetSpeakerEncoder
|
from TTS.encoder.models.resnet import ResNetSpeakerEncoder
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class AugmentWAV(object):
|
class AugmentWAV(object):
|
||||||
def __init__(self, ap, augmentation_config):
|
def __init__(self, ap, augmentation_config):
|
||||||
|
@ -34,12 +37,14 @@ class AugmentWAV(object):
|
||||||
# ignore not listed directories
|
# ignore not listed directories
|
||||||
if noise_dir not in self.additive_noise_types:
|
if noise_dir not in self.additive_noise_types:
|
||||||
continue
|
continue
|
||||||
if not noise_dir in self.noise_list:
|
if noise_dir not in self.noise_list:
|
||||||
self.noise_list[noise_dir] = []
|
self.noise_list[noise_dir] = []
|
||||||
self.noise_list[noise_dir].append(wav_file)
|
self.noise_list[noise_dir].append(wav_file)
|
||||||
|
|
||||||
print(
|
logger.info(
|
||||||
f" | > Using Additive Noise Augmentation: with {len(additive_files)} audios instances from {self.additive_noise_types}"
|
"Using Additive Noise Augmentation: with %d audios instances from %s",
|
||||||
|
len(additive_files),
|
||||||
|
self.additive_noise_types,
|
||||||
)
|
)
|
||||||
|
|
||||||
self.use_rir = False
|
self.use_rir = False
|
||||||
|
@ -50,7 +55,7 @@ class AugmentWAV(object):
|
||||||
self.rir_files = glob.glob(os.path.join(self.rir_config["rir_path"], "**/*.wav"), recursive=True)
|
self.rir_files = glob.glob(os.path.join(self.rir_config["rir_path"], "**/*.wav"), recursive=True)
|
||||||
self.use_rir = True
|
self.use_rir = True
|
||||||
|
|
||||||
print(f" | > Using RIR Noise Augmentation: with {len(self.rir_files)} audios instances")
|
logger.info("Using RIR Noise Augmentation: with %d audios instances", len(self.rir_files))
|
||||||
|
|
||||||
self.create_augmentation_global_list()
|
self.create_augmentation_global_list()
|
||||||
|
|
||||||
|
|
|
@ -19,15 +19,19 @@
|
||||||
# pylint: disable=too-many-locals, too-many-statements, too-many-arguments, too-many-instance-attributes
|
# pylint: disable=too-many-locals, too-many-statements, too-many-arguments, too-many-instance-attributes
|
||||||
""" voxceleb 1 & 2 """
|
""" voxceleb 1 & 2 """
|
||||||
|
|
||||||
|
import csv
|
||||||
import hashlib
|
import hashlib
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
import zipfile
|
import zipfile
|
||||||
|
|
||||||
import pandas
|
|
||||||
import soundfile as sf
|
import soundfile as sf
|
||||||
from absl import logging
|
|
||||||
|
from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
SUBSETS = {
|
SUBSETS = {
|
||||||
"vox1_dev_wav": [
|
"vox1_dev_wav": [
|
||||||
|
@ -77,14 +81,14 @@ def download_and_extract(directory, subset, urls):
|
||||||
zip_filepath = os.path.join(directory, url.split("/")[-1])
|
zip_filepath = os.path.join(directory, url.split("/")[-1])
|
||||||
if os.path.exists(zip_filepath):
|
if os.path.exists(zip_filepath):
|
||||||
continue
|
continue
|
||||||
logging.info("Downloading %s to %s" % (url, zip_filepath))
|
logger.info("Downloading %s to %s" % (url, zip_filepath))
|
||||||
subprocess.call(
|
subprocess.call(
|
||||||
"wget %s --user %s --password %s -O %s" % (url, USER["user"], USER["password"], zip_filepath),
|
"wget %s --user %s --password %s -O %s" % (url, USER["user"], USER["password"], zip_filepath),
|
||||||
shell=True,
|
shell=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
statinfo = os.stat(zip_filepath)
|
statinfo = os.stat(zip_filepath)
|
||||||
logging.info("Successfully downloaded %s, size(bytes): %d" % (url, statinfo.st_size))
|
logger.info("Successfully downloaded %s, size(bytes): %d" % (url, statinfo.st_size))
|
||||||
|
|
||||||
# concatenate all parts into zip files
|
# concatenate all parts into zip files
|
||||||
if ".zip" not in zip_filepath:
|
if ".zip" not in zip_filepath:
|
||||||
|
@ -118,9 +122,9 @@ def exec_cmd(cmd):
|
||||||
try:
|
try:
|
||||||
retcode = subprocess.call(cmd, shell=True)
|
retcode = subprocess.call(cmd, shell=True)
|
||||||
if retcode < 0:
|
if retcode < 0:
|
||||||
logging.info(f"Child was terminated by signal {retcode}")
|
logger.info(f"Child was terminated by signal {retcode}")
|
||||||
except OSError as e:
|
except OSError as e:
|
||||||
logging.info(f"Execution failed: {e}")
|
logger.info(f"Execution failed: {e}")
|
||||||
retcode = -999
|
retcode = -999
|
||||||
return retcode
|
return retcode
|
||||||
|
|
||||||
|
@ -134,11 +138,11 @@ def decode_aac_with_ffmpeg(aac_file, wav_file):
|
||||||
bool, True if success.
|
bool, True if success.
|
||||||
"""
|
"""
|
||||||
cmd = f"ffmpeg -i {aac_file} {wav_file}"
|
cmd = f"ffmpeg -i {aac_file} {wav_file}"
|
||||||
logging.info(f"Decoding aac file using command line: {cmd}")
|
logger.info(f"Decoding aac file using command line: {cmd}")
|
||||||
ret = exec_cmd(cmd)
|
ret = exec_cmd(cmd)
|
||||||
if ret != 0:
|
if ret != 0:
|
||||||
logging.error(f"Failed to decode aac file with retcode {ret}")
|
logger.error(f"Failed to decode aac file with retcode {ret}")
|
||||||
logging.error("Please check your ffmpeg installation.")
|
logger.error("Please check your ffmpeg installation.")
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
@ -152,7 +156,7 @@ def convert_audio_and_make_label(input_dir, subset, output_dir, output_file):
|
||||||
output_file: the name of the newly generated csv file. e.g. vox1_dev_wav.csv
|
output_file: the name of the newly generated csv file. e.g. vox1_dev_wav.csv
|
||||||
"""
|
"""
|
||||||
|
|
||||||
logging.info("Preprocessing audio and label for subset %s" % subset)
|
logger.info("Preprocessing audio and label for subset %s" % subset)
|
||||||
source_dir = os.path.join(input_dir, subset)
|
source_dir = os.path.join(input_dir, subset)
|
||||||
|
|
||||||
files = []
|
files = []
|
||||||
|
@ -185,9 +189,12 @@ def convert_audio_and_make_label(input_dir, subset, output_dir, output_file):
|
||||||
# Write to CSV file which contains four columns:
|
# Write to CSV file which contains four columns:
|
||||||
# "wav_filename", "wav_length_ms", "speaker_id", "speaker_name".
|
# "wav_filename", "wav_length_ms", "speaker_id", "speaker_name".
|
||||||
csv_file_path = os.path.join(output_dir, output_file)
|
csv_file_path = os.path.join(output_dir, output_file)
|
||||||
df = pandas.DataFrame(data=files, columns=["wav_filename", "wav_length_ms", "speaker_id", "speaker_name"])
|
with open(csv_file_path, "w", newline="", encoding="utf-8") as f:
|
||||||
df.to_csv(csv_file_path, index=False, sep="\t")
|
writer = csv.writer(f, delimiter="\t")
|
||||||
logging.info("Successfully generated csv file {}".format(csv_file_path))
|
writer.writerow(["wav_filename", "wav_length_ms", "speaker_id", "speaker_name"])
|
||||||
|
for wav_file in files:
|
||||||
|
writer.writerow(wav_file)
|
||||||
|
logger.info("Successfully generated csv file {}".format(csv_file_path))
|
||||||
|
|
||||||
|
|
||||||
def processor(directory, subset, force_process):
|
def processor(directory, subset, force_process):
|
||||||
|
@ -200,16 +207,16 @@ def processor(directory, subset, force_process):
|
||||||
if not force_process and os.path.exists(subset_csv):
|
if not force_process and os.path.exists(subset_csv):
|
||||||
return subset_csv
|
return subset_csv
|
||||||
|
|
||||||
logging.info("Downloading and process the voxceleb in %s", directory)
|
logger.info("Downloading and process the voxceleb in %s", directory)
|
||||||
logging.info("Preparing subset %s", subset)
|
logger.info("Preparing subset %s", subset)
|
||||||
download_and_extract(directory, subset, urls[subset])
|
download_and_extract(directory, subset, urls[subset])
|
||||||
convert_audio_and_make_label(directory, subset, directory, subset + ".csv")
|
convert_audio_and_make_label(directory, subset, directory, subset + ".csv")
|
||||||
logging.info("Finished downloading and processing")
|
logger.info("Finished downloading and processing")
|
||||||
return subset_csv
|
return subset_csv
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
logging.set_verbosity(logging.INFO)
|
setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
|
||||||
if len(sys.argv) != 4:
|
if len(sys.argv) != 4:
|
||||||
print("Usage: python prepare_data.py save_directory user password")
|
print("Usage: python prepare_data.py save_directory user password")
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
|
@ -3,13 +3,13 @@ from dataclasses import dataclass, field
|
||||||
|
|
||||||
from coqpit import Coqpit
|
from coqpit import Coqpit
|
||||||
from trainer import TrainerArgs, get_last_checkpoint
|
from trainer import TrainerArgs, get_last_checkpoint
|
||||||
|
from trainer.generic_utils import get_experiment_folder_path, get_git_branch
|
||||||
from trainer.io import copy_model_files
|
from trainer.io import copy_model_files
|
||||||
from trainer.logging import logger_factory
|
from trainer.logging import logger_factory
|
||||||
from trainer.logging.console_logger import ConsoleLogger
|
from trainer.logging.console_logger import ConsoleLogger
|
||||||
|
|
||||||
from TTS.config import load_config, register_config
|
from TTS.config import load_config, register_config
|
||||||
from TTS.tts.utils.text.characters import parse_symbols
|
from TTS.tts.utils.text.characters import parse_symbols
|
||||||
from TTS.utils.generic_utils import get_experiment_folder_path, get_git_branch
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
@ -29,7 +29,7 @@ def process_args(args, config=None):
|
||||||
args (argparse.Namespace or dict like): Parsed input arguments.
|
args (argparse.Namespace or dict like): Parsed input arguments.
|
||||||
config (Coqpit): Model config. If none, it is generated from `args`. Defaults to None.
|
config (Coqpit): Model config. If none, it is generated from `args`. Defaults to None.
|
||||||
Returns:
|
Returns:
|
||||||
c (TTS.utils.io.AttrDict): Config paramaters.
|
c (Coqpit): Config paramaters.
|
||||||
out_path (str): Path to save models and logging.
|
out_path (str): Path to save models and logging.
|
||||||
audio_path (str): Path to save generated test audios.
|
audio_path (str): Path to save generated test audios.
|
||||||
c_logger (TTS.utils.console_logger.ConsoleLogger): Class that does
|
c_logger (TTS.utils.console_logger.ConsoleLogger): Class that does
|
||||||
|
|
21
TTS/model.py
21
TTS/model.py
|
@ -1,5 +1,6 @@
|
||||||
|
import os
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
from typing import Dict
|
from typing import Any, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from coqpit import Coqpit
|
from coqpit import Coqpit
|
||||||
|
@ -16,7 +17,7 @@ class BaseTrainerModel(TrainerModel):
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def init_from_config(config: Coqpit):
|
def init_from_config(config: Coqpit) -> "BaseTrainerModel":
|
||||||
"""Init the model and all its attributes from the given config.
|
"""Init the model and all its attributes from the given config.
|
||||||
|
|
||||||
Override this depending on your model.
|
Override this depending on your model.
|
||||||
|
@ -24,7 +25,7 @@ class BaseTrainerModel(TrainerModel):
|
||||||
...
|
...
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def inference(self, input: torch.Tensor, aux_input={}) -> Dict:
|
def inference(self, input: torch.Tensor, aux_input: dict[str, Any] = {}) -> dict[str, Any]:
|
||||||
"""Forward pass for inference.
|
"""Forward pass for inference.
|
||||||
|
|
||||||
It must return a dictionary with the main model output and all the auxiliary outputs. The key ```model_outputs```
|
It must return a dictionary with the main model output and all the auxiliary outputs. The key ```model_outputs```
|
||||||
|
@ -45,15 +46,21 @@ class BaseTrainerModel(TrainerModel):
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def load_checkpoint(
|
def load_checkpoint(
|
||||||
self, config: Coqpit, checkpoint_path: str, eval: bool = False, strict: bool = True, cache=False
|
self,
|
||||||
|
config: Coqpit,
|
||||||
|
checkpoint_path: Union[str, os.PathLike[Any]],
|
||||||
|
eval: bool = False,
|
||||||
|
strict: bool = True,
|
||||||
|
cache: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Load a model checkpoint gile and get ready for training or inference.
|
"""Load a model checkpoint file and get ready for training or inference.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
config (Coqpit): Model configuration.
|
config (Coqpit): Model configuration.
|
||||||
checkpoint_path (str): Path to the model checkpoint file.
|
checkpoint_path (str | os.PathLike): Path to the model checkpoint file.
|
||||||
eval (bool, optional): If true, init model for inference else for training. Defaults to False.
|
eval (bool, optional): If true, init model for inference else for training. Defaults to False.
|
||||||
strict (bool, optional): Match all checkpoint keys to model's keys. Defaults to True.
|
strict (bool, optional): Match all checkpoint keys to model's keys. Defaults to True.
|
||||||
cache (bool, optional): If True, cache the file locally for subsequent calls. It is cached under `get_user_data_dir()/tts_cache`. Defaults to False.
|
cache (bool, optional): If True, cache the file locally for subsequent calls.
|
||||||
|
It is cached under `trainer.io.get_user_data_dir()/tts_cache`. Defaults to False.
|
||||||
"""
|
"""
|
||||||
...
|
...
|
||||||
|
|
|
@ -1,5 +1,8 @@
|
||||||
# :frog: TTS demo server
|
# :frog: TTS demo server
|
||||||
Before you use the server, make sure you [install](https://github.com/coqui-ai/TTS/tree/dev#install-tts)) :frog: TTS properly. Then, you can follow the steps below.
|
Before you use the server, make sure you
|
||||||
|
[install](https://github.com/idiap/coqui-ai-TTS/tree/dev#install-tts)) :frog: TTS
|
||||||
|
properly and install the additional dependencies with `pip install
|
||||||
|
coqui-tts[server]`. Then, you can follow the steps below.
|
||||||
|
|
||||||
**Note:** If you install :frog:TTS using ```pip```, you can also use the ```tts-server``` end point on the terminal.
|
**Note:** If you install :frog:TTS using ```pip```, you can also use the ```tts-server``` end point on the terminal.
|
||||||
|
|
||||||
|
@ -12,7 +15,7 @@ Run the server with the official models.
|
||||||
```python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan```
|
```python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan```
|
||||||
|
|
||||||
Run the server with the official models on a GPU.
|
Run the server with the official models on a GPU.
|
||||||
```CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan --use_cuda True```
|
```CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan --use_cuda```
|
||||||
|
|
||||||
Run the server with a custom models.
|
Run the server with a custom models.
|
||||||
```python TTS/server/server.py --tts_checkpoint /path/to/tts/model.pth --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth --vocoder_config /path/to/vocoder/config.json```
|
```python TTS/server/server.py --tts_checkpoint /path/to/tts/model.pth --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth --vocoder_config /path/to/vocoder/config.json```
|
||||||
|
|
|
@ -1,7 +1,11 @@
|
||||||
#!flask/bin/python
|
#!flask/bin/python
|
||||||
|
|
||||||
|
"""TTS demo server."""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import io
|
import io
|
||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -9,24 +13,26 @@ from threading import Lock
|
||||||
from typing import Union
|
from typing import Union
|
||||||
from urllib.parse import parse_qs
|
from urllib.parse import parse_qs
|
||||||
|
|
||||||
from flask import Flask, render_template, render_template_string, request, send_file
|
try:
|
||||||
|
from flask import Flask, render_template, render_template_string, request, send_file
|
||||||
|
except ImportError as e:
|
||||||
|
msg = "Server requires requires flask, use `pip install coqui-tts[server]`"
|
||||||
|
raise ImportError(msg) from e
|
||||||
|
|
||||||
from TTS.config import load_config
|
from TTS.config import load_config
|
||||||
|
from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
|
||||||
from TTS.utils.manage import ModelManager
|
from TTS.utils.manage import ModelManager
|
||||||
from TTS.utils.synthesizer import Synthesizer
|
from TTS.utils.synthesizer import Synthesizer
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
setup_logger("TTS", level=logging.INFO, screen=True, formatter=ConsoleFormatter())
|
||||||
|
|
||||||
def create_argparser():
|
|
||||||
def convert_boolean(x):
|
|
||||||
return x.lower() in ["true", "1", "yes"]
|
|
||||||
|
|
||||||
|
def create_argparser() -> argparse.ArgumentParser:
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--list_models",
|
"--list_models",
|
||||||
type=convert_boolean,
|
action="store_true",
|
||||||
nargs="?",
|
|
||||||
const=True,
|
|
||||||
default=False,
|
|
||||||
help="list available pre-trained tts and vocoder models.",
|
help="list available pre-trained tts and vocoder models.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
|
@ -54,9 +60,13 @@ def create_argparser():
|
||||||
parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None)
|
parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None)
|
||||||
parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
|
parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
|
||||||
parser.add_argument("--port", type=int, default=5002, help="port to listen on.")
|
parser.add_argument("--port", type=int, default=5002, help="port to listen on.")
|
||||||
parser.add_argument("--use_cuda", type=convert_boolean, default=False, help="true to use CUDA.")
|
parser.add_argument("--use_cuda", action=argparse.BooleanOptionalAction, default=False, help="true to use CUDA.")
|
||||||
parser.add_argument("--debug", type=convert_boolean, default=False, help="true to enable Flask debug mode.")
|
parser.add_argument(
|
||||||
parser.add_argument("--show_details", type=convert_boolean, default=False, help="Generate model detail page.")
|
"--debug", action=argparse.BooleanOptionalAction, default=False, help="true to enable Flask debug mode."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--show_details", action=argparse.BooleanOptionalAction, default=False, help="Generate model detail page."
|
||||||
|
)
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
@ -66,10 +76,6 @@ args = create_argparser().parse_args()
|
||||||
path = Path(__file__).parent / "../.models.json"
|
path = Path(__file__).parent / "../.models.json"
|
||||||
manager = ModelManager(path)
|
manager = ModelManager(path)
|
||||||
|
|
||||||
if args.list_models:
|
|
||||||
manager.list_models()
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
# update in-use models to the specified released models.
|
# update in-use models to the specified released models.
|
||||||
model_path = None
|
model_path = None
|
||||||
config_path = None
|
config_path = None
|
||||||
|
@ -164,17 +170,15 @@ def index():
|
||||||
def details():
|
def details():
|
||||||
if args.config_path is not None and os.path.isfile(args.config_path):
|
if args.config_path is not None and os.path.isfile(args.config_path):
|
||||||
model_config = load_config(args.config_path)
|
model_config = load_config(args.config_path)
|
||||||
else:
|
elif args.model_name is not None:
|
||||||
if args.model_name is not None:
|
model_config = load_config(config_path)
|
||||||
model_config = load_config(config_path)
|
|
||||||
|
|
||||||
if args.vocoder_config_path is not None and os.path.isfile(args.vocoder_config_path):
|
if args.vocoder_config_path is not None and os.path.isfile(args.vocoder_config_path):
|
||||||
vocoder_config = load_config(args.vocoder_config_path)
|
vocoder_config = load_config(args.vocoder_config_path)
|
||||||
|
elif args.vocoder_name is not None:
|
||||||
|
vocoder_config = load_config(vocoder_config_path)
|
||||||
else:
|
else:
|
||||||
if args.vocoder_name is not None:
|
vocoder_config = None
|
||||||
vocoder_config = load_config(vocoder_config_path)
|
|
||||||
else:
|
|
||||||
vocoder_config = None
|
|
||||||
|
|
||||||
return render_template(
|
return render_template(
|
||||||
"details.html",
|
"details.html",
|
||||||
|
@ -197,9 +201,9 @@ def tts():
|
||||||
style_wav = request.headers.get("style-wav") or request.values.get("style_wav", "")
|
style_wav = request.headers.get("style-wav") or request.values.get("style_wav", "")
|
||||||
style_wav = style_wav_uri_to_dict(style_wav)
|
style_wav = style_wav_uri_to_dict(style_wav)
|
||||||
|
|
||||||
print(f" > Model input: {text}")
|
logger.info("Model input: %s", text)
|
||||||
print(f" > Speaker Idx: {speaker_idx}")
|
logger.info("Speaker idx: %s", speaker_idx)
|
||||||
print(f" > Language Idx: {language_idx}")
|
logger.info("Language idx: %s", language_idx)
|
||||||
wavs = synthesizer.tts(text, speaker_name=speaker_idx, language_name=language_idx, style_wav=style_wav)
|
wavs = synthesizer.tts(text, speaker_name=speaker_idx, language_name=language_idx, style_wav=style_wav)
|
||||||
out = io.BytesIO()
|
out = io.BytesIO()
|
||||||
synthesizer.save_wav(wavs, out)
|
synthesizer.save_wav(wavs, out)
|
||||||
|
@ -243,7 +247,7 @@ def mary_tts_api_process():
|
||||||
text = data.get("INPUT_TEXT", [""])[0]
|
text = data.get("INPUT_TEXT", [""])[0]
|
||||||
else:
|
else:
|
||||||
text = request.args.get("INPUT_TEXT", "")
|
text = request.args.get("INPUT_TEXT", "")
|
||||||
print(f" > Model input: {text}")
|
logger.info("Model input: %s", text)
|
||||||
wavs = synthesizer.tts(text)
|
wavs = synthesizer.tts(text)
|
||||||
out = io.BytesIO()
|
out = io.BytesIO()
|
||||||
synthesizer.save_wav(wavs, out)
|
synthesizer.save_wav(wavs, out)
|
||||||
|
|
|
@ -128,4 +128,4 @@
|
||||||
|
|
||||||
</body>
|
</body>
|
||||||
|
|
||||||
</html>
|
</html>
|
||||||
|
|
|
@ -30,7 +30,7 @@
|
||||||
</head>
|
</head>
|
||||||
|
|
||||||
<body>
|
<body>
|
||||||
<a href="https://github.com/coqui-ai/TTS"><img style="position: absolute; z-index:1000; top: 0; left: 0; border: 0;"
|
<a href="https://github.com/idiap/coqui-ai-TTS"><img style="position: absolute; z-index:1000; top: 0; left: 0; border: 0;"
|
||||||
src="https://s3.amazonaws.com/github/ribbons/forkme_left_darkblue_121621.png" alt="Fork me on GitHub"></a>
|
src="https://s3.amazonaws.com/github/ribbons/forkme_left_darkblue_121621.png" alt="Fork me on GitHub"></a>
|
||||||
|
|
||||||
<!-- Navigation -->
|
<!-- Navigation -->
|
||||||
|
@ -151,4 +151,4 @@
|
||||||
|
|
||||||
</body>
|
</body>
|
||||||
|
|
||||||
</html>
|
</html>
|
||||||
|
|
|
@ -2,11 +2,12 @@ import os
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import Dict
|
from typing import Dict
|
||||||
|
|
||||||
|
from trainer.io import get_user_data_dir
|
||||||
|
|
||||||
from TTS.tts.configs.shared_configs import BaseTTSConfig
|
from TTS.tts.configs.shared_configs import BaseTTSConfig
|
||||||
from TTS.tts.layers.bark.model import GPTConfig
|
from TTS.tts.layers.bark.model import GPTConfig
|
||||||
from TTS.tts.layers.bark.model_fine import FineGPTConfig
|
from TTS.tts.layers.bark.model_fine import FineGPTConfig
|
||||||
from TTS.tts.models.bark import BarkAudioConfig
|
from TTS.tts.models.bark import BarkAudioConfig
|
||||||
from TTS.utils.generic_utils import get_user_data_dir
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
|
@ -9,6 +10,8 @@ import numpy as np
|
||||||
from TTS.tts.datasets.dataset import *
|
from TTS.tts.datasets.dataset import *
|
||||||
from TTS.tts.datasets.formatters import *
|
from TTS.tts.datasets.formatters import *
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01):
|
def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01):
|
||||||
"""Split a dataset into train and eval. Consider speaker distribution in multi-speaker training.
|
"""Split a dataset into train and eval. Consider speaker distribution in multi-speaker training.
|
||||||
|
@ -122,7 +125,7 @@ def load_tts_samples(
|
||||||
|
|
||||||
meta_data_train = add_extra_keys(meta_data_train, language, dataset_name)
|
meta_data_train = add_extra_keys(meta_data_train, language, dataset_name)
|
||||||
|
|
||||||
print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}")
|
logger.info("Found %d files in %s", len(meta_data_train), Path(root_path).resolve())
|
||||||
# load evaluation split if set
|
# load evaluation split if set
|
||||||
if eval_split:
|
if eval_split:
|
||||||
if meta_file_val:
|
if meta_file_val:
|
||||||
|
@ -166,16 +169,15 @@ def _get_formatter_by_name(name):
|
||||||
return getattr(thismodule, name.lower())
|
return getattr(thismodule, name.lower())
|
||||||
|
|
||||||
|
|
||||||
def find_unique_chars(data_samples, verbose=True):
|
def find_unique_chars(data_samples):
|
||||||
texts = "".join(item[0] for item in data_samples)
|
texts = "".join(item["text"] for item in data_samples)
|
||||||
chars = set(texts)
|
chars = set(texts)
|
||||||
lower_chars = filter(lambda c: c.islower(), chars)
|
lower_chars = filter(lambda c: c.islower(), chars)
|
||||||
chars_force_lower = [c.lower() for c in chars]
|
chars_force_lower = [c.lower() for c in chars]
|
||||||
chars_force_lower = set(chars_force_lower)
|
chars_force_lower = set(chars_force_lower)
|
||||||
|
|
||||||
if verbose:
|
logger.info("Number of unique characters: %d", len(chars))
|
||||||
print(f" > Number of unique characters: {len(chars)}")
|
logger.info("Unique characters: %s", "".join(sorted(chars)))
|
||||||
print(f" > Unique characters: {''.join(sorted(chars))}")
|
logger.info("Unique lower characters: %s", "".join(sorted(lower_chars)))
|
||||||
print(f" > Unique lower characters: {''.join(sorted(lower_chars))}")
|
logger.info("Unique all forced to lower characters: %s", "".join(sorted(chars_force_lower)))
|
||||||
print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}")
|
|
||||||
return chars_force_lower
|
return chars_force_lower
|
||||||
|
|
|
@ -1,11 +1,13 @@
|
||||||
import base64
|
import base64
|
||||||
import collections
|
import collections
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
from typing import Dict, List, Union
|
from typing import Dict, List, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
import torchaudio
|
||||||
import tqdm
|
import tqdm
|
||||||
from torch.utils.data import Dataset
|
from torch.utils.data import Dataset
|
||||||
|
|
||||||
|
@ -13,7 +15,7 @@ from TTS.tts.utils.data import prepare_data, prepare_stop_target, prepare_tensor
|
||||||
from TTS.utils.audio import AudioProcessor
|
from TTS.utils.audio import AudioProcessor
|
||||||
from TTS.utils.audio.numpy_transforms import compute_energy as calculate_energy
|
from TTS.utils.audio.numpy_transforms import compute_energy as calculate_energy
|
||||||
|
|
||||||
import mutagen
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# to prevent too many open files error as suggested here
|
# to prevent too many open files error as suggested here
|
||||||
# https://github.com/pytorch/pytorch/issues/11201#issuecomment-421146936
|
# https://github.com/pytorch/pytorch/issues/11201#issuecomment-421146936
|
||||||
|
@ -44,13 +46,15 @@ def string2filename(string):
|
||||||
return filename
|
return filename
|
||||||
|
|
||||||
|
|
||||||
def get_audio_size(audiopath):
|
def get_audio_size(audiopath) -> int:
|
||||||
|
"""Return the number of samples in the audio file."""
|
||||||
extension = audiopath.rpartition(".")[-1].lower()
|
extension = audiopath.rpartition(".")[-1].lower()
|
||||||
if extension not in {"mp3", "wav", "flac"}:
|
if extension not in {"mp3", "wav", "flac"}:
|
||||||
raise RuntimeError(f"The audio format {extension} is not supported, please convert the audio files to mp3, flac, or wav format!")
|
raise RuntimeError(
|
||||||
|
f"The audio format {extension} is not supported, please convert the audio files to mp3, flac, or wav format!"
|
||||||
|
)
|
||||||
|
|
||||||
audio_info = mutagen.File(audiopath).info
|
return torchaudio.info(audiopath).num_frames
|
||||||
return int(audio_info.length * audio_info.sample_rate)
|
|
||||||
|
|
||||||
|
|
||||||
class TTSDataset(Dataset):
|
class TTSDataset(Dataset):
|
||||||
|
@ -78,7 +82,6 @@ class TTSDataset(Dataset):
|
||||||
language_id_mapping: Dict = None,
|
language_id_mapping: Dict = None,
|
||||||
use_noise_augment: bool = False,
|
use_noise_augment: bool = False,
|
||||||
start_by_longest: bool = False,
|
start_by_longest: bool = False,
|
||||||
verbose: bool = False,
|
|
||||||
):
|
):
|
||||||
"""Generic 📂 data loader for `tts` models. It is configurable for different outputs and needs.
|
"""Generic 📂 data loader for `tts` models. It is configurable for different outputs and needs.
|
||||||
|
|
||||||
|
@ -136,8 +139,6 @@ class TTSDataset(Dataset):
|
||||||
use_noise_augment (bool): Enable adding random noise to wav for augmentation. Defaults to False.
|
use_noise_augment (bool): Enable adding random noise to wav for augmentation. Defaults to False.
|
||||||
|
|
||||||
start_by_longest (bool): Start by longest sequence. It is especially useful to check OOM. Defaults to False.
|
start_by_longest (bool): Start by longest sequence. It is especially useful to check OOM. Defaults to False.
|
||||||
|
|
||||||
verbose (bool): Print diagnostic information. Defaults to false.
|
|
||||||
"""
|
"""
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.batch_group_size = batch_group_size
|
self.batch_group_size = batch_group_size
|
||||||
|
@ -161,7 +162,6 @@ class TTSDataset(Dataset):
|
||||||
self.use_noise_augment = use_noise_augment
|
self.use_noise_augment = use_noise_augment
|
||||||
self.start_by_longest = start_by_longest
|
self.start_by_longest = start_by_longest
|
||||||
|
|
||||||
self.verbose = verbose
|
|
||||||
self.rescue_item_idx = 1
|
self.rescue_item_idx = 1
|
||||||
self.pitch_computed = False
|
self.pitch_computed = False
|
||||||
self.tokenizer = tokenizer
|
self.tokenizer = tokenizer
|
||||||
|
@ -179,8 +179,7 @@ class TTSDataset(Dataset):
|
||||||
self.energy_dataset = EnergyDataset(
|
self.energy_dataset = EnergyDataset(
|
||||||
self.samples, self.ap, cache_path=energy_cache_path, precompute_num_workers=precompute_num_workers
|
self.samples, self.ap, cache_path=energy_cache_path, precompute_num_workers=precompute_num_workers
|
||||||
)
|
)
|
||||||
if self.verbose:
|
self.print_logs()
|
||||||
self.print_logs()
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def lengths(self):
|
def lengths(self):
|
||||||
|
@ -213,11 +212,10 @@ class TTSDataset(Dataset):
|
||||||
|
|
||||||
def print_logs(self, level: int = 0) -> None:
|
def print_logs(self, level: int = 0) -> None:
|
||||||
indent = "\t" * level
|
indent = "\t" * level
|
||||||
print("\n")
|
logger.info("%sDataLoader initialization", indent)
|
||||||
print(f"{indent}> DataLoader initialization")
|
logger.info("%s| Tokenizer:", indent)
|
||||||
print(f"{indent}| > Tokenizer:")
|
|
||||||
self.tokenizer.print_logs(level + 1)
|
self.tokenizer.print_logs(level + 1)
|
||||||
print(f"{indent}| > Number of instances : {len(self.samples)}")
|
logger.info("%s| Number of instances : %d", indent, len(self.samples))
|
||||||
|
|
||||||
def load_wav(self, filename):
|
def load_wav(self, filename):
|
||||||
waveform = self.ap.load_wav(filename)
|
waveform = self.ap.load_wav(filename)
|
||||||
|
@ -389,17 +387,15 @@ class TTSDataset(Dataset):
|
||||||
text_lengths = [s["text_length"] for s in samples]
|
text_lengths = [s["text_length"] for s in samples]
|
||||||
self.samples = samples
|
self.samples = samples
|
||||||
|
|
||||||
if self.verbose:
|
logger.info("Preprocessing samples")
|
||||||
print(" | > Preprocessing samples")
|
logger.info("Max text length: {}".format(np.max(text_lengths)))
|
||||||
print(" | > Max text length: {}".format(np.max(text_lengths)))
|
logger.info("Min text length: {}".format(np.min(text_lengths)))
|
||||||
print(" | > Min text length: {}".format(np.min(text_lengths)))
|
logger.info("Avg text length: {}".format(np.mean(text_lengths)))
|
||||||
print(" | > Avg text length: {}".format(np.mean(text_lengths)))
|
logger.info("Max audio length: {}".format(np.max(audio_lengths)))
|
||||||
print(" | ")
|
logger.info("Min audio length: {}".format(np.min(audio_lengths)))
|
||||||
print(" | > Max audio length: {}".format(np.max(audio_lengths)))
|
logger.info("Avg audio length: {}".format(np.mean(audio_lengths)))
|
||||||
print(" | > Min audio length: {}".format(np.min(audio_lengths)))
|
logger.info("Num. instances discarded samples: %d", len(ignore_idx))
|
||||||
print(" | > Avg audio length: {}".format(np.mean(audio_lengths)))
|
logger.info("Batch group size: {}.".format(self.batch_group_size))
|
||||||
print(f" | > Num. instances discarded samples: {len(ignore_idx)}")
|
|
||||||
print(" | > Batch group size: {}.".format(self.batch_group_size))
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _sort_batch(batch, text_lengths):
|
def _sort_batch(batch, text_lengths):
|
||||||
|
@ -456,9 +452,11 @@ class TTSDataset(Dataset):
|
||||||
|
|
||||||
# lengths adjusted by the reduction factor
|
# lengths adjusted by the reduction factor
|
||||||
mel_lengths_adjusted = [
|
mel_lengths_adjusted = [
|
||||||
m.shape[1] + (self.outputs_per_step - (m.shape[1] % self.outputs_per_step))
|
(
|
||||||
if m.shape[1] % self.outputs_per_step
|
m.shape[1] + (self.outputs_per_step - (m.shape[1] % self.outputs_per_step))
|
||||||
else m.shape[1]
|
if m.shape[1] % self.outputs_per_step
|
||||||
|
else m.shape[1]
|
||||||
|
)
|
||||||
for m in mel
|
for m in mel
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -640,7 +638,7 @@ class PhonemeDataset(Dataset):
|
||||||
|
|
||||||
We use pytorch dataloader because we are lazy.
|
We use pytorch dataloader because we are lazy.
|
||||||
"""
|
"""
|
||||||
print("[*] Pre-computing phonemes...")
|
logger.info("Pre-computing phonemes...")
|
||||||
with tqdm.tqdm(total=len(self)) as pbar:
|
with tqdm.tqdm(total=len(self)) as pbar:
|
||||||
batch_size = num_workers if num_workers > 0 else 1
|
batch_size = num_workers if num_workers > 0 else 1
|
||||||
dataloder = torch.utils.data.DataLoader(
|
dataloder = torch.utils.data.DataLoader(
|
||||||
|
@ -662,11 +660,10 @@ class PhonemeDataset(Dataset):
|
||||||
|
|
||||||
def print_logs(self, level: int = 0) -> None:
|
def print_logs(self, level: int = 0) -> None:
|
||||||
indent = "\t" * level
|
indent = "\t" * level
|
||||||
print("\n")
|
logger.info("%sPhonemeDataset", indent)
|
||||||
print(f"{indent}> PhonemeDataset ")
|
logger.info("%s| Tokenizer:", indent)
|
||||||
print(f"{indent}| > Tokenizer:")
|
|
||||||
self.tokenizer.print_logs(level + 1)
|
self.tokenizer.print_logs(level + 1)
|
||||||
print(f"{indent}| > Number of instances : {len(self.samples)}")
|
logger.info("%s| Number of instances : %d", indent, len(self.samples))
|
||||||
|
|
||||||
|
|
||||||
class F0Dataset:
|
class F0Dataset:
|
||||||
|
@ -698,14 +695,12 @@ class F0Dataset:
|
||||||
samples: Union[List[List], List[Dict]],
|
samples: Union[List[List], List[Dict]],
|
||||||
ap: "AudioProcessor",
|
ap: "AudioProcessor",
|
||||||
audio_config=None, # pylint: disable=unused-argument
|
audio_config=None, # pylint: disable=unused-argument
|
||||||
verbose=False,
|
|
||||||
cache_path: str = None,
|
cache_path: str = None,
|
||||||
precompute_num_workers=0,
|
precompute_num_workers=0,
|
||||||
normalize_f0=True,
|
normalize_f0=True,
|
||||||
):
|
):
|
||||||
self.samples = samples
|
self.samples = samples
|
||||||
self.ap = ap
|
self.ap = ap
|
||||||
self.verbose = verbose
|
|
||||||
self.cache_path = cache_path
|
self.cache_path = cache_path
|
||||||
self.normalize_f0 = normalize_f0
|
self.normalize_f0 = normalize_f0
|
||||||
self.pad_id = 0.0
|
self.pad_id = 0.0
|
||||||
|
@ -729,7 +724,7 @@ class F0Dataset:
|
||||||
return len(self.samples)
|
return len(self.samples)
|
||||||
|
|
||||||
def precompute(self, num_workers=0):
|
def precompute(self, num_workers=0):
|
||||||
print("[*] Pre-computing F0s...")
|
logger.info("Pre-computing F0s...")
|
||||||
with tqdm.tqdm(total=len(self)) as pbar:
|
with tqdm.tqdm(total=len(self)) as pbar:
|
||||||
batch_size = num_workers if num_workers > 0 else 1
|
batch_size = num_workers if num_workers > 0 else 1
|
||||||
# we do not normalize at preproessing
|
# we do not normalize at preproessing
|
||||||
|
@ -816,9 +811,8 @@ class F0Dataset:
|
||||||
|
|
||||||
def print_logs(self, level: int = 0) -> None:
|
def print_logs(self, level: int = 0) -> None:
|
||||||
indent = "\t" * level
|
indent = "\t" * level
|
||||||
print("\n")
|
logger.info("%sF0Dataset", indent)
|
||||||
print(f"{indent}> F0Dataset ")
|
logger.info("%s| Number of instances : %d", indent, len(self.samples))
|
||||||
print(f"{indent}| > Number of instances : {len(self.samples)}")
|
|
||||||
|
|
||||||
|
|
||||||
class EnergyDataset:
|
class EnergyDataset:
|
||||||
|
@ -849,14 +843,12 @@ class EnergyDataset:
|
||||||
self,
|
self,
|
||||||
samples: Union[List[List], List[Dict]],
|
samples: Union[List[List], List[Dict]],
|
||||||
ap: "AudioProcessor",
|
ap: "AudioProcessor",
|
||||||
verbose=False,
|
|
||||||
cache_path: str = None,
|
cache_path: str = None,
|
||||||
precompute_num_workers=0,
|
precompute_num_workers=0,
|
||||||
normalize_energy=True,
|
normalize_energy=True,
|
||||||
):
|
):
|
||||||
self.samples = samples
|
self.samples = samples
|
||||||
self.ap = ap
|
self.ap = ap
|
||||||
self.verbose = verbose
|
|
||||||
self.cache_path = cache_path
|
self.cache_path = cache_path
|
||||||
self.normalize_energy = normalize_energy
|
self.normalize_energy = normalize_energy
|
||||||
self.pad_id = 0.0
|
self.pad_id = 0.0
|
||||||
|
@ -880,7 +872,7 @@ class EnergyDataset:
|
||||||
return len(self.samples)
|
return len(self.samples)
|
||||||
|
|
||||||
def precompute(self, num_workers=0):
|
def precompute(self, num_workers=0):
|
||||||
print("[*] Pre-computing energys...")
|
logger.info("Pre-computing energys...")
|
||||||
with tqdm.tqdm(total=len(self)) as pbar:
|
with tqdm.tqdm(total=len(self)) as pbar:
|
||||||
batch_size = num_workers if num_workers > 0 else 1
|
batch_size = num_workers if num_workers > 0 else 1
|
||||||
# we do not normalize at preproessing
|
# we do not normalize at preproessing
|
||||||
|
@ -968,6 +960,5 @@ class EnergyDataset:
|
||||||
|
|
||||||
def print_logs(self, level: int = 0) -> None:
|
def print_logs(self, level: int = 0) -> None:
|
||||||
indent = "\t" * level
|
indent = "\t" * level
|
||||||
print("\n")
|
logger.info("%senergyDataset")
|
||||||
print(f"{indent}> energyDataset ")
|
logger.info("%s| Number of instances : %d", indent, len(self.samples))
|
||||||
print(f"{indent}| > Number of instances : {len(self.samples)}")
|
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
import csv
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
|
@ -5,9 +7,10 @@ from glob import glob
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
########################
|
########################
|
||||||
# DATASETS
|
# DATASETS
|
||||||
########################
|
########################
|
||||||
|
@ -23,32 +26,34 @@ def cml_tts(root_path, meta_file, ignored_speakers=None):
|
||||||
num_cols = len(lines[0].split("|")) # take the first row as reference
|
num_cols = len(lines[0].split("|")) # take the first row as reference
|
||||||
for idx, line in enumerate(lines[1:]):
|
for idx, line in enumerate(lines[1:]):
|
||||||
if len(line.split("|")) != num_cols:
|
if len(line.split("|")) != num_cols:
|
||||||
print(f" > Missing column in line {idx + 1} -> {line.strip()}")
|
logger.warning("Missing column in line %d -> %s", idx + 1, line.strip())
|
||||||
# load metadata
|
# load metadata
|
||||||
metadata = pd.read_csv(os.path.join(root_path, meta_file), sep="|")
|
with open(Path(root_path) / meta_file, newline="", encoding="utf-8") as f:
|
||||||
assert all(x in metadata.columns for x in ["wav_filename", "transcript"])
|
reader = csv.DictReader(f, delimiter="|")
|
||||||
client_id = None if "client_id" in metadata.columns else "default"
|
metadata = list(reader)
|
||||||
emotion_name = None if "emotion_name" in metadata.columns else "neutral"
|
assert all(x in metadata[0] for x in ["wav_filename", "transcript"])
|
||||||
|
client_id = None if "client_id" in metadata[0] else "default"
|
||||||
|
emotion_name = None if "emotion_name" in metadata[0] else "neutral"
|
||||||
items = []
|
items = []
|
||||||
not_found_counter = 0
|
not_found_counter = 0
|
||||||
for row in metadata.itertuples():
|
for row in metadata:
|
||||||
if client_id is None and ignored_speakers is not None and row.client_id in ignored_speakers:
|
if client_id is None and ignored_speakers is not None and row["client_id"] in ignored_speakers:
|
||||||
continue
|
continue
|
||||||
audio_path = os.path.join(root_path, row.wav_filename)
|
audio_path = os.path.join(root_path, row["wav_filename"])
|
||||||
if not os.path.exists(audio_path):
|
if not os.path.exists(audio_path):
|
||||||
not_found_counter += 1
|
not_found_counter += 1
|
||||||
continue
|
continue
|
||||||
items.append(
|
items.append(
|
||||||
{
|
{
|
||||||
"text": row.transcript,
|
"text": row["transcript"],
|
||||||
"audio_file": audio_path,
|
"audio_file": audio_path,
|
||||||
"speaker_name": client_id if client_id is not None else row.client_id,
|
"speaker_name": client_id if client_id is not None else row["client_id"],
|
||||||
"emotion_name": emotion_name if emotion_name is not None else row.emotion_name,
|
"emotion_name": emotion_name if emotion_name is not None else row["emotion_name"],
|
||||||
"root_path": root_path,
|
"root_path": root_path,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
if not_found_counter > 0:
|
if not_found_counter > 0:
|
||||||
print(f" | > [!] {not_found_counter} files not found")
|
logger.warning("%d files not found", not_found_counter)
|
||||||
return items
|
return items
|
||||||
|
|
||||||
|
|
||||||
|
@ -61,32 +66,34 @@ def coqui(root_path, meta_file, ignored_speakers=None):
|
||||||
num_cols = len(lines[0].split("|")) # take the first row as reference
|
num_cols = len(lines[0].split("|")) # take the first row as reference
|
||||||
for idx, line in enumerate(lines[1:]):
|
for idx, line in enumerate(lines[1:]):
|
||||||
if len(line.split("|")) != num_cols:
|
if len(line.split("|")) != num_cols:
|
||||||
print(f" > Missing column in line {idx + 1} -> {line.strip()}")
|
logger.warning("Missing column in line %d -> %s", idx + 1, line.strip())
|
||||||
# load metadata
|
# load metadata
|
||||||
metadata = pd.read_csv(os.path.join(root_path, meta_file), sep="|")
|
with open(Path(root_path) / meta_file, newline="", encoding="utf-8") as f:
|
||||||
assert all(x in metadata.columns for x in ["audio_file", "text"])
|
reader = csv.DictReader(f, delimiter="|")
|
||||||
speaker_name = None if "speaker_name" in metadata.columns else "coqui"
|
metadata = list(reader)
|
||||||
emotion_name = None if "emotion_name" in metadata.columns else "neutral"
|
assert all(x in metadata[0] for x in ["audio_file", "text"])
|
||||||
|
speaker_name = None if "speaker_name" in metadata[0] else "coqui"
|
||||||
|
emotion_name = None if "emotion_name" in metadata[0] else "neutral"
|
||||||
items = []
|
items = []
|
||||||
not_found_counter = 0
|
not_found_counter = 0
|
||||||
for row in metadata.itertuples():
|
for row in metadata:
|
||||||
if speaker_name is None and ignored_speakers is not None and row.speaker_name in ignored_speakers:
|
if speaker_name is None and ignored_speakers is not None and row["speaker_name"] in ignored_speakers:
|
||||||
continue
|
continue
|
||||||
audio_path = os.path.join(root_path, row.audio_file)
|
audio_path = os.path.join(root_path, row["audio_file"])
|
||||||
if not os.path.exists(audio_path):
|
if not os.path.exists(audio_path):
|
||||||
not_found_counter += 1
|
not_found_counter += 1
|
||||||
continue
|
continue
|
||||||
items.append(
|
items.append(
|
||||||
{
|
{
|
||||||
"text": row.text,
|
"text": row["text"],
|
||||||
"audio_file": audio_path,
|
"audio_file": audio_path,
|
||||||
"speaker_name": speaker_name if speaker_name is not None else row.speaker_name,
|
"speaker_name": speaker_name if speaker_name is not None else row["speaker_name"],
|
||||||
"emotion_name": emotion_name if emotion_name is not None else row.emotion_name,
|
"emotion_name": emotion_name if emotion_name is not None else row["emotion_name"],
|
||||||
"root_path": root_path,
|
"root_path": root_path,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
if not_found_counter > 0:
|
if not_found_counter > 0:
|
||||||
print(f" | > [!] {not_found_counter} files not found")
|
logger.warning("%d files not found", not_found_counter)
|
||||||
return items
|
return items
|
||||||
|
|
||||||
|
|
||||||
|
@ -169,7 +176,7 @@ def mailabs(root_path, meta_files=None, ignored_speakers=None):
|
||||||
if isinstance(ignored_speakers, list):
|
if isinstance(ignored_speakers, list):
|
||||||
if speaker_name in ignored_speakers:
|
if speaker_name in ignored_speakers:
|
||||||
continue
|
continue
|
||||||
print(" | > {}".format(csv_file))
|
logger.info(csv_file)
|
||||||
with open(txt_file, "r", encoding="utf-8") as ttf:
|
with open(txt_file, "r", encoding="utf-8") as ttf:
|
||||||
for line in ttf:
|
for line in ttf:
|
||||||
cols = line.split("|")
|
cols = line.split("|")
|
||||||
|
@ -184,7 +191,7 @@ def mailabs(root_path, meta_files=None, ignored_speakers=None):
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# M-AI-Labs have some missing samples, so just print the warning
|
# M-AI-Labs have some missing samples, so just print the warning
|
||||||
print("> File %s does not exist!" % (wav_file))
|
logger.warning("File %s does not exist!", wav_file)
|
||||||
return items
|
return items
|
||||||
|
|
||||||
|
|
||||||
|
@ -249,7 +256,7 @@ def sam_accenture(root_path, meta_file, **kwargs): # pylint: disable=unused-arg
|
||||||
text = item.text
|
text = item.text
|
||||||
wav_file = os.path.join(root_path, "vo_voice_quality_transformation", item.get("id") + ".wav")
|
wav_file = os.path.join(root_path, "vo_voice_quality_transformation", item.get("id") + ".wav")
|
||||||
if not os.path.exists(wav_file):
|
if not os.path.exists(wav_file):
|
||||||
print(f" [!] {wav_file} in metafile does not exist. Skipping...")
|
logger.warning("%s in metafile does not exist. Skipping...", wav_file)
|
||||||
continue
|
continue
|
||||||
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
|
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
|
||||||
return items
|
return items
|
||||||
|
@ -370,7 +377,7 @@ def custom_turkish(root_path, meta_file, **kwargs): # pylint: disable=unused-ar
|
||||||
continue
|
continue
|
||||||
text = cols[1].strip()
|
text = cols[1].strip()
|
||||||
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
|
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
|
||||||
print(f" [!] {len(skipped_files)} files skipped. They don't exist...")
|
logger.warning("%d files skipped. They don't exist...")
|
||||||
return items
|
return items
|
||||||
|
|
||||||
|
|
||||||
|
@ -438,7 +445,7 @@ def vctk(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic
|
||||||
{"text": text, "audio_file": wav_file, "speaker_name": "VCTK_" + speaker_id, "root_path": root_path}
|
{"text": text, "audio_file": wav_file, "speaker_name": "VCTK_" + speaker_id, "root_path": root_path}
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
print(f" [!] wav files don't exist - {wav_file}")
|
logger.warning("Wav file doesn't exist - %s", wav_file)
|
||||||
return items
|
return items
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,11 +1,14 @@
|
||||||
# From https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer
|
# From https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer
|
||||||
|
|
||||||
|
import logging
|
||||||
import os.path
|
import os.path
|
||||||
import shutil
|
import shutil
|
||||||
import urllib.request
|
import urllib.request
|
||||||
|
|
||||||
import huggingface_hub
|
import huggingface_hub
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class HubertManager:
|
class HubertManager:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -13,9 +16,9 @@ class HubertManager:
|
||||||
download_url: str = "https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt", model_path: str = ""
|
download_url: str = "https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt", model_path: str = ""
|
||||||
):
|
):
|
||||||
if not os.path.isfile(model_path):
|
if not os.path.isfile(model_path):
|
||||||
print("Downloading HuBERT base model")
|
logger.info("Downloading HuBERT base model")
|
||||||
urllib.request.urlretrieve(download_url, model_path)
|
urllib.request.urlretrieve(download_url, model_path)
|
||||||
print("Downloaded HuBERT")
|
logger.info("Downloaded HuBERT")
|
||||||
return model_path
|
return model_path
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
@ -27,9 +30,9 @@ class HubertManager:
|
||||||
):
|
):
|
||||||
model_dir = os.path.dirname(model_path)
|
model_dir = os.path.dirname(model_path)
|
||||||
if not os.path.isfile(model_path):
|
if not os.path.isfile(model_path):
|
||||||
print("Downloading HuBERT custom tokenizer")
|
logger.info("Downloading HuBERT custom tokenizer")
|
||||||
huggingface_hub.hf_hub_download(repo, model, local_dir=model_dir, local_dir_use_symlinks=False)
|
huggingface_hub.hf_hub_download(repo, model, local_dir=model_dir, local_dir_use_symlinks=False)
|
||||||
shutil.move(os.path.join(model_dir, model), model_path)
|
shutil.move(os.path.join(model_dir, model), model_path)
|
||||||
print("Downloaded tokenizer")
|
logger.info("Downloaded tokenizer")
|
||||||
return model_path
|
return model_path
|
||||||
return None
|
return None
|
||||||
|
|
|
@ -7,8 +7,6 @@ License: MIT
|
||||||
|
|
||||||
# Modified code from https://github.com/lucidrains/audiolm-pytorch/blob/main/audiolm_pytorch/hubert_kmeans.py
|
# Modified code from https://github.com/lucidrains/audiolm-pytorch/blob/main/audiolm_pytorch/hubert_kmeans.py
|
||||||
|
|
||||||
import logging
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from einops import pack, unpack
|
from einops import pack, unpack
|
||||||
|
|
|
@ -5,6 +5,7 @@ License: MIT
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
import os.path
|
import os.path
|
||||||
from zipfile import ZipFile
|
from zipfile import ZipFile
|
||||||
|
|
||||||
|
@ -12,6 +13,8 @@ import numpy
|
||||||
import torch
|
import torch
|
||||||
from torch import nn, optim
|
from torch import nn, optim
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class HubertTokenizer(nn.Module):
|
class HubertTokenizer(nn.Module):
|
||||||
def __init__(self, hidden_size=1024, input_size=768, output_size=10000, version=0):
|
def __init__(self, hidden_size=1024, input_size=768, output_size=10000, version=0):
|
||||||
|
@ -85,7 +88,7 @@ class HubertTokenizer(nn.Module):
|
||||||
|
|
||||||
# Print loss
|
# Print loss
|
||||||
if log_loss:
|
if log_loss:
|
||||||
print("Loss", loss.item())
|
logger.info("Loss %.3f", loss.item())
|
||||||
|
|
||||||
# Backward pass
|
# Backward pass
|
||||||
loss.backward()
|
loss.backward()
|
||||||
|
@ -157,10 +160,10 @@ def auto_train(data_path, save_path="model.pth", load_model: str = None, save_ep
|
||||||
data_x, data_y = [], []
|
data_x, data_y = [], []
|
||||||
|
|
||||||
if load_model and os.path.isfile(load_model):
|
if load_model and os.path.isfile(load_model):
|
||||||
print("Loading model from", load_model)
|
logger.info("Loading model from %s", load_model)
|
||||||
model_training = HubertTokenizer.load_from_checkpoint(load_model, "cuda")
|
model_training = HubertTokenizer.load_from_checkpoint(load_model, "cuda")
|
||||||
else:
|
else:
|
||||||
print("Creating new model.")
|
logger.info("Creating new model.")
|
||||||
model_training = HubertTokenizer(version=1).to("cuda") # Settings for the model to run without lstm
|
model_training = HubertTokenizer(version=1).to("cuda") # Settings for the model to run without lstm
|
||||||
save_path = os.path.join(data_path, save_path)
|
save_path = os.path.join(data_path, save_path)
|
||||||
base_save_path = ".".join(save_path.split(".")[:-1])
|
base_save_path = ".".join(save_path.split(".")[:-1])
|
||||||
|
@ -191,5 +194,5 @@ def auto_train(data_path, save_path="model.pth", load_model: str = None, save_ep
|
||||||
save_p_2 = f"{base_save_path}_epoch_{epoch}.pth"
|
save_p_2 = f"{base_save_path}_epoch_{epoch}.pth"
|
||||||
model_training.save(save_p)
|
model_training.save(save_p)
|
||||||
model_training.save(save_p_2)
|
model_training.save(save_p_2)
|
||||||
print(f"Epoch {epoch} completed")
|
logger.info("Epoch %d completed", epoch)
|
||||||
epoch += 1
|
epoch += 1
|
||||||
|
|
|
@ -2,10 +2,11 @@ import logging
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
from glob import glob
|
from glob import glob
|
||||||
from typing import Dict, List
|
from typing import Dict, List, Optional, Tuple
|
||||||
|
|
||||||
import librosa
|
import librosa
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import numpy.typing as npt
|
||||||
import torch
|
import torch
|
||||||
import torchaudio
|
import torchaudio
|
||||||
import tqdm
|
import tqdm
|
||||||
|
@ -48,7 +49,7 @@ def get_voices(extra_voice_dirs: List[str] = []): # pylint: disable=dangerous-d
|
||||||
return voices
|
return voices
|
||||||
|
|
||||||
|
|
||||||
def load_npz(npz_file):
|
def load_npz(npz_file: str) -> Tuple[npt.NDArray[np.int64], npt.NDArray[np.int64], npt.NDArray[np.int64]]:
|
||||||
x_history = np.load(npz_file)
|
x_history = np.load(npz_file)
|
||||||
semantic = x_history["semantic_prompt"]
|
semantic = x_history["semantic_prompt"]
|
||||||
coarse = x_history["coarse_prompt"]
|
coarse = x_history["coarse_prompt"]
|
||||||
|
@ -56,7 +57,11 @@ def load_npz(npz_file):
|
||||||
return semantic, coarse, fine
|
return semantic, coarse, fine
|
||||||
|
|
||||||
|
|
||||||
def load_voice(model, voice: str, extra_voice_dirs: List[str] = []): # pylint: disable=dangerous-default-value
|
def load_voice(
|
||||||
|
model, voice: str, extra_voice_dirs: List[str] = []
|
||||||
|
) -> Tuple[
|
||||||
|
Optional[npt.NDArray[np.int64]], Optional[npt.NDArray[np.int64]], Optional[npt.NDArray[np.int64]]
|
||||||
|
]: # pylint: disable=dangerous-default-value
|
||||||
if voice == "random":
|
if voice == "random":
|
||||||
return None, None, None
|
return None, None, None
|
||||||
|
|
||||||
|
@ -107,11 +112,10 @@ def generate_voice(
|
||||||
model,
|
model,
|
||||||
output_path,
|
output_path,
|
||||||
):
|
):
|
||||||
"""Generate a new voice from a given audio and text prompt.
|
"""Generate a new voice from a given audio.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
audio (np.ndarray): The audio to use as a base for the new voice.
|
audio (np.ndarray): The audio to use as a base for the new voice.
|
||||||
text (str): Transcription of the audio you are clonning.
|
|
||||||
model (BarkModel): The BarkModel to use for generating the new voice.
|
model (BarkModel): The BarkModel to use for generating the new voice.
|
||||||
output_path (str): The path to save the generated voice to.
|
output_path (str): The path to save the generated voice to.
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
Much of this code is adapted from Andrej Karpathy's NanoGPT
|
Much of this code is adapted from Andrej Karpathy's NanoGPT
|
||||||
(https://github.com/karpathy/nanoGPT)
|
(https://github.com/karpathy/nanoGPT)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import math
|
import math
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
Much of this code is adapted from Andrej Karpathy's NanoGPT
|
Much of this code is adapted from Andrej Karpathy's NanoGPT
|
||||||
(https://github.com/karpathy/nanoGPT)
|
(https://github.com/karpathy/nanoGPT)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import math
|
import math
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
### credit: https://github.com/dunky11/voicesmith
|
### credit: https://github.com/dunky11/voicesmith
|
||||||
|
import logging
|
||||||
from typing import Callable, Dict, Tuple
|
from typing import Callable, Dict, Tuple
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
@ -20,6 +21,8 @@ from TTS.tts.layers.delightful_tts.variance_predictor import VariancePredictor
|
||||||
from TTS.tts.layers.generic.aligner import AlignmentNetwork
|
from TTS.tts.layers.generic.aligner import AlignmentNetwork
|
||||||
from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask
|
from TTS.tts.utils.helpers import generate_path, maximum_path, sequence_mask
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class AcousticModel(torch.nn.Module):
|
class AcousticModel(torch.nn.Module):
|
||||||
def __init__(
|
def __init__(
|
||||||
|
@ -217,7 +220,7 @@ class AcousticModel(torch.nn.Module):
|
||||||
def _init_speaker_embedding(self):
|
def _init_speaker_embedding(self):
|
||||||
# pylint: disable=attribute-defined-outside-init
|
# pylint: disable=attribute-defined-outside-init
|
||||||
if self.num_speakers > 0:
|
if self.num_speakers > 0:
|
||||||
print(" > initialization of speaker-embedding layers.")
|
logger.info("Initialization of speaker-embedding layers.")
|
||||||
self.embedded_speaker_dim = self.args.speaker_embedding_channels
|
self.embedded_speaker_dim = self.args.speaker_embedding_channels
|
||||||
self.emb_g = nn.Embedding(self.num_speakers, self.embedded_speaker_dim)
|
self.emb_g = nn.Embedding(self.num_speakers, self.embedded_speaker_dim)
|
||||||
|
|
||||||
|
@ -362,7 +365,7 @@ class AcousticModel(torch.nn.Module):
|
||||||
|
|
||||||
pos_encoding = positional_encoding(
|
pos_encoding = positional_encoding(
|
||||||
self.emb_dim,
|
self.emb_dim,
|
||||||
max(token_embeddings.shape[1], max(mel_lens)),
|
max(token_embeddings.shape[1], *mel_lens),
|
||||||
device=token_embeddings.device,
|
device=token_embeddings.device,
|
||||||
)
|
)
|
||||||
encoder_outputs = self.encoder(
|
encoder_outputs = self.encoder(
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
import torch
|
import torch
|
||||||
from packaging.version import Version
|
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from torch.nn import functional as F
|
from torch.nn import functional as F
|
||||||
|
|
||||||
|
@ -90,10 +89,7 @@ class InvConvNear(nn.Module):
|
||||||
self.no_jacobian = no_jacobian
|
self.no_jacobian = no_jacobian
|
||||||
self.weight_inv = None
|
self.weight_inv = None
|
||||||
|
|
||||||
if Version(torch.__version__) < Version("1.9"):
|
w_init = torch.linalg.qr(torch.FloatTensor(self.num_splits, self.num_splits).normal_(), "complete")[0]
|
||||||
w_init = torch.qr(torch.FloatTensor(self.num_splits, self.num_splits).normal_())[0]
|
|
||||||
else:
|
|
||||||
w_init = torch.linalg.qr(torch.FloatTensor(self.num_splits, self.num_splits).normal_(), "complete")[0]
|
|
||||||
|
|
||||||
if torch.det(w_init) < 0:
|
if torch.det(w_init) < 0:
|
||||||
w_init[:, 0] = -1 * w_init[:, 0]
|
w_init[:, 0] = -1 * w_init[:, 0]
|
||||||
|
|
|
@ -5,6 +5,7 @@ from torch import nn
|
||||||
from torch.nn import functional as F
|
from torch.nn import functional as F
|
||||||
|
|
||||||
from TTS.tts.layers.generic.normalization import LayerNorm, LayerNorm2
|
from TTS.tts.layers.generic.normalization import LayerNorm, LayerNorm2
|
||||||
|
from TTS.tts.utils.helpers import convert_pad_shape
|
||||||
|
|
||||||
|
|
||||||
class RelativePositionMultiHeadAttention(nn.Module):
|
class RelativePositionMultiHeadAttention(nn.Module):
|
||||||
|
@ -300,7 +301,7 @@ class FeedForwardNetwork(nn.Module):
|
||||||
pad_l = self.kernel_size - 1
|
pad_l = self.kernel_size - 1
|
||||||
pad_r = 0
|
pad_r = 0
|
||||||
padding = [[0, 0], [0, 0], [pad_l, pad_r]]
|
padding = [[0, 0], [0, 0], [pad_l, pad_r]]
|
||||||
x = F.pad(x, self._pad_shape(padding))
|
x = F.pad(x, convert_pad_shape(padding))
|
||||||
return x
|
return x
|
||||||
|
|
||||||
def _same_padding(self, x):
|
def _same_padding(self, x):
|
||||||
|
@ -309,15 +310,9 @@ class FeedForwardNetwork(nn.Module):
|
||||||
pad_l = (self.kernel_size - 1) // 2
|
pad_l = (self.kernel_size - 1) // 2
|
||||||
pad_r = self.kernel_size // 2
|
pad_r = self.kernel_size // 2
|
||||||
padding = [[0, 0], [0, 0], [pad_l, pad_r]]
|
padding = [[0, 0], [0, 0], [pad_l, pad_r]]
|
||||||
x = F.pad(x, self._pad_shape(padding))
|
x = F.pad(x, convert_pad_shape(padding))
|
||||||
return x
|
return x
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _pad_shape(padding):
|
|
||||||
l = padding[::-1]
|
|
||||||
pad_shape = [item for sublist in l for item in sublist]
|
|
||||||
return pad_shape
|
|
||||||
|
|
||||||
|
|
||||||
class RelativePositionTransformer(nn.Module):
|
class RelativePositionTransformer(nn.Module):
|
||||||
"""Transformer with Relative Potional Encoding.
|
"""Transformer with Relative Potional Encoding.
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
import logging
|
||||||
import math
|
import math
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
@ -10,6 +11,8 @@ from TTS.tts.utils.helpers import sequence_mask
|
||||||
from TTS.tts.utils.ssim import SSIMLoss as _SSIMLoss
|
from TTS.tts.utils.ssim import SSIMLoss as _SSIMLoss
|
||||||
from TTS.utils.audio.torch_transforms import TorchSTFT
|
from TTS.utils.audio.torch_transforms import TorchSTFT
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
# pylint: disable=abstract-method
|
# pylint: disable=abstract-method
|
||||||
# relates https://github.com/pytorch/pytorch/issues/42305
|
# relates https://github.com/pytorch/pytorch/issues/42305
|
||||||
|
@ -132,11 +135,11 @@ class SSIMLoss(torch.nn.Module):
|
||||||
ssim_loss = self.loss_func((y_norm * mask).unsqueeze(1), (y_hat_norm * mask).unsqueeze(1))
|
ssim_loss = self.loss_func((y_norm * mask).unsqueeze(1), (y_hat_norm * mask).unsqueeze(1))
|
||||||
|
|
||||||
if ssim_loss.item() > 1.0:
|
if ssim_loss.item() > 1.0:
|
||||||
print(f" > SSIM loss is out-of-range {ssim_loss.item()}, setting it 1.0")
|
logger.info("SSIM loss is out-of-range (%.2f), setting it to 1.0", ssim_loss.item())
|
||||||
ssim_loss = torch.tensor(1.0, device=ssim_loss.device)
|
ssim_loss = torch.tensor(1.0, device=ssim_loss.device)
|
||||||
|
|
||||||
if ssim_loss.item() < 0.0:
|
if ssim_loss.item() < 0.0:
|
||||||
print(f" > SSIM loss is out-of-range {ssim_loss.item()}, setting it 0.0")
|
logger.info("SSIM loss is out-of-range (%.2f), setting it to 0.0", ssim_loss.item())
|
||||||
ssim_loss = torch.tensor(0.0, device=ssim_loss.device)
|
ssim_loss = torch.tensor(0.0, device=ssim_loss.device)
|
||||||
|
|
||||||
return ssim_loss
|
return ssim_loss
|
||||||
|
@ -252,7 +255,7 @@ class GuidedAttentionLoss(torch.nn.Module):
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _make_ga_mask(ilen, olen, sigma):
|
def _make_ga_mask(ilen, olen, sigma):
|
||||||
grid_x, grid_y = torch.meshgrid(torch.arange(olen).to(olen), torch.arange(ilen).to(ilen))
|
grid_x, grid_y = torch.meshgrid(torch.arange(olen).to(olen), torch.arange(ilen).to(ilen), indexing="ij")
|
||||||
grid_x, grid_y = grid_x.float(), grid_y.float()
|
grid_x, grid_y = grid_x.float(), grid_y.float()
|
||||||
return 1.0 - torch.exp(-((grid_y / ilen - grid_x / olen) ** 2) / (2 * (sigma**2)))
|
return 1.0 - torch.exp(-((grid_y / ilen - grid_x / olen) ** 2) / (2 * (sigma**2)))
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
import logging
|
||||||
from typing import List, Tuple
|
from typing import List, Tuple
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
@ -8,6 +9,8 @@ from tqdm.auto import tqdm
|
||||||
from TTS.tts.layers.tacotron.common_layers import Linear
|
from TTS.tts.layers.tacotron.common_layers import Linear
|
||||||
from TTS.tts.layers.tacotron.tacotron2 import ConvBNBlock
|
from TTS.tts.layers.tacotron.tacotron2 import ConvBNBlock
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class Encoder(nn.Module):
|
class Encoder(nn.Module):
|
||||||
r"""Neural HMM Encoder
|
r"""Neural HMM Encoder
|
||||||
|
@ -213,8 +216,8 @@ class Outputnet(nn.Module):
|
||||||
original_tensor = std.clone().detach()
|
original_tensor = std.clone().detach()
|
||||||
std = torch.clamp(std, min=self.std_floor)
|
std = torch.clamp(std, min=self.std_floor)
|
||||||
if torch.any(original_tensor != std):
|
if torch.any(original_tensor != std):
|
||||||
print(
|
logger.info(
|
||||||
"[*] Standard deviation was floored! The model is preventing overfitting, nothing serious to worry about"
|
"Standard deviation was floored! The model is preventing overfitting, nothing serious to worry about"
|
||||||
)
|
)
|
||||||
return std
|
return std
|
||||||
|
|
||||||
|
|
|
@ -128,7 +128,8 @@ class NeuralHMM(nn.Module):
|
||||||
# Get mean, std and transition vector from decoder for this timestep
|
# Get mean, std and transition vector from decoder for this timestep
|
||||||
# Note: Gradient checkpointing currently doesn't works with multiple gpus inside a loop
|
# Note: Gradient checkpointing currently doesn't works with multiple gpus inside a loop
|
||||||
if self.use_grad_checkpointing and self.training:
|
if self.use_grad_checkpointing and self.training:
|
||||||
mean, std, transition_vector = checkpoint(self.output_net, h_memory, inputs)
|
# TODO: use_reentrant=False is recommended
|
||||||
|
mean, std, transition_vector = checkpoint(self.output_net, h_memory, inputs, use_reentrant=True)
|
||||||
else:
|
else:
|
||||||
mean, std, transition_vector = self.output_net(h_memory, inputs)
|
mean, std, transition_vector = self.output_net(h_memory, inputs)
|
||||||
|
|
||||||
|
|
|
@ -71,7 +71,7 @@ def plot_transition_probabilities_to_numpy(states, transition_probabilities, out
|
||||||
ax.set_title("Transition probability of state")
|
ax.set_title("Transition probability of state")
|
||||||
ax.set_xlabel("hidden state")
|
ax.set_xlabel("hidden state")
|
||||||
ax.set_ylabel("probability")
|
ax.set_ylabel("probability")
|
||||||
ax.set_xticks([i for i in range(len(transition_probabilities))]) # pylint: disable=unnecessary-comprehension
|
ax.set_xticks(list(range(len(transition_probabilities))))
|
||||||
ax.set_xticklabels([int(x) for x in states], rotation=90)
|
ax.set_xticklabels([int(x) for x in states], rotation=90)
|
||||||
plt.tight_layout()
|
plt.tight_layout()
|
||||||
if not output_fig:
|
if not output_fig:
|
||||||
|
|
|
@ -1,12 +1,16 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
# adapted from https://github.com/r9y9/tacotron_pytorch
|
# adapted from https://github.com/r9y9/tacotron_pytorch
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
||||||
from .attentions import init_attn
|
from .attentions import init_attn
|
||||||
from .common_layers import Prenet
|
from .common_layers import Prenet
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class BatchNormConv1d(nn.Module):
|
class BatchNormConv1d(nn.Module):
|
||||||
r"""A wrapper for Conv1d with BatchNorm. It sets the activation
|
r"""A wrapper for Conv1d with BatchNorm. It sets the activation
|
||||||
|
@ -480,7 +484,7 @@ class Decoder(nn.Module):
|
||||||
if t > inputs.shape[1] / 4 and (stop_token > 0.6 or attention[:, -1].item() > 0.6):
|
if t > inputs.shape[1] / 4 and (stop_token > 0.6 or attention[:, -1].item() > 0.6):
|
||||||
break
|
break
|
||||||
if t > self.max_decoder_steps:
|
if t > self.max_decoder_steps:
|
||||||
print(" | > Decoder stopped with 'max_decoder_steps")
|
logger.info("Decoder stopped with `max_decoder_steps` %d", self.max_decoder_steps)
|
||||||
break
|
break
|
||||||
return self._parse_outputs(outputs, attentions, stop_tokens)
|
return self._parse_outputs(outputs, attentions, stop_tokens)
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
import logging
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from torch.nn import functional as F
|
from torch.nn import functional as F
|
||||||
|
@ -5,6 +7,8 @@ from torch.nn import functional as F
|
||||||
from .attentions import init_attn
|
from .attentions import init_attn
|
||||||
from .common_layers import Linear, Prenet
|
from .common_layers import Linear, Prenet
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
# pylint: disable=no-value-for-parameter
|
# pylint: disable=no-value-for-parameter
|
||||||
# pylint: disable=unexpected-keyword-arg
|
# pylint: disable=unexpected-keyword-arg
|
||||||
|
@ -356,7 +360,7 @@ class Decoder(nn.Module):
|
||||||
if stop_token > self.stop_threshold and t > inputs.shape[0] // 2:
|
if stop_token > self.stop_threshold and t > inputs.shape[0] // 2:
|
||||||
break
|
break
|
||||||
if len(outputs) == self.max_decoder_steps:
|
if len(outputs) == self.max_decoder_steps:
|
||||||
print(f" > Decoder stopped with `max_decoder_steps` {self.max_decoder_steps}")
|
logger.info("Decoder stopped with `max_decoder_steps` %d", self.max_decoder_steps)
|
||||||
break
|
break
|
||||||
|
|
||||||
memory = self._update_memory(decoder_output)
|
memory = self._update_memory(decoder_output)
|
||||||
|
@ -389,7 +393,7 @@ class Decoder(nn.Module):
|
||||||
if stop_token > 0.7:
|
if stop_token > 0.7:
|
||||||
break
|
break
|
||||||
if len(outputs) == self.max_decoder_steps:
|
if len(outputs) == self.max_decoder_steps:
|
||||||
print(" | > Decoder stopped with 'max_decoder_steps")
|
logger.info("Decoder stopped with `max_decoder_steps` %d", self.max_decoder_steps)
|
||||||
break
|
break
|
||||||
|
|
||||||
self.memory_truncated = decoder_output
|
self.memory_truncated = decoder_output
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
import functools
|
import functools
|
||||||
import math
|
import math
|
||||||
import os
|
|
||||||
|
|
||||||
import fsspec
|
import fsspec
|
||||||
import torch
|
import torch
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
from glob import glob
|
from glob import glob
|
||||||
from typing import Dict, List
|
from typing import Dict, List
|
||||||
|
@ -10,6 +11,8 @@ from scipy.io.wavfile import read
|
||||||
|
|
||||||
from TTS.utils.audio.torch_transforms import TorchSTFT
|
from TTS.utils.audio.torch_transforms import TorchSTFT
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def load_wav_to_torch(full_path):
|
def load_wav_to_torch(full_path):
|
||||||
sampling_rate, data = read(full_path)
|
sampling_rate, data = read(full_path)
|
||||||
|
@ -28,7 +31,7 @@ def check_audio(audio, audiopath: str):
|
||||||
# Check some assumptions about audio range. This should be automatically fixed in load_wav_to_torch, but might not be in some edge cases, where we should squawk.
|
# Check some assumptions about audio range. This should be automatically fixed in load_wav_to_torch, but might not be in some edge cases, where we should squawk.
|
||||||
# '2' is arbitrarily chosen since it seems like audio will often "overdrive" the [-1,1] bounds.
|
# '2' is arbitrarily chosen since it seems like audio will often "overdrive" the [-1,1] bounds.
|
||||||
if torch.any(audio > 2) or not torch.any(audio < 0):
|
if torch.any(audio > 2) or not torch.any(audio < 0):
|
||||||
print(f"Error with {audiopath}. Max={audio.max()} min={audio.min()}")
|
logger.error("Error with %s. Max=%.2f min=%.2f", audiopath, audio.max(), audio.min())
|
||||||
audio.clip_(-1, 1)
|
audio.clip_(-1, 1)
|
||||||
|
|
||||||
|
|
||||||
|
@ -136,7 +139,7 @@ def load_voices(voices: List[str], extra_voice_dirs: List[str] = []):
|
||||||
for voice in voices:
|
for voice in voices:
|
||||||
if voice == "random":
|
if voice == "random":
|
||||||
if len(voices) > 1:
|
if len(voices) > 1:
|
||||||
print("Cannot combine a random voice with a non-random voice. Just using a random voice.")
|
logger.warning("Cannot combine a random voice with a non-random voice. Just using a random voice.")
|
||||||
return None, None
|
return None, None
|
||||||
clip, latent = load_voice(voice, extra_voice_dirs)
|
clip, latent = load_voice(voice, extra_voice_dirs)
|
||||||
if latent is None:
|
if latent is None:
|
||||||
|
|
|
@ -126,7 +126,7 @@ class CLVP(nn.Module):
|
||||||
text_latents = self.to_text_latent(text_latents)
|
text_latents = self.to_text_latent(text_latents)
|
||||||
speech_latents = self.to_speech_latent(speech_latents)
|
speech_latents = self.to_speech_latent(speech_latents)
|
||||||
|
|
||||||
text_latents, speech_latents = map(lambda t: F.normalize(t, p=2, dim=-1), (text_latents, speech_latents))
|
text_latents, speech_latents = (F.normalize(t, p=2, dim=-1) for t in (text_latents, speech_latents))
|
||||||
|
|
||||||
temp = self.temperature.exp()
|
temp = self.temperature.exp()
|
||||||
|
|
||||||
|
|
|
@ -972,7 +972,7 @@ class GaussianDiffusion:
|
||||||
assert False # not currently supported for this type of diffusion.
|
assert False # not currently supported for this type of diffusion.
|
||||||
elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE:
|
elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE:
|
||||||
model_outputs = model(x_t, x_start, self._scale_timesteps(t), **model_kwargs)
|
model_outputs = model(x_t, x_start, self._scale_timesteps(t), **model_kwargs)
|
||||||
terms.update({k: o for k, o in zip(model_output_keys, model_outputs)})
|
terms.update(dict(zip(model_output_keys, model_outputs)))
|
||||||
model_output = terms[gd_out_key]
|
model_output = terms[gd_out_key]
|
||||||
if self.model_var_type in [
|
if self.model_var_type in [
|
||||||
ModelVarType.LEARNED,
|
ModelVarType.LEARNED,
|
||||||
|
|
|
@ -1,7 +1,10 @@
|
||||||
|
import logging
|
||||||
import math
|
import math
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class NoiseScheduleVP:
|
class NoiseScheduleVP:
|
||||||
def __init__(
|
def __init__(
|
||||||
|
@ -1171,7 +1174,7 @@ class DPM_Solver:
|
||||||
lambda_0 - lambda_s,
|
lambda_0 - lambda_s,
|
||||||
)
|
)
|
||||||
nfe += order
|
nfe += order
|
||||||
print("adaptive solver nfe", nfe)
|
logger.debug("adaptive solver nfe %d", nfe)
|
||||||
return x
|
return x
|
||||||
|
|
||||||
def add_noise(self, x, t, noise=None):
|
def add_noise(self, x, t, noise=None):
|
||||||
|
|
|
@ -37,7 +37,7 @@ def route_args(router, args, depth):
|
||||||
for key in matched_keys:
|
for key in matched_keys:
|
||||||
val = args[key]
|
val = args[key]
|
||||||
for depth, ((f_args, g_args), routes) in enumerate(zip(routed_args, router[key])):
|
for depth, ((f_args, g_args), routes) in enumerate(zip(routed_args, router[key])):
|
||||||
new_f_args, new_g_args = map(lambda route: ({key: val} if route else {}), routes)
|
new_f_args, new_g_args = (({key: val} if route else {}) for route in routes)
|
||||||
routed_args[depth] = ({**f_args, **new_f_args}, {**g_args, **new_g_args})
|
routed_args[depth] = ({**f_args, **new_f_args}, {**g_args, **new_g_args})
|
||||||
return routed_args
|
return routed_args
|
||||||
|
|
||||||
|
@ -152,7 +152,7 @@ class Attention(nn.Module):
|
||||||
softmax = torch.softmax
|
softmax = torch.softmax
|
||||||
|
|
||||||
qkv = self.to_qkv(x).chunk(3, dim=-1)
|
qkv = self.to_qkv(x).chunk(3, dim=-1)
|
||||||
q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), qkv)
|
q, k, v = (rearrange(t, "b n (h d) -> b h n d", h=h) for t in qkv)
|
||||||
|
|
||||||
q = q * self.scale
|
q = q * self.scale
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,11 @@
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
from urllib import request
|
from urllib import request
|
||||||
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
DEFAULT_MODELS_DIR = os.path.join(os.path.expanduser("~"), ".cache", "tortoise", "models")
|
DEFAULT_MODELS_DIR = os.path.join(os.path.expanduser("~"), ".cache", "tortoise", "models")
|
||||||
MODELS_DIR = os.environ.get("TORTOISE_MODELS_DIR", DEFAULT_MODELS_DIR)
|
MODELS_DIR = os.environ.get("TORTOISE_MODELS_DIR", DEFAULT_MODELS_DIR)
|
||||||
MODELS_DIR = "/data/speech_synth/models/"
|
MODELS_DIR = "/data/speech_synth/models/"
|
||||||
|
@ -28,10 +31,10 @@ def download_models(specific_models=None):
|
||||||
model_path = os.path.join(MODELS_DIR, model_name)
|
model_path = os.path.join(MODELS_DIR, model_name)
|
||||||
if os.path.exists(model_path):
|
if os.path.exists(model_path):
|
||||||
continue
|
continue
|
||||||
print(f"Downloading {model_name} from {url}...")
|
logger.info("Downloading %s from %s...", model_name, url)
|
||||||
with tqdm(unit="B", unit_scale=True, unit_divisor=1024, miniters=1) as t:
|
with tqdm(unit="B", unit_scale=True, unit_divisor=1024, miniters=1) as t:
|
||||||
request.urlretrieve(url, model_path, lambda nb, bs, fs, t=t: t.update(nb * bs - t.n))
|
request.urlretrieve(url, model_path, lambda nb, bs, fs, t=t: t.update(nb * bs - t.n))
|
||||||
print("Done.")
|
logger.info("Done.")
|
||||||
|
|
||||||
|
|
||||||
def get_model_path(model_name, models_dir=MODELS_DIR):
|
def get_model_path(model_name, models_dir=MODELS_DIR):
|
||||||
|
|
|
@ -84,7 +84,7 @@ def init_zero_(layer):
|
||||||
|
|
||||||
|
|
||||||
def pick_and_pop(keys, d):
|
def pick_and_pop(keys, d):
|
||||||
values = list(map(lambda key: d.pop(key), keys))
|
values = [d.pop(key) for key in keys]
|
||||||
return dict(zip(keys, values))
|
return dict(zip(keys, values))
|
||||||
|
|
||||||
|
|
||||||
|
@ -107,7 +107,7 @@ def group_by_key_prefix(prefix, d):
|
||||||
|
|
||||||
def groupby_prefix_and_trim(prefix, d):
|
def groupby_prefix_and_trim(prefix, d):
|
||||||
kwargs_with_prefix, kwargs = group_dict_by_key(partial(string_begins_with, prefix), d)
|
kwargs_with_prefix, kwargs = group_dict_by_key(partial(string_begins_with, prefix), d)
|
||||||
kwargs_without_prefix = dict(map(lambda x: (x[0][len(prefix) :], x[1]), tuple(kwargs_with_prefix.items())))
|
kwargs_without_prefix = {x[0][len(prefix) :]: x[1] for x in tuple(kwargs_with_prefix.items())}
|
||||||
return kwargs_without_prefix, kwargs
|
return kwargs_without_prefix, kwargs
|
||||||
|
|
||||||
|
|
||||||
|
@ -428,7 +428,7 @@ class ShiftTokens(nn.Module):
|
||||||
feats_per_shift = x.shape[-1] // segments
|
feats_per_shift = x.shape[-1] // segments
|
||||||
splitted = x.split(feats_per_shift, dim=-1)
|
splitted = x.split(feats_per_shift, dim=-1)
|
||||||
segments_to_shift, rest = splitted[:segments], splitted[segments:]
|
segments_to_shift, rest = splitted[:segments], splitted[segments:]
|
||||||
segments_to_shift = list(map(lambda args: shift(*args, mask=mask), zip(segments_to_shift, shifts)))
|
segments_to_shift = [shift(*args, mask=mask) for args in zip(segments_to_shift, shifts)]
|
||||||
x = torch.cat((*segments_to_shift, *rest), dim=-1)
|
x = torch.cat((*segments_to_shift, *rest), dim=-1)
|
||||||
return self.fn(x, **kwargs)
|
return self.fn(x, **kwargs)
|
||||||
|
|
||||||
|
@ -635,7 +635,7 @@ class Attention(nn.Module):
|
||||||
v = self.to_v(v_input)
|
v = self.to_v(v_input)
|
||||||
|
|
||||||
if not collab_heads:
|
if not collab_heads:
|
||||||
q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (q, k, v))
|
q, k, v = (rearrange(t, "b n (h d) -> b h n d", h=h) for t in (q, k, v))
|
||||||
else:
|
else:
|
||||||
q = einsum("b i d, h d -> b h i d", q, self.collab_mixing)
|
q = einsum("b i d, h d -> b h i d", q, self.collab_mixing)
|
||||||
k = rearrange(k, "b n d -> b () n d")
|
k = rearrange(k, "b n d -> b () n d")
|
||||||
|
@ -650,9 +650,9 @@ class Attention(nn.Module):
|
||||||
|
|
||||||
if exists(rotary_pos_emb) and not has_context:
|
if exists(rotary_pos_emb) and not has_context:
|
||||||
l = rotary_pos_emb.shape[-1]
|
l = rotary_pos_emb.shape[-1]
|
||||||
(ql, qr), (kl, kr), (vl, vr) = map(lambda t: (t[..., :l], t[..., l:]), (q, k, v))
|
(ql, qr), (kl, kr), (vl, vr) = ((t[..., :l], t[..., l:]) for t in (q, k, v))
|
||||||
ql, kl, vl = map(lambda t: apply_rotary_pos_emb(t, rotary_pos_emb), (ql, kl, vl))
|
ql, kl, vl = (apply_rotary_pos_emb(t, rotary_pos_emb) for t in (ql, kl, vl))
|
||||||
q, k, v = map(lambda t: torch.cat(t, dim=-1), ((ql, qr), (kl, kr), (vl, vr)))
|
q, k, v = (torch.cat(t, dim=-1) for t in ((ql, qr), (kl, kr), (vl, vr)))
|
||||||
|
|
||||||
input_mask = None
|
input_mask = None
|
||||||
if any(map(exists, (mask, context_mask))):
|
if any(map(exists, (mask, context_mask))):
|
||||||
|
@ -664,7 +664,7 @@ class Attention(nn.Module):
|
||||||
input_mask = q_mask * k_mask
|
input_mask = q_mask * k_mask
|
||||||
|
|
||||||
if self.num_mem_kv > 0:
|
if self.num_mem_kv > 0:
|
||||||
mem_k, mem_v = map(lambda t: repeat(t, "h n d -> b h n d", b=b), (self.mem_k, self.mem_v))
|
mem_k, mem_v = (repeat(t, "h n d -> b h n d", b=b) for t in (self.mem_k, self.mem_v))
|
||||||
k = torch.cat((mem_k, k), dim=-2)
|
k = torch.cat((mem_k, k), dim=-2)
|
||||||
v = torch.cat((mem_v, v), dim=-2)
|
v = torch.cat((mem_v, v), dim=-2)
|
||||||
if exists(input_mask):
|
if exists(input_mask):
|
||||||
|
@ -964,9 +964,7 @@ class AttentionLayers(nn.Module):
|
||||||
seq_len = x.shape[1]
|
seq_len = x.shape[1]
|
||||||
if past_key_values is not None:
|
if past_key_values is not None:
|
||||||
seq_len += past_key_values[0][0].shape[-2]
|
seq_len += past_key_values[0][0].shape[-2]
|
||||||
max_rotary_emb_length = max(
|
max_rotary_emb_length = max([(m.shape[1] if exists(m) else 0) + seq_len for m in mems] + [expected_seq_len])
|
||||||
list(map(lambda m: (m.shape[1] if exists(m) else 0) + seq_len, mems)) + [expected_seq_len]
|
|
||||||
)
|
|
||||||
rotary_pos_emb = self.rotary_pos_emb(max_rotary_emb_length, x.device)
|
rotary_pos_emb = self.rotary_pos_emb(max_rotary_emb_length, x.device)
|
||||||
|
|
||||||
present_key_values = []
|
present_key_values = []
|
||||||
|
@ -1200,7 +1198,7 @@ class TransformerWrapper(nn.Module):
|
||||||
|
|
||||||
res = [out]
|
res = [out]
|
||||||
if return_attn:
|
if return_attn:
|
||||||
attn_maps = list(map(lambda t: t.post_softmax_attn, intermediates.attn_intermediates))
|
attn_maps = [t.post_softmax_attn for t in intermediates.attn_intermediates]
|
||||||
res.append(attn_maps)
|
res.append(attn_maps)
|
||||||
if use_cache:
|
if use_cache:
|
||||||
res.append(intermediates.past_key_values)
|
res.append(intermediates.past_key_values)
|
||||||
|
@ -1249,7 +1247,7 @@ class ContinuousTransformerWrapper(nn.Module):
|
||||||
|
|
||||||
res = [out]
|
res = [out]
|
||||||
if return_attn:
|
if return_attn:
|
||||||
attn_maps = list(map(lambda t: t.post_softmax_attn, intermediates.attn_intermediates))
|
attn_maps = [t.post_softmax_attn for t in intermediates.attn_intermediates]
|
||||||
res.append(attn_maps)
|
res.append(attn_maps)
|
||||||
if use_cache:
|
if use_cache:
|
||||||
res.append(intermediates.past_key_values)
|
res.append(intermediates.past_key_values)
|
||||||
|
|
|
@ -2,7 +2,7 @@ import torch
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from torch.nn.modules.conv import Conv1d
|
from torch.nn.modules.conv import Conv1d
|
||||||
|
|
||||||
from TTS.vocoder.models.hifigan_discriminator import DiscriminatorP, MultiPeriodDiscriminator
|
from TTS.vocoder.models.hifigan_discriminator import DiscriminatorP
|
||||||
|
|
||||||
|
|
||||||
class DiscriminatorS(torch.nn.Module):
|
class DiscriminatorS(torch.nn.Module):
|
||||||
|
|
|
@ -10,22 +10,6 @@ from TTS.tts.utils.helpers import sequence_mask
|
||||||
LRELU_SLOPE = 0.1
|
LRELU_SLOPE = 0.1
|
||||||
|
|
||||||
|
|
||||||
def convert_pad_shape(pad_shape):
|
|
||||||
l = pad_shape[::-1]
|
|
||||||
pad_shape = [item for sublist in l for item in sublist]
|
|
||||||
return pad_shape
|
|
||||||
|
|
||||||
|
|
||||||
def init_weights(m, mean=0.0, std=0.01):
|
|
||||||
classname = m.__class__.__name__
|
|
||||||
if classname.find("Conv") != -1:
|
|
||||||
m.weight.data.normal_(mean, std)
|
|
||||||
|
|
||||||
|
|
||||||
def get_padding(kernel_size, dilation=1):
|
|
||||||
return int((kernel_size * dilation - dilation) / 2)
|
|
||||||
|
|
||||||
|
|
||||||
class TextEncoder(nn.Module):
|
class TextEncoder(nn.Module):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue