mirror of https://github.com/coqui-ai/TTS.git
commit
56ba616a03
|
@ -1,2 +1,9 @@
|
||||||
.git/
|
.git/
|
||||||
Dockerfile
|
Dockerfile
|
||||||
|
build/
|
||||||
|
dist/
|
||||||
|
TTS.egg-info/
|
||||||
|
tests/outputs/*
|
||||||
|
tests/train_outputs/*
|
||||||
|
__pycache__/
|
||||||
|
*.pyc
|
|
@ -21,9 +21,9 @@ jobs:
|
||||||
python-version: [3.7, 3.8, 3.9, "3.10"]
|
python-version: [3.7, 3.8, 3.9, "3.10"]
|
||||||
experimental: [false]
|
experimental: [false]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v3
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
uses: coqui-ai/setup-python@pip-cache-key-py-ver
|
uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
architecture: x64
|
architecture: x64
|
||||||
|
|
|
@ -21,9 +21,9 @@ jobs:
|
||||||
python-version: [3.7, 3.8, 3.9, "3.10"]
|
python-version: [3.7, 3.8, 3.9, "3.10"]
|
||||||
experimental: [false]
|
experimental: [false]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v3
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
uses: coqui-ai/setup-python@pip-cache-key-py-ver
|
uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
architecture: x64
|
architecture: x64
|
||||||
|
|
|
@ -15,8 +15,8 @@ jobs:
|
||||||
matrix:
|
matrix:
|
||||||
arch: ["amd64"]
|
arch: ["amd64"]
|
||||||
base:
|
base:
|
||||||
- "nvcr.io/nvidia/pytorch:22.03-py3" # GPU enabled
|
- "nvidia/cuda:11.8.0-base-ubuntu22.04" # GPU enabled
|
||||||
- "ubuntu:20.04" # CPU only
|
- "python:3.10.8-slim" # CPU only
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v2
|
||||||
- name: Log in to the Container registry
|
- name: Log in to the Container registry
|
||||||
|
@ -32,7 +32,7 @@ jobs:
|
||||||
base="ghcr.io/coqui-ai/tts"
|
base="ghcr.io/coqui-ai/tts"
|
||||||
tags="" # PR build
|
tags="" # PR build
|
||||||
|
|
||||||
if [[ ${{ matrix.base }} = "ubuntu:20.04" ]]; then
|
if [[ ${{ matrix.base }} = "python:3.10.8-slim" ]]; then
|
||||||
base="ghcr.io/coqui-ai/tts-cpu"
|
base="ghcr.io/coqui-ai/tts-cpu"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
|
@ -21,9 +21,9 @@ jobs:
|
||||||
python-version: [3.7, 3.8, 3.9, "3.10"]
|
python-version: [3.7, 3.8, 3.9, "3.10"]
|
||||||
experimental: [false]
|
experimental: [false]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v3
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
uses: coqui-ai/setup-python@pip-cache-key-py-ver
|
uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
architecture: x64
|
architecture: x64
|
||||||
|
|
|
@ -21,9 +21,9 @@ jobs:
|
||||||
python-version: [3.9]
|
python-version: [3.9]
|
||||||
experimental: [false]
|
experimental: [false]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v3
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
uses: coqui-ai/setup-python@pip-cache-key-py-ver
|
uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
architecture: x64
|
architecture: x64
|
||||||
|
|
|
@ -21,9 +21,9 @@ jobs:
|
||||||
python-version: [3.7, 3.8, 3.9, "3.10"]
|
python-version: [3.7, 3.8, 3.9, "3.10"]
|
||||||
experimental: [false]
|
experimental: [false]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v3
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
uses: coqui-ai/setup-python@pip-cache-key-py-ver
|
uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
architecture: x64
|
architecture: x64
|
||||||
|
|
|
@ -21,9 +21,9 @@ jobs:
|
||||||
python-version: [3.7, 3.8, 3.9, "3.10"]
|
python-version: [3.7, 3.8, 3.9, "3.10"]
|
||||||
experimental: [false]
|
experimental: [false]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v3
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
uses: coqui-ai/setup-python@pip-cache-key-py-ver
|
uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
architecture: x64
|
architecture: x64
|
||||||
|
|
|
@ -21,9 +21,9 @@ jobs:
|
||||||
python-version: [3.7, 3.8, 3.9, "3.10"]
|
python-version: [3.7, 3.8, 3.9, "3.10"]
|
||||||
experimental: [false]
|
experimental: [false]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v3
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
uses: coqui-ai/setup-python@pip-cache-key-py-ver
|
uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
architecture: x64
|
architecture: x64
|
||||||
|
|
|
@ -0,0 +1,52 @@
|
||||||
|
name: zoo-tests-0
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
pull_request:
|
||||||
|
types: [opened, synchronize, reopened]
|
||||||
|
jobs:
|
||||||
|
check_skip:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
if: "! contains(github.event.head_commit.message, '[ci skip]')"
|
||||||
|
steps:
|
||||||
|
- run: echo "${{ github.event.head_commit.message }}"
|
||||||
|
|
||||||
|
test:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
python-version: [3.7, 3.8, 3.9, "3.10"]
|
||||||
|
experimental: [false]
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: ${{ matrix.python-version }}
|
||||||
|
architecture: x64
|
||||||
|
cache: 'pip'
|
||||||
|
cache-dependency-path: 'requirements*'
|
||||||
|
- name: check OS
|
||||||
|
run: cat /etc/os-release
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install -y git make gcc
|
||||||
|
sudo apt-get install espeak espeak-ng
|
||||||
|
make system-deps
|
||||||
|
- name: Install/upgrade Python setup deps
|
||||||
|
run: python3 -m pip install --upgrade pip setuptools wheel
|
||||||
|
- name: Replace scarf urls
|
||||||
|
run: |
|
||||||
|
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
|
||||||
|
- name: Install TTS
|
||||||
|
run: |
|
||||||
|
python3 -m pip install .[all]
|
||||||
|
python3 setup.py egg_info
|
||||||
|
- name: Unit tests
|
||||||
|
run: |
|
||||||
|
nose2 -F -v -B TTS tests.zoo_tests.test_models.test_models_offset_0_step_3
|
||||||
|
nose2 -F -v -B TTS tests.zoo_tests.test_models.test_voice_conversion
|
|
@ -1,4 +1,4 @@
|
||||||
name: zoo-tests
|
name: zoo-tests-1
|
||||||
|
|
||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
|
@ -21,9 +21,9 @@ jobs:
|
||||||
python-version: [3.7, 3.8, 3.9, "3.10"]
|
python-version: [3.7, 3.8, 3.9, "3.10"]
|
||||||
experimental: [false]
|
experimental: [false]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v3
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
uses: coqui-ai/setup-python@pip-cache-key-py-ver
|
uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
architecture: x64
|
architecture: x64
|
||||||
|
@ -47,4 +47,4 @@ jobs:
|
||||||
python3 -m pip install .[all]
|
python3 -m pip install .[all]
|
||||||
python3 setup.py egg_info
|
python3 setup.py egg_info
|
||||||
- name: Unit tests
|
- name: Unit tests
|
||||||
run: make test_zoo
|
run: nose2 -F -v -B --with-coverage --coverage TTS tests.zoo_tests.test_models.test_models_offset_1_step_3
|
|
@ -0,0 +1,50 @@
|
||||||
|
name: zoo-tests-2
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
pull_request:
|
||||||
|
types: [opened, synchronize, reopened]
|
||||||
|
jobs:
|
||||||
|
check_skip:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
if: "! contains(github.event.head_commit.message, '[ci skip]')"
|
||||||
|
steps:
|
||||||
|
- run: echo "${{ github.event.head_commit.message }}"
|
||||||
|
|
||||||
|
test:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
python-version: [3.7, 3.8, 3.9, "3.10"]
|
||||||
|
experimental: [false]
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: ${{ matrix.python-version }}
|
||||||
|
architecture: x64
|
||||||
|
cache: 'pip'
|
||||||
|
cache-dependency-path: 'requirements*'
|
||||||
|
- name: check OS
|
||||||
|
run: cat /etc/os-release
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install -y git make gcc
|
||||||
|
sudo apt-get install espeak espeak-ng
|
||||||
|
make system-deps
|
||||||
|
- name: Install/upgrade Python setup deps
|
||||||
|
run: python3 -m pip install --upgrade pip setuptools wheel
|
||||||
|
- name: Replace scarf urls
|
||||||
|
run: |
|
||||||
|
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
|
||||||
|
- name: Install TTS
|
||||||
|
run: |
|
||||||
|
python3 -m pip install .[all]
|
||||||
|
python3 setup.py egg_info
|
||||||
|
- name: Unit tests
|
||||||
|
run: nose2 -F -v -B --with-coverage --coverage TTS tests.zoo_tests.test_models.test_models_offset_2_step_3
|
18
Dockerfile
18
Dockerfile
|
@ -1,20 +1,12 @@
|
||||||
ARG BASE=nvcr.io/nvidia/pytorch:22.03-py3
|
ARG BASE=nvidia/cuda:11.8.0-base-ubuntu22.04
|
||||||
FROM ${BASE}
|
FROM ${BASE}
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ make python3 python3-dev python3-pip python3-venv python3-wheel espeak espeak-ng libsndfile1-dev && rm -rf /var/lib/apt/lists/*
|
RUN apt-get update && apt-get upgrade -y
|
||||||
RUN pip install llvmlite --ignore-installed
|
RUN apt-get install -y --no-install-recommends gcc g++ make python3 python3-dev python3-pip python3-venv python3-wheel espeak-ng libsndfile1-dev && rm -rf /var/lib/apt/lists/*
|
||||||
|
RUN pip3 install llvmlite --ignore-installed
|
||||||
# Create and activate virtual env
|
|
||||||
ENV VIRTUAL_ENV=/venv
|
|
||||||
RUN python3 -m venv $VIRTUAL_ENV
|
|
||||||
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
|
|
||||||
RUN pip install -U pip setuptools wheel
|
|
||||||
|
|
||||||
WORKDIR /root
|
WORKDIR /root
|
||||||
COPY requirements.txt /root
|
|
||||||
COPY requirements.dev.txt /root
|
|
||||||
COPY requirements.notebooks.txt /root
|
|
||||||
RUN ["/bin/bash", "-c", "pip install -r <(cat requirements.txt requirements.dev.txt requirements.notebooks.txt)"]
|
|
||||||
COPY . /root
|
COPY . /root
|
||||||
|
RUN pip3 install torch torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
|
||||||
RUN make install
|
RUN make install
|
||||||
ENTRYPOINT ["tts"]
|
ENTRYPOINT ["tts"]
|
||||||
CMD ["--help"]
|
CMD ["--help"]
|
||||||
|
|
43
README.md
43
README.md
|
@ -1,9 +1,16 @@
|
||||||
# <img src="https://raw.githubusercontent.com/coqui-ai/TTS/main/images/coqui-log-green-TTS.png" height="56"/>
|
<img src="https://raw.githubusercontent.com/coqui-ai/TTS/main/images/coqui-log-green-TTS.png" height="56"/>
|
||||||
|
|
||||||
|
----
|
||||||
|
|
||||||
|
### 📣 Clone your voice with a single click on [🐸Coqui.ai](https://app.coqui.ai/auth/signin)
|
||||||
|
### 📣 🐸Coqui Studio is launching soon!! Join our [waiting list](https://coqui.ai/)!!
|
||||||
|
|
||||||
|
----
|
||||||
|
|
||||||
🐸TTS is a library for advanced Text-to-Speech generation. It's built on the latest research, was designed to achieve the best trade-off among ease-of-training, speed and quality.
|
🐸TTS is a library for advanced Text-to-Speech generation. It's built on the latest research, was designed to achieve the best trade-off among ease-of-training, speed and quality.
|
||||||
🐸TTS comes with pretrained models, tools for measuring dataset quality and already used in **20+ languages** for products and research projects.
|
🐸TTS comes with pretrained models, tools for measuring dataset quality and already used in **20+ languages** for products and research projects.
|
||||||
|
|
||||||
[](https://gitter.im/coqui-ai/TTS?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge)
|
[](https://discord.gg/5eXr5seRrv)
|
||||||
[](https://opensource.org/licenses/MPL-2.0)
|
[](https://opensource.org/licenses/MPL-2.0)
|
||||||
[](https://badge.fury.io/py/TTS)
|
[](https://badge.fury.io/py/TTS)
|
||||||
[](https://github.com/coqui-ai/TTS/blob/master/CODE_OF_CONDUCT.md)
|
[](https://github.com/coqui-ai/TTS/blob/master/CODE_OF_CONDUCT.md)
|
||||||
|
@ -36,12 +43,12 @@ Please use our dedicated channels for questions and discussion. Help is much mor
|
||||||
| ------------------------------- | --------------------------------------- |
|
| ------------------------------- | --------------------------------------- |
|
||||||
| 🚨 **Bug Reports** | [GitHub Issue Tracker] |
|
| 🚨 **Bug Reports** | [GitHub Issue Tracker] |
|
||||||
| 🎁 **Feature Requests & Ideas** | [GitHub Issue Tracker] |
|
| 🎁 **Feature Requests & Ideas** | [GitHub Issue Tracker] |
|
||||||
| 👩💻 **Usage Questions** | [Github Discussions] |
|
| 👩💻 **Usage Questions** | [GitHub Discussions] |
|
||||||
| 🗯 **General Discussion** | [Github Discussions] or [Gitter Room] |
|
| 🗯 **General Discussion** | [GitHub Discussions] or [Discord] |
|
||||||
|
|
||||||
[github issue tracker]: https://github.com/coqui-ai/tts/issues
|
[github issue tracker]: https://github.com/coqui-ai/tts/issues
|
||||||
[github discussions]: https://github.com/coqui-ai/TTS/discussions
|
[github discussions]: https://github.com/coqui-ai/TTS/discussions
|
||||||
[gitter room]: https://gitter.im/coqui-ai/TTS?utm_source=share-link&utm_medium=link&utm_campaign=share-link
|
[discord]: https://discord.gg/5eXr5seRrv
|
||||||
[Tutorials and Examples]: https://github.com/coqui-ai/TTS/wiki/TTS-Notebooks-and-Tutorials
|
[Tutorials and Examples]: https://github.com/coqui-ai/TTS/wiki/TTS-Notebooks-and-Tutorials
|
||||||
|
|
||||||
|
|
||||||
|
@ -75,7 +82,7 @@ Underlined "TTS*" and "Judy*" are 🐸TTS models
|
||||||
- Modular (but not too much) code base enabling easy implementation of new ideas.
|
- Modular (but not too much) code base enabling easy implementation of new ideas.
|
||||||
|
|
||||||
## Implemented Models
|
## Implemented Models
|
||||||
### Text-to-Spectrogram
|
### Spectrogram models
|
||||||
- Tacotron: [paper](https://arxiv.org/abs/1703.10135)
|
- Tacotron: [paper](https://arxiv.org/abs/1703.10135)
|
||||||
- Tacotron2: [paper](https://arxiv.org/abs/1712.05884)
|
- Tacotron2: [paper](https://arxiv.org/abs/1712.05884)
|
||||||
- Glow-TTS: [paper](https://arxiv.org/abs/2005.11129)
|
- Glow-TTS: [paper](https://arxiv.org/abs/2005.11129)
|
||||||
|
@ -83,9 +90,12 @@ Underlined "TTS*" and "Judy*" are 🐸TTS models
|
||||||
- Align-TTS: [paper](https://arxiv.org/abs/2003.01950)
|
- Align-TTS: [paper](https://arxiv.org/abs/2003.01950)
|
||||||
- FastPitch: [paper](https://arxiv.org/pdf/2006.06873.pdf)
|
- FastPitch: [paper](https://arxiv.org/pdf/2006.06873.pdf)
|
||||||
- FastSpeech: [paper](https://arxiv.org/abs/1905.09263)
|
- FastSpeech: [paper](https://arxiv.org/abs/1905.09263)
|
||||||
|
- SC-GlowTTS: [paper](https://arxiv.org/abs/2104.05557)
|
||||||
|
- Capacitron: [paper](https://arxiv.org/abs/1906.03402)
|
||||||
|
|
||||||
### End-to-End Models
|
### End-to-End Models
|
||||||
- VITS: [paper](https://arxiv.org/pdf/2106.06103)
|
- VITS: [paper](https://arxiv.org/pdf/2106.06103)
|
||||||
|
- YourTTS: [paper](https://arxiv.org/abs/2112.02418)
|
||||||
|
|
||||||
### Attention Methods
|
### Attention Methods
|
||||||
- Guided Attention: [paper](https://arxiv.org/abs/1710.08969)
|
- Guided Attention: [paper](https://arxiv.org/abs/1710.08969)
|
||||||
|
@ -136,6 +146,21 @@ $ make install
|
||||||
|
|
||||||
If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](https://stackoverflow.com/questions/66726331/how-can-i-run-mozilla-tts-coqui-tts-training-with-cuda-on-a-windows-system).
|
If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](https://stackoverflow.com/questions/66726331/how-can-i-run-mozilla-tts-coqui-tts-training-with-cuda-on-a-windows-system).
|
||||||
|
|
||||||
|
|
||||||
|
## Docker Image
|
||||||
|
You can also try TTS without install with the docker image.
|
||||||
|
Simply run the following command and you will be able to run TTS without installing it.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker run --rm -it -p 5002:5002 --entrypoint /bin/bash ghcr.io/coqui-ai/tts-cpu
|
||||||
|
python3 TTS/server/server.py --list_models #To get the list of available models
|
||||||
|
python3 TTS/server/server.py --model_name tts_models/en/vctk/vits # To start a server
|
||||||
|
```
|
||||||
|
|
||||||
|
You can then enjoy the TTS server [here](http://[::1]:5002/)
|
||||||
|
More details about the docker images (like GPU support) can be found [here](https://tts.readthedocs.io/en/latest/docker_images.html)
|
||||||
|
|
||||||
|
|
||||||
## Use TTS
|
## Use TTS
|
||||||
|
|
||||||
### Single Speaker Models
|
### Single Speaker Models
|
||||||
|
@ -208,7 +233,7 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht
|
||||||
|
|
||||||
- Run your own TTS and Vocoder models:
|
- Run your own TTS and Vocoder models:
|
||||||
```
|
```
|
||||||
$ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth --out_path output/path/speech.wav
|
$ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
|
||||||
--vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
|
--vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -229,7 +254,7 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht
|
||||||
- Run your own multi-speaker TTS model:
|
- Run your own multi-speaker TTS model:
|
||||||
|
|
||||||
```
|
```
|
||||||
$ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
|
$ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/model.pth --config_path path/to/config.json --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
|
||||||
```
|
```
|
||||||
|
|
||||||
## Directory Structure
|
## Directory Structure
|
||||||
|
@ -239,8 +264,6 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht
|
||||||
|- TTS
|
|- TTS
|
||||||
|- bin/ (folder for all the executables.)
|
|- bin/ (folder for all the executables.)
|
||||||
|- train*.py (train your target model.)
|
|- train*.py (train your target model.)
|
||||||
|- distribute.py (train your TTS model using Multiple GPUs.)
|
|
||||||
|- compute_statistics.py (compute dataset statistics for normalization.)
|
|
||||||
|- ...
|
|- ...
|
||||||
|- tts/ (text to speech models)
|
|- tts/ (text to speech models)
|
||||||
|- layers/ (model layer definitions)
|
|- layers/ (model layer definitions)
|
||||||
|
|
257
TTS/.models.json
257
TTS/.models.json
|
@ -12,6 +12,61 @@
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"bg": {
|
||||||
|
"cv": {
|
||||||
|
"vits":{
|
||||||
|
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--bg--cv--vits.zip",
|
||||||
|
"default_vocoder": null,
|
||||||
|
"commit": null,
|
||||||
|
"author": "@NeonGeckoCom",
|
||||||
|
"license": "bsd-3-clause"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"cs": {
|
||||||
|
"cv": {
|
||||||
|
"vits":{
|
||||||
|
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--cs--cv--vits.zip",
|
||||||
|
"default_vocoder": null,
|
||||||
|
"commit": null,
|
||||||
|
"author": "@NeonGeckoCom",
|
||||||
|
"license": "bsd-3-clause"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"da": {
|
||||||
|
"cv": {
|
||||||
|
"vits":{
|
||||||
|
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--da--cv--vits.zip",
|
||||||
|
"default_vocoder": null,
|
||||||
|
"commit": null,
|
||||||
|
"author": "@NeonGeckoCom",
|
||||||
|
"license": "bsd-3-clause"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"et": {
|
||||||
|
"cv": {
|
||||||
|
"vits":{
|
||||||
|
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--et--cv--vits.zip",
|
||||||
|
"default_vocoder": null,
|
||||||
|
"commit": null,
|
||||||
|
"author": "@NeonGeckoCom",
|
||||||
|
"license": "bsd-3-clause"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"ga": {
|
||||||
|
"cv": {
|
||||||
|
"vits":{
|
||||||
|
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ga--cv--vits.zip",
|
||||||
|
"default_vocoder": null,
|
||||||
|
"commit": null,
|
||||||
|
"author": "@NeonGeckoCom",
|
||||||
|
"license": "bsd-3-clause"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
"en": {
|
"en": {
|
||||||
"ek1": {
|
"ek1": {
|
||||||
"tacotron2": {
|
"tacotron2": {
|
||||||
|
@ -79,6 +134,14 @@
|
||||||
"license": "apache 2.0",
|
"license": "apache 2.0",
|
||||||
"contact": "egolge@coqui.com"
|
"contact": "egolge@coqui.com"
|
||||||
},
|
},
|
||||||
|
"vits--neon": {
|
||||||
|
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--en--ljspeech--vits.zip",
|
||||||
|
"default_vocoder": null,
|
||||||
|
"author": "@NeonGeckoCom",
|
||||||
|
"license": "bsd-3-clause",
|
||||||
|
"contact": null,
|
||||||
|
"commit": null
|
||||||
|
},
|
||||||
"fast_pitch": {
|
"fast_pitch": {
|
||||||
"description": "FastPitch model trained on LJSpeech using the Aligner Network",
|
"description": "FastPitch model trained on LJSpeech using the Aligner Network",
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--fast_pitch.zip",
|
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--fast_pitch.zip",
|
||||||
|
@ -130,10 +193,10 @@
|
||||||
"license": "apache 2.0",
|
"license": "apache 2.0",
|
||||||
"contact": "adamfroghyar@gmail.com"
|
"contact": "adamfroghyar@gmail.com"
|
||||||
},
|
},
|
||||||
"capacitron-t2-c150": {
|
"capacitron-t2-c150_v2": {
|
||||||
"description": "Capacitron additions to Tacotron 2 with Capacity at 150 as in https://arxiv.org/pdf/1906.03402.pdf",
|
"description": "Capacitron additions to Tacotron 2 with Capacity at 150 as in https://arxiv.org/pdf/1906.03402.pdf",
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--en--blizzard2013--capacitron-t2-c150.zip",
|
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.1_models/tts_models--en--blizzard2013--capacitron-t2-c150_v2.zip",
|
||||||
"commit": "d6284e7",
|
"commit": "a67039d",
|
||||||
"default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
|
"default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
|
||||||
"author": "Adam Froghyar @a-froghyar",
|
"author": "Adam Froghyar @a-froghyar",
|
||||||
"license": "apache 2.0",
|
"license": "apache 2.0",
|
||||||
|
@ -151,6 +214,15 @@
|
||||||
"license": "MPL",
|
"license": "MPL",
|
||||||
"contact": "egolge@coqui.com"
|
"contact": "egolge@coqui.com"
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
"css10":{
|
||||||
|
"vits":{
|
||||||
|
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--es--css10--vits.zip",
|
||||||
|
"default_vocoder": null,
|
||||||
|
"commit": null,
|
||||||
|
"author": "@NeonGeckoCom",
|
||||||
|
"license": "bsd-3-clause"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"fr": {
|
"fr": {
|
||||||
|
@ -158,11 +230,20 @@
|
||||||
"tacotron2-DDC": {
|
"tacotron2-DDC": {
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--fr--mai--tacotron2-DDC.zip",
|
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--fr--mai--tacotron2-DDC.zip",
|
||||||
"default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
|
"default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
|
||||||
"commit": "",
|
"commit": null,
|
||||||
"author": "Eren Gölge @erogol",
|
"author": "Eren Gölge @erogol",
|
||||||
"license": "MPL",
|
"license": "MPL",
|
||||||
"contact": "egolge@coqui.com"
|
"contact": "egolge@coqui.com"
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
"css10":{
|
||||||
|
"vits":{
|
||||||
|
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fr--css10--vits.zip",
|
||||||
|
"default_vocoder": null,
|
||||||
|
"commit": null,
|
||||||
|
"author": "@NeonGeckoCom",
|
||||||
|
"license": "bsd-3-clause"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"uk":{
|
"uk":{
|
||||||
|
@ -174,6 +255,13 @@
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"contact": "",
|
"contact": "",
|
||||||
"default_vocoder": "vocoder_models/uk/mai/multiband-melgan"
|
"default_vocoder": "vocoder_models/uk/mai/multiband-melgan"
|
||||||
|
},
|
||||||
|
"vits":{
|
||||||
|
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--uk--mai--vits.zip",
|
||||||
|
"default_vocoder": null,
|
||||||
|
"commit": null,
|
||||||
|
"author": "@NeonGeckoCom",
|
||||||
|
"license": "bsd-3-clause"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
@ -198,6 +286,15 @@
|
||||||
"stats_file": null,
|
"stats_file": null,
|
||||||
"commit": "540d811"
|
"commit": "540d811"
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
"css10":{
|
||||||
|
"vits":{
|
||||||
|
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--nl--css10--vits.zip",
|
||||||
|
"default_vocoder": null,
|
||||||
|
"commit": null,
|
||||||
|
"author": "@NeonGeckoCom",
|
||||||
|
"license": "bsd-3-clause"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"de": {
|
"de": {
|
||||||
|
@ -224,6 +321,15 @@
|
||||||
"license": "apache 2.0",
|
"license": "apache 2.0",
|
||||||
"commit": "unknown"
|
"commit": "unknown"
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
"css10": {
|
||||||
|
"vits-neon":{
|
||||||
|
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--de--css10--vits.zip",
|
||||||
|
"default_vocoder": null,
|
||||||
|
"author": "@NeonGeckoCom",
|
||||||
|
"license": "bsd-3-clause",
|
||||||
|
"commit": null
|
||||||
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"ja": {
|
"ja": {
|
||||||
|
@ -359,6 +465,149 @@
|
||||||
"commit": "1b22f03"
|
"commit": "1b22f03"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
"hu": {
|
||||||
|
"css10": {
|
||||||
|
"vits": {
|
||||||
|
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hu--css10--vits.zip",
|
||||||
|
"default_vocoder": null,
|
||||||
|
"commit": null,
|
||||||
|
"author": "@NeonGeckoCom",
|
||||||
|
"license": "bsd-3-clause"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"el": {
|
||||||
|
"cv": {
|
||||||
|
"vits": {
|
||||||
|
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--el--cv--vits.zip",
|
||||||
|
"default_vocoder": null,
|
||||||
|
"commit": null,
|
||||||
|
"author": "@NeonGeckoCom",
|
||||||
|
"license": "bsd-3-clause"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"fi": {
|
||||||
|
"css10": {
|
||||||
|
"vits":{
|
||||||
|
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fi--css10--vits.zip",
|
||||||
|
"default_vocoder": null,
|
||||||
|
"commit": null,
|
||||||
|
"author": "@NeonGeckoCom",
|
||||||
|
"license": "bsd-3-clause"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"hr": {
|
||||||
|
"cv": {
|
||||||
|
"vits":{
|
||||||
|
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hr--cv--vits.zip",
|
||||||
|
"default_vocoder": null,
|
||||||
|
"commit": null,
|
||||||
|
"author": "@NeonGeckoCom",
|
||||||
|
"license": "bsd-3-clause"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"lt": {
|
||||||
|
"cv": {
|
||||||
|
"vits":{
|
||||||
|
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lt--cv--vits.zip",
|
||||||
|
"default_vocoder": null,
|
||||||
|
"commit": null,
|
||||||
|
"author": "@NeonGeckoCom",
|
||||||
|
"license": "bsd-3-clause"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"lv": {
|
||||||
|
"cv": {
|
||||||
|
"vits":{
|
||||||
|
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lv--cv--vits.zip",
|
||||||
|
"default_vocoder": null,
|
||||||
|
"commit": null,
|
||||||
|
"author": "@NeonGeckoCom",
|
||||||
|
"license": "bsd-3-clause"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"mt": {
|
||||||
|
"cv": {
|
||||||
|
"vits":{
|
||||||
|
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--mt--cv--vits.zip",
|
||||||
|
"default_vocoder": null,
|
||||||
|
"commit": null,
|
||||||
|
"author": "@NeonGeckoCom",
|
||||||
|
"license": "bsd-3-clause"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"pl": {
|
||||||
|
"mai_female": {
|
||||||
|
"vits":{
|
||||||
|
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pl--mai_female--vits.zip",
|
||||||
|
"default_vocoder": null,
|
||||||
|
"commit": null,
|
||||||
|
"author": "@NeonGeckoCom",
|
||||||
|
"license": "bsd-3-clause"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"pt": {
|
||||||
|
"cv": {
|
||||||
|
"vits":{
|
||||||
|
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pt--cv--vits.zip",
|
||||||
|
"default_vocoder": null,
|
||||||
|
"commit": null,
|
||||||
|
"author": "@NeonGeckoCom",
|
||||||
|
"license": "bsd-3-clause"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"ro": {
|
||||||
|
"cv": {
|
||||||
|
"vits":{
|
||||||
|
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ro--cv--vits.zip",
|
||||||
|
"default_vocoder": null,
|
||||||
|
"commit": null,
|
||||||
|
"author": "@NeonGeckoCom",
|
||||||
|
"license": "bsd-3-clause"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"sk": {
|
||||||
|
"cv": {
|
||||||
|
"vits":{
|
||||||
|
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sk--cv--vits.zip",
|
||||||
|
"default_vocoder": null,
|
||||||
|
"commit": null,
|
||||||
|
"author": "@NeonGeckoCom",
|
||||||
|
"license": "bsd-3-clause"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"sl": {
|
||||||
|
"cv": {
|
||||||
|
"vits":{
|
||||||
|
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sl--cv--vits.zip",
|
||||||
|
"default_vocoder": null,
|
||||||
|
"commit": null,
|
||||||
|
"author": "@NeonGeckoCom",
|
||||||
|
"license": "bsd-3-clause"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"sv": {
|
||||||
|
"cv": {
|
||||||
|
"vits":{
|
||||||
|
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sv--cv--vits.zip",
|
||||||
|
"default_vocoder": null,
|
||||||
|
"commit": null,
|
||||||
|
"author": "@NeonGeckoCom",
|
||||||
|
"license": "bsd-3-clause"
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"vocoder_models": {
|
"vocoder_models": {
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
0.8.0
|
0.9.0
|
|
@ -6,38 +6,87 @@ import torch
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from TTS.config import load_config
|
from TTS.config import load_config
|
||||||
|
from TTS.config.shared_configs import BaseDatasetConfig
|
||||||
from TTS.tts.datasets import load_tts_samples
|
from TTS.tts.datasets import load_tts_samples
|
||||||
from TTS.tts.utils.managers import save_file
|
from TTS.tts.utils.managers import save_file
|
||||||
from TTS.tts.utils.speakers import SpeakerManager
|
from TTS.tts.utils.speakers import SpeakerManager
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="""Compute embedding vectors for each wav file in a dataset.\n\n"""
|
description="""Compute embedding vectors for each audio file in a dataset and store them keyed by `{dataset_name}#{file_path}` in a .pth file\n\n"""
|
||||||
"""
|
"""
|
||||||
Example runs:
|
Example runs:
|
||||||
python TTS/bin/compute_embeddings.py speaker_encoder_model.pth speaker_encoder_config.json dataset_config.json
|
python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --config_dataset_path dataset_config.json
|
||||||
|
|
||||||
|
python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --fomatter vctk --dataset_path /path/to/vctk/dataset --dataset_name my_vctk --metafile /path/to/vctk/metafile.csv
|
||||||
""",
|
""",
|
||||||
formatter_class=RawTextHelpFormatter,
|
formatter_class=RawTextHelpFormatter,
|
||||||
)
|
)
|
||||||
parser.add_argument("model_path", type=str, help="Path to model checkpoint file.")
|
parser.add_argument(
|
||||||
parser.add_argument("config_path", type=str, help="Path to model config file.")
|
"--model_path",
|
||||||
parser.add_argument("config_dataset_path", type=str, help="Path to dataset config file.")
|
type=str,
|
||||||
|
help="Path to model checkpoint file. It defaults to the released speaker encoder.",
|
||||||
|
default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--config_path",
|
||||||
|
type=str,
|
||||||
|
help="Path to model config file. It defaults to the released speaker encoder config.",
|
||||||
|
default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--config_dataset_path",
|
||||||
|
type=str,
|
||||||
|
help="Path to dataset config file. You either need to provide this or `formatter_name`, `dataset_name` and `dataset_path` arguments.",
|
||||||
|
default=None,
|
||||||
|
)
|
||||||
parser.add_argument("--output_path", type=str, help="Path for output `pth` or `json` file.", default="speakers.pth")
|
parser.add_argument("--output_path", type=str, help="Path for output `pth` or `json` file.", default="speakers.pth")
|
||||||
parser.add_argument("--old_file", type=str, help="Previous embedding file to only compute new audios.", default=None)
|
parser.add_argument("--old_file", type=str, help="Previous embedding file to only compute new audios.", default=None)
|
||||||
parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False)
|
parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False)
|
||||||
parser.add_argument("--no_eval", type=bool, help="Do not compute eval?. Default False", default=False)
|
parser.add_argument("--no_eval", type=bool, help="Do not compute eval?. Default False", default=False)
|
||||||
|
parser.add_argument(
|
||||||
|
"--formatter_name",
|
||||||
|
type=str,
|
||||||
|
help="Name of the formatter to use. You either need to provide this or `config_dataset_path`",
|
||||||
|
default=None,
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--dataset_name",
|
||||||
|
type=str,
|
||||||
|
help="Name of the dataset to use. You either need to provide this or `config_dataset_path`",
|
||||||
|
default=None,
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--dataset_path",
|
||||||
|
type=str,
|
||||||
|
help="Path to the dataset. You either need to provide this or `config_dataset_path`",
|
||||||
|
default=None,
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--metafile",
|
||||||
|
type=str,
|
||||||
|
help="Path to the meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
|
||||||
|
default=None,
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
use_cuda = torch.cuda.is_available() and not args.disable_cuda
|
use_cuda = torch.cuda.is_available() and not args.disable_cuda
|
||||||
|
|
||||||
c_dataset = load_config(args.config_dataset_path)
|
if args.config_dataset_path is not None:
|
||||||
|
c_dataset = load_config(args.config_dataset_path)
|
||||||
|
meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=not args.no_eval)
|
||||||
|
else:
|
||||||
|
c_dataset = BaseDatasetConfig()
|
||||||
|
c_dataset.formatter = args.formatter_name
|
||||||
|
c_dataset.dataset_name = args.dataset_name
|
||||||
|
c_dataset.path = args.dataset_path
|
||||||
|
c_dataset.meta_file_train = args.metafile if args.metafile else None
|
||||||
|
meta_data_train, meta_data_eval = load_tts_samples(c_dataset, eval_split=not args.no_eval)
|
||||||
|
|
||||||
meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=not args.no_eval)
|
|
||||||
|
|
||||||
if meta_data_eval is None:
|
if meta_data_eval is None:
|
||||||
wav_files = meta_data_train
|
samples = meta_data_train
|
||||||
else:
|
else:
|
||||||
wav_files = meta_data_train + meta_data_eval
|
samples = meta_data_train + meta_data_eval
|
||||||
|
|
||||||
encoder_manager = SpeakerManager(
|
encoder_manager = SpeakerManager(
|
||||||
encoder_model_path=args.model_path,
|
encoder_model_path=args.model_path,
|
||||||
|
@ -50,25 +99,23 @@ class_name_key = encoder_manager.encoder_config.class_name_key
|
||||||
|
|
||||||
# compute speaker embeddings
|
# compute speaker embeddings
|
||||||
speaker_mapping = {}
|
speaker_mapping = {}
|
||||||
for idx, wav_file in enumerate(tqdm(wav_files)):
|
for idx, fields in enumerate(tqdm(samples)):
|
||||||
if isinstance(wav_file, dict):
|
class_name = fields[class_name_key]
|
||||||
class_name = wav_file[class_name_key]
|
audio_file = fields["audio_file"]
|
||||||
wav_file = wav_file["audio_file"]
|
embedding_key = fields["audio_unique_name"]
|
||||||
else:
|
root_path = fields["root_path"]
|
||||||
class_name = None
|
|
||||||
|
|
||||||
wav_file_name = os.path.basename(wav_file)
|
if args.old_file is not None and embedding_key in encoder_manager.clip_ids:
|
||||||
if args.old_file is not None and wav_file_name in encoder_manager.clip_ids:
|
|
||||||
# get the embedding from the old file
|
# get the embedding from the old file
|
||||||
embedd = encoder_manager.get_embedding_by_clip(wav_file_name)
|
embedd = encoder_manager.get_embedding_by_clip(embedding_key)
|
||||||
else:
|
else:
|
||||||
# extract the embedding
|
# extract the embedding
|
||||||
embedd = encoder_manager.compute_embedding_from_clip(wav_file)
|
embedd = encoder_manager.compute_embedding_from_clip(audio_file)
|
||||||
|
|
||||||
# create speaker_mapping if target dataset is defined
|
# create speaker_mapping if target dataset is defined
|
||||||
speaker_mapping[wav_file_name] = {}
|
speaker_mapping[embedding_key] = {}
|
||||||
speaker_mapping[wav_file_name]["name"] = class_name
|
speaker_mapping[embedding_key]["name"] = class_name
|
||||||
speaker_mapping[wav_file_name]["embedding"] = embedd
|
speaker_mapping[embedding_key]["embedding"] = embedd
|
||||||
|
|
||||||
if speaker_mapping:
|
if speaker_mapping:
|
||||||
# save speaker_mapping if target dataset is defined
|
# save speaker_mapping if target dataset is defined
|
||||||
|
|
|
@ -37,7 +37,7 @@ def setup_loader(ap, r, verbose=False):
|
||||||
precompute_num_workers=0,
|
precompute_num_workers=0,
|
||||||
use_noise_augment=False,
|
use_noise_augment=False,
|
||||||
verbose=verbose,
|
verbose=verbose,
|
||||||
speaker_id_mapping=speaker_manager.ids if c.use_speaker_embedding else None,
|
speaker_id_mapping=speaker_manager.name_to_id if c.use_speaker_embedding else None,
|
||||||
d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None,
|
d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -7,30 +7,25 @@ from tqdm.contrib.concurrent import process_map
|
||||||
|
|
||||||
from TTS.config import load_config
|
from TTS.config import load_config
|
||||||
from TTS.tts.datasets import load_tts_samples
|
from TTS.tts.datasets import load_tts_samples
|
||||||
from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut
|
from TTS.tts.utils.text.phonemizers import Gruut
|
||||||
|
|
||||||
phonemizer = Gruut(language="en-us")
|
|
||||||
|
|
||||||
|
|
||||||
def compute_phonemes(item):
|
def compute_phonemes(item):
|
||||||
try:
|
text = item["text"]
|
||||||
text = item[0]
|
ph = phonemizer.phonemize(text).replace("|", "")
|
||||||
ph = phonemizer.phonemize(text).split("|")
|
return set(list(ph))
|
||||||
except:
|
|
||||||
return []
|
|
||||||
return list(set(ph))
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
# pylint: disable=W0601
|
# pylint: disable=W0601
|
||||||
global c
|
global c, phonemizer
|
||||||
# pylint: disable=bad-option-value
|
# pylint: disable=bad-option-value
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="""Find all the unique characters or phonemes in a dataset.\n\n"""
|
description="""Find all the unique characters or phonemes in a dataset.\n\n"""
|
||||||
"""
|
"""
|
||||||
Example runs:
|
Example runs:
|
||||||
|
|
||||||
python TTS/bin/find_unique_chars.py --config_path config.json
|
python TTS/bin/find_unique_phonemes.py --config_path config.json
|
||||||
""",
|
""",
|
||||||
formatter_class=RawTextHelpFormatter,
|
formatter_class=RawTextHelpFormatter,
|
||||||
)
|
)
|
||||||
|
@ -46,15 +41,24 @@ def main():
|
||||||
items = train_items + eval_items
|
items = train_items + eval_items
|
||||||
print("Num items:", len(items))
|
print("Num items:", len(items))
|
||||||
|
|
||||||
is_lang_def = all(item["language"] for item in items)
|
language_list = [item["language"] for item in items]
|
||||||
|
is_lang_def = all(language_list)
|
||||||
|
|
||||||
if not c.phoneme_language or not is_lang_def:
|
if not c.phoneme_language or not is_lang_def:
|
||||||
raise ValueError("Phoneme language must be defined in config.")
|
raise ValueError("Phoneme language must be defined in config.")
|
||||||
|
|
||||||
|
if not language_list.count(language_list[0]) == len(language_list):
|
||||||
|
raise ValueError(
|
||||||
|
"Currently, just one phoneme language per config file is supported !! Please split the dataset config into different configs and run it individually for each language !!"
|
||||||
|
)
|
||||||
|
|
||||||
|
phonemizer = Gruut(language=language_list[0], keep_puncs=True)
|
||||||
|
|
||||||
phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15)
|
phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15)
|
||||||
phones = []
|
phones = []
|
||||||
for ph in phonemes:
|
for ph in phonemes:
|
||||||
phones.extend(ph)
|
phones.extend(ph)
|
||||||
|
|
||||||
phones = set(phones)
|
phones = set(phones)
|
||||||
lower_phones = filter(lambda c: c.islower(), phones)
|
lower_phones = filter(lambda c: c.islower(), phones)
|
||||||
phones_force_lower = [c.lower() for c in phones]
|
phones_force_lower = [c.lower() for c in phones]
|
||||||
|
|
|
@ -17,7 +17,7 @@ def adjust_path_and_remove_silence(audio_path):
|
||||||
# create all directory structure
|
# create all directory structure
|
||||||
pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
||||||
# remove the silence and save the audio
|
# remove the silence and save the audio
|
||||||
output_path = remove_silence(
|
output_path, is_speech = remove_silence(
|
||||||
model_and_utils,
|
model_and_utils,
|
||||||
audio_path,
|
audio_path,
|
||||||
output_path,
|
output_path,
|
||||||
|
@ -25,26 +25,34 @@ def adjust_path_and_remove_silence(audio_path):
|
||||||
use_cuda=args.use_cuda,
|
use_cuda=args.use_cuda,
|
||||||
)
|
)
|
||||||
|
|
||||||
return output_path
|
return output_path, is_speech
|
||||||
|
|
||||||
|
|
||||||
def preprocess_audios():
|
def preprocess_audios():
|
||||||
files = sorted(glob.glob(os.path.join(args.input_dir, args.glob), recursive=True))
|
files = sorted(glob.glob(os.path.join(args.input_dir, args.glob), recursive=True))
|
||||||
print("> Number of files: ", len(files))
|
print("> Number of files: ", len(files))
|
||||||
if not args.force:
|
if not args.force:
|
||||||
print("> Ignoring files that already exist in the output directory.")
|
print("> Ignoring files that already exist in the output idrectory.")
|
||||||
|
|
||||||
if args.trim_just_beginning_and_end:
|
if args.trim_just_beginning_and_end:
|
||||||
print("> Trimming just the beginning and the end with nonspeech parts.")
|
print("> Trimming just the beginning and the end with nonspeech parts.")
|
||||||
else:
|
else:
|
||||||
print("> Trimming all nonspeech parts.")
|
print("> Trimming all nonspeech parts.")
|
||||||
|
|
||||||
|
filtered_files = []
|
||||||
if files:
|
if files:
|
||||||
# create threads
|
# create threads
|
||||||
# num_threads = multiprocessing.cpu_count()
|
# num_threads = multiprocessing.cpu_count()
|
||||||
# process_map(adjust_path_and_remove_silence, files, max_workers=num_threads, chunksize=15)
|
# process_map(adjust_path_and_remove_silence, files, max_workers=num_threads, chunksize=15)
|
||||||
for f in tqdm(files):
|
for f in tqdm(files):
|
||||||
adjust_path_and_remove_silence(f)
|
output_path, is_speech = adjust_path_and_remove_silence(f)
|
||||||
|
if not is_speech:
|
||||||
|
filtered_files.append(output_path)
|
||||||
|
|
||||||
|
# write files that do not have speech
|
||||||
|
with open(os.path.join(args.output_dir, "filtered_files.txt"), "w", encoding="utf-8") as f:
|
||||||
|
for file in filtered_files:
|
||||||
|
f.write(file + "\n")
|
||||||
else:
|
else:
|
||||||
print("> No files Found !")
|
print("> No files Found !")
|
||||||
|
|
||||||
|
|
|
@ -238,6 +238,13 @@ If you don't specify any models, then it uses LJSpeech based English model.
|
||||||
help="speaker ID of the reference_wav speaker (If not provided the embedding will be computed using the Speaker Encoder).",
|
help="speaker ID of the reference_wav speaker (If not provided the embedding will be computed using the Speaker Encoder).",
|
||||||
default=None,
|
default=None,
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--progress_bar",
|
||||||
|
type=str2bool,
|
||||||
|
help="If true shows a progress bar for the model download. Defaults to True",
|
||||||
|
default=True,
|
||||||
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# print the description if either text or list_models is not set
|
# print the description if either text or list_models is not set
|
||||||
|
@ -255,7 +262,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
|
||||||
|
|
||||||
# load model manager
|
# load model manager
|
||||||
path = Path(__file__).parent / "../.models.json"
|
path = Path(__file__).parent / "../.models.json"
|
||||||
manager = ModelManager(path)
|
manager = ModelManager(path, progress_bar=args.progress_bar)
|
||||||
|
|
||||||
model_path = None
|
model_path = None
|
||||||
config_path = None
|
config_path = None
|
||||||
|
@ -323,7 +330,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
|
||||||
print(
|
print(
|
||||||
" > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
|
" > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
|
||||||
)
|
)
|
||||||
print(synthesizer.tts_model.speaker_manager.ids)
|
print(synthesizer.tts_model.speaker_manager.name_to_id)
|
||||||
return
|
return
|
||||||
|
|
||||||
# query langauge ids of a multi-lingual model.
|
# query langauge ids of a multi-lingual model.
|
||||||
|
@ -331,7 +338,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
|
||||||
print(
|
print(
|
||||||
" > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
|
" > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
|
||||||
)
|
)
|
||||||
print(synthesizer.tts_model.language_manager.ids)
|
print(synthesizer.tts_model.language_manager.name_to_id)
|
||||||
return
|
return
|
||||||
|
|
||||||
# check the arguments against a multi-speaker model.
|
# check the arguments against a multi-speaker model.
|
||||||
|
|
|
@ -62,7 +62,7 @@ class BaseAudioConfig(Coqpit):
|
||||||
Maximum frequency of the F0 frames. Defaults to ```640```.
|
Maximum frequency of the F0 frames. Defaults to ```640```.
|
||||||
|
|
||||||
pitch_fmin (float, optional):
|
pitch_fmin (float, optional):
|
||||||
Minimum frequency of the F0 frames. Defaults to ```0```.
|
Minimum frequency of the F0 frames. Defaults to ```1```.
|
||||||
|
|
||||||
trim_db (int):
|
trim_db (int):
|
||||||
Silence threshold used for silence trimming. Defaults to 45.
|
Silence threshold used for silence trimming. Defaults to 45.
|
||||||
|
@ -144,7 +144,7 @@ class BaseAudioConfig(Coqpit):
|
||||||
do_amp_to_db_mel: bool = True
|
do_amp_to_db_mel: bool = True
|
||||||
# f0 params
|
# f0 params
|
||||||
pitch_fmax: float = 640.0
|
pitch_fmax: float = 640.0
|
||||||
pitch_fmin: float = 0.0
|
pitch_fmin: float = 1.0
|
||||||
# normalization params
|
# normalization params
|
||||||
signal_norm: bool = True
|
signal_norm: bool = True
|
||||||
min_level_db: int = -100
|
min_level_db: int = -100
|
||||||
|
@ -193,21 +193,24 @@ class BaseDatasetConfig(Coqpit):
|
||||||
"""Base config for TTS datasets.
|
"""Base config for TTS datasets.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
name (str):
|
formatter (str):
|
||||||
Dataset name that defines the preprocessor in use. Defaults to None.
|
Formatter name that defines used formatter in ```TTS.tts.datasets.formatter```. Defaults to `""`.
|
||||||
|
|
||||||
|
dataset_name (str):
|
||||||
|
Unique name for the dataset. Defaults to `""`.
|
||||||
|
|
||||||
path (str):
|
path (str):
|
||||||
Root path to the dataset files. Defaults to None.
|
Root path to the dataset files. Defaults to `""`.
|
||||||
|
|
||||||
meta_file_train (str):
|
meta_file_train (str):
|
||||||
Name of the dataset meta file. Or a list of speakers to be ignored at training for multi-speaker datasets.
|
Name of the dataset meta file. Or a list of speakers to be ignored at training for multi-speaker datasets.
|
||||||
Defaults to None.
|
Defaults to `""`.
|
||||||
|
|
||||||
ignored_speakers (List):
|
ignored_speakers (List):
|
||||||
List of speakers IDs that are not used at the training. Default None.
|
List of speakers IDs that are not used at the training. Default None.
|
||||||
|
|
||||||
language (str):
|
language (str):
|
||||||
Language code of the dataset. If defined, it overrides `phoneme_language`. Defaults to None.
|
Language code of the dataset. If defined, it overrides `phoneme_language`. Defaults to `""`.
|
||||||
|
|
||||||
meta_file_val (str):
|
meta_file_val (str):
|
||||||
Name of the dataset meta file that defines the instances used at validation.
|
Name of the dataset meta file that defines the instances used at validation.
|
||||||
|
@ -217,7 +220,8 @@ class BaseDatasetConfig(Coqpit):
|
||||||
train the duration predictor.
|
train the duration predictor.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
name: str = ""
|
formatter: str = ""
|
||||||
|
dataset_name: str = ""
|
||||||
path: str = ""
|
path: str = ""
|
||||||
meta_file_train: str = ""
|
meta_file_train: str = ""
|
||||||
ignored_speakers: List[str] = None
|
ignored_speakers: List[str] = None
|
||||||
|
@ -230,7 +234,7 @@ class BaseDatasetConfig(Coqpit):
|
||||||
):
|
):
|
||||||
"""Check config fields"""
|
"""Check config fields"""
|
||||||
c = asdict(self)
|
c = asdict(self)
|
||||||
check_argument("name", c, restricted=True)
|
check_argument("formatter", c, restricted=True)
|
||||||
check_argument("path", c, restricted=True)
|
check_argument("path", c, restricted=True)
|
||||||
check_argument("meta_file_train", c, restricted=True)
|
check_argument("meta_file_train", c, restricted=True)
|
||||||
check_argument("meta_file_val", c, restricted=False)
|
check_argument("meta_file_val", c, restricted=False)
|
||||||
|
|
|
@ -107,11 +107,18 @@ class BaseEncoder(nn.Module):
|
||||||
return criterion
|
return criterion
|
||||||
|
|
||||||
def load_checkpoint(
|
def load_checkpoint(
|
||||||
self, config: Coqpit, checkpoint_path: str, eval: bool = False, use_cuda: bool = False, criterion=None
|
self,
|
||||||
|
config: Coqpit,
|
||||||
|
checkpoint_path: str,
|
||||||
|
eval: bool = False,
|
||||||
|
use_cuda: bool = False,
|
||||||
|
criterion=None,
|
||||||
|
cache=False,
|
||||||
):
|
):
|
||||||
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
|
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
|
||||||
try:
|
try:
|
||||||
self.load_state_dict(state["model"])
|
self.load_state_dict(state["model"])
|
||||||
|
print(" > Model fully restored. ")
|
||||||
except (KeyError, RuntimeError) as error:
|
except (KeyError, RuntimeError) as error:
|
||||||
# If eval raise the error
|
# If eval raise the error
|
||||||
if eval:
|
if eval:
|
||||||
|
|
|
@ -44,13 +44,16 @@ class BaseTrainerModel(TrainerModel):
|
||||||
return outputs_dict
|
return outputs_dict
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def load_checkpoint(self, config: Coqpit, checkpoint_path: str, eval: bool = False, strict: bool = True) -> None:
|
def load_checkpoint(
|
||||||
|
self, config: Coqpit, checkpoint_path: str, eval: bool = False, strict: bool = True, cache=False
|
||||||
|
) -> None:
|
||||||
"""Load a model checkpoint gile and get ready for training or inference.
|
"""Load a model checkpoint gile and get ready for training or inference.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
config (Coqpit): Model configuration.
|
config (Coqpit): Model configuration.
|
||||||
checkpoint_path (str): Path to the model checkpoint file.
|
checkpoint_path (str): Path to the model checkpoint file.
|
||||||
eval (bool, optional): If true, init model for inference else for training. Defaults to False.
|
eval (bool, optional): If true, init model for inference else for training. Defaults to False.
|
||||||
strcit (bool, optional): Match all checkpoint keys to model's keys. Defaults to True.
|
strict (bool, optional): Match all checkpoint keys to model's keys. Defaults to True.
|
||||||
|
cache (bool, optional): If True, cache the file locally for subsequent calls. It is cached under `get_user_data_dir()/tts_cache`. Defaults to False.
|
||||||
"""
|
"""
|
||||||
...
|
...
|
||||||
|
|
|
@ -5,6 +5,7 @@ import json
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from threading import Lock
|
||||||
from typing import Union
|
from typing import Union
|
||||||
|
|
||||||
from flask import Flask, render_template, request, send_file
|
from flask import Flask, render_template, request, send_file
|
||||||
|
@ -146,7 +147,7 @@ def index():
|
||||||
"index.html",
|
"index.html",
|
||||||
show_details=args.show_details,
|
show_details=args.show_details,
|
||||||
use_multi_speaker=use_multi_speaker,
|
use_multi_speaker=use_multi_speaker,
|
||||||
speaker_ids=speaker_manager.ids if speaker_manager is not None else None,
|
speaker_ids=speaker_manager.name_to_id if speaker_manager is not None else None,
|
||||||
use_gst=use_gst,
|
use_gst=use_gst,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -168,17 +169,21 @@ def details():
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
lock = Lock()
|
||||||
|
|
||||||
|
|
||||||
@app.route("/api/tts", methods=["GET"])
|
@app.route("/api/tts", methods=["GET"])
|
||||||
def tts():
|
def tts():
|
||||||
text = request.args.get("text")
|
with lock:
|
||||||
speaker_idx = request.args.get("speaker_id", "")
|
text = request.args.get("text")
|
||||||
style_wav = request.args.get("style_wav", "")
|
speaker_idx = request.args.get("speaker_id", "")
|
||||||
style_wav = style_wav_uri_to_dict(style_wav)
|
style_wav = request.args.get("style_wav", "")
|
||||||
print(" > Model input: {}".format(text))
|
style_wav = style_wav_uri_to_dict(style_wav)
|
||||||
print(" > Speaker Idx: {}".format(speaker_idx))
|
print(" > Model input: {}".format(text))
|
||||||
wavs = synthesizer.tts(text, speaker_name=speaker_idx, style_wav=style_wav)
|
print(" > Speaker Idx: {}".format(speaker_idx))
|
||||||
out = io.BytesIO()
|
wavs = synthesizer.tts(text, speaker_name=speaker_idx, style_wav=style_wav)
|
||||||
synthesizer.save_wav(wavs, out)
|
out = io.BytesIO()
|
||||||
|
synthesizer.save_wav(wavs, out)
|
||||||
return send_file(out, mimetype="audio/wav")
|
return send_file(out, mimetype="audio/wav")
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
import os
|
||||||
import sys
|
import sys
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -12,20 +13,16 @@ from TTS.tts.datasets.formatters import *
|
||||||
def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01):
|
def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01):
|
||||||
"""Split a dataset into train and eval. Consider speaker distribution in multi-speaker training.
|
"""Split a dataset into train and eval. Consider speaker distribution in multi-speaker training.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
<<<<<<< HEAD
|
items (List[List]):
|
||||||
items (List[List]):
|
A list of samples. Each sample is a list of `[audio_path, text, speaker_id]`.
|
||||||
A list of samples. Each sample is a list of `[audio_path, text, speaker_id]`.
|
|
||||||
|
|
||||||
eval_split_max_size (int):
|
eval_split_max_size (int):
|
||||||
Number maximum of samples to be used for evaluation in proportion split. Defaults to None (Disabled).
|
Number maximum of samples to be used for evaluation in proportion split. Defaults to None (Disabled).
|
||||||
|
|
||||||
eval_split_size (float):
|
eval_split_size (float):
|
||||||
If between 0.0 and 1.0 represents the proportion of the dataset to include in the evaluation set.
|
If between 0.0 and 1.0 represents the proportion of the dataset to include in the evaluation set.
|
||||||
If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%).
|
If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%).
|
||||||
=======
|
|
||||||
items (List[List]): A list of samples. Each sample is a list of `[text, audio_path, speaker_id]`.
|
|
||||||
>>>>>>> Fix docstring
|
|
||||||
"""
|
"""
|
||||||
speakers = [item["speaker_name"] for item in items]
|
speakers = [item["speaker_name"] for item in items]
|
||||||
is_multi_speaker = len(set(speakers)) > 1
|
is_multi_speaker = len(set(speakers)) > 1
|
||||||
|
@ -59,6 +56,17 @@ def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01):
|
||||||
return items[:eval_split_size], items[eval_split_size:]
|
return items[:eval_split_size], items[eval_split_size:]
|
||||||
|
|
||||||
|
|
||||||
|
def add_extra_keys(metadata, language, dataset_name):
|
||||||
|
for item in metadata:
|
||||||
|
# add language name
|
||||||
|
item["language"] = language
|
||||||
|
# add unique audio name
|
||||||
|
relfilepath = os.path.splitext(os.path.relpath(item["audio_file"], item["root_path"]))[0]
|
||||||
|
audio_unique_name = f"{dataset_name}#{relfilepath}"
|
||||||
|
item["audio_unique_name"] = audio_unique_name
|
||||||
|
return metadata
|
||||||
|
|
||||||
|
|
||||||
def load_tts_samples(
|
def load_tts_samples(
|
||||||
datasets: Union[List[Dict], Dict],
|
datasets: Union[List[Dict], Dict],
|
||||||
eval_split=True,
|
eval_split=True,
|
||||||
|
@ -97,7 +105,8 @@ def load_tts_samples(
|
||||||
if not isinstance(datasets, list):
|
if not isinstance(datasets, list):
|
||||||
datasets = [datasets]
|
datasets = [datasets]
|
||||||
for dataset in datasets:
|
for dataset in datasets:
|
||||||
name = dataset["name"]
|
formatter_name = dataset["formatter"]
|
||||||
|
dataset_name = dataset["dataset_name"]
|
||||||
root_path = dataset["path"]
|
root_path = dataset["path"]
|
||||||
meta_file_train = dataset["meta_file_train"]
|
meta_file_train = dataset["meta_file_train"]
|
||||||
meta_file_val = dataset["meta_file_val"]
|
meta_file_val = dataset["meta_file_val"]
|
||||||
|
@ -106,17 +115,19 @@ def load_tts_samples(
|
||||||
|
|
||||||
# setup the right data processor
|
# setup the right data processor
|
||||||
if formatter is None:
|
if formatter is None:
|
||||||
formatter = _get_formatter_by_name(name)
|
formatter = _get_formatter_by_name(formatter_name)
|
||||||
# load train set
|
# load train set
|
||||||
meta_data_train = formatter(root_path, meta_file_train, ignored_speakers=ignored_speakers)
|
meta_data_train = formatter(root_path, meta_file_train, ignored_speakers=ignored_speakers)
|
||||||
meta_data_train = [{**item, **{"language": language}} for item in meta_data_train]
|
assert len(meta_data_train) > 0, f" [!] No training samples found in {root_path}/{meta_file_train}"
|
||||||
|
|
||||||
|
meta_data_train = add_extra_keys(meta_data_train, language, dataset_name)
|
||||||
|
|
||||||
print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}")
|
print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}")
|
||||||
# load evaluation split if set
|
# load evaluation split if set
|
||||||
if eval_split:
|
if eval_split:
|
||||||
if meta_file_val:
|
if meta_file_val:
|
||||||
meta_data_eval = formatter(root_path, meta_file_val, ignored_speakers=ignored_speakers)
|
meta_data_eval = formatter(root_path, meta_file_val, ignored_speakers=ignored_speakers)
|
||||||
meta_data_eval = [{**item, **{"language": language}} for item in meta_data_eval]
|
meta_data_eval = add_extra_keys(meta_data_eval, language, dataset_name)
|
||||||
else:
|
else:
|
||||||
meta_data_eval, meta_data_train = split_dataset(meta_data_train, eval_split_max_size, eval_split_size)
|
meta_data_eval, meta_data_train = split_dataset(meta_data_train, eval_split_max_size, eval_split_size)
|
||||||
meta_data_eval_all += meta_data_eval
|
meta_data_eval_all += meta_data_eval
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
import base64
|
||||||
import collections
|
import collections
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
|
@ -34,6 +35,12 @@ def noise_augment_audio(wav):
|
||||||
return wav + (1.0 / 32768.0) * np.random.rand(*wav.shape)
|
return wav + (1.0 / 32768.0) * np.random.rand(*wav.shape)
|
||||||
|
|
||||||
|
|
||||||
|
def string2filename(string):
|
||||||
|
# generate a safe and reversible filename based on a string
|
||||||
|
filename = base64.urlsafe_b64encode(string.encode("utf-8")).decode("utf-8", "ignore")
|
||||||
|
return filename
|
||||||
|
|
||||||
|
|
||||||
class TTSDataset(Dataset):
|
class TTSDataset(Dataset):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
@ -201,7 +208,7 @@ class TTSDataset(Dataset):
|
||||||
def get_f0(self, idx):
|
def get_f0(self, idx):
|
||||||
out_dict = self.f0_dataset[idx]
|
out_dict = self.f0_dataset[idx]
|
||||||
item = self.samples[idx]
|
item = self.samples[idx]
|
||||||
assert item["audio_file"] == out_dict["audio_file"]
|
assert item["audio_unique_name"] == out_dict["audio_unique_name"]
|
||||||
return out_dict
|
return out_dict
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -256,6 +263,7 @@ class TTSDataset(Dataset):
|
||||||
"speaker_name": item["speaker_name"],
|
"speaker_name": item["speaker_name"],
|
||||||
"language_name": item["language"],
|
"language_name": item["language"],
|
||||||
"wav_file_name": os.path.basename(item["audio_file"]),
|
"wav_file_name": os.path.basename(item["audio_file"]),
|
||||||
|
"audio_unique_name": item["audio_unique_name"],
|
||||||
}
|
}
|
||||||
return sample
|
return sample
|
||||||
|
|
||||||
|
@ -397,8 +405,8 @@ class TTSDataset(Dataset):
|
||||||
language_ids = None
|
language_ids = None
|
||||||
# get pre-computed d-vectors
|
# get pre-computed d-vectors
|
||||||
if self.d_vector_mapping is not None:
|
if self.d_vector_mapping is not None:
|
||||||
wav_files_names = list(batch["wav_file_name"])
|
embedding_keys = list(batch["audio_unique_name"])
|
||||||
d_vectors = [self.d_vector_mapping[w]["embedding"] for w in wav_files_names]
|
d_vectors = [self.d_vector_mapping[w]["embedding"] for w in embedding_keys]
|
||||||
else:
|
else:
|
||||||
d_vectors = None
|
d_vectors = None
|
||||||
|
|
||||||
|
@ -560,19 +568,18 @@ class PhonemeDataset(Dataset):
|
||||||
|
|
||||||
def __getitem__(self, index):
|
def __getitem__(self, index):
|
||||||
item = self.samples[index]
|
item = self.samples[index]
|
||||||
ids = self.compute_or_load(item["audio_file"], item["text"])
|
ids = self.compute_or_load(string2filename(item["audio_unique_name"]), item["text"])
|
||||||
ph_hat = self.tokenizer.ids_to_text(ids)
|
ph_hat = self.tokenizer.ids_to_text(ids)
|
||||||
return {"text": item["text"], "ph_hat": ph_hat, "token_ids": ids, "token_ids_len": len(ids)}
|
return {"text": item["text"], "ph_hat": ph_hat, "token_ids": ids, "token_ids_len": len(ids)}
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return len(self.samples)
|
return len(self.samples)
|
||||||
|
|
||||||
def compute_or_load(self, wav_file, text):
|
def compute_or_load(self, file_name, text):
|
||||||
"""Compute phonemes for the given text.
|
"""Compute phonemes for the given text.
|
||||||
|
|
||||||
If the phonemes are already cached, load them from cache.
|
If the phonemes are already cached, load them from cache.
|
||||||
"""
|
"""
|
||||||
file_name = os.path.splitext(os.path.basename(wav_file))[0]
|
|
||||||
file_ext = "_phoneme.npy"
|
file_ext = "_phoneme.npy"
|
||||||
cache_path = os.path.join(self.cache_path, file_name + file_ext)
|
cache_path = os.path.join(self.cache_path, file_name + file_ext)
|
||||||
try:
|
try:
|
||||||
|
@ -669,11 +676,11 @@ class F0Dataset:
|
||||||
|
|
||||||
def __getitem__(self, idx):
|
def __getitem__(self, idx):
|
||||||
item = self.samples[idx]
|
item = self.samples[idx]
|
||||||
f0 = self.compute_or_load(item["audio_file"])
|
f0 = self.compute_or_load(item["audio_file"], string2filename(item["audio_unique_name"]))
|
||||||
if self.normalize_f0:
|
if self.normalize_f0:
|
||||||
assert self.mean is not None and self.std is not None, " [!] Mean and STD is not available"
|
assert self.mean is not None and self.std is not None, " [!] Mean and STD is not available"
|
||||||
f0 = self.normalize(f0)
|
f0 = self.normalize(f0)
|
||||||
return {"audio_file": item["audio_file"], "f0": f0}
|
return {"audio_unique_name": item["audio_unique_name"], "f0": f0}
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return len(self.samples)
|
return len(self.samples)
|
||||||
|
@ -705,8 +712,7 @@ class F0Dataset:
|
||||||
return self.pad_id
|
return self.pad_id
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def create_pitch_file_path(wav_file, cache_path):
|
def create_pitch_file_path(file_name, cache_path):
|
||||||
file_name = os.path.splitext(os.path.basename(wav_file))[0]
|
|
||||||
pitch_file = os.path.join(cache_path, file_name + "_pitch.npy")
|
pitch_file = os.path.join(cache_path, file_name + "_pitch.npy")
|
||||||
return pitch_file
|
return pitch_file
|
||||||
|
|
||||||
|
@ -744,11 +750,11 @@ class F0Dataset:
|
||||||
pitch[zero_idxs] = 0.0
|
pitch[zero_idxs] = 0.0
|
||||||
return pitch
|
return pitch
|
||||||
|
|
||||||
def compute_or_load(self, wav_file):
|
def compute_or_load(self, wav_file, audio_unique_name):
|
||||||
"""
|
"""
|
||||||
compute pitch and return a numpy array of pitch values
|
compute pitch and return a numpy array of pitch values
|
||||||
"""
|
"""
|
||||||
pitch_file = self.create_pitch_file_path(wav_file, self.cache_path)
|
pitch_file = self.create_pitch_file_path(audio_unique_name, self.cache_path)
|
||||||
if not os.path.exists(pitch_file):
|
if not os.path.exists(pitch_file):
|
||||||
pitch = self._compute_and_save_pitch(self.ap, wav_file, pitch_file)
|
pitch = self._compute_and_save_pitch(self.ap, wav_file, pitch_file)
|
||||||
else:
|
else:
|
||||||
|
@ -756,14 +762,14 @@ class F0Dataset:
|
||||||
return pitch.astype(np.float32)
|
return pitch.astype(np.float32)
|
||||||
|
|
||||||
def collate_fn(self, batch):
|
def collate_fn(self, batch):
|
||||||
audio_file = [item["audio_file"] for item in batch]
|
audio_unique_name = [item["audio_unique_name"] for item in batch]
|
||||||
f0s = [item["f0"] for item in batch]
|
f0s = [item["f0"] for item in batch]
|
||||||
f0_lens = [len(item["f0"]) for item in batch]
|
f0_lens = [len(item["f0"]) for item in batch]
|
||||||
f0_lens_max = max(f0_lens)
|
f0_lens_max = max(f0_lens)
|
||||||
f0s_torch = torch.LongTensor(len(f0s), f0_lens_max).fill_(self.get_pad_id())
|
f0s_torch = torch.LongTensor(len(f0s), f0_lens_max).fill_(self.get_pad_id())
|
||||||
for i, f0_len in enumerate(f0_lens):
|
for i, f0_len in enumerate(f0_lens):
|
||||||
f0s_torch[i, :f0_len] = torch.LongTensor(f0s[i])
|
f0s_torch[i, :f0_len] = torch.LongTensor(f0s[i])
|
||||||
return {"audio_file": audio_file, "f0": f0s_torch, "f0_lens": f0_lens}
|
return {"audio_unique_name": audio_unique_name, "f0": f0s_torch, "f0_lens": f0_lens}
|
||||||
|
|
||||||
def print_logs(self, level: int = 0) -> None:
|
def print_logs(self, level: int = 0) -> None:
|
||||||
indent = "\t" * level
|
indent = "\t" * level
|
||||||
|
|
|
@ -15,6 +15,15 @@ from tqdm import tqdm
|
||||||
|
|
||||||
def coqui(root_path, meta_file, ignored_speakers=None):
|
def coqui(root_path, meta_file, ignored_speakers=None):
|
||||||
"""Interal dataset formatter."""
|
"""Interal dataset formatter."""
|
||||||
|
filepath = os.path.join(root_path, meta_file)
|
||||||
|
# ensure there are 4 columns for every line
|
||||||
|
with open(filepath, "r", encoding="utf8") as f:
|
||||||
|
lines = f.readlines()
|
||||||
|
num_cols = len(lines[0].split("|")) # take the first row as reference
|
||||||
|
for idx, line in enumerate(lines[1:]):
|
||||||
|
if len(line.split("|")) != num_cols:
|
||||||
|
print(f" > Missing column in line {idx + 1} -> {line.strip()}")
|
||||||
|
# load metadata
|
||||||
metadata = pd.read_csv(os.path.join(root_path, meta_file), sep="|")
|
metadata = pd.read_csv(os.path.join(root_path, meta_file), sep="|")
|
||||||
assert all(x in metadata.columns for x in ["audio_file", "text"])
|
assert all(x in metadata.columns for x in ["audio_file", "text"])
|
||||||
speaker_name = None if "speaker_name" in metadata.columns else "coqui"
|
speaker_name = None if "speaker_name" in metadata.columns else "coqui"
|
||||||
|
@ -97,9 +106,9 @@ def mailabs(root_path, meta_files=None, ignored_speakers=None):
|
||||||
meta_files (str): list of meta files to be used in the training. If None, finds all the csv files
|
meta_files (str): list of meta files to be used in the training. If None, finds all the csv files
|
||||||
recursively. Defaults to None
|
recursively. Defaults to None
|
||||||
"""
|
"""
|
||||||
speaker_regex = re.compile("by_book/(male|female)/(?P<speaker_name>[^/]+)/")
|
speaker_regex = re.compile(f"by_book{os.sep}(male|female){os.sep}(?P<speaker_name>[^{os.sep}]+){os.sep}")
|
||||||
if not meta_files:
|
if not meta_files:
|
||||||
csv_files = glob(root_path + "/**/metadata.csv", recursive=True)
|
csv_files = glob(root_path + f"{os.sep}**{os.sep}metadata.csv", recursive=True)
|
||||||
else:
|
else:
|
||||||
csv_files = meta_files
|
csv_files = meta_files
|
||||||
|
|
||||||
|
@ -578,3 +587,17 @@ def kokoro(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
|
||||||
text = cols[2].replace(" ", "")
|
text = cols[2].replace(" ", "")
|
||||||
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
|
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
|
||||||
return items
|
return items
|
||||||
|
|
||||||
|
|
||||||
|
def kss(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
|
||||||
|
"""Korean single-speaker dataset from https://www.kaggle.com/datasets/bryanpark/korean-single-speaker-speech-dataset"""
|
||||||
|
txt_file = os.path.join(root_path, meta_file)
|
||||||
|
items = []
|
||||||
|
speaker_name = "kss"
|
||||||
|
with open(txt_file, "r", encoding="utf-8") as ttf:
|
||||||
|
for line in ttf:
|
||||||
|
cols = line.split("|")
|
||||||
|
wav_file = os.path.join(root_path, cols[0])
|
||||||
|
text = cols[2] # cols[1] => 6월, cols[2] => 유월
|
||||||
|
items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name})
|
||||||
|
return items
|
||||||
|
|
|
@ -398,9 +398,9 @@ class AlignTTS(BaseTTS):
|
||||||
logger.eval_audios(steps, audios, self.ap.sample_rate)
|
logger.eval_audios(steps, audios, self.ap.sample_rate)
|
||||||
|
|
||||||
def load_checkpoint(
|
def load_checkpoint(
|
||||||
self, config, checkpoint_path, eval=False
|
self, config, checkpoint_path, eval=False, cache=False
|
||||||
): # pylint: disable=unused-argument, redefined-builtin
|
): # pylint: disable=unused-argument, redefined-builtin
|
||||||
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
|
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
|
||||||
self.load_state_dict(state["model"])
|
self.load_state_dict(state["model"])
|
||||||
if eval:
|
if eval:
|
||||||
self.eval()
|
self.eval()
|
||||||
|
|
|
@ -92,16 +92,17 @@ class BaseTacotron(BaseTTS):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def load_checkpoint(
|
def load_checkpoint(
|
||||||
self, config, checkpoint_path, eval=False
|
self, config, checkpoint_path, eval=False, cache=False
|
||||||
): # pylint: disable=unused-argument, redefined-builtin
|
): # pylint: disable=unused-argument, redefined-builtin
|
||||||
"""Load model checkpoint and set up internals.
|
"""Load model checkpoint and set up internals.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
config (Coqpi): model configuration.
|
config (Coqpi): model configuration.
|
||||||
checkpoint_path (str): path to checkpoint file.
|
checkpoint_path (str): path to checkpoint file.
|
||||||
eval (bool): whether to load model for evaluation.
|
eval (bool, optional): whether to load model for evaluation.
|
||||||
|
cache (bool, optional): If True, cache the file locally for subsequent calls. It is cached under `get_user_data_dir()/tts_cache`. Defaults to False.
|
||||||
"""
|
"""
|
||||||
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
|
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
|
||||||
self.load_state_dict(state["model"])
|
self.load_state_dict(state["model"])
|
||||||
# TODO: set r in run-time by taking it from the new config
|
# TODO: set r in run-time by taking it from the new config
|
||||||
if "r" in state:
|
if "r" in state:
|
||||||
|
|
|
@ -144,11 +144,11 @@ class BaseTTS(BaseTrainerModel):
|
||||||
if speaker_name is None:
|
if speaker_name is None:
|
||||||
speaker_id = self.speaker_manager.get_random_id()
|
speaker_id = self.speaker_manager.get_random_id()
|
||||||
else:
|
else:
|
||||||
speaker_id = self.speaker_manager.ids[speaker_name]
|
speaker_id = self.speaker_manager.name_to_id[speaker_name]
|
||||||
|
|
||||||
# get language id
|
# get language id
|
||||||
if hasattr(self, "language_manager") and config.use_language_embedding and language_name is not None:
|
if hasattr(self, "language_manager") and config.use_language_embedding and language_name is not None:
|
||||||
language_id = self.language_manager.ids[language_name]
|
language_id = self.language_manager.name_to_id[language_name]
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"text": text,
|
"text": text,
|
||||||
|
@ -288,11 +288,13 @@ class BaseTTS(BaseTrainerModel):
|
||||||
# setup multi-speaker attributes
|
# setup multi-speaker attributes
|
||||||
if hasattr(self, "speaker_manager") and self.speaker_manager is not None:
|
if hasattr(self, "speaker_manager") and self.speaker_manager is not None:
|
||||||
if hasattr(config, "model_args"):
|
if hasattr(config, "model_args"):
|
||||||
speaker_id_mapping = self.speaker_manager.ids if config.model_args.use_speaker_embedding else None
|
speaker_id_mapping = (
|
||||||
|
self.speaker_manager.name_to_id if config.model_args.use_speaker_embedding else None
|
||||||
|
)
|
||||||
d_vector_mapping = self.speaker_manager.embeddings if config.model_args.use_d_vector_file else None
|
d_vector_mapping = self.speaker_manager.embeddings if config.model_args.use_d_vector_file else None
|
||||||
config.use_d_vector_file = config.model_args.use_d_vector_file
|
config.use_d_vector_file = config.model_args.use_d_vector_file
|
||||||
else:
|
else:
|
||||||
speaker_id_mapping = self.speaker_manager.ids if config.use_speaker_embedding else None
|
speaker_id_mapping = self.speaker_manager.name_to_id if config.use_speaker_embedding else None
|
||||||
d_vector_mapping = self.speaker_manager.embeddings if config.use_d_vector_file else None
|
d_vector_mapping = self.speaker_manager.embeddings if config.use_d_vector_file else None
|
||||||
else:
|
else:
|
||||||
speaker_id_mapping = None
|
speaker_id_mapping = None
|
||||||
|
@ -300,7 +302,7 @@ class BaseTTS(BaseTrainerModel):
|
||||||
|
|
||||||
# setup multi-lingual attributes
|
# setup multi-lingual attributes
|
||||||
if hasattr(self, "language_manager") and self.language_manager is not None:
|
if hasattr(self, "language_manager") and self.language_manager is not None:
|
||||||
language_id_mapping = self.language_manager.ids if self.args.use_language_embedding else None
|
language_id_mapping = self.language_manager.name_to_id if self.args.use_language_embedding else None
|
||||||
else:
|
else:
|
||||||
language_id_mapping = None
|
language_id_mapping = None
|
||||||
|
|
||||||
|
@ -342,7 +344,7 @@ class BaseTTS(BaseTrainerModel):
|
||||||
loader = DataLoader(
|
loader = DataLoader(
|
||||||
dataset,
|
dataset,
|
||||||
batch_size=config.eval_batch_size if is_eval else config.batch_size,
|
batch_size=config.eval_batch_size if is_eval else config.batch_size,
|
||||||
shuffle=False, # shuffle is done in the dataset.
|
shuffle=True, # if there is no other sampler
|
||||||
collate_fn=dataset.collate_fn,
|
collate_fn=dataset.collate_fn,
|
||||||
drop_last=False, # setting this False might cause issues in AMP training.
|
drop_last=False, # setting this False might cause issues in AMP training.
|
||||||
sampler=sampler,
|
sampler=sampler,
|
||||||
|
@ -363,7 +365,7 @@ class BaseTTS(BaseTrainerModel):
|
||||||
aux_inputs = {
|
aux_inputs = {
|
||||||
"speaker_id": None
|
"speaker_id": None
|
||||||
if not self.config.use_speaker_embedding
|
if not self.config.use_speaker_embedding
|
||||||
else random.sample(sorted(self.speaker_manager.ids.values()), 1),
|
else random.sample(sorted(self.speaker_manager.name_to_id.values()), 1),
|
||||||
"d_vector": d_vector,
|
"d_vector": d_vector,
|
||||||
"style_wav": None, # TODO: handle GST style input
|
"style_wav": None, # TODO: handle GST style input
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,6 +16,7 @@ from TTS.tts.utils.helpers import average_over_durations, generate_path, maximum
|
||||||
from TTS.tts.utils.speakers import SpeakerManager
|
from TTS.tts.utils.speakers import SpeakerManager
|
||||||
from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
||||||
from TTS.tts.utils.visual import plot_alignment, plot_avg_pitch, plot_spectrogram
|
from TTS.tts.utils.visual import plot_alignment, plot_avg_pitch, plot_spectrogram
|
||||||
|
from TTS.utils.io import load_fsspec
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
@ -707,9 +708,9 @@ class ForwardTTS(BaseTTS):
|
||||||
logger.eval_audios(steps, audios, self.ap.sample_rate)
|
logger.eval_audios(steps, audios, self.ap.sample_rate)
|
||||||
|
|
||||||
def load_checkpoint(
|
def load_checkpoint(
|
||||||
self, config, checkpoint_path, eval=False
|
self, config, checkpoint_path, eval=False, cache=False
|
||||||
): # pylint: disable=unused-argument, redefined-builtin
|
): # pylint: disable=unused-argument, redefined-builtin
|
||||||
state = torch.load(checkpoint_path, map_location=torch.device("cpu"))
|
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
|
||||||
self.load_state_dict(state["model"])
|
self.load_state_dict(state["model"])
|
||||||
if eval:
|
if eval:
|
||||||
self.eval()
|
self.eval()
|
||||||
|
|
|
@ -284,6 +284,7 @@ class VitsDataset(TTSDataset):
|
||||||
"wav_file": wav_filename,
|
"wav_file": wav_filename,
|
||||||
"speaker_name": item["speaker_name"],
|
"speaker_name": item["speaker_name"],
|
||||||
"language_name": item["language"],
|
"language_name": item["language"],
|
||||||
|
"audio_unique_name": item["audio_unique_name"],
|
||||||
}
|
}
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -308,6 +309,7 @@ class VitsDataset(TTSDataset):
|
||||||
- language_names: :math:`[B]`
|
- language_names: :math:`[B]`
|
||||||
- audiofile_paths: :math:`[B]`
|
- audiofile_paths: :math:`[B]`
|
||||||
- raw_texts: :math:`[B]`
|
- raw_texts: :math:`[B]`
|
||||||
|
- audio_unique_names: :math:`[B]`
|
||||||
"""
|
"""
|
||||||
# convert list of dicts to dict of lists
|
# convert list of dicts to dict of lists
|
||||||
B = len(batch)
|
B = len(batch)
|
||||||
|
@ -348,6 +350,7 @@ class VitsDataset(TTSDataset):
|
||||||
"language_names": batch["language_name"],
|
"language_names": batch["language_name"],
|
||||||
"audio_files": batch["wav_file"],
|
"audio_files": batch["wav_file"],
|
||||||
"raw_text": batch["raw_text"],
|
"raw_text": batch["raw_text"],
|
||||||
|
"audio_unique_names": batch["audio_unique_name"],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -718,6 +721,10 @@ class Vits(BaseTTS):
|
||||||
use_spectral_norm=self.args.use_spectral_norm_disriminator,
|
use_spectral_norm=self.args.use_spectral_norm_disriminator,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def device(self):
|
||||||
|
return next(self.parameters()).device
|
||||||
|
|
||||||
def init_multispeaker(self, config: Coqpit):
|
def init_multispeaker(self, config: Coqpit):
|
||||||
"""Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer
|
"""Initialize multi-speaker modules of a model. A model can be trained either with a speaker embedding layer
|
||||||
or with external `d_vectors` computed from a speaker encoder model.
|
or with external `d_vectors` computed from a speaker encoder model.
|
||||||
|
@ -755,17 +762,12 @@ class Vits(BaseTTS):
|
||||||
|
|
||||||
if (
|
if (
|
||||||
hasattr(self.speaker_manager.encoder, "audio_config")
|
hasattr(self.speaker_manager.encoder, "audio_config")
|
||||||
and self.config.audio["sample_rate"] != self.speaker_manager.encoder.audio_config["sample_rate"]
|
and self.config.audio.sample_rate != self.speaker_manager.encoder.audio_config["sample_rate"]
|
||||||
):
|
):
|
||||||
self.audio_transform = torchaudio.transforms.Resample(
|
self.audio_transform = torchaudio.transforms.Resample(
|
||||||
orig_freq=self.audio_config["sample_rate"],
|
orig_freq=self.config.audio.sample_rate,
|
||||||
new_freq=self.speaker_manager.encoder.audio_config["sample_rate"],
|
new_freq=self.speaker_manager.encoder.audio_config["sample_rate"],
|
||||||
)
|
)
|
||||||
# pylint: disable=W0101,W0105
|
|
||||||
self.audio_transform = torchaudio.transforms.Resample(
|
|
||||||
orig_freq=self.config.audio.sample_rate,
|
|
||||||
new_freq=self.speaker_manager.encoder.audio_config["sample_rate"],
|
|
||||||
)
|
|
||||||
|
|
||||||
def _init_speaker_embedding(self):
|
def _init_speaker_embedding(self):
|
||||||
# pylint: disable=attribute-defined-outside-init
|
# pylint: disable=attribute-defined-outside-init
|
||||||
|
@ -808,6 +810,13 @@ class Vits(BaseTTS):
|
||||||
orig_freq=self.config.audio["sample_rate"], new_freq=self.args.encoder_sample_rate
|
orig_freq=self.config.audio["sample_rate"], new_freq=self.args.encoder_sample_rate
|
||||||
) # pylint: disable=W0201
|
) # pylint: disable=W0201
|
||||||
|
|
||||||
|
def on_epoch_start(self, trainer): # pylint: disable=W0613
|
||||||
|
"""Freeze layers at the beginning of an epoch"""
|
||||||
|
self._freeze_layers()
|
||||||
|
# set the device of speaker encoder
|
||||||
|
if self.args.use_speaker_encoder_as_loss:
|
||||||
|
self.speaker_manager.encoder = self.speaker_manager.encoder.to(self.device)
|
||||||
|
|
||||||
def on_init_end(self, trainer): # pylint: disable=W0613
|
def on_init_end(self, trainer): # pylint: disable=W0613
|
||||||
"""Reinit layes if needed"""
|
"""Reinit layes if needed"""
|
||||||
if self.args.reinit_DP:
|
if self.args.reinit_DP:
|
||||||
|
@ -1185,7 +1194,6 @@ class Vits(BaseTTS):
|
||||||
y_lengths = torch.tensor([y.size(-1)]).to(y.device)
|
y_lengths = torch.tensor([y.size(-1)]).to(y.device)
|
||||||
speaker_cond_src = reference_speaker_id if reference_speaker_id is not None else reference_d_vector
|
speaker_cond_src = reference_speaker_id if reference_speaker_id is not None else reference_d_vector
|
||||||
speaker_cond_tgt = speaker_id if speaker_id is not None else d_vector
|
speaker_cond_tgt = speaker_id if speaker_id is not None else d_vector
|
||||||
# print(y.shape, y_lengths.shape)
|
|
||||||
wav, _, _ = self.voice_conversion(y, y_lengths, speaker_cond_src, speaker_cond_tgt)
|
wav, _, _ = self.voice_conversion(y, y_lengths, speaker_cond_src, speaker_cond_tgt)
|
||||||
return wav
|
return wav
|
||||||
|
|
||||||
|
@ -1229,8 +1237,6 @@ class Vits(BaseTTS):
|
||||||
Tuple[Dict, Dict]: Model ouputs and computed losses.
|
Tuple[Dict, Dict]: Model ouputs and computed losses.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
self._freeze_layers()
|
|
||||||
|
|
||||||
spec_lens = batch["spec_lens"]
|
spec_lens = batch["spec_lens"]
|
||||||
|
|
||||||
if optimizer_idx == 0:
|
if optimizer_idx == 0:
|
||||||
|
@ -1402,11 +1408,11 @@ class Vits(BaseTTS):
|
||||||
if speaker_name is None:
|
if speaker_name is None:
|
||||||
speaker_id = self.speaker_manager.get_random_id()
|
speaker_id = self.speaker_manager.get_random_id()
|
||||||
else:
|
else:
|
||||||
speaker_id = self.speaker_manager.ids[speaker_name]
|
speaker_id = self.speaker_manager.name_to_id[speaker_name]
|
||||||
|
|
||||||
# get language id
|
# get language id
|
||||||
if hasattr(self, "language_manager") and config.use_language_embedding and language_name is not None:
|
if hasattr(self, "language_manager") and config.use_language_embedding and language_name is not None:
|
||||||
language_id = self.language_manager.ids[language_name]
|
language_id = self.language_manager.name_to_id[language_name]
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"text": text,
|
"text": text,
|
||||||
|
@ -1461,8 +1467,8 @@ class Vits(BaseTTS):
|
||||||
d_vectors = None
|
d_vectors = None
|
||||||
|
|
||||||
# get numerical speaker ids from speaker names
|
# get numerical speaker ids from speaker names
|
||||||
if self.speaker_manager is not None and self.speaker_manager.ids and self.args.use_speaker_embedding:
|
if self.speaker_manager is not None and self.speaker_manager.name_to_id and self.args.use_speaker_embedding:
|
||||||
speaker_ids = [self.speaker_manager.ids[sn] for sn in batch["speaker_names"]]
|
speaker_ids = [self.speaker_manager.name_to_id[sn] for sn in batch["speaker_names"]]
|
||||||
|
|
||||||
if speaker_ids is not None:
|
if speaker_ids is not None:
|
||||||
speaker_ids = torch.LongTensor(speaker_ids)
|
speaker_ids = torch.LongTensor(speaker_ids)
|
||||||
|
@ -1471,12 +1477,12 @@ class Vits(BaseTTS):
|
||||||
# get d_vectors from audio file names
|
# get d_vectors from audio file names
|
||||||
if self.speaker_manager is not None and self.speaker_manager.embeddings and self.args.use_d_vector_file:
|
if self.speaker_manager is not None and self.speaker_manager.embeddings and self.args.use_d_vector_file:
|
||||||
d_vector_mapping = self.speaker_manager.embeddings
|
d_vector_mapping = self.speaker_manager.embeddings
|
||||||
d_vectors = [d_vector_mapping[w]["embedding"] for w in batch["audio_files"]]
|
d_vectors = [d_vector_mapping[w]["embedding"] for w in batch["audio_unique_names"]]
|
||||||
d_vectors = torch.FloatTensor(d_vectors)
|
d_vectors = torch.FloatTensor(d_vectors)
|
||||||
|
|
||||||
# get language ids from language names
|
# get language ids from language names
|
||||||
if self.language_manager is not None and self.language_manager.ids and self.args.use_language_embedding:
|
if self.language_manager is not None and self.language_manager.name_to_id and self.args.use_language_embedding:
|
||||||
language_ids = [self.language_manager.ids[ln] for ln in batch["language_names"]]
|
language_ids = [self.language_manager.name_to_id[ln] for ln in batch["language_names"]]
|
||||||
|
|
||||||
if language_ids is not None:
|
if language_ids is not None:
|
||||||
language_ids = torch.LongTensor(language_ids)
|
language_ids = torch.LongTensor(language_ids)
|
||||||
|
@ -1680,14 +1686,10 @@ class Vits(BaseTTS):
|
||||||
return [VitsDiscriminatorLoss(self.config), VitsGeneratorLoss(self.config)]
|
return [VitsDiscriminatorLoss(self.config), VitsGeneratorLoss(self.config)]
|
||||||
|
|
||||||
def load_checkpoint(
|
def load_checkpoint(
|
||||||
self,
|
self, config, checkpoint_path, eval=False, strict=True, cache=False
|
||||||
config,
|
|
||||||
checkpoint_path,
|
|
||||||
eval=False,
|
|
||||||
strict=True,
|
|
||||||
): # pylint: disable=unused-argument, redefined-builtin
|
): # pylint: disable=unused-argument, redefined-builtin
|
||||||
"""Load the model checkpoint and setup for training or inference"""
|
"""Load the model checkpoint and setup for training or inference"""
|
||||||
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
|
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
|
||||||
# compat band-aid for the pre-trained models to not use the encoder baked into the model
|
# compat band-aid for the pre-trained models to not use the encoder baked into the model
|
||||||
# TODO: consider baking the speaker encoder into the model and call it from there.
|
# TODO: consider baking the speaker encoder into the model and call it from there.
|
||||||
# as it is probably easier for model distribution.
|
# as it is probably easier for model distribution.
|
||||||
|
|
|
@ -37,11 +37,11 @@ class LanguageManager(BaseIDManager):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def num_languages(self) -> int:
|
def num_languages(self) -> int:
|
||||||
return len(list(self.ids.keys()))
|
return len(list(self.name_to_id.keys()))
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def language_names(self) -> List:
|
def language_names(self) -> List:
|
||||||
return list(self.ids.keys())
|
return list(self.name_to_id.keys())
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def parse_language_ids_from_config(c: Coqpit) -> Dict:
|
def parse_language_ids_from_config(c: Coqpit) -> Dict:
|
||||||
|
@ -67,7 +67,7 @@ class LanguageManager(BaseIDManager):
|
||||||
Args:
|
Args:
|
||||||
c (Coqpit): Config.
|
c (Coqpit): Config.
|
||||||
"""
|
"""
|
||||||
self.ids = self.parse_language_ids_from_config(c)
|
self.name_to_id = self.parse_language_ids_from_config(c)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def parse_ids_from_data(items: List, parse_key: str) -> Any:
|
def parse_ids_from_data(items: List, parse_key: str) -> Any:
|
||||||
|
@ -82,7 +82,7 @@ class LanguageManager(BaseIDManager):
|
||||||
Args:
|
Args:
|
||||||
file_path (str): Path to the output file.
|
file_path (str): Path to the output file.
|
||||||
"""
|
"""
|
||||||
self._save_json(file_path, self.ids)
|
self._save_json(file_path, self.name_to_id)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def init_from_config(config: Coqpit) -> "LanguageManager":
|
def init_from_config(config: Coqpit) -> "LanguageManager":
|
||||||
|
|
|
@ -39,7 +39,7 @@ class BaseIDManager:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, id_file_path: str = ""):
|
def __init__(self, id_file_path: str = ""):
|
||||||
self.ids = {}
|
self.name_to_id = {}
|
||||||
|
|
||||||
if id_file_path:
|
if id_file_path:
|
||||||
self.load_ids_from_file(id_file_path)
|
self.load_ids_from_file(id_file_path)
|
||||||
|
@ -60,7 +60,7 @@ class BaseIDManager:
|
||||||
Args:
|
Args:
|
||||||
items (List): Data sampled returned by `load_tts_samples()`.
|
items (List): Data sampled returned by `load_tts_samples()`.
|
||||||
"""
|
"""
|
||||||
self.ids = self.parse_ids_from_data(items, parse_key=parse_key)
|
self.name_to_id = self.parse_ids_from_data(items, parse_key=parse_key)
|
||||||
|
|
||||||
def load_ids_from_file(self, file_path: str) -> None:
|
def load_ids_from_file(self, file_path: str) -> None:
|
||||||
"""Set IDs from a file.
|
"""Set IDs from a file.
|
||||||
|
@ -68,7 +68,7 @@ class BaseIDManager:
|
||||||
Args:
|
Args:
|
||||||
file_path (str): Path to the file.
|
file_path (str): Path to the file.
|
||||||
"""
|
"""
|
||||||
self.ids = load_file(file_path)
|
self.name_to_id = load_file(file_path)
|
||||||
|
|
||||||
def save_ids_to_file(self, file_path: str) -> None:
|
def save_ids_to_file(self, file_path: str) -> None:
|
||||||
"""Save IDs to a json file.
|
"""Save IDs to a json file.
|
||||||
|
@ -76,7 +76,7 @@ class BaseIDManager:
|
||||||
Args:
|
Args:
|
||||||
file_path (str): Path to the output file.
|
file_path (str): Path to the output file.
|
||||||
"""
|
"""
|
||||||
save_file(self.ids, file_path)
|
save_file(self.name_to_id, file_path)
|
||||||
|
|
||||||
def get_random_id(self) -> Any:
|
def get_random_id(self) -> Any:
|
||||||
"""Get a random embedding.
|
"""Get a random embedding.
|
||||||
|
@ -86,8 +86,8 @@ class BaseIDManager:
|
||||||
Returns:
|
Returns:
|
||||||
np.ndarray: embedding.
|
np.ndarray: embedding.
|
||||||
"""
|
"""
|
||||||
if self.ids:
|
if self.name_to_id:
|
||||||
return self.ids[random.choices(list(self.ids.keys()))[0]]
|
return self.name_to_id[random.choices(list(self.name_to_id.keys()))[0]]
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
@ -109,11 +109,27 @@ class BaseIDManager:
|
||||||
class EmbeddingManager(BaseIDManager):
|
class EmbeddingManager(BaseIDManager):
|
||||||
"""Base `Embedding` Manager class. Every new `Embedding` manager must inherit this.
|
"""Base `Embedding` Manager class. Every new `Embedding` manager must inherit this.
|
||||||
It defines common `Embedding` manager specific functions.
|
It defines common `Embedding` manager specific functions.
|
||||||
|
|
||||||
|
It expects embeddings files in the following format:
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
{
|
||||||
|
'audio_file_key':{
|
||||||
|
'name': 'category_name',
|
||||||
|
'embedding'[<embedding_values>]
|
||||||
|
},
|
||||||
|
...
|
||||||
|
}
|
||||||
|
|
||||||
|
`audio_file_key` is a unique key to the audio file in the dataset. It can be the path to the file or any other unique key.
|
||||||
|
`embedding` is the embedding vector of the audio file.
|
||||||
|
`name` can be name of the speaker of the audio file.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
embedding_file_path: str = "",
|
embedding_file_path: Union[str, List[str]] = "",
|
||||||
id_file_path: str = "",
|
id_file_path: str = "",
|
||||||
encoder_model_path: str = "",
|
encoder_model_path: str = "",
|
||||||
encoder_config_path: str = "",
|
encoder_config_path: str = "",
|
||||||
|
@ -129,11 +145,24 @@ class EmbeddingManager(BaseIDManager):
|
||||||
self.use_cuda = use_cuda
|
self.use_cuda = use_cuda
|
||||||
|
|
||||||
if embedding_file_path:
|
if embedding_file_path:
|
||||||
self.load_embeddings_from_file(embedding_file_path)
|
if isinstance(embedding_file_path, list):
|
||||||
|
self.load_embeddings_from_list_of_files(embedding_file_path)
|
||||||
|
else:
|
||||||
|
self.load_embeddings_from_file(embedding_file_path)
|
||||||
|
|
||||||
if encoder_model_path and encoder_config_path:
|
if encoder_model_path and encoder_config_path:
|
||||||
self.init_encoder(encoder_model_path, encoder_config_path, use_cuda)
|
self.init_encoder(encoder_model_path, encoder_config_path, use_cuda)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def num_embeddings(self):
|
||||||
|
"""Get number of embeddings."""
|
||||||
|
return len(self.embeddings)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def num_names(self):
|
||||||
|
"""Get number of embeddings."""
|
||||||
|
return len(self.embeddings_by_names)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def embedding_dim(self):
|
def embedding_dim(self):
|
||||||
"""Dimensionality of embeddings. If embeddings are not loaded, returns zero."""
|
"""Dimensionality of embeddings. If embeddings are not loaded, returns zero."""
|
||||||
|
@ -141,6 +170,11 @@ class EmbeddingManager(BaseIDManager):
|
||||||
return len(self.embeddings[list(self.embeddings.keys())[0]]["embedding"])
|
return len(self.embeddings[list(self.embeddings.keys())[0]]["embedding"])
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
@property
|
||||||
|
def embedding_names(self):
|
||||||
|
"""Get embedding names."""
|
||||||
|
return list(self.embeddings_by_names.keys())
|
||||||
|
|
||||||
def save_embeddings_to_file(self, file_path: str) -> None:
|
def save_embeddings_to_file(self, file_path: str) -> None:
|
||||||
"""Save embeddings to a json file.
|
"""Save embeddings to a json file.
|
||||||
|
|
||||||
|
@ -149,20 +183,57 @@ class EmbeddingManager(BaseIDManager):
|
||||||
"""
|
"""
|
||||||
save_file(self.embeddings, file_path)
|
save_file(self.embeddings, file_path)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def read_embeddings_from_file(file_path: str):
|
||||||
|
"""Load embeddings from a json file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path (str): Path to the file.
|
||||||
|
"""
|
||||||
|
embeddings = load_file(file_path)
|
||||||
|
speakers = sorted({x["name"] for x in embeddings.values()})
|
||||||
|
name_to_id = {name: i for i, name in enumerate(speakers)}
|
||||||
|
clip_ids = list(set(sorted(clip_name for clip_name in embeddings.keys())))
|
||||||
|
# cache embeddings_by_names for fast inference using a bigger speakers.json
|
||||||
|
embeddings_by_names = {}
|
||||||
|
for x in embeddings.values():
|
||||||
|
if x["name"] not in embeddings_by_names.keys():
|
||||||
|
embeddings_by_names[x["name"]] = [x["embedding"]]
|
||||||
|
else:
|
||||||
|
embeddings_by_names[x["name"]].append(x["embedding"])
|
||||||
|
return name_to_id, clip_ids, embeddings, embeddings_by_names
|
||||||
|
|
||||||
def load_embeddings_from_file(self, file_path: str) -> None:
|
def load_embeddings_from_file(self, file_path: str) -> None:
|
||||||
"""Load embeddings from a json file.
|
"""Load embeddings from a json file.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_path (str): Path to the target json file.
|
file_path (str): Path to the target json file.
|
||||||
"""
|
"""
|
||||||
self.embeddings = load_file(file_path)
|
self.name_to_id, self.clip_ids, self.embeddings, self.embeddings_by_names = self.read_embeddings_from_file(
|
||||||
|
file_path
|
||||||
|
)
|
||||||
|
|
||||||
speakers = sorted({x["name"] for x in self.embeddings.values()})
|
def load_embeddings_from_list_of_files(self, file_paths: List[str]) -> None:
|
||||||
self.ids = {name: i for i, name in enumerate(speakers)}
|
"""Load embeddings from a list of json files and don't allow duplicate keys.
|
||||||
|
|
||||||
self.clip_ids = list(set(sorted(clip_name for clip_name in self.embeddings.keys())))
|
Args:
|
||||||
# cache embeddings_by_names for fast inference using a bigger speakers.json
|
file_paths (List[str]): List of paths to the target json files.
|
||||||
self.embeddings_by_names = self.get_embeddings_by_names()
|
"""
|
||||||
|
self.name_to_id = {}
|
||||||
|
self.clip_ids = []
|
||||||
|
self.embeddings_by_names = {}
|
||||||
|
self.embeddings = {}
|
||||||
|
for file_path in file_paths:
|
||||||
|
ids, clip_ids, embeddings, embeddings_by_names = self.read_embeddings_from_file(file_path)
|
||||||
|
# check colliding keys
|
||||||
|
duplicates = set(self.embeddings.keys()) & set(embeddings.keys())
|
||||||
|
if duplicates:
|
||||||
|
raise ValueError(f" [!] Duplicate embedding names <{duplicates}> in {file_path}")
|
||||||
|
# store values
|
||||||
|
self.name_to_id.update(ids)
|
||||||
|
self.clip_ids.extend(clip_ids)
|
||||||
|
self.embeddings_by_names.update(embeddings_by_names)
|
||||||
|
self.embeddings.update(embeddings)
|
||||||
|
|
||||||
def get_embedding_by_clip(self, clip_idx: str) -> List:
|
def get_embedding_by_clip(self, clip_idx: str) -> List:
|
||||||
"""Get embedding by clip ID.
|
"""Get embedding by clip ID.
|
||||||
|
|
|
@ -73,14 +73,14 @@ class SpeakerManager(EmbeddingManager):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def num_speakers(self):
|
def num_speakers(self):
|
||||||
return len(self.ids)
|
return len(self.name_to_id)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def speaker_names(self):
|
def speaker_names(self):
|
||||||
return list(self.ids.keys())
|
return list(self.name_to_id.keys())
|
||||||
|
|
||||||
def get_speakers(self) -> List:
|
def get_speakers(self) -> List:
|
||||||
return self.ids
|
return self.name_to_id
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def init_from_config(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "SpeakerManager":
|
def init_from_config(config: "Coqpit", samples: Union[List[List], List[Dict]] = None) -> "SpeakerManager":
|
||||||
|
@ -182,10 +182,10 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None,
|
||||||
speaker_manager.load_embeddings_from_file(c.d_vector_file)
|
speaker_manager.load_embeddings_from_file(c.d_vector_file)
|
||||||
speaker_manager.load_embeddings_from_file(speakers_file)
|
speaker_manager.load_embeddings_from_file(speakers_file)
|
||||||
elif not c.use_d_vector_file: # restor speaker manager with speaker ID file.
|
elif not c.use_d_vector_file: # restor speaker manager with speaker ID file.
|
||||||
speaker_ids_from_data = speaker_manager.ids
|
speaker_ids_from_data = speaker_manager.name_to_id
|
||||||
speaker_manager.load_ids_from_file(speakers_file)
|
speaker_manager.load_ids_from_file(speakers_file)
|
||||||
assert all(
|
assert all(
|
||||||
speaker in speaker_manager.ids for speaker in speaker_ids_from_data
|
speaker in speaker_manager.name_to_id for speaker in speaker_ids_from_data
|
||||||
), " [!] You cannot introduce new speakers to a pre-trained model."
|
), " [!] You cannot introduce new speakers to a pre-trained model."
|
||||||
elif c.use_d_vector_file and c.d_vector_file:
|
elif c.use_d_vector_file and c.d_vector_file:
|
||||||
# new speaker manager with external speaker embeddings.
|
# new speaker manager with external speaker embeddings.
|
||||||
|
@ -199,7 +199,7 @@ def get_speaker_manager(c: Coqpit, data: List = None, restore_path: str = None,
|
||||||
if speaker_manager.num_speakers > 0:
|
if speaker_manager.num_speakers > 0:
|
||||||
print(
|
print(
|
||||||
" > Speaker manager is loaded with {} speakers: {}".format(
|
" > Speaker manager is loaded with {} speakers: {}".format(
|
||||||
speaker_manager.num_speakers, ", ".join(speaker_manager.ids)
|
speaker_manager.num_speakers, ", ".join(speaker_manager.name_to_id)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -295,7 +295,12 @@ def transfer_voice(
|
||||||
reference_d_vector = embedding_to_torch(reference_d_vector, cuda=use_cuda)
|
reference_d_vector = embedding_to_torch(reference_d_vector, cuda=use_cuda)
|
||||||
|
|
||||||
# load reference_wav audio
|
# load reference_wav audio
|
||||||
reference_wav = embedding_to_torch(model.ap.load_wav(reference_wav, sr=model.ap.sample_rate), cuda=use_cuda)
|
reference_wav = embedding_to_torch(
|
||||||
|
model.ap.load_wav(
|
||||||
|
reference_wav, sr=model.args.encoder_sample_rate if model.args.encoder_sample_rate else model.ap.sample_rate
|
||||||
|
),
|
||||||
|
cuda=use_cuda,
|
||||||
|
)
|
||||||
|
|
||||||
if hasattr(model, "module"):
|
if hasattr(model, "module"):
|
||||||
_func = model.module.inference_voice_conversion
|
_func = model.module.inference_voice_conversion
|
||||||
|
|
|
@ -0,0 +1,44 @@
|
||||||
|
# coding: utf-8
|
||||||
|
# Add the word you want to the dictionary.
|
||||||
|
etc_dictionary = {"1+1": "원플러스원", "2+1": "투플러스원"}
|
||||||
|
|
||||||
|
|
||||||
|
english_dictionary = {
|
||||||
|
"KOREA": "코리아",
|
||||||
|
"IDOL": "아이돌",
|
||||||
|
"IT": "아이티",
|
||||||
|
"IQ": "아이큐",
|
||||||
|
"UP": "업",
|
||||||
|
"DOWN": "다운",
|
||||||
|
"PC": "피씨",
|
||||||
|
"CCTV": "씨씨티비",
|
||||||
|
"SNS": "에스엔에스",
|
||||||
|
"AI": "에이아이",
|
||||||
|
"CEO": "씨이오",
|
||||||
|
"A": "에이",
|
||||||
|
"B": "비",
|
||||||
|
"C": "씨",
|
||||||
|
"D": "디",
|
||||||
|
"E": "이",
|
||||||
|
"F": "에프",
|
||||||
|
"G": "지",
|
||||||
|
"H": "에이치",
|
||||||
|
"I": "아이",
|
||||||
|
"J": "제이",
|
||||||
|
"K": "케이",
|
||||||
|
"L": "엘",
|
||||||
|
"M": "엠",
|
||||||
|
"N": "엔",
|
||||||
|
"O": "오",
|
||||||
|
"P": "피",
|
||||||
|
"Q": "큐",
|
||||||
|
"R": "알",
|
||||||
|
"S": "에스",
|
||||||
|
"T": "티",
|
||||||
|
"U": "유",
|
||||||
|
"V": "브이",
|
||||||
|
"W": "더블유",
|
||||||
|
"X": "엑스",
|
||||||
|
"Y": "와이",
|
||||||
|
"Z": "제트",
|
||||||
|
}
|
|
@ -0,0 +1,32 @@
|
||||||
|
# coding: utf-8
|
||||||
|
# Code based on https://github.com/carpedm20/multi-speaker-tacotron-tensorflow/blob/master/text/korean.py
|
||||||
|
import re
|
||||||
|
|
||||||
|
from TTS.tts.utils.text.korean.ko_dictionary import english_dictionary, etc_dictionary
|
||||||
|
|
||||||
|
|
||||||
|
def normalize(text):
|
||||||
|
text = text.strip()
|
||||||
|
text = re.sub("[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]", "", text)
|
||||||
|
text = normalize_with_dictionary(text, etc_dictionary)
|
||||||
|
text = normalize_english(text)
|
||||||
|
text = text.lower()
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_with_dictionary(text, dic):
|
||||||
|
if any(key in text for key in dic.keys()):
|
||||||
|
pattern = re.compile("|".join(re.escape(key) for key in dic.keys()))
|
||||||
|
return pattern.sub(lambda x: dic[x.group()], text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_english(text):
|
||||||
|
def fn(m):
|
||||||
|
word = m.group()
|
||||||
|
if word in english_dictionary:
|
||||||
|
return english_dictionary.get(word)
|
||||||
|
return word
|
||||||
|
|
||||||
|
text = re.sub("([A-Za-z]+)", fn, text)
|
||||||
|
return text
|
|
@ -0,0 +1,36 @@
|
||||||
|
from jamo import hangul_to_jamo
|
||||||
|
|
||||||
|
from TTS.tts.utils.text.korean.korean import normalize
|
||||||
|
|
||||||
|
g2p = None
|
||||||
|
|
||||||
|
|
||||||
|
def korean_text_to_phonemes(text, character: str = "hangeul") -> str:
|
||||||
|
"""
|
||||||
|
|
||||||
|
The input and output values look the same, but they are different in Unicode.
|
||||||
|
|
||||||
|
example :
|
||||||
|
|
||||||
|
input = '하늘' (Unicode : \ud558\ub298), (하 + 늘)
|
||||||
|
output = '하늘' (Unicode :\u1112\u1161\u1102\u1173\u11af), (ᄒ + ᅡ + ᄂ + ᅳ + ᆯ)
|
||||||
|
|
||||||
|
"""
|
||||||
|
global g2p # pylint: disable=global-statement
|
||||||
|
if g2p is None:
|
||||||
|
from g2pkk import G2p
|
||||||
|
|
||||||
|
g2p = G2p()
|
||||||
|
|
||||||
|
if character == "english":
|
||||||
|
from anyascii import anyascii
|
||||||
|
|
||||||
|
text = normalize(text)
|
||||||
|
text = g2p(text)
|
||||||
|
text = anyascii(text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
text = normalize(text)
|
||||||
|
text = g2p(text)
|
||||||
|
text = list(hangul_to_jamo(text)) # '하늘' --> ['ᄒ', 'ᅡ', 'ᄂ', 'ᅳ', 'ᆯ']
|
||||||
|
return "".join(text)
|
|
@ -2,6 +2,7 @@ from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
|
||||||
from TTS.tts.utils.text.phonemizers.espeak_wrapper import ESpeak
|
from TTS.tts.utils.text.phonemizers.espeak_wrapper import ESpeak
|
||||||
from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut
|
from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut
|
||||||
from TTS.tts.utils.text.phonemizers.ja_jp_phonemizer import JA_JP_Phonemizer
|
from TTS.tts.utils.text.phonemizers.ja_jp_phonemizer import JA_JP_Phonemizer
|
||||||
|
from TTS.tts.utils.text.phonemizers.ko_kr_phonemizer import KO_KR_Phonemizer
|
||||||
from TTS.tts.utils.text.phonemizers.zh_cn_phonemizer import ZH_CN_Phonemizer
|
from TTS.tts.utils.text.phonemizers.zh_cn_phonemizer import ZH_CN_Phonemizer
|
||||||
|
|
||||||
PHONEMIZERS = {b.name(): b for b in (ESpeak, Gruut, JA_JP_Phonemizer)}
|
PHONEMIZERS = {b.name(): b for b in (ESpeak, Gruut, JA_JP_Phonemizer)}
|
||||||
|
@ -26,6 +27,7 @@ DEF_LANG_TO_PHONEMIZER.update(_new_dict)
|
||||||
DEF_LANG_TO_PHONEMIZER["en"] = DEF_LANG_TO_PHONEMIZER["en-us"]
|
DEF_LANG_TO_PHONEMIZER["en"] = DEF_LANG_TO_PHONEMIZER["en-us"]
|
||||||
DEF_LANG_TO_PHONEMIZER["ja-jp"] = JA_JP_Phonemizer.name()
|
DEF_LANG_TO_PHONEMIZER["ja-jp"] = JA_JP_Phonemizer.name()
|
||||||
DEF_LANG_TO_PHONEMIZER["zh-cn"] = ZH_CN_Phonemizer.name()
|
DEF_LANG_TO_PHONEMIZER["zh-cn"] = ZH_CN_Phonemizer.name()
|
||||||
|
DEF_LANG_TO_PHONEMIZER["ko-kr"] = KO_KR_Phonemizer.name()
|
||||||
|
|
||||||
|
|
||||||
def get_phonemizer_by_name(name: str, **kwargs) -> BasePhonemizer:
|
def get_phonemizer_by_name(name: str, **kwargs) -> BasePhonemizer:
|
||||||
|
@ -46,6 +48,8 @@ def get_phonemizer_by_name(name: str, **kwargs) -> BasePhonemizer:
|
||||||
return ZH_CN_Phonemizer(**kwargs)
|
return ZH_CN_Phonemizer(**kwargs)
|
||||||
if name == "ja_jp_phonemizer":
|
if name == "ja_jp_phonemizer":
|
||||||
return JA_JP_Phonemizer(**kwargs)
|
return JA_JP_Phonemizer(**kwargs)
|
||||||
|
if name == "ko_kr_phonemizer":
|
||||||
|
return KO_KR_Phonemizer(**kwargs)
|
||||||
raise ValueError(f"Phonemizer {name} not found")
|
raise ValueError(f"Phonemizer {name} not found")
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -94,6 +94,8 @@ class ESpeak(BasePhonemizer):
|
||||||
# band-aid for backwards compatibility
|
# band-aid for backwards compatibility
|
||||||
if language == "en":
|
if language == "en":
|
||||||
language = "en-us"
|
language = "en-us"
|
||||||
|
if language == "zh-cn":
|
||||||
|
language = "cmn"
|
||||||
|
|
||||||
super().__init__(language, punctuations=punctuations, keep_puncs=keep_puncs)
|
super().__init__(language, punctuations=punctuations, keep_puncs=keep_puncs)
|
||||||
if backend is not None:
|
if backend is not None:
|
||||||
|
|
|
@ -0,0 +1,65 @@
|
||||||
|
from typing import Dict
|
||||||
|
|
||||||
|
from TTS.tts.utils.text.korean.phonemizer import korean_text_to_phonemes
|
||||||
|
from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
|
||||||
|
|
||||||
|
_DEF_KO_PUNCS = "、.,[]()?!〽~『』「」【】"
|
||||||
|
|
||||||
|
|
||||||
|
class KO_KR_Phonemizer(BasePhonemizer):
|
||||||
|
"""🐸TTS ko_kr_phonemizer using functions in `TTS.tts.utils.text.korean.phonemizer`
|
||||||
|
|
||||||
|
TODO: Add Korean to character (ᄀᄁᄂᄃᄄᄅᄆᄇᄈᄉᄊᄋᄌᄍᄎᄏᄐᄑ하ᅢᅣᅤᅥᅦᅧᅨᅩᅪᅫᅬᅭᅮᅯᅰᅱᅲᅳᅴᅵᆨᆩᆪᆫᆬᆭᆮᆯᆰᆱᆲᆳᆴᆵᆶᆷᆸᆹᆺᆻᆼᆽᆾᆿᇀᇁᇂ)
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
>>> from TTS.tts.utils.text.phonemizers import KO_KR_Phonemizer
|
||||||
|
>>> phonemizer = KO_KR_Phonemizer()
|
||||||
|
>>> phonemizer.phonemize("이 문장은 음성합성 테스트를 위한 문장입니다.", separator="|")
|
||||||
|
'ᄋ|ᅵ| |ᄆ|ᅮ|ᆫ|ᄌ|ᅡ|ᆼ|ᄋ|ᅳ| |ᄂ|ᅳ|ᆷ|ᄉ|ᅥ|ᆼ|ᄒ|ᅡ|ᆸ|ᄊ|ᅥ|ᆼ| |ᄐ|ᅦ|ᄉ|ᅳ|ᄐ|ᅳ|ᄅ|ᅳ| |ᄅ|ᅱ|ᄒ|ᅡ|ᆫ| |ᄆ|ᅮ|ᆫ|ᄌ|ᅡ|ᆼ|ᄋ|ᅵ|ᆷ|ᄂ|ᅵ|ᄃ|ᅡ|.'
|
||||||
|
|
||||||
|
>>> from TTS.tts.utils.text.phonemizers import KO_KR_Phonemizer
|
||||||
|
>>> phonemizer = KO_KR_Phonemizer()
|
||||||
|
>>> phonemizer.phonemize("이 문장은 음성합성 테스트를 위한 문장입니다.", separator="|", character='english')
|
||||||
|
'I| |M|u|n|J|a|n|g|E|u| |N|e|u|m|S|e|o|n|g|H|a|b|S|s|e|o|n|g| |T|e|S|e|u|T|e|u|L|e|u| |L|w|i|H|a|n| |M|u|n|J|a|n|g|I|m|N|i|D|a|.'
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
language = "ko-kr"
|
||||||
|
|
||||||
|
def __init__(self, punctuations=_DEF_KO_PUNCS, keep_puncs=True, **kwargs): # pylint: disable=unused-argument
|
||||||
|
super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def name():
|
||||||
|
return "ko_kr_phonemizer"
|
||||||
|
|
||||||
|
def _phonemize(self, text: str, separator: str = "", character: str = "hangeul") -> str:
|
||||||
|
ph = korean_text_to_phonemes(text, character=character)
|
||||||
|
if separator is not None or separator != "":
|
||||||
|
return separator.join(ph)
|
||||||
|
return ph
|
||||||
|
|
||||||
|
def phonemize(self, text: str, separator: str = "", character: str = "hangeul") -> str:
|
||||||
|
return self._phonemize(text, separator, character)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def supported_languages() -> Dict:
|
||||||
|
return {"ko-kr": "hangeul(korean)"}
|
||||||
|
|
||||||
|
def version(self) -> str:
|
||||||
|
return "0.0.2"
|
||||||
|
|
||||||
|
def is_available(self) -> bool:
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
texts = "이 문장은 음성합성 테스트를 위한 문장입니다."
|
||||||
|
e = KO_KR_Phonemizer()
|
||||||
|
print(e.supported_languages())
|
||||||
|
print(e.version())
|
||||||
|
print(e.language)
|
||||||
|
print(e.name())
|
||||||
|
print(e.is_available())
|
||||||
|
print(e.phonemize(texts))
|
|
@ -42,7 +42,7 @@ class ZH_CN_Phonemizer(BasePhonemizer):
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def supported_languages() -> Dict:
|
def supported_languages() -> Dict:
|
||||||
return {"zh-cn": "Japanese (Japan)"}
|
return {"zh-cn": "Chinese (China)"}
|
||||||
|
|
||||||
def version(self) -> str:
|
def version(self) -> str:
|
||||||
return "0.0.1"
|
return "0.0.1"
|
||||||
|
|
|
@ -2,9 +2,9 @@ from typing import Tuple
|
||||||
|
|
||||||
import librosa
|
import librosa
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pyworld as pw
|
|
||||||
import scipy
|
import scipy
|
||||||
import soundfile as sf
|
import soundfile as sf
|
||||||
|
from librosa import pyin
|
||||||
|
|
||||||
# For using kwargs
|
# For using kwargs
|
||||||
# pylint: disable=unused-argument
|
# pylint: disable=unused-argument
|
||||||
|
@ -242,12 +242,28 @@ def compute_stft_paddings(
|
||||||
|
|
||||||
|
|
||||||
def compute_f0(
|
def compute_f0(
|
||||||
*, x: np.ndarray = None, pitch_fmax: float = None, hop_length: int = None, sample_rate: int = None, **kwargs
|
*,
|
||||||
|
x: np.ndarray = None,
|
||||||
|
pitch_fmax: float = None,
|
||||||
|
pitch_fmin: float = None,
|
||||||
|
hop_length: int = None,
|
||||||
|
win_length: int = None,
|
||||||
|
sample_rate: int = None,
|
||||||
|
stft_pad_mode: str = "reflect",
|
||||||
|
center: bool = True,
|
||||||
|
**kwargs,
|
||||||
) -> np.ndarray:
|
) -> np.ndarray:
|
||||||
"""Compute pitch (f0) of a waveform using the same parameters used for computing melspectrogram.
|
"""Compute pitch (f0) of a waveform using the same parameters used for computing melspectrogram.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
x (np.ndarray): Waveform. Shape :math:`[T_wav,]`
|
x (np.ndarray): Waveform. Shape :math:`[T_wav,]`
|
||||||
|
pitch_fmax (float): Pitch max value.
|
||||||
|
pitch_fmin (float): Pitch min value.
|
||||||
|
hop_length (int): Number of frames between STFT columns.
|
||||||
|
win_length (int): STFT window length.
|
||||||
|
sample_rate (int): Audio sampling rate.
|
||||||
|
stft_pad_mode (str): Padding mode for STFT.
|
||||||
|
center (bool): Centered padding.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
np.ndarray: Pitch. Shape :math:`[T_pitch,]`. :math:`T_pitch == T_wav / hop_length`
|
np.ndarray: Pitch. Shape :math:`[T_pitch,]`. :math:`T_pitch == T_wav / hop_length`
|
||||||
|
@ -255,20 +271,35 @@ def compute_f0(
|
||||||
Examples:
|
Examples:
|
||||||
>>> WAV_FILE = filename = librosa.util.example_audio_file()
|
>>> WAV_FILE = filename = librosa.util.example_audio_file()
|
||||||
>>> from TTS.config import BaseAudioConfig
|
>>> from TTS.config import BaseAudioConfig
|
||||||
>>> from TTS.utils.audio.processor import AudioProcessor >>> conf = BaseAudioConfig(pitch_fmax=8000)
|
>>> from TTS.utils.audio import AudioProcessor
|
||||||
|
>>> conf = BaseAudioConfig(pitch_fmax=640, pitch_fmin=1)
|
||||||
>>> ap = AudioProcessor(**conf)
|
>>> ap = AudioProcessor(**conf)
|
||||||
>>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050]
|
>>> wav = ap.load_wav(WAV_FILE, sr=ap.sample_rate)[:5 * ap.sample_rate]
|
||||||
>>> pitch = ap.compute_f0(wav)
|
>>> pitch = ap.compute_f0(wav)
|
||||||
"""
|
"""
|
||||||
assert pitch_fmax is not None, " [!] Set `pitch_fmax` before caling `compute_f0`."
|
assert pitch_fmax is not None, " [!] Set `pitch_fmax` before caling `compute_f0`."
|
||||||
|
assert pitch_fmin is not None, " [!] Set `pitch_fmin` before caling `compute_f0`."
|
||||||
|
|
||||||
f0, t = pw.dio(
|
f0, voiced_mask, _ = pyin(
|
||||||
x.astype(np.double),
|
y=x.astype(np.double),
|
||||||
fs=sample_rate,
|
fmin=pitch_fmin,
|
||||||
f0_ceil=pitch_fmax,
|
fmax=pitch_fmax,
|
||||||
frame_period=1000 * hop_length / sample_rate,
|
sr=sample_rate,
|
||||||
|
frame_length=win_length,
|
||||||
|
win_length=win_length // 2,
|
||||||
|
hop_length=hop_length,
|
||||||
|
pad_mode=stft_pad_mode,
|
||||||
|
center=center,
|
||||||
|
n_thresholds=100,
|
||||||
|
beta_parameters=(2, 18),
|
||||||
|
boltzmann_parameter=2,
|
||||||
|
resolution=0.1,
|
||||||
|
max_transition_rate=35.92,
|
||||||
|
switch_prob=0.01,
|
||||||
|
no_trough_prob=0.01,
|
||||||
)
|
)
|
||||||
f0 = pw.stonemask(x.astype(np.double), f0, t, sample_rate)
|
f0[~voiced_mask] = 0.0
|
||||||
|
|
||||||
return f0
|
return f0
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -2,12 +2,12 @@ from typing import Dict, Tuple
|
||||||
|
|
||||||
import librosa
|
import librosa
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pyworld as pw
|
|
||||||
import scipy.io.wavfile
|
import scipy.io.wavfile
|
||||||
import scipy.signal
|
import scipy.signal
|
||||||
import soundfile as sf
|
import soundfile as sf
|
||||||
|
|
||||||
from TTS.tts.utils.helpers import StandardScaler
|
from TTS.tts.utils.helpers import StandardScaler
|
||||||
|
from TTS.utils.audio.numpy_transforms import compute_f0
|
||||||
|
|
||||||
# pylint: disable=too-many-public-methods
|
# pylint: disable=too-many-public-methods
|
||||||
|
|
||||||
|
@ -573,23 +573,28 @@ class AudioProcessor(object):
|
||||||
>>> WAV_FILE = filename = librosa.util.example_audio_file()
|
>>> WAV_FILE = filename = librosa.util.example_audio_file()
|
||||||
>>> from TTS.config import BaseAudioConfig
|
>>> from TTS.config import BaseAudioConfig
|
||||||
>>> from TTS.utils.audio import AudioProcessor
|
>>> from TTS.utils.audio import AudioProcessor
|
||||||
>>> conf = BaseAudioConfig(pitch_fmax=8000)
|
>>> conf = BaseAudioConfig(pitch_fmax=640, pitch_fmin=1)
|
||||||
>>> ap = AudioProcessor(**conf)
|
>>> ap = AudioProcessor(**conf)
|
||||||
>>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050]
|
>>> wav = ap.load_wav(WAV_FILE, sr=ap.sample_rate)[:5 * ap.sample_rate]
|
||||||
>>> pitch = ap.compute_f0(wav)
|
>>> pitch = ap.compute_f0(wav)
|
||||||
"""
|
"""
|
||||||
assert self.pitch_fmax is not None, " [!] Set `pitch_fmax` before caling `compute_f0`."
|
assert self.pitch_fmax is not None, " [!] Set `pitch_fmax` before caling `compute_f0`."
|
||||||
|
assert self.pitch_fmin is not None, " [!] Set `pitch_fmin` before caling `compute_f0`."
|
||||||
# align F0 length to the spectrogram length
|
# align F0 length to the spectrogram length
|
||||||
if len(x) % self.hop_length == 0:
|
if len(x) % self.hop_length == 0:
|
||||||
x = np.pad(x, (0, self.hop_length // 2), mode="reflect")
|
x = np.pad(x, (0, self.hop_length // 2), mode=self.stft_pad_mode)
|
||||||
|
|
||||||
f0, t = pw.dio(
|
f0 = compute_f0(
|
||||||
x.astype(np.double),
|
x=x,
|
||||||
fs=self.sample_rate,
|
pitch_fmax=self.pitch_fmax,
|
||||||
f0_ceil=self.pitch_fmax,
|
pitch_fmin=self.pitch_fmin,
|
||||||
frame_period=1000 * self.hop_length / self.sample_rate,
|
hop_length=self.hop_length,
|
||||||
|
win_length=self.win_length,
|
||||||
|
sample_rate=self.sample_rate,
|
||||||
|
stft_pad_mode=self.stft_pad_mode,
|
||||||
|
center=True,
|
||||||
)
|
)
|
||||||
f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate)
|
|
||||||
return f0
|
return f0
|
||||||
|
|
||||||
### Audio Processing ###
|
### Audio Processing ###
|
||||||
|
|
|
@ -38,9 +38,9 @@ class CapacitronOptimizer:
|
||||||
self.param_groups = self.primary_optimizer.param_groups
|
self.param_groups = self.primary_optimizer.param_groups
|
||||||
self.primary_optimizer.step()
|
self.primary_optimizer.step()
|
||||||
|
|
||||||
def zero_grad(self):
|
def zero_grad(self, set_to_none=False):
|
||||||
self.primary_optimizer.zero_grad()
|
self.primary_optimizer.zero_grad(set_to_none)
|
||||||
self.secondary_optimizer.zero_grad()
|
self.secondary_optimizer.zero_grad(set_to_none)
|
||||||
|
|
||||||
def load_state_dict(self, state_dict):
|
def load_state_dict(self, state_dict):
|
||||||
self.primary_optimizer.load_state_dict(state_dict[0])
|
self.primary_optimizer.load_state_dict(state_dict[0])
|
||||||
|
|
|
@ -9,6 +9,8 @@ import fsspec
|
||||||
import torch
|
import torch
|
||||||
from coqpit import Coqpit
|
from coqpit import Coqpit
|
||||||
|
|
||||||
|
from TTS.utils.generic_utils import get_user_data_dir
|
||||||
|
|
||||||
|
|
||||||
class RenamingUnpickler(pickle_tts.Unpickler):
|
class RenamingUnpickler(pickle_tts.Unpickler):
|
||||||
"""Overload default pickler to solve module renaming problem"""
|
"""Overload default pickler to solve module renaming problem"""
|
||||||
|
@ -57,6 +59,7 @@ def copy_model_files(config: Coqpit, out_path, new_fields=None):
|
||||||
def load_fsspec(
|
def load_fsspec(
|
||||||
path: str,
|
path: str,
|
||||||
map_location: Union[str, Callable, torch.device, Dict[Union[str, torch.device], Union[str, torch.device]]] = None,
|
map_location: Union[str, Callable, torch.device, Dict[Union[str, torch.device], Union[str, torch.device]]] = None,
|
||||||
|
cache: bool = True,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> Any:
|
) -> Any:
|
||||||
"""Like torch.load but can load from other locations (e.g. s3:// , gs://).
|
"""Like torch.load but can load from other locations (e.g. s3:// , gs://).
|
||||||
|
@ -64,21 +67,33 @@ def load_fsspec(
|
||||||
Args:
|
Args:
|
||||||
path: Any path or url supported by fsspec.
|
path: Any path or url supported by fsspec.
|
||||||
map_location: torch.device or str.
|
map_location: torch.device or str.
|
||||||
|
cache: If True, cache a remote file locally for subsequent calls. It is cached under `get_user_data_dir()/tts_cache`. Defaults to True.
|
||||||
**kwargs: Keyword arguments forwarded to torch.load.
|
**kwargs: Keyword arguments forwarded to torch.load.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Object stored in path.
|
Object stored in path.
|
||||||
"""
|
"""
|
||||||
with fsspec.open(path, "rb") as f:
|
is_local = os.path.isdir(path) or os.path.isfile(path)
|
||||||
return torch.load(f, map_location=map_location, **kwargs)
|
if cache and not is_local:
|
||||||
|
with fsspec.open(
|
||||||
|
f"filecache::{path}",
|
||||||
|
filecache={"cache_storage": str(get_user_data_dir("tts_cache"))},
|
||||||
|
mode="rb",
|
||||||
|
) as f:
|
||||||
|
return torch.load(f, map_location=map_location, **kwargs)
|
||||||
|
else:
|
||||||
|
with fsspec.open(path, "rb") as f:
|
||||||
|
return torch.load(f, map_location=map_location, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
def load_checkpoint(model, checkpoint_path, use_cuda=False, eval=False): # pylint: disable=redefined-builtin
|
def load_checkpoint(
|
||||||
|
model, checkpoint_path, use_cuda=False, eval=False, cache=False
|
||||||
|
): # pylint: disable=redefined-builtin
|
||||||
try:
|
try:
|
||||||
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
|
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
|
||||||
except ModuleNotFoundError:
|
except ModuleNotFoundError:
|
||||||
pickle_tts.Unpickler = RenamingUnpickler
|
pickle_tts.Unpickler = RenamingUnpickler
|
||||||
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), pickle_module=pickle_tts)
|
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), pickle_module=pickle_tts, cache=cache)
|
||||||
model.load_state_dict(state["model"])
|
model.load_state_dict(state["model"])
|
||||||
if use_cuda:
|
if use_cuda:
|
||||||
model.cuda()
|
model.cuda()
|
||||||
|
|
|
@ -32,11 +32,14 @@ class ModelManager(object):
|
||||||
home path.
|
home path.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
models_file (str): path to .model.json
|
models_file (str): path to .model.json file. Defaults to None.
|
||||||
|
output_prefix (str): prefix to `tts` to download models. Defaults to None
|
||||||
|
progress_bar (bool): print a progress bar when donwloading a file. Defaults to False.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, models_file=None, output_prefix=None):
|
def __init__(self, models_file=None, output_prefix=None, progress_bar=False):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
self.progress_bar = progress_bar
|
||||||
if output_prefix is None:
|
if output_prefix is None:
|
||||||
self.output_prefix = get_user_data_dir("tts")
|
self.output_prefix = get_user_data_dir("tts")
|
||||||
else:
|
else:
|
||||||
|
@ -236,7 +239,7 @@ class ModelManager(object):
|
||||||
os.makedirs(output_path, exist_ok=True)
|
os.makedirs(output_path, exist_ok=True)
|
||||||
print(f" > Downloading model to {output_path}")
|
print(f" > Downloading model to {output_path}")
|
||||||
# download from github release
|
# download from github release
|
||||||
self._download_zip_file(model_item["github_rls_url"], output_path)
|
self._download_zip_file(model_item["github_rls_url"], output_path, self.progress_bar)
|
||||||
self.print_model_license(model_item=model_item)
|
self.print_model_license(model_item=model_item)
|
||||||
# find downloaded files
|
# find downloaded files
|
||||||
output_model_path, output_config_path = self._find_files(output_path)
|
output_model_path, output_config_path = self._find_files(output_path)
|
||||||
|
@ -334,7 +337,7 @@ class ModelManager(object):
|
||||||
config.save_json(config_path)
|
config.save_json(config_path)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _download_zip_file(file_url, output_folder):
|
def _download_zip_file(file_url, output_folder, progress_bar):
|
||||||
"""Download the github releases"""
|
"""Download the github releases"""
|
||||||
# download the file
|
# download the file
|
||||||
r = requests.get(file_url, stream=True)
|
r = requests.get(file_url, stream=True)
|
||||||
|
@ -342,11 +345,13 @@ class ModelManager(object):
|
||||||
try:
|
try:
|
||||||
total_size_in_bytes = int(r.headers.get("content-length", 0))
|
total_size_in_bytes = int(r.headers.get("content-length", 0))
|
||||||
block_size = 1024 # 1 Kibibyte
|
block_size = 1024 # 1 Kibibyte
|
||||||
progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
|
if progress_bar:
|
||||||
|
progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
|
||||||
temp_zip_name = os.path.join(output_folder, file_url.split("/")[-1])
|
temp_zip_name = os.path.join(output_folder, file_url.split("/")[-1])
|
||||||
with open(temp_zip_name, "wb") as file:
|
with open(temp_zip_name, "wb") as file:
|
||||||
for data in r.iter_content(block_size):
|
for data in r.iter_content(block_size):
|
||||||
progress_bar.update(len(data))
|
if progress_bar:
|
||||||
|
progress_bar.update(len(data))
|
||||||
file.write(data)
|
file.write(data)
|
||||||
with zipfile.ZipFile(temp_zip_name) as z:
|
with zipfile.ZipFile(temp_zip_name) as z:
|
||||||
z.extractall(output_folder)
|
z.extractall(output_folder)
|
||||||
|
|
|
@ -212,8 +212,13 @@ class Synthesizer(object):
|
||||||
# handle multi-speaker
|
# handle multi-speaker
|
||||||
speaker_embedding = None
|
speaker_embedding = None
|
||||||
speaker_id = None
|
speaker_id = None
|
||||||
if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "ids"):
|
if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "name_to_id"):
|
||||||
if speaker_name and isinstance(speaker_name, str):
|
|
||||||
|
# handle Neon models with single speaker.
|
||||||
|
if len(self.tts_model.speaker_manager.name_to_id) == 1:
|
||||||
|
speaker_id = list(self.tts_model.speaker_manager.name_to_id.values())[0]
|
||||||
|
|
||||||
|
elif speaker_name and isinstance(speaker_name, str):
|
||||||
if self.tts_config.use_d_vector_file:
|
if self.tts_config.use_d_vector_file:
|
||||||
# get the average speaker embedding from the saved d_vectors.
|
# get the average speaker embedding from the saved d_vectors.
|
||||||
speaker_embedding = self.tts_model.speaker_manager.get_mean_embedding(
|
speaker_embedding = self.tts_model.speaker_manager.get_mean_embedding(
|
||||||
|
@ -222,7 +227,7 @@ class Synthesizer(object):
|
||||||
speaker_embedding = np.array(speaker_embedding)[None, :] # [1 x embedding_dim]
|
speaker_embedding = np.array(speaker_embedding)[None, :] # [1 x embedding_dim]
|
||||||
else:
|
else:
|
||||||
# get speaker idx from the speaker name
|
# get speaker idx from the speaker name
|
||||||
speaker_id = self.tts_model.speaker_manager.ids[speaker_name]
|
speaker_id = self.tts_model.speaker_manager.name_to_id[speaker_name]
|
||||||
|
|
||||||
elif not speaker_name and not speaker_wav:
|
elif not speaker_name and not speaker_wav:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
@ -243,8 +248,12 @@ class Synthesizer(object):
|
||||||
if self.tts_languages_file or (
|
if self.tts_languages_file or (
|
||||||
hasattr(self.tts_model, "language_manager") and self.tts_model.language_manager is not None
|
hasattr(self.tts_model, "language_manager") and self.tts_model.language_manager is not None
|
||||||
):
|
):
|
||||||
if language_name and isinstance(language_name, str):
|
|
||||||
language_id = self.tts_model.language_manager.ids[language_name]
|
if len(self.tts_model.language_manager.name_to_id) == 1:
|
||||||
|
language_id = list(self.tts_model.language_manager.name_to_id.values())[0]
|
||||||
|
|
||||||
|
elif language_name and isinstance(language_name, str):
|
||||||
|
language_id = self.tts_model.language_manager.name_to_id[language_name]
|
||||||
|
|
||||||
elif not language_name:
|
elif not language_name:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
@ -316,7 +325,7 @@ class Synthesizer(object):
|
||||||
# get the speaker embedding or speaker id for the reference wav file
|
# get the speaker embedding or speaker id for the reference wav file
|
||||||
reference_speaker_embedding = None
|
reference_speaker_embedding = None
|
||||||
reference_speaker_id = None
|
reference_speaker_id = None
|
||||||
if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "ids"):
|
if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "name_to_id"):
|
||||||
if reference_speaker_name and isinstance(reference_speaker_name, str):
|
if reference_speaker_name and isinstance(reference_speaker_name, str):
|
||||||
if self.tts_config.use_d_vector_file:
|
if self.tts_config.use_d_vector_file:
|
||||||
# get the speaker embedding from the saved d_vectors.
|
# get the speaker embedding from the saved d_vectors.
|
||||||
|
@ -328,12 +337,11 @@ class Synthesizer(object):
|
||||||
] # [1 x embedding_dim]
|
] # [1 x embedding_dim]
|
||||||
else:
|
else:
|
||||||
# get speaker idx from the speaker name
|
# get speaker idx from the speaker name
|
||||||
reference_speaker_id = self.tts_model.speaker_manager.ids[reference_speaker_name]
|
reference_speaker_id = self.tts_model.speaker_manager.name_to_id[reference_speaker_name]
|
||||||
else:
|
else:
|
||||||
reference_speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(
|
reference_speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(
|
||||||
reference_wav
|
reference_wav
|
||||||
)
|
)
|
||||||
|
|
||||||
outputs = transfer_voice(
|
outputs = transfer_voice(
|
||||||
model=self.tts_model,
|
model=self.tts_model,
|
||||||
CONFIG=self.tts_config,
|
CONFIG=self.tts_config,
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
import soundfile as sf
|
||||||
import torch
|
import torch
|
||||||
import torchaudio
|
import torchaudio
|
||||||
|
|
||||||
|
@ -48,7 +49,7 @@ def remove_silence(
|
||||||
):
|
):
|
||||||
|
|
||||||
# get the VAD model and utils functions
|
# get the VAD model and utils functions
|
||||||
model, get_speech_timestamps, save_audio, collect_chunks = model_and_utils
|
model, get_speech_timestamps, _, collect_chunks = model_and_utils
|
||||||
|
|
||||||
# read ground truth wav and resample the audio for the VAD
|
# read ground truth wav and resample the audio for the VAD
|
||||||
wav, gt_sample_rate = read_audio(audio_path)
|
wav, gt_sample_rate = read_audio(audio_path)
|
||||||
|
@ -73,9 +74,11 @@ def remove_silence(
|
||||||
# if have speech timestamps else save the wav
|
# if have speech timestamps else save the wav
|
||||||
if new_speech_timestamps:
|
if new_speech_timestamps:
|
||||||
wav = collect_chunks(new_speech_timestamps, wav)
|
wav = collect_chunks(new_speech_timestamps, wav)
|
||||||
|
is_speech = True
|
||||||
else:
|
else:
|
||||||
print(f"> The file {audio_path} probably does not have speech please check it !!")
|
print(f"> The file {audio_path} probably does not have speech please check it !!")
|
||||||
|
is_speech = False
|
||||||
|
|
||||||
# save audio
|
# save audio
|
||||||
save_audio(out_path, wav, sampling_rate=gt_sample_rate)
|
sf.write(out_path, wav, gt_sample_rate, subtype="PCM_16")
|
||||||
return out_path
|
return out_path, is_speech
|
||||||
|
|
|
@ -22,14 +22,12 @@ class HifiganConfig(BaseGANVocoderConfig):
|
||||||
generator_model_params (dict): Parameters of the generator model. Defaults to
|
generator_model_params (dict): Parameters of the generator model. Defaults to
|
||||||
`
|
`
|
||||||
{
|
{
|
||||||
"use_mel": True,
|
"upsample_factors": [8, 8, 2, 2],
|
||||||
"sample_rate": 22050,
|
"upsample_kernel_sizes": [16, 16, 4, 4],
|
||||||
"n_fft": 1024,
|
"upsample_initial_channel": 512,
|
||||||
"hop_length": 256,
|
"resblock_kernel_sizes": [3, 7, 11],
|
||||||
"win_length": 1024,
|
"resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
|
||||||
"n_mels": 80,
|
"resblock_type": "1",
|
||||||
"mel_fmin": 0.0,
|
|
||||||
"mel_fmax": None,
|
|
||||||
}
|
}
|
||||||
`
|
`
|
||||||
batch_size (int):
|
batch_size (int):
|
||||||
|
|
|
@ -231,6 +231,7 @@ class GAN(BaseVocoder):
|
||||||
config: Coqpit,
|
config: Coqpit,
|
||||||
checkpoint_path: str,
|
checkpoint_path: str,
|
||||||
eval: bool = False, # pylint: disable=unused-argument, redefined-builtin
|
eval: bool = False, # pylint: disable=unused-argument, redefined-builtin
|
||||||
|
cache: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Load a GAN checkpoint and initialize model parameters.
|
"""Load a GAN checkpoint and initialize model parameters.
|
||||||
|
|
||||||
|
@ -239,7 +240,7 @@ class GAN(BaseVocoder):
|
||||||
checkpoint_path (str): Checkpoint file path.
|
checkpoint_path (str): Checkpoint file path.
|
||||||
eval (bool, optional): If true, load the model for inference. If falseDefaults to False.
|
eval (bool, optional): If true, load the model for inference. If falseDefaults to False.
|
||||||
"""
|
"""
|
||||||
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
|
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
|
||||||
# band-aid for older than v0.0.15 GAN models
|
# band-aid for older than v0.0.15 GAN models
|
||||||
if "model_disc" in state:
|
if "model_disc" in state:
|
||||||
self.model_g.load_checkpoint(config, checkpoint_path, eval)
|
self.model_g.load_checkpoint(config, checkpoint_path, eval)
|
||||||
|
|
|
@ -290,9 +290,9 @@ class HifiganGenerator(torch.nn.Module):
|
||||||
remove_weight_norm(self.conv_post)
|
remove_weight_norm(self.conv_post)
|
||||||
|
|
||||||
def load_checkpoint(
|
def load_checkpoint(
|
||||||
self, config, checkpoint_path, eval=False
|
self, config, checkpoint_path, eval=False, cache=False
|
||||||
): # pylint: disable=unused-argument, redefined-builtin
|
): # pylint: disable=unused-argument, redefined-builtin
|
||||||
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
|
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
|
||||||
self.load_state_dict(state["model"])
|
self.load_state_dict(state["model"])
|
||||||
if eval:
|
if eval:
|
||||||
self.eval()
|
self.eval()
|
||||||
|
|
|
@ -85,9 +85,9 @@ class MelganGenerator(nn.Module):
|
||||||
layer.remove_weight_norm()
|
layer.remove_weight_norm()
|
||||||
|
|
||||||
def load_checkpoint(
|
def load_checkpoint(
|
||||||
self, config, checkpoint_path, eval=False
|
self, config, checkpoint_path, eval=False, cache=False
|
||||||
): # pylint: disable=unused-argument, redefined-builtin
|
): # pylint: disable=unused-argument, redefined-builtin
|
||||||
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
|
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
|
||||||
self.load_state_dict(state["model"])
|
self.load_state_dict(state["model"])
|
||||||
if eval:
|
if eval:
|
||||||
self.eval()
|
self.eval()
|
||||||
|
|
|
@ -153,9 +153,9 @@ class ParallelWaveganGenerator(torch.nn.Module):
|
||||||
return self._get_receptive_field_size(self.layers, self.stacks, self.kernel_size)
|
return self._get_receptive_field_size(self.layers, self.stacks, self.kernel_size)
|
||||||
|
|
||||||
def load_checkpoint(
|
def load_checkpoint(
|
||||||
self, config, checkpoint_path, eval=False
|
self, config, checkpoint_path, eval=False, cache=False
|
||||||
): # pylint: disable=unused-argument, redefined-builtin
|
): # pylint: disable=unused-argument, redefined-builtin
|
||||||
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
|
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
|
||||||
self.load_state_dict(state["model"])
|
self.load_state_dict(state["model"])
|
||||||
if eval:
|
if eval:
|
||||||
self.eval()
|
self.eval()
|
||||||
|
|
|
@ -218,9 +218,9 @@ class Wavegrad(BaseVocoder):
|
||||||
self.y_conv = weight_norm(self.y_conv)
|
self.y_conv = weight_norm(self.y_conv)
|
||||||
|
|
||||||
def load_checkpoint(
|
def load_checkpoint(
|
||||||
self, config, checkpoint_path, eval=False
|
self, config, checkpoint_path, eval=False, cache=False
|
||||||
): # pylint: disable=unused-argument, redefined-builtin
|
): # pylint: disable=unused-argument, redefined-builtin
|
||||||
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
|
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
|
||||||
self.load_state_dict(state["model"])
|
self.load_state_dict(state["model"])
|
||||||
if eval:
|
if eval:
|
||||||
self.eval()
|
self.eval()
|
||||||
|
|
|
@ -542,9 +542,9 @@ class Wavernn(BaseVocoder):
|
||||||
return unfolded
|
return unfolded
|
||||||
|
|
||||||
def load_checkpoint(
|
def load_checkpoint(
|
||||||
self, config, checkpoint_path, eval=False
|
self, config, checkpoint_path, eval=False, cache=False
|
||||||
): # pylint: disable=unused-argument, redefined-builtin
|
): # pylint: disable=unused-argument, redefined-builtin
|
||||||
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
|
state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
|
||||||
self.load_state_dict(state["model"])
|
self.load_state_dict(state["model"])
|
||||||
if eval:
|
if eval:
|
||||||
self.eval()
|
self.eval()
|
||||||
|
|
|
@ -0,0 +1,56 @@
|
||||||
|
(docker_images)=
|
||||||
|
## Docker images
|
||||||
|
We provide docker images to be able to test TTS without having to setup your own environment.
|
||||||
|
|
||||||
|
### Using premade images
|
||||||
|
You can use premade images built automatically from the latest TTS version.
|
||||||
|
|
||||||
|
#### CPU version
|
||||||
|
```bash
|
||||||
|
docker pull ghcr.io/coqui-ai/tts-cpu
|
||||||
|
```
|
||||||
|
#### GPU version
|
||||||
|
```bash
|
||||||
|
docker pull ghcr.io/coqui-ai/tts
|
||||||
|
```
|
||||||
|
|
||||||
|
### Building your own image
|
||||||
|
```bash
|
||||||
|
docker build -t tts .
|
||||||
|
```
|
||||||
|
|
||||||
|
## Basic inference
|
||||||
|
Basic usage: generating an audio file from a text passed as argument.
|
||||||
|
You can pass any tts argument after the image name.
|
||||||
|
|
||||||
|
### CPU version
|
||||||
|
```bash
|
||||||
|
docker run --rm -v ~/tts-output:/root/tts-output ghcr.io/coqui-ai/tts-cpu --text "Hello." --out_path /root/tts-output/hello.wav
|
||||||
|
```
|
||||||
|
### GPU version
|
||||||
|
For the GPU version, you need to have the latest NVIDIA drivers installed.
|
||||||
|
With `nvidia-smi` you can check the CUDA version supported, it must be >= 11.8
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker run --rm --gpus all -v ~/tts-output:/root/tts-output ghcr.io/coqui-ai/tts --text "Hello." --out_path /root/tts-output/hello.wav --use_cuda true
|
||||||
|
```
|
||||||
|
|
||||||
|
## Start a server
|
||||||
|
Starting a TTS server:
|
||||||
|
Start the container and get a shell inside it.
|
||||||
|
|
||||||
|
### CPU version
|
||||||
|
```bash
|
||||||
|
docker run --rm -it -p 5002:5002 --entrypoint /bin/bash ghcr.io/coqui-ai/tts-cpu
|
||||||
|
python3 TTS/server/server.py --list_models #To get the list of available models
|
||||||
|
python3 TTS/server/server.py --model_name tts_models/en/vctk/vits
|
||||||
|
```
|
||||||
|
|
||||||
|
### GPU version
|
||||||
|
```bash
|
||||||
|
docker run --rm -it -p 5002:5002 --gpus all --entrypoint /bin/bash ghcr.io/coqui-ai/tts
|
||||||
|
python3 TTS/server/server.py --list_models #To get the list of available models
|
||||||
|
python3 TTS/server/server.py --model_name tts_models/en/vctk/vits --use_cuda true
|
||||||
|
```
|
||||||
|
|
||||||
|
Click [there](http://[::1]:5002/) and have fun with the server!
|
|
@ -53,7 +53,7 @@ We tried to collect common issues and questions we receive about 🐸TTS. It is
|
||||||
"mixed_precision": false,
|
"mixed_precision": false,
|
||||||
"output_path": "recipes/ljspeech/glow_tts/",
|
"output_path": "recipes/ljspeech/glow_tts/",
|
||||||
"test_sentences": ["Test this sentence.", "This test sentence.", "Sentence this test."],
|
"test_sentences": ["Test this sentence.", "This test sentence.", "Sentence this test."],
|
||||||
"datasets":[{"name": "ljspeech", "meta_file_train":"metadata.csv", "path": "recipes/ljspeech/LJSpeech-1.1/"}]
|
"datasets":[{"formatter": "ljspeech", "meta_file_train":"metadata.csv", "path": "recipes/ljspeech/LJSpeech-1.1/"}]
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -88,7 +88,7 @@ from TTS.tts.datasets import load_tts_samples
|
||||||
|
|
||||||
# dataset config for one of the pre-defined datasets
|
# dataset config for one of the pre-defined datasets
|
||||||
dataset_config = BaseDatasetConfig(
|
dataset_config = BaseDatasetConfig(
|
||||||
name="vctk", meta_file_train="", language="en-us", path="dataset-path")
|
formatter="vctk", meta_file_train="", language="en-us", path="dataset-path")
|
||||||
)
|
)
|
||||||
|
|
||||||
# load training samples
|
# load training samples
|
||||||
|
|
|
@ -20,6 +20,7 @@
|
||||||
:caption: Using 🐸TTS
|
:caption: Using 🐸TTS
|
||||||
|
|
||||||
inference
|
inference
|
||||||
|
docker_images
|
||||||
implementing_a_new_model
|
implementing_a_new_model
|
||||||
training_a_model
|
training_a_model
|
||||||
finetuning
|
finetuning
|
||||||
|
|
|
@ -12,7 +12,7 @@ Currently we provide the following pre-configured architectures:
|
||||||
|
|
||||||
- **FastPitch:**
|
- **FastPitch:**
|
||||||
|
|
||||||
It uses the same FastSpeech architecture that us conditioned on fundemental frequency (f0) contours with the
|
It uses the same FastSpeech architecture that is conditioned on fundemental frequency (f0) contours with the
|
||||||
promise of more expressive speech.
|
promise of more expressive speech.
|
||||||
|
|
||||||
- **SpeedySpeech:**
|
- **SpeedySpeech:**
|
||||||
|
|
|
@ -84,7 +84,7 @@ We still support running training from CLI like in the old days. The same traini
|
||||||
"print_eval": true,
|
"print_eval": true,
|
||||||
"mixed_precision": false,
|
"mixed_precision": false,
|
||||||
"output_path": "recipes/ljspeech/glow_tts/",
|
"output_path": "recipes/ljspeech/glow_tts/",
|
||||||
"datasets":[{"name": "ljspeech", "meta_file_train":"metadata.csv", "path": "recipes/ljspeech/LJSpeech-1.1/"}]
|
"datasets":[{"formatter": "ljspeech", "meta_file_train":"metadata.csv", "path": "recipes/ljspeech/LJSpeech-1.1/"}]
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -120,6 +120,3 @@ $ tts-server -h # see the help
|
||||||
$ tts-server --list_models # list the available models.
|
$ tts-server --list_models # list the available models.
|
||||||
```
|
```
|
||||||

|

|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -74,7 +74,7 @@
|
||||||
"<span style=\"color:purple;font-size:15px\">\n",
|
"<span style=\"color:purple;font-size:15px\">\n",
|
||||||
"/MyTTSDataset <br /> \n",
|
"/MyTTSDataset <br /> \n",
|
||||||
" | <br /> \n",
|
" | <br /> \n",
|
||||||
" | -> metadata.txt<br /> \n",
|
" | -> metadata.csv<br /> \n",
|
||||||
" | -> /wavs<br /> \n",
|
" | -> /wavs<br /> \n",
|
||||||
"  | -> audio1.wav<br /> \n",
|
"  | -> audio1.wav<br /> \n",
|
||||||
"  | -> audio2.wav<br /> \n",
|
"  | -> audio2.wav<br /> \n",
|
||||||
|
|
|
@ -15,7 +15,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
data_path = "/srv/data/"
|
data_path = "/srv/data/"
|
||||||
|
|
||||||
# Using LJSpeech like dataset processing for the blizzard dataset
|
# Using LJSpeech like dataset processing for the blizzard dataset
|
||||||
dataset_config = BaseDatasetConfig(name="ljspeech", meta_file_train="metadata.csv", path=data_path)
|
dataset_config = BaseDatasetConfig(formatter="ljspeech", meta_file_train="metadata.csv", path=data_path)
|
||||||
|
|
||||||
audio_config = BaseAudioConfig(
|
audio_config = BaseAudioConfig(
|
||||||
sample_rate=24000,
|
sample_rate=24000,
|
||||||
|
|
|
@ -16,7 +16,7 @@ data_path = "/srv/data/blizzard2013/segmented"
|
||||||
|
|
||||||
# Using LJSpeech like dataset processing for the blizzard dataset
|
# Using LJSpeech like dataset processing for the blizzard dataset
|
||||||
dataset_config = BaseDatasetConfig(
|
dataset_config = BaseDatasetConfig(
|
||||||
name="ljspeech",
|
formatter="ljspeech",
|
||||||
meta_file_train="metadata.csv",
|
meta_file_train="metadata.csv",
|
||||||
path=data_path,
|
path=data_path,
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
{
|
{
|
||||||
"datasets": [
|
"datasets": [
|
||||||
{
|
{
|
||||||
"name": "kokoro",
|
"formatter": "kokoro",
|
||||||
"path": "DEFINE THIS",
|
"path": "DEFINE THIS",
|
||||||
"meta_file_train": "metadata.csv",
|
"meta_file_train": "metadata.csv",
|
||||||
"meta_file_val": null
|
"meta_file_val": null
|
||||||
|
|
|
@ -13,7 +13,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
# init configs
|
# init configs
|
||||||
dataset_config = BaseDatasetConfig(
|
dataset_config = BaseDatasetConfig(
|
||||||
name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
|
formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
|
||||||
)
|
)
|
||||||
config = AlignTTSConfig(
|
config = AlignTTSConfig(
|
||||||
batch_size=32,
|
batch_size=32,
|
||||||
|
|
|
@ -14,7 +14,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
# init configs
|
# init configs
|
||||||
dataset_config = BaseDatasetConfig(
|
dataset_config = BaseDatasetConfig(
|
||||||
name="ljspeech",
|
formatter="ljspeech",
|
||||||
meta_file_train="metadata.csv",
|
meta_file_train="metadata.csv",
|
||||||
# meta_file_attn_mask=os.path.join(output_path, "../LJSpeech-1.1/metadata_attn_mask.txt"),
|
# meta_file_attn_mask=os.path.join(output_path, "../LJSpeech-1.1/metadata_attn_mask.txt"),
|
||||||
path=os.path.join(output_path, "../LJSpeech-1.1/"),
|
path=os.path.join(output_path, "../LJSpeech-1.1/"),
|
||||||
|
|
|
@ -14,7 +14,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
# init configs
|
# init configs
|
||||||
dataset_config = BaseDatasetConfig(
|
dataset_config = BaseDatasetConfig(
|
||||||
name="ljspeech",
|
formatter="ljspeech",
|
||||||
meta_file_train="metadata.csv",
|
meta_file_train="metadata.csv",
|
||||||
# meta_file_attn_mask=os.path.join(output_path, "../LJSpeech-1.1/metadata_attn_mask.txt"),
|
# meta_file_attn_mask=os.path.join(output_path, "../LJSpeech-1.1/metadata_attn_mask.txt"),
|
||||||
path=os.path.join(output_path, "../LJSpeech-1.1/"),
|
path=os.path.join(output_path, "../LJSpeech-1.1/"),
|
||||||
|
|
|
@ -21,7 +21,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
# Set LJSpeech as our target dataset and define its path.
|
# Set LJSpeech as our target dataset and define its path.
|
||||||
# You can also use a simple Dict to define the dataset and pass it to your custom formatter.
|
# You can also use a simple Dict to define the dataset and pass it to your custom formatter.
|
||||||
dataset_config = BaseDatasetConfig(
|
dataset_config = BaseDatasetConfig(
|
||||||
name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
|
formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
|
||||||
)
|
)
|
||||||
|
|
||||||
# INITIALIZE THE TRAINING CONFIGURATION
|
# INITIALIZE THE TRAINING CONFIGURATION
|
||||||
|
|
|
@ -11,7 +11,7 @@ from TTS.utils.audio import AudioProcessor
|
||||||
|
|
||||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
dataset_config = BaseDatasetConfig(
|
dataset_config = BaseDatasetConfig(
|
||||||
name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
|
formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
|
||||||
)
|
)
|
||||||
|
|
||||||
audio_config = BaseAudioConfig(
|
audio_config = BaseAudioConfig(
|
||||||
|
|
|
@ -16,7 +16,7 @@ data_path = "/srv/data/"
|
||||||
|
|
||||||
# Using LJSpeech like dataset processing for the blizzard dataset
|
# Using LJSpeech like dataset processing for the blizzard dataset
|
||||||
dataset_config = BaseDatasetConfig(
|
dataset_config = BaseDatasetConfig(
|
||||||
name="ljspeech",
|
formatter="ljspeech",
|
||||||
meta_file_train="metadata.csv",
|
meta_file_train="metadata.csv",
|
||||||
path=data_path,
|
path=data_path,
|
||||||
)
|
)
|
||||||
|
|
|
@ -16,7 +16,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
# init configs
|
# init configs
|
||||||
dataset_config = BaseDatasetConfig(
|
dataset_config = BaseDatasetConfig(
|
||||||
name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
|
formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
|
||||||
)
|
)
|
||||||
|
|
||||||
audio_config = BaseAudioConfig(
|
audio_config = BaseAudioConfig(
|
||||||
|
|
|
@ -16,7 +16,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
# init configs
|
# init configs
|
||||||
dataset_config = BaseDatasetConfig(
|
dataset_config = BaseDatasetConfig(
|
||||||
name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
|
formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
|
||||||
)
|
)
|
||||||
|
|
||||||
audio_config = BaseAudioConfig(
|
audio_config = BaseAudioConfig(
|
||||||
|
|
|
@ -11,7 +11,7 @@ from TTS.utils.audio import AudioProcessor
|
||||||
|
|
||||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
dataset_config = BaseDatasetConfig(
|
dataset_config = BaseDatasetConfig(
|
||||||
name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
|
formatter="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
|
||||||
)
|
)
|
||||||
audio_config = VitsAudioConfig(
|
audio_config = VitsAudioConfig(
|
||||||
sample_rate=22050, win_length=1024, hop_length=256, num_mels=80, mel_fmin=0, mel_fmax=None
|
sample_rate=22050, win_length=1024, hop_length=256, num_mels=80, mel_fmin=0, mel_fmax=None
|
||||||
|
|
|
@ -17,7 +17,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
mailabs_path = "/home/julian/workspace/mailabs/**"
|
mailabs_path = "/home/julian/workspace/mailabs/**"
|
||||||
dataset_paths = glob(mailabs_path)
|
dataset_paths = glob(mailabs_path)
|
||||||
dataset_config = [
|
dataset_config = [
|
||||||
BaseDatasetConfig(name="mailabs", meta_file_train=None, path=path, language=path.split("/")[-1])
|
BaseDatasetConfig(formatter="mailabs", meta_file_train=None, path=path, language=path.split("/")[-1])
|
||||||
for path in dataset_paths
|
for path in dataset_paths
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
@ -14,7 +14,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
# init configs
|
# init configs
|
||||||
dataset_config = BaseDatasetConfig(
|
dataset_config = BaseDatasetConfig(
|
||||||
name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
|
formatter="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
|
||||||
)
|
)
|
||||||
|
|
||||||
# download dataset if not already present
|
# download dataset if not already present
|
||||||
|
|
|
@ -22,7 +22,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
# Set LJSpeech as our target dataset and define its path.
|
# Set LJSpeech as our target dataset and define its path.
|
||||||
# You can also use a simple Dict to define the dataset and pass it to your custom formatter.
|
# You can also use a simple Dict to define the dataset and pass it to your custom formatter.
|
||||||
dataset_config = BaseDatasetConfig(
|
dataset_config = BaseDatasetConfig(
|
||||||
name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
|
formatter="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
|
||||||
)
|
)
|
||||||
|
|
||||||
# download dataset if not already present
|
# download dataset if not already present
|
||||||
|
|
|
@ -12,7 +12,7 @@ from TTS.utils.downloaders import download_thorsten_de
|
||||||
|
|
||||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
dataset_config = BaseDatasetConfig(
|
dataset_config = BaseDatasetConfig(
|
||||||
name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
|
formatter="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
|
||||||
)
|
)
|
||||||
|
|
||||||
# download dataset if not already present
|
# download dataset if not already present
|
||||||
|
|
|
@ -16,7 +16,7 @@ output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
# init configs
|
# init configs
|
||||||
dataset_config = BaseDatasetConfig(
|
dataset_config = BaseDatasetConfig(
|
||||||
name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
|
formatter="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
|
||||||
)
|
)
|
||||||
|
|
||||||
# download dataset if not already present
|
# download dataset if not already present
|
||||||
|
|
|
@ -12,7 +12,7 @@ from TTS.utils.downloaders import download_thorsten_de
|
||||||
|
|
||||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
dataset_config = BaseDatasetConfig(
|
dataset_config = BaseDatasetConfig(
|
||||||
name="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
|
formatter="thorsten", meta_file_train="metadata.csv", path=os.path.join(output_path, "../thorsten-de/")
|
||||||
)
|
)
|
||||||
|
|
||||||
# download dataset if not already present
|
# download dataset if not already present
|
||||||
|
|
|
@ -11,7 +11,7 @@ from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
||||||
from TTS.utils.audio import AudioProcessor
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
|
||||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
|
dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
|
||||||
|
|
||||||
audio_config = BaseAudioConfig(
|
audio_config = BaseAudioConfig(
|
||||||
sample_rate=22050,
|
sample_rate=22050,
|
||||||
|
|
|
@ -11,7 +11,7 @@ from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
||||||
from TTS.utils.audio import AudioProcessor
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
|
||||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
|
dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
|
||||||
|
|
||||||
audio_config = BaseAudioConfig(
|
audio_config = BaseAudioConfig(
|
||||||
sample_rate=22050,
|
sample_rate=22050,
|
||||||
|
|
|
@ -22,7 +22,7 @@ if not os.path.exists(dataset_path):
|
||||||
download_vctk(dataset_path)
|
download_vctk(dataset_path)
|
||||||
|
|
||||||
# define dataset config
|
# define dataset config
|
||||||
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=dataset_path)
|
dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", path=dataset_path)
|
||||||
|
|
||||||
# define audio config
|
# define audio config
|
||||||
# ❗ resample the dataset externally using `TTS/bin/resample.py` and set `resample=False` for faster training
|
# ❗ resample the dataset externally using `TTS/bin/resample.py` and set `resample=False` for faster training
|
||||||
|
|
|
@ -0,0 +1,139 @@
|
||||||
|
import os
|
||||||
|
|
||||||
|
from TTS.encoder.configs.speaker_encoder_config import SpeakerEncoderConfig
|
||||||
|
|
||||||
|
# from TTS.encoder.configs.emotion_encoder_config import EmotionEncoderConfig
|
||||||
|
from TTS.tts.configs.shared_configs import BaseDatasetConfig
|
||||||
|
|
||||||
|
CURRENT_PATH = os.getcwd()
|
||||||
|
# change the root path to the TTS root path
|
||||||
|
os.chdir("../../../")
|
||||||
|
|
||||||
|
### Definitions ###
|
||||||
|
# dataset
|
||||||
|
VCTK_PATH = "/raid/datasets/VCTK_NEW_16khz_removed_silence_silero_vad/" # download: https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zipdddddddddd
|
||||||
|
RIR_SIMULATED_PATH = "/raid/datasets/DA/RIRS_NOISES/simulated_rirs/" # download: https://www.openslr.org/17/
|
||||||
|
MUSAN_PATH = "/raid/datasets/DA/musan/" # download: https://www.openslr.org/17/
|
||||||
|
|
||||||
|
# training
|
||||||
|
OUTPUT_PATH = os.path.join(
|
||||||
|
CURRENT_PATH, "resnet_speaker_encoder_training_output/"
|
||||||
|
) # path to save the train logs and checkpoint
|
||||||
|
CONFIG_OUT_PATH = os.path.join(OUTPUT_PATH, "config_se.json")
|
||||||
|
RESTORE_PATH = None # Checkpoint to use for transfer learning if None ignore
|
||||||
|
|
||||||
|
# instance the config
|
||||||
|
# to speaker encoder
|
||||||
|
config = SpeakerEncoderConfig()
|
||||||
|
# to emotion encoder
|
||||||
|
# config = EmotionEncoderConfig()
|
||||||
|
|
||||||
|
|
||||||
|
#### DATASET CONFIG ####
|
||||||
|
# The formatter need to return the key "speaker_name" for the speaker encoder and the "emotion_name" for the emotion encoder
|
||||||
|
dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", language="en-us", path=VCTK_PATH)
|
||||||
|
|
||||||
|
# add the dataset to the config
|
||||||
|
config.datasets = [dataset_config]
|
||||||
|
|
||||||
|
|
||||||
|
#### TRAINING CONFIG ####
|
||||||
|
# The encoder data loader balancer the dataset item equally to guarantee better training and to attend the losses requirements
|
||||||
|
# It have two parameters to control the final batch size the number total of speaker used in each batch and the number of samples for each speaker
|
||||||
|
|
||||||
|
# number total of speaker in batch in training
|
||||||
|
config.num_classes_in_batch = 100
|
||||||
|
# number of utterance per class/speaker in the batch in training
|
||||||
|
config.num_utter_per_class = 4
|
||||||
|
# final batch size = config.num_classes_in_batch * config.num_utter_per_class
|
||||||
|
|
||||||
|
# number total of speaker in batch in evaluation
|
||||||
|
config.eval_num_classes_in_batch = 100
|
||||||
|
# number of utterance per class/speaker in the batch in evaluation
|
||||||
|
config.eval_num_utter_per_class = 4
|
||||||
|
|
||||||
|
# number of data loader workers
|
||||||
|
config.num_loader_workers = 8
|
||||||
|
config.num_val_loader_workers = 8
|
||||||
|
|
||||||
|
# number of epochs
|
||||||
|
config.epochs = 10000
|
||||||
|
# loss to be used in training
|
||||||
|
config.loss = "softmaxproto"
|
||||||
|
|
||||||
|
# run eval
|
||||||
|
config.run_eval = False
|
||||||
|
|
||||||
|
# output path for the checkpoints
|
||||||
|
config.output_path = OUTPUT_PATH
|
||||||
|
|
||||||
|
# Save local checkpoint every save_step steps
|
||||||
|
config.save_step = 2000
|
||||||
|
|
||||||
|
### Model Config ###
|
||||||
|
config.model_params = {
|
||||||
|
"model_name": "resnet", # supported "lstm" and "resnet"
|
||||||
|
"input_dim": 64,
|
||||||
|
"use_torch_spec": True,
|
||||||
|
"log_input": True,
|
||||||
|
"proj_dim": 512, # embedding dim
|
||||||
|
}
|
||||||
|
|
||||||
|
### Audio Config ###
|
||||||
|
# To fast train the model divides the audio in small parts. it parameter defines the length in seconds of these "parts"
|
||||||
|
config.voice_len = 2.0
|
||||||
|
# all others configs
|
||||||
|
config.audio = {
|
||||||
|
"fft_size": 512,
|
||||||
|
"win_length": 400,
|
||||||
|
"hop_length": 160,
|
||||||
|
"frame_shift_ms": None,
|
||||||
|
"frame_length_ms": None,
|
||||||
|
"stft_pad_mode": "reflect",
|
||||||
|
"sample_rate": 16000,
|
||||||
|
"resample": False,
|
||||||
|
"preemphasis": 0.97,
|
||||||
|
"ref_level_db": 20,
|
||||||
|
"do_sound_norm": False,
|
||||||
|
"do_trim_silence": False,
|
||||||
|
"trim_db": 60,
|
||||||
|
"power": 1.5,
|
||||||
|
"griffin_lim_iters": 60,
|
||||||
|
"num_mels": 64,
|
||||||
|
"mel_fmin": 0.0,
|
||||||
|
"mel_fmax": 8000.0,
|
||||||
|
"spec_gain": 20,
|
||||||
|
"signal_norm": False,
|
||||||
|
"min_level_db": -100,
|
||||||
|
"symmetric_norm": False,
|
||||||
|
"max_norm": 4.0,
|
||||||
|
"clip_norm": False,
|
||||||
|
"stats_path": None,
|
||||||
|
"do_rms_norm": True,
|
||||||
|
"db_level": -27.0,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
### Augmentation Config ###
|
||||||
|
config.audio_augmentation = {
|
||||||
|
# additive noise and room impulse response (RIR) simulation similar to: https://arxiv.org/pdf/2009.14153.pdf
|
||||||
|
"p": 0.5, # probability to the use of one of the augmentation - 0 means disabled
|
||||||
|
"rir": {"rir_path": RIR_SIMULATED_PATH, "conv_mode": "full"}, # download: https://www.openslr.org/17/
|
||||||
|
"additive": {
|
||||||
|
"sounds_path": MUSAN_PATH,
|
||||||
|
"speech": {"min_snr_in_db": 13, "max_snr_in_db": 20, "min_num_noises": 1, "max_num_noises": 1},
|
||||||
|
"noise": {"min_snr_in_db": 0, "max_snr_in_db": 15, "min_num_noises": 1, "max_num_noises": 1},
|
||||||
|
"music": {"min_snr_in_db": 5, "max_snr_in_db": 15, "min_num_noises": 1, "max_num_noises": 1},
|
||||||
|
},
|
||||||
|
"gaussian": {"p": 0.7, "min_amplitude": 0.0, "max_amplitude": 1e-05},
|
||||||
|
}
|
||||||
|
|
||||||
|
config.save_json(CONFIG_OUT_PATH)
|
||||||
|
|
||||||
|
print(CONFIG_OUT_PATH)
|
||||||
|
if RESTORE_PATH is not None:
|
||||||
|
command = f"python TTS/bin/train_encoder.py --config_path {CONFIG_OUT_PATH} --restore_path {RESTORE_PATH}"
|
||||||
|
else:
|
||||||
|
command = f"python TTS/bin/train_encoder.py --config_path {CONFIG_OUT_PATH}"
|
||||||
|
|
||||||
|
os.system(command)
|
|
@ -11,7 +11,7 @@ from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
||||||
from TTS.utils.audio import AudioProcessor
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
|
||||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
|
dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
|
||||||
|
|
||||||
audio_config = BaseAudioConfig(
|
audio_config = BaseAudioConfig(
|
||||||
sample_rate=22050,
|
sample_rate=22050,
|
||||||
|
|
|
@ -12,7 +12,7 @@ from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
||||||
from TTS.utils.audio import AudioProcessor
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
|
||||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
|
dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
|
||||||
|
|
||||||
audio_config = BaseAudioConfig(
|
audio_config = BaseAudioConfig(
|
||||||
sample_rate=22050,
|
sample_rate=22050,
|
||||||
|
|
|
@ -12,7 +12,7 @@ from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
||||||
from TTS.utils.audio import AudioProcessor
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
|
||||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
|
dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
|
||||||
|
|
||||||
audio_config = BaseAudioConfig(
|
audio_config = BaseAudioConfig(
|
||||||
sample_rate=22050,
|
sample_rate=22050,
|
||||||
|
|
|
@ -12,7 +12,7 @@ from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
||||||
from TTS.utils.audio import AudioProcessor
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
|
||||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
dataset_config = BaseDatasetConfig(name="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
|
dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", path=os.path.join(output_path, "../VCTK/"))
|
||||||
|
|
||||||
audio_config = BaseAudioConfig(
|
audio_config = BaseAudioConfig(
|
||||||
sample_rate=22050,
|
sample_rate=22050,
|
||||||
|
|
|
@ -12,7 +12,7 @@ from TTS.utils.audio import AudioProcessor
|
||||||
|
|
||||||
output_path = os.path.dirname(os.path.abspath(__file__))
|
output_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
dataset_config = BaseDatasetConfig(
|
dataset_config = BaseDatasetConfig(
|
||||||
name="vctk", meta_file_train="", language="en-us", path=os.path.join(output_path, "../VCTK/")
|
formatter="vctk", meta_file_train="", language="en-us", path=os.path.join(output_path, "../VCTK/")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -23,7 +23,6 @@ umap-learn==0.5.1
|
||||||
pandas
|
pandas
|
||||||
# deps for training
|
# deps for training
|
||||||
matplotlib
|
matplotlib
|
||||||
pyworld==0.2.10 # > 0.2.10 is not p3.10.x compatible
|
|
||||||
# coqui stack
|
# coqui stack
|
||||||
trainer
|
trainer
|
||||||
# config management
|
# config management
|
||||||
|
@ -35,4 +34,8 @@ pypinyin
|
||||||
mecab-python3==1.0.5
|
mecab-python3==1.0.5
|
||||||
unidic-lite==1.0.8
|
unidic-lite==1.0.8
|
||||||
# gruut+supported langs
|
# gruut+supported langs
|
||||||
gruut[cs,de,es,fr,it,nl,pt,ru,sv]==2.2.3
|
gruut[de]==2.2.3
|
||||||
|
# deps for korean
|
||||||
|
jamo
|
||||||
|
nltk
|
||||||
|
g2pkk>=0.1.1
|
|
@ -33,7 +33,9 @@ def get_tests_data_path():
|
||||||
|
|
||||||
def get_tests_output_path():
|
def get_tests_output_path():
|
||||||
"""Returns the path to the directory for test outputs."""
|
"""Returns the path to the directory for test outputs."""
|
||||||
return os.path.join(get_tests_path(), "outputs")
|
path = os.path.join(get_tests_path(), "outputs")
|
||||||
|
os.makedirs(path, exist_ok=True)
|
||||||
|
return path
|
||||||
|
|
||||||
|
|
||||||
def run_cli(command):
|
def run_cli(command):
|
||||||
|
@ -42,7 +44,7 @@ def run_cli(command):
|
||||||
|
|
||||||
|
|
||||||
def get_test_data_config():
|
def get_test_data_config():
|
||||||
return BaseDatasetConfig(name="ljspeech", path="tests/data/ljspeech/", meta_file_train="metadata.csv")
|
return BaseDatasetConfig(formatter="ljspeech", path="tests/data/ljspeech/", meta_file_train="metadata.csv")
|
||||||
|
|
||||||
|
|
||||||
def assertHasAttr(test_obj, obj, intendedAttr):
|
def assertHasAttr(test_obj, obj, intendedAttr):
|
||||||
|
|
|
@ -10,7 +10,7 @@ OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests")
|
||||||
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
|
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
|
||||||
|
|
||||||
os.makedirs(OUT_PATH, exist_ok=True)
|
os.makedirs(OUT_PATH, exist_ok=True)
|
||||||
conf = BaseAudioConfig(mel_fmax=8000)
|
conf = BaseAudioConfig(mel_fmax=8000, pitch_fmax=640, pitch_fmin=1)
|
||||||
|
|
||||||
|
|
||||||
# pylint: disable=protected-access
|
# pylint: disable=protected-access
|
||||||
|
|
|
@ -0,0 +1,92 @@
|
||||||
|
import os
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from tests import get_tests_input_path
|
||||||
|
from TTS.config import load_config
|
||||||
|
from TTS.encoder.utils.generic_utils import setup_encoder_model
|
||||||
|
from TTS.encoder.utils.io import save_checkpoint
|
||||||
|
from TTS.tts.utils.managers import EmbeddingManager
|
||||||
|
from TTS.utils.audio import AudioProcessor
|
||||||
|
|
||||||
|
encoder_config_path = os.path.join(get_tests_input_path(), "test_speaker_encoder_config.json")
|
||||||
|
encoder_model_path = os.path.join(get_tests_input_path(), "checkpoint_0.pth")
|
||||||
|
sample_wav_path = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0001.wav")
|
||||||
|
sample_wav_path2 = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0002.wav")
|
||||||
|
embedding_file_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.json")
|
||||||
|
embeddings_file_path2 = os.path.join(get_tests_input_path(), "../data/dummy_speakers2.json")
|
||||||
|
embeddings_file_pth_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.pth")
|
||||||
|
|
||||||
|
|
||||||
|
class EmbeddingManagerTest(unittest.TestCase):
|
||||||
|
"""Test emEeddingManager for loading embedding files and computing embeddings from waveforms"""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def test_speaker_embedding():
|
||||||
|
# load config
|
||||||
|
config = load_config(encoder_config_path)
|
||||||
|
config.audio.resample = True
|
||||||
|
|
||||||
|
# create a dummy speaker encoder
|
||||||
|
model = setup_encoder_model(config)
|
||||||
|
save_checkpoint(model, None, None, get_tests_input_path(), 0)
|
||||||
|
|
||||||
|
# load audio processor and speaker encoder
|
||||||
|
manager = EmbeddingManager(encoder_model_path=encoder_model_path, encoder_config_path=encoder_config_path)
|
||||||
|
|
||||||
|
# load a sample audio and compute embedding
|
||||||
|
ap = AudioProcessor(**config.audio)
|
||||||
|
waveform = ap.load_wav(sample_wav_path)
|
||||||
|
mel = ap.melspectrogram(waveform)
|
||||||
|
embedding = manager.compute_embeddings(mel)
|
||||||
|
assert embedding.shape[1] == 256
|
||||||
|
|
||||||
|
# compute embedding directly from an input file
|
||||||
|
embedding = manager.compute_embedding_from_clip(sample_wav_path)
|
||||||
|
embedding2 = manager.compute_embedding_from_clip(sample_wav_path)
|
||||||
|
embedding = torch.FloatTensor(embedding)
|
||||||
|
embedding2 = torch.FloatTensor(embedding2)
|
||||||
|
assert embedding.shape[0] == 256
|
||||||
|
assert (embedding - embedding2).sum() == 0.0
|
||||||
|
|
||||||
|
# compute embedding from a list of wav files.
|
||||||
|
embedding3 = manager.compute_embedding_from_clip([sample_wav_path, sample_wav_path2])
|
||||||
|
embedding3 = torch.FloatTensor(embedding3)
|
||||||
|
assert embedding3.shape[0] == 256
|
||||||
|
assert (embedding - embedding3).sum() != 0.0
|
||||||
|
|
||||||
|
# remove dummy model
|
||||||
|
os.remove(encoder_model_path)
|
||||||
|
|
||||||
|
def test_embedding_file_processing(self): # pylint: disable=no-self-use
|
||||||
|
manager = EmbeddingManager(embedding_file_path=embeddings_file_pth_path)
|
||||||
|
# test embedding querying
|
||||||
|
embedding = manager.get_embedding_by_clip(manager.clip_ids[0])
|
||||||
|
assert len(embedding) == 256
|
||||||
|
embeddings = manager.get_embeddings_by_name(manager.embedding_names[0])
|
||||||
|
assert len(embeddings[0]) == 256
|
||||||
|
embedding1 = manager.get_mean_embedding(manager.embedding_names[0], num_samples=2, randomize=True)
|
||||||
|
assert len(embedding1) == 256
|
||||||
|
embedding2 = manager.get_mean_embedding(manager.embedding_names[0], num_samples=2, randomize=False)
|
||||||
|
assert len(embedding2) == 256
|
||||||
|
assert np.sum(np.array(embedding1) - np.array(embedding2)) != 0
|
||||||
|
|
||||||
|
def test_embedding_file_loading(self):
|
||||||
|
# test loading a json file
|
||||||
|
manager = EmbeddingManager(embedding_file_path=embedding_file_path)
|
||||||
|
self.assertEqual(manager.num_embeddings, 384)
|
||||||
|
self.assertEqual(manager.embedding_dim, 256)
|
||||||
|
# test loading a pth file
|
||||||
|
manager = EmbeddingManager(embedding_file_path=embeddings_file_pth_path)
|
||||||
|
self.assertEqual(manager.num_embeddings, 384)
|
||||||
|
self.assertEqual(manager.embedding_dim, 256)
|
||||||
|
# test loading a pth files with duplicate embedding keys
|
||||||
|
with self.assertRaises(Exception) as context:
|
||||||
|
manager = EmbeddingManager(embedding_file_path=[embeddings_file_pth_path, embeddings_file_pth_path])
|
||||||
|
self.assertTrue("Duplicate embedding names" in str(context.exception))
|
||||||
|
# test loading embedding files with different embedding keys
|
||||||
|
manager = EmbeddingManager(embedding_file_path=[embeddings_file_pth_path, embeddings_file_path2])
|
||||||
|
self.assertEqual(manager.embedding_dim, 256)
|
||||||
|
self.assertEqual(manager.num_embeddings, 384 * 2)
|
|
@ -15,7 +15,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase):
|
||||||
def test_GlowTTS():
|
def test_GlowTTS():
|
||||||
# set paths
|
# set paths
|
||||||
config_path = os.path.join(get_tests_input_path(), "test_glow_tts.json")
|
config_path = os.path.join(get_tests_input_path(), "test_glow_tts.json")
|
||||||
checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth")
|
checkpoint_path = os.path.join(get_tests_output_path(), "glowtts.pth")
|
||||||
output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/")
|
output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/")
|
||||||
# load config
|
# load config
|
||||||
c = load_config(config_path)
|
c = load_config(config_path)
|
||||||
|
@ -33,7 +33,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase):
|
||||||
def test_Tacotron2():
|
def test_Tacotron2():
|
||||||
# set paths
|
# set paths
|
||||||
config_path = os.path.join(get_tests_input_path(), "test_tacotron2_config.json")
|
config_path = os.path.join(get_tests_input_path(), "test_tacotron2_config.json")
|
||||||
checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth")
|
checkpoint_path = os.path.join(get_tests_output_path(), "tacotron2.pth")
|
||||||
output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/")
|
output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/")
|
||||||
# load config
|
# load config
|
||||||
c = load_config(config_path)
|
c = load_config(config_path)
|
||||||
|
@ -51,7 +51,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase):
|
||||||
def test_Tacotron():
|
def test_Tacotron():
|
||||||
# set paths
|
# set paths
|
||||||
config_path = os.path.join(get_tests_input_path(), "test_tacotron_config.json")
|
config_path = os.path.join(get_tests_input_path(), "test_tacotron_config.json")
|
||||||
checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth")
|
checkpoint_path = os.path.join(get_tests_output_path(), "tacotron.pth")
|
||||||
output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/")
|
output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/")
|
||||||
# load config
|
# load config
|
||||||
c = load_config(config_path)
|
c = load_config(config_path)
|
||||||
|
|
|
@ -12,20 +12,22 @@ torch.manual_seed(1)
|
||||||
config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
|
config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
|
||||||
|
|
||||||
dataset_config_en = BaseDatasetConfig(
|
dataset_config_en = BaseDatasetConfig(
|
||||||
name="ljspeech",
|
formatter="ljspeech",
|
||||||
meta_file_train="metadata.csv",
|
meta_file_train="metadata.csv",
|
||||||
meta_file_val="metadata.csv",
|
meta_file_val="metadata.csv",
|
||||||
path="tests/data/ljspeech",
|
path="tests/data/ljspeech",
|
||||||
language="en",
|
language="en",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
"""
|
||||||
dataset_config_pt = BaseDatasetConfig(
|
dataset_config_pt = BaseDatasetConfig(
|
||||||
name="ljspeech",
|
formatter="ljspeech",
|
||||||
meta_file_train="metadata.csv",
|
meta_file_train="metadata.csv",
|
||||||
meta_file_val="metadata.csv",
|
meta_file_val="metadata.csv",
|
||||||
path="tests/data/ljspeech",
|
path="tests/data/ljspeech",
|
||||||
language="pt-br",
|
language="pt-br",
|
||||||
)
|
)
|
||||||
|
"""
|
||||||
|
|
||||||
# pylint: disable=protected-access
|
# pylint: disable=protected-access
|
||||||
class TestFindUniquePhonemes(unittest.TestCase):
|
class TestFindUniquePhonemes(unittest.TestCase):
|
||||||
|
@ -46,7 +48,7 @@ class TestFindUniquePhonemes(unittest.TestCase):
|
||||||
epochs=1,
|
epochs=1,
|
||||||
print_step=1,
|
print_step=1,
|
||||||
print_eval=True,
|
print_eval=True,
|
||||||
datasets=[dataset_config_en, dataset_config_pt],
|
datasets=[dataset_config_en],
|
||||||
)
|
)
|
||||||
config.save_json(config_path)
|
config.save_json(config_path)
|
||||||
|
|
||||||
|
@ -70,7 +72,7 @@ class TestFindUniquePhonemes(unittest.TestCase):
|
||||||
epochs=1,
|
epochs=1,
|
||||||
print_step=1,
|
print_step=1,
|
||||||
print_eval=True,
|
print_eval=True,
|
||||||
datasets=[dataset_config_en, dataset_config_pt],
|
datasets=[dataset_config_en],
|
||||||
)
|
)
|
||||||
config.save_json(config_path)
|
config.save_json(config_path)
|
||||||
|
|
||||||
|
|
|
@ -31,7 +31,8 @@ class TestNumpyTransforms(unittest.TestCase):
|
||||||
mel_fmin: int = 0
|
mel_fmin: int = 0
|
||||||
hop_length: int = 256
|
hop_length: int = 256
|
||||||
win_length: int = 1024
|
win_length: int = 1024
|
||||||
pitch_fmax: int = 450
|
pitch_fmax: int = 640
|
||||||
|
pitch_fmin: int = 1
|
||||||
trim_db: int = -1
|
trim_db: int = -1
|
||||||
min_silence_sec: float = 0.01
|
min_silence_sec: float = 0.01
|
||||||
gain: float = 1.0
|
gain: float = 1.0
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue