mirror of https://github.com/coqui-ai/TTS.git
Merge branch 'coqui-ai:main' into main
This commit is contained in:
commit
b761d488a7
|
@ -18,7 +18,7 @@ jobs:
|
|||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python-version: [3.7, 3.8, 3.9, "3.10"]
|
||||
python-version: [3.9, "3.10", "3.11"]
|
||||
experimental: [false]
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
|
|
@ -18,7 +18,7 @@ jobs:
|
|||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python-version: [3.7, 3.8, 3.9, "3.10"]
|
||||
python-version: [3.9, "3.10", "3.11"]
|
||||
experimental: [false]
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
|
|
@ -18,7 +18,7 @@ jobs:
|
|||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python-version: [3.7, 3.8, 3.9, "3.10"]
|
||||
python-version: [3.9, "3.10", "3.11"]
|
||||
experimental: [false]
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
|
|
@ -21,7 +21,7 @@ jobs:
|
|||
fi
|
||||
- uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: 3.8
|
||||
python-version: 3.9
|
||||
- run: |
|
||||
python -m pip install -U pip setuptools wheel build
|
||||
- run: |
|
||||
|
@ -36,7 +36,7 @@ jobs:
|
|||
runs-on: ubuntu-20.04
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ["3.7", "3.8", "3.9", "3.10"]
|
||||
python-version: ["3.9", "3.10", "3.11"]
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/setup-python@v2
|
||||
|
@ -64,14 +64,6 @@ jobs:
|
|||
with:
|
||||
name: "sdist"
|
||||
path: "dist/"
|
||||
- uses: actions/download-artifact@v2
|
||||
with:
|
||||
name: "wheel-3.7"
|
||||
path: "dist/"
|
||||
- uses: actions/download-artifact@v2
|
||||
with:
|
||||
name: "wheel-3.8"
|
||||
path: "dist/"
|
||||
- uses: actions/download-artifact@v2
|
||||
with:
|
||||
name: "wheel-3.9"
|
||||
|
@ -80,6 +72,10 @@ jobs:
|
|||
with:
|
||||
name: "wheel-3.10"
|
||||
path: "dist/"
|
||||
- uses: actions/download-artifact@v2
|
||||
with:
|
||||
name: "wheel-3.11"
|
||||
path: "dist/"
|
||||
- run: |
|
||||
ls -lh dist/
|
||||
- name: Setup PyPI config
|
||||
|
@ -91,7 +87,7 @@ jobs:
|
|||
EOF
|
||||
- uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: 3.8
|
||||
python-version: 3.9
|
||||
- run: |
|
||||
python -m pip install twine
|
||||
- run: |
|
||||
|
|
|
@ -42,6 +42,6 @@ jobs:
|
|||
run: |
|
||||
python3 -m pip install .[all]
|
||||
python3 setup.py egg_info
|
||||
- name: Lint check
|
||||
run: |
|
||||
make lint
|
||||
# - name: Lint check
|
||||
# run: |
|
||||
# make lint
|
|
@ -18,7 +18,7 @@ jobs:
|
|||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python-version: [3.7, 3.8, 3.9, "3.10"]
|
||||
python-version: [3.9, "3.10", "3.11"]
|
||||
experimental: [false]
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
|
|
@ -18,7 +18,7 @@ jobs:
|
|||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python-version: [3.7, 3.8, 3.9, "3.10"]
|
||||
python-version: [3.9, "3.10", "3.11"]
|
||||
experimental: [false]
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
|
|
@ -18,7 +18,7 @@ jobs:
|
|||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python-version: [3.7, 3.8, 3.9, "3.10"]
|
||||
python-version: [3.9, "3.10", "3.11"]
|
||||
experimental: [false]
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
|
|
@ -18,7 +18,7 @@ jobs:
|
|||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python-version: [3.7, 3.8, 3.9, "3.10"]
|
||||
python-version: [3.9, "3.10", "3.11"]
|
||||
experimental: [false]
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
|
|
@ -18,7 +18,7 @@ jobs:
|
|||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python-version: [3.7, 3.8, 3.9, "3.10"]
|
||||
python-version: [3.9, "3.10", "3.11"]
|
||||
experimental: [false]
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
@ -43,6 +43,7 @@ jobs:
|
|||
run: python3 -m pip install --upgrade pip setuptools wheel
|
||||
- name: Replace scarf urls
|
||||
run: |
|
||||
sed -i 's/https:\/\/coqui.gateway.scarf.sh\/hf\/bark\//https:\/\/huggingface.co\/erogol\/bark\/resolve\/main\//g' TTS/.models.json
|
||||
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
|
||||
- name: Install TTS
|
||||
run: |
|
||||
|
|
|
@ -18,7 +18,7 @@ jobs:
|
|||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python-version: [3.7, 3.8, 3.9, "3.10"]
|
||||
python-version: [3.9, "3.10", "3.11"]
|
||||
experimental: [false]
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
|
|
@ -169,7 +169,9 @@ disable=missing-docstring,
|
|||
comprehension-escape,
|
||||
duplicate-code,
|
||||
not-callable,
|
||||
import-outside-toplevel
|
||||
import-outside-toplevel,
|
||||
logging-fstring-interpolation,
|
||||
logging-not-lazy
|
||||
|
||||
# Enable the message, report, category or checker with the given id(s). You can
|
||||
# either give multiple identifier separated by comma (,) or put this option
|
||||
|
|
|
@ -5,14 +5,19 @@
|
|||
# Required
|
||||
version: 2
|
||||
|
||||
# Set the version of Python and other tools you might need
|
||||
build:
|
||||
os: ubuntu-22.04
|
||||
tools:
|
||||
python: "3.11"
|
||||
|
||||
# Optionally set the version of Python and requirements required to build your docs
|
||||
python:
|
||||
install:
|
||||
- requirements: docs/requirements.txt
|
||||
- requirements: requirements.txt
|
||||
|
||||
# Build documentation in the docs/ directory with Sphinx
|
||||
sphinx:
|
||||
builder: html
|
||||
configuration: docs/source/conf.py
|
||||
|
||||
# Optionally set the version of Python and requirements required to build your docs
|
||||
python:
|
||||
version: 3.7
|
||||
install:
|
||||
- requirements: docs/requirements.txt
|
||||
- requirements: requirements.txt
|
35
README.md
35
README.md
|
@ -1,10 +1,14 @@
|
|||
|
||||
|
||||
## 🐸Coqui.ai News
|
||||
- 📣 Coqui Studio API is landed on 🐸TTS. You can use the studio voices in combination with 🐸TTS models. [Example](https://github.com/coqui-ai/TTS/blob/dev/README.md#-python-api)
|
||||
- 📣 Voice generation with prompts - **Prompt to Voice** - is live on Coqui.ai!! [Blog Post](https://coqui.ai/blog/tts/prompt-to-voice)
|
||||
- 📣 Clone your voice with a single click on [🐸Coqui.ai](https://app.coqui.ai/auth/signin)
|
||||
<br>
|
||||
- 📣 [🐶Bark](https://github.com/suno-ai/bark) is now available for inference with uncontrained voice cloning. [Docs](https://tts.readthedocs.io/en/dev/models/bark.html)
|
||||
- 📣 You can use [~1100 Fairseq models](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS.
|
||||
- 📣 🐸TTS now supports 🐢Tortoise with faster inference. [Docs](https://tts.readthedocs.io/en/dev/models/tortoise.html)
|
||||
- 📣 **Coqui Studio API** is landed on 🐸TTS. - [Example](https://github.com/coqui-ai/TTS/blob/dev/README.md#-python-api)
|
||||
- 📣 [**Coqui Studio API**](https://docs.coqui.ai/docs) is live.
|
||||
- 📣 Voice generation with prompts - **Prompt to Voice** - is live on [**Coqui Studio**](https://app.coqui.ai/auth/signin)!! - [Blog Post](https://coqui.ai/blog/tts/prompt-to-voice)
|
||||
- 📣 Voice generation with fusion - **Voice fusion** - is live on [**Coqui Studio**](https://app.coqui.ai/auth/signin).
|
||||
- 📣 Voice cloning is live on [**Coqui Studio**](https://app.coqui.ai/auth/signin).
|
||||
|
||||
## <img src="https://raw.githubusercontent.com/coqui-ai/TTS/main/images/coqui-log-green-TTS.png" height="56"/>
|
||||
|
||||
|
@ -185,7 +189,9 @@ from TTS.api import TTS
|
|||
model_name = TTS.list_models()[0]
|
||||
# Init TTS
|
||||
tts = TTS(model_name)
|
||||
|
||||
# Run TTS
|
||||
|
||||
# ❗ Since this model is multi-speaker and multi-lingual, we must set the target speaker and the language
|
||||
# Text to speech with a numpy output
|
||||
wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0], language=tts.languages[0])
|
||||
|
@ -199,7 +205,8 @@ tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False,
|
|||
# Run TTS
|
||||
tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path=OUTPUT_PATH)
|
||||
|
||||
# Example voice cloning with YourTTS in English, French and Portuguese:
|
||||
# Example voice cloning with YourTTS in English, French and Portuguese
|
||||
|
||||
tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)
|
||||
tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav")
|
||||
tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr-fr", file_path="output.wav")
|
||||
|
@ -221,7 +228,9 @@ tts.tts_with_vc_to_file(
|
|||
file_path="ouptut.wav"
|
||||
)
|
||||
|
||||
# Example text to speech using [🐸Coqui Studio](https://coqui.ai) models. You can use all of your available speakers in the studio.
|
||||
# Example text to speech using [🐸Coqui Studio](https://coqui.ai) models.
|
||||
|
||||
# You can use all of your available speakers in the studio.
|
||||
# [🐸Coqui Studio](https://coqui.ai) API token is required. You can get it from the [account page](https://coqui.ai/account).
|
||||
# You should set the `COQUI_STUDIO_TOKEN` environment variable to use the API token.
|
||||
|
||||
|
@ -234,6 +243,20 @@ tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_b
|
|||
tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH)
|
||||
# Run TTS with emotion and speed control
|
||||
tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5)
|
||||
|
||||
|
||||
#Example text to speech using **Fairseq models in ~1100 languages** 🤯.
|
||||
|
||||
#For these models use the following name format: `tts_models/<lang-iso_code>/fairseq/vits`.
|
||||
#You can find the list of language ISO codes [here](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html) and learn about the Fairseq models [here](https://github.com/facebookresearch/fairseq/tree/main/examples/mms).
|
||||
|
||||
# TTS with on the fly voice conversion
|
||||
api = TTS("tts_models/deu/fairseq/vits")
|
||||
api.tts_with_vc_to_file(
|
||||
"Wie sage ich auf Italienisch, dass ich dich liebe?",
|
||||
speaker_wav="target/speaker.wav",
|
||||
file_path="ouptut.wav"
|
||||
)
|
||||
```
|
||||
|
||||
### Command line `tts`
|
||||
|
|
159
TTS/.models.json
159
TTS/.models.json
|
@ -1,20 +1,33 @@
|
|||
{
|
||||
"tts_models": {
|
||||
"multilingual":{
|
||||
"multi-dataset":{
|
||||
"your_tts":{
|
||||
"multilingual": {
|
||||
"multi-dataset": {
|
||||
"your_tts": {
|
||||
"description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418",
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--multilingual--multi-dataset--your_tts.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": "e9a1953e",
|
||||
"license": "CC BY-NC-ND 4.0",
|
||||
"contact": "egolge@coqui.ai"
|
||||
},
|
||||
"bark": {
|
||||
"description": "🐶 Bark TTS model released by suno-ai. You can find the original implementation in https://github.com/suno-ai/bark.",
|
||||
"hf_url": [
|
||||
"https://coqui.gateway.scarf.sh/hf/bark/coarse_2.pt",
|
||||
"https://coqui.gateway.scarf.sh/hf/bark/fine_2.pt",
|
||||
"https://coqui.gateway.scarf.sh/hf/bark/text_2.pt",
|
||||
"https://coqui.gateway.scarf.sh/hf/bark/config.json"
|
||||
],
|
||||
"default_vocoder": null,
|
||||
"commit": "e9a1953e",
|
||||
"license": "MIT",
|
||||
"contact": "https://www.suno.ai/"
|
||||
}
|
||||
}
|
||||
},
|
||||
"bg": {
|
||||
"cv": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--bg--cv--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -25,7 +38,7 @@
|
|||
},
|
||||
"cs": {
|
||||
"cv": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--cs--cv--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -36,7 +49,7 @@
|
|||
},
|
||||
"da": {
|
||||
"cv": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--da--cv--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -47,7 +60,7 @@
|
|||
},
|
||||
"et": {
|
||||
"cv": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--et--cv--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -58,7 +71,7 @@
|
|||
},
|
||||
"ga": {
|
||||
"cv": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ga--cv--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -180,7 +193,7 @@
|
|||
"license": "apache 2.0",
|
||||
"contact": "egolge@coqui.ai"
|
||||
},
|
||||
"fast_pitch":{
|
||||
"fast_pitch": {
|
||||
"description": "FastPitch model trained on VCTK dataseset.",
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--fast_pitch.zip",
|
||||
"default_vocoder": null,
|
||||
|
@ -220,21 +233,21 @@
|
|||
"license": "apache 2.0",
|
||||
"contact": "adamfroghyar@gmail.com"
|
||||
}
|
||||
|
||||
},
|
||||
"multi-dataset":{
|
||||
"tortoise-v2":{
|
||||
"multi-dataset": {
|
||||
"tortoise-v2": {
|
||||
"description": "Tortoise tts model https://github.com/neonbjb/tortoise-tts",
|
||||
"github_rls_url": ["https://coqui.gateway.scarf.sh/v0.14.1_models/autoregressive.pth",
|
||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/clvp2.pth",
|
||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/cvvp.pth",
|
||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/diffusion_decoder.pth",
|
||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_auto.pth",
|
||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_diffuser.pth",
|
||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/vocoder.pth",
|
||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/mel_norms.pth",
|
||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/config.json"
|
||||
],
|
||||
"github_rls_url": [
|
||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/autoregressive.pth",
|
||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/clvp2.pth",
|
||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/cvvp.pth",
|
||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/diffusion_decoder.pth",
|
||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_auto.pth",
|
||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_diffuser.pth",
|
||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/vocoder.pth",
|
||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/mel_norms.pth",
|
||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/config.json"
|
||||
],
|
||||
"commit": "c1875f6",
|
||||
"default_vocoder": null,
|
||||
"author": "@neonbjb - James Betker, @manmay-nakhashi Manmay Nakhashi",
|
||||
|
@ -242,7 +255,7 @@
|
|||
}
|
||||
},
|
||||
"jenny": {
|
||||
"jenny":{
|
||||
"jenny": {
|
||||
"description": "VITS model trained with Jenny(Dioco) dataset. Named as Jenny as demanded by the license. Original URL for the model https://www.kaggle.com/datasets/noml4u/tts-models--en--jenny-dioco--vits",
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.14.0_models/tts_models--en--jenny--jenny.zip",
|
||||
"default_vocoder": null,
|
||||
|
@ -263,8 +276,8 @@
|
|||
"contact": "egolge@coqui.com"
|
||||
}
|
||||
},
|
||||
"css10":{
|
||||
"vits":{
|
||||
"css10": {
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--es--css10--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -284,8 +297,8 @@
|
|||
"contact": "egolge@coqui.com"
|
||||
}
|
||||
},
|
||||
"css10":{
|
||||
"vits":{
|
||||
"css10": {
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fr--css10--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -294,17 +307,17 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"uk":{
|
||||
"uk": {
|
||||
"mai": {
|
||||
"glow-tts": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--uk--mai--glow-tts.zip",
|
||||
"author":"@robinhad",
|
||||
"author": "@robinhad",
|
||||
"commit": "bdab788d",
|
||||
"license": "MIT",
|
||||
"contact": "",
|
||||
"default_vocoder": "vocoder_models/uk/mai/multiband-melgan"
|
||||
},
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--uk--mai--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -335,8 +348,8 @@
|
|||
"commit": "540d811"
|
||||
}
|
||||
},
|
||||
"css10":{
|
||||
"vits":{
|
||||
"css10": {
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--nl--css10--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -371,7 +384,7 @@
|
|||
}
|
||||
},
|
||||
"css10": {
|
||||
"vits-neon":{
|
||||
"vits-neon": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--de--css10--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"author": "@NeonGeckoCom",
|
||||
|
@ -392,9 +405,9 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"tr":{
|
||||
"tr": {
|
||||
"common-voice": {
|
||||
"glow-tts":{
|
||||
"glow-tts": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--tr--common-voice--glow-tts.zip",
|
||||
"default_vocoder": "vocoder_models/tr/common-voice/hifigan",
|
||||
"license": "MIT",
|
||||
|
@ -406,7 +419,7 @@
|
|||
},
|
||||
"it": {
|
||||
"mai_female": {
|
||||
"glow-tts":{
|
||||
"glow-tts": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--glow-tts.zip",
|
||||
"default_vocoder": null,
|
||||
"description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
|
||||
|
@ -414,7 +427,7 @@
|
|||
"license": "apache 2.0",
|
||||
"commit": null
|
||||
},
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
|
||||
|
@ -424,7 +437,7 @@
|
|||
}
|
||||
},
|
||||
"mai_male": {
|
||||
"glow-tts":{
|
||||
"glow-tts": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--glow-tts.zip",
|
||||
"default_vocoder": null,
|
||||
"description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
|
||||
|
@ -432,7 +445,7 @@
|
|||
"license": "apache 2.0",
|
||||
"commit": null
|
||||
},
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
|
||||
|
@ -444,7 +457,7 @@
|
|||
},
|
||||
"ewe": {
|
||||
"openbible": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--ewe--openbible--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"license": "CC-BY-SA 4.0",
|
||||
|
@ -456,7 +469,7 @@
|
|||
},
|
||||
"hau": {
|
||||
"openbible": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--hau--openbible--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"license": "CC-BY-SA 4.0",
|
||||
|
@ -468,7 +481,7 @@
|
|||
},
|
||||
"lin": {
|
||||
"openbible": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--lin--openbible--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"license": "CC-BY-SA 4.0",
|
||||
|
@ -480,7 +493,7 @@
|
|||
},
|
||||
"tw_akuapem": {
|
||||
"openbible": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_akuapem--openbible--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"license": "CC-BY-SA 4.0",
|
||||
|
@ -492,7 +505,7 @@
|
|||
},
|
||||
"tw_asante": {
|
||||
"openbible": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_asante--openbible--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"license": "CC-BY-SA 4.0",
|
||||
|
@ -504,7 +517,7 @@
|
|||
},
|
||||
"yor": {
|
||||
"openbible": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--yor--openbible--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"license": "CC-BY-SA 4.0",
|
||||
|
@ -538,7 +551,7 @@
|
|||
},
|
||||
"fi": {
|
||||
"css10": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fi--css10--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -549,7 +562,7 @@
|
|||
},
|
||||
"hr": {
|
||||
"cv": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hr--cv--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -560,7 +573,7 @@
|
|||
},
|
||||
"lt": {
|
||||
"cv": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lt--cv--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -571,7 +584,7 @@
|
|||
},
|
||||
"lv": {
|
||||
"cv": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lv--cv--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -582,7 +595,7 @@
|
|||
},
|
||||
"mt": {
|
||||
"cv": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--mt--cv--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -593,7 +606,7 @@
|
|||
},
|
||||
"pl": {
|
||||
"mai_female": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pl--mai_female--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -604,7 +617,7 @@
|
|||
},
|
||||
"pt": {
|
||||
"cv": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pt--cv--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -615,7 +628,7 @@
|
|||
},
|
||||
"ro": {
|
||||
"cv": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ro--cv--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -626,7 +639,7 @@
|
|||
},
|
||||
"sk": {
|
||||
"cv": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sk--cv--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -637,7 +650,7 @@
|
|||
},
|
||||
"sl": {
|
||||
"cv": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sl--cv--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -648,7 +661,7 @@
|
|||
},
|
||||
"sv": {
|
||||
"cv": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sv--cv--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -659,7 +672,7 @@
|
|||
},
|
||||
"ca": {
|
||||
"custom": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--ca--custom--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -669,8 +682,8 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"fa":{
|
||||
"custom":{
|
||||
"fa": {
|
||||
"custom": {
|
||||
"glow-tts": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--fa--custom--glow-tts.zip",
|
||||
"default_vocoder": null,
|
||||
|
@ -681,18 +694,18 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"bn":{
|
||||
"custom":{
|
||||
"vits-male":{
|
||||
"github_rls_url":"https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_male.zip",
|
||||
"bn": {
|
||||
"custom": {
|
||||
"vits-male": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_male.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
"description": "Single speaker Bangla male model. For more information -> https://github.com/mobassir94/comprehensive-bangla-tts",
|
||||
"author": "@mobassir94",
|
||||
"license": "Apache 2.0"
|
||||
},
|
||||
"vits-female":{
|
||||
"github_rls_url":"https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_female.zip",
|
||||
"vits-female": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_female.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
"description": "Single speaker Bangla female model. For more information -> https://github.com/mobassir94/comprehensive-bangla-tts",
|
||||
|
@ -834,16 +847,16 @@
|
|||
"mai": {
|
||||
"multiband-melgan": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--uk--mai--multiband-melgan.zip",
|
||||
"author":"@robinhad",
|
||||
"author": "@robinhad",
|
||||
"commit": "bdab788d",
|
||||
"license": "MIT",
|
||||
"contact": ""
|
||||
}
|
||||
}
|
||||
},
|
||||
"tr":{
|
||||
"tr": {
|
||||
"common-voice": {
|
||||
"hifigan":{
|
||||
"hifigan": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--tr--common-voice--hifigan.zip",
|
||||
"description": "HifiGAN model using an unknown speaker from the Common-Voice dataset.",
|
||||
"author": "Fatih Akademi",
|
||||
|
@ -853,10 +866,10 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"voice_conversion_models":{
|
||||
"multilingual":{
|
||||
"vctk":{
|
||||
"freevc24":{
|
||||
"voice_conversion_models": {
|
||||
"multilingual": {
|
||||
"vctk": {
|
||||
"freevc24": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.0_models/voice_conversion_models--multilingual--vctk--freevc24.zip",
|
||||
"description": "FreeVC model trained on VCTK dataset from https://github.com/OlaWod/FreeVC",
|
||||
"author": "Jing-Yi Li @OlaWod",
|
||||
|
@ -866,4 +879,4 @@
|
|||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1 +1 @@
|
|||
0.14.0
|
||||
0.15.4
|
12
TTS/api.py
12
TTS/api.py
|
@ -105,7 +105,7 @@ class CS_API:
|
|||
"""List built-in Coqui Studio speakers."""
|
||||
self._check_token()
|
||||
conn = http.client.HTTPSConnection("app.coqui.ai")
|
||||
conn.request("GET", f"{self.api_prefix}/speakers", headers=self.headers)
|
||||
conn.request("GET", f"{self.api_prefix}/speakers?per_page=100", headers=self.headers)
|
||||
res = conn.getresponse()
|
||||
data = res.read()
|
||||
return [Speaker(s) for s in json.loads(data)["result"]]
|
||||
|
@ -130,7 +130,7 @@ class CS_API:
|
|||
for speaker in self.speakers:
|
||||
if speaker.name == name:
|
||||
return speaker
|
||||
raise ValueError(f"Speaker {name} not found.")
|
||||
raise ValueError(f"Speaker {name} not found in {self.speakers}")
|
||||
|
||||
def id_to_speaker(self, speaker_id):
|
||||
for speaker in self.speakers:
|
||||
|
@ -264,6 +264,10 @@ class TTS:
|
|||
>>> tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr", file_path="thisisit.wav")
|
||||
>>> tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt", file_path="thisisit.wav")
|
||||
|
||||
Example Fairseq TTS models (uses ISO language codes in https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html):
|
||||
>>> tts = TTS(model_name="tts_models/eng/fairseq/vits", progress_bar=False, gpu=True)
|
||||
>>> tts.tts_to_file("This is a test.", file_path="output.wav")
|
||||
|
||||
Args:
|
||||
model_name (str, optional): Model name to load. You can list models by ```tts.models```. Defaults to None.
|
||||
model_path (str, optional): Path to the model checkpoint. Defaults to None.
|
||||
|
@ -342,7 +346,7 @@ class TTS:
|
|||
|
||||
def download_model_by_name(self, model_name: str):
|
||||
model_path, config_path, model_item = self.manager.download_model(model_name)
|
||||
if isinstance(model_item["github_rls_url"], list):
|
||||
if "fairseq" in model_name or (model_item is not None and isinstance(model_item["model_url"], list)):
|
||||
# return model directory if there are multiple files
|
||||
# we assume that the model knows how to load itself
|
||||
return None, None, None, None, model_path
|
||||
|
@ -580,6 +584,8 @@ class TTS:
|
|||
Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None.
|
||||
file_path (str, optional):
|
||||
Output file path. Defaults to "output.wav".
|
||||
kwargs (dict, optional):
|
||||
Additional arguments for the model.
|
||||
"""
|
||||
self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
|
||||
|
||||
|
|
|
@ -356,7 +356,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
|
|||
vc_config_path = config_path
|
||||
|
||||
# tts model with multiple files to be loaded from the directory path
|
||||
if isinstance(model_item["github_rls_url"], list):
|
||||
if model_item.get("author", None) == "fairseq" or isinstance(model_item["model_url"], list):
|
||||
model_dir = model_path
|
||||
tts_path = None
|
||||
tts_config_path = None
|
||||
|
|
|
@ -23,7 +23,7 @@ colormap = (
|
|||
[0, 0, 0],
|
||||
[183, 183, 183],
|
||||
],
|
||||
dtype=np.float,
|
||||
dtype=float,
|
||||
)
|
||||
/ 255
|
||||
)
|
||||
|
|
|
@ -0,0 +1,105 @@
|
|||
import os
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict
|
||||
|
||||
from TTS.tts.configs.shared_configs import BaseTTSConfig
|
||||
from TTS.tts.layers.bark.model import GPTConfig
|
||||
from TTS.tts.layers.bark.model_fine import FineGPTConfig
|
||||
from TTS.tts.models.bark import BarkAudioConfig
|
||||
from TTS.utils.generic_utils import get_user_data_dir
|
||||
|
||||
|
||||
@dataclass
|
||||
class BarkConfig(BaseTTSConfig):
|
||||
"""Bark TTS configuration
|
||||
|
||||
Args:
|
||||
model (str): model name that registers the model.
|
||||
audio (BarkAudioConfig): audio configuration. Defaults to BarkAudioConfig().
|
||||
num_chars (int): number of characters in the alphabet. Defaults to 0.
|
||||
semantic_config (GPTConfig): semantic configuration. Defaults to GPTConfig().
|
||||
fine_config (FineGPTConfig): fine configuration. Defaults to FineGPTConfig().
|
||||
coarse_config (GPTConfig): coarse configuration. Defaults to GPTConfig().
|
||||
CONTEXT_WINDOW_SIZE (int): GPT context window size. Defaults to 1024.
|
||||
SEMANTIC_RATE_HZ (float): semantic tokens rate in Hz. Defaults to 49.9.
|
||||
SEMANTIC_VOCAB_SIZE (int): semantic vocabulary size. Defaults to 10_000.
|
||||
CODEBOOK_SIZE (int): encodec codebook size. Defaults to 1024.
|
||||
N_COARSE_CODEBOOKS (int): number of coarse codebooks. Defaults to 2.
|
||||
N_FINE_CODEBOOKS (int): number of fine codebooks. Defaults to 8.
|
||||
COARSE_RATE_HZ (int): coarse tokens rate in Hz. Defaults to 75.
|
||||
SAMPLE_RATE (int): sample rate. Defaults to 24_000.
|
||||
USE_SMALLER_MODELS (bool): use smaller models. Defaults to False.
|
||||
TEXT_ENCODING_OFFSET (int): text encoding offset. Defaults to 10_048.
|
||||
SEMANTIC_PAD_TOKEN (int): semantic pad token. Defaults to 10_000.
|
||||
TEXT_PAD_TOKEN ([type]): text pad token. Defaults to 10_048.
|
||||
TEXT_EOS_TOKEN ([type]): text end of sentence token. Defaults to 10_049.
|
||||
TEXT_SOS_TOKEN ([type]): text start of sentence token. Defaults to 10_050.
|
||||
SEMANTIC_INFER_TOKEN (int): semantic infer token. Defaults to 10_051.
|
||||
COARSE_SEMANTIC_PAD_TOKEN (int): coarse semantic pad token. Defaults to 12_048.
|
||||
COARSE_INFER_TOKEN (int): coarse infer token. Defaults to 12_050.
|
||||
REMOTE_BASE_URL ([type]): remote base url. Defaults to "https://huggingface.co/erogol/bark/tree".
|
||||
REMOTE_MODEL_PATHS (Dict): remote model paths. Defaults to None.
|
||||
LOCAL_MODEL_PATHS (Dict): local model paths. Defaults to None.
|
||||
SMALL_REMOTE_MODEL_PATHS (Dict): small remote model paths. Defaults to None.
|
||||
CACHE_DIR (str): local cache directory. Defaults to get_user_data_dir().
|
||||
DEF_SPEAKER_DIR (str): default speaker directory to stoke speaker values for voice cloning. Defaults to get_user_data_dir().
|
||||
"""
|
||||
|
||||
model: str = "bark"
|
||||
audio: BarkAudioConfig = field(default_factory=BarkAudioConfig)
|
||||
num_chars: int = 0
|
||||
semantic_config: GPTConfig = field(default_factory=GPTConfig)
|
||||
fine_config: FineGPTConfig = field(default_factory=FineGPTConfig)
|
||||
coarse_config: GPTConfig = field(default_factory=GPTConfig)
|
||||
CONTEXT_WINDOW_SIZE: int = 1024
|
||||
SEMANTIC_RATE_HZ: float = 49.9
|
||||
SEMANTIC_VOCAB_SIZE: int = 10_000
|
||||
CODEBOOK_SIZE: int = 1024
|
||||
N_COARSE_CODEBOOKS: int = 2
|
||||
N_FINE_CODEBOOKS: int = 8
|
||||
COARSE_RATE_HZ: int = 75
|
||||
SAMPLE_RATE: int = 24_000
|
||||
USE_SMALLER_MODELS: bool = False
|
||||
|
||||
TEXT_ENCODING_OFFSET: int = 10_048
|
||||
SEMANTIC_PAD_TOKEN: int = 10_000
|
||||
TEXT_PAD_TOKEN: int = 129_595
|
||||
SEMANTIC_INFER_TOKEN: int = 129_599
|
||||
COARSE_SEMANTIC_PAD_TOKEN: int = 12_048
|
||||
COARSE_INFER_TOKEN: int = 12_050
|
||||
|
||||
REMOTE_BASE_URL = "https://huggingface.co/erogol/bark/tree/main/"
|
||||
REMOTE_MODEL_PATHS: Dict = None
|
||||
LOCAL_MODEL_PATHS: Dict = None
|
||||
SMALL_REMOTE_MODEL_PATHS: Dict = None
|
||||
CACHE_DIR: str = str(get_user_data_dir("tts/suno/bark_v0"))
|
||||
DEF_SPEAKER_DIR: str = str(get_user_data_dir("tts/bark_v0/speakers"))
|
||||
|
||||
def __post_init__(self):
|
||||
self.REMOTE_MODEL_PATHS = {
|
||||
"text": {
|
||||
"path": os.path.join(self.REMOTE_BASE_URL, "text_2.pt"),
|
||||
"checksum": "54afa89d65e318d4f5f80e8e8799026a",
|
||||
},
|
||||
"coarse": {
|
||||
"path": os.path.join(self.REMOTE_BASE_URL, "coarse_2.pt"),
|
||||
"checksum": "8a98094e5e3a255a5c9c0ab7efe8fd28",
|
||||
},
|
||||
"fine": {
|
||||
"path": os.path.join(self.REMOTE_BASE_URL, "fine_2.pt"),
|
||||
"checksum": "59d184ed44e3650774a2f0503a48a97b",
|
||||
},
|
||||
}
|
||||
self.LOCAL_MODEL_PATHS = {
|
||||
"text": os.path.join(self.CACHE_DIR, "text_2.pt"),
|
||||
"coarse": os.path.join(self.CACHE_DIR, "coarse_2.pt"),
|
||||
"fine": os.path.join(self.CACHE_DIR, "fine_2.pt"),
|
||||
"hubert_tokenizer": os.path.join(self.CACHE_DIR, "tokenizer.pth"),
|
||||
"hubert": os.path.join(self.CACHE_DIR, "hubert.pt"),
|
||||
}
|
||||
self.SMALL_REMOTE_MODEL_PATHS = {
|
||||
"text": {"path": os.path.join(self.REMOTE_BASE_URL, "text.pt")},
|
||||
"coarse": {"path": os.path.join(self.REMOTE_BASE_URL, "coarse.pt")},
|
||||
"fine": {"path": os.path.join(self.REMOTE_BASE_URL, "fine.pt")},
|
||||
}
|
||||
self.sample_rate = self.SAMPLE_RATE # pylint: disable=attribute-defined-outside-init
|
|
@ -113,7 +113,7 @@ class FastPitchConfig(BaseTTSConfig):
|
|||
base_model: str = "forward_tts"
|
||||
|
||||
# model specific params
|
||||
model_args: ForwardTTSArgs = ForwardTTSArgs()
|
||||
model_args: ForwardTTSArgs = field(default_factory=ForwardTTSArgs)
|
||||
|
||||
# multi-speaker settings
|
||||
num_speakers: int = 0
|
||||
|
|
|
@ -107,7 +107,7 @@ class FastSpeechConfig(BaseTTSConfig):
|
|||
base_model: str = "forward_tts"
|
||||
|
||||
# model specific params
|
||||
model_args: ForwardTTSArgs = ForwardTTSArgs(use_pitch=False)
|
||||
model_args: ForwardTTSArgs = field(default_factory=lambda: ForwardTTSArgs(use_pitch=False))
|
||||
|
||||
# multi-speaker settings
|
||||
num_speakers: int = 0
|
||||
|
|
|
@ -123,7 +123,7 @@ class Fastspeech2Config(BaseTTSConfig):
|
|||
base_model: str = "forward_tts"
|
||||
|
||||
# model specific params
|
||||
model_args: ForwardTTSArgs = ForwardTTSArgs(use_pitch=True, use_energy=True)
|
||||
model_args: ForwardTTSArgs = field(default_factory=lambda: ForwardTTSArgs(use_pitch=True, use_energy=True))
|
||||
|
||||
# multi-speaker settings
|
||||
num_speakers: int = 0
|
||||
|
|
|
@ -103,26 +103,28 @@ class SpeedySpeechConfig(BaseTTSConfig):
|
|||
base_model: str = "forward_tts"
|
||||
|
||||
# set model args as SpeedySpeech
|
||||
model_args: ForwardTTSArgs = ForwardTTSArgs(
|
||||
use_pitch=False,
|
||||
encoder_type="residual_conv_bn",
|
||||
encoder_params={
|
||||
"kernel_size": 4,
|
||||
"dilations": 4 * [1, 2, 4] + [1],
|
||||
"num_conv_blocks": 2,
|
||||
"num_res_blocks": 13,
|
||||
},
|
||||
decoder_type="residual_conv_bn",
|
||||
decoder_params={
|
||||
"kernel_size": 4,
|
||||
"dilations": 4 * [1, 2, 4, 8] + [1],
|
||||
"num_conv_blocks": 2,
|
||||
"num_res_blocks": 17,
|
||||
},
|
||||
out_channels=80,
|
||||
hidden_channels=128,
|
||||
positional_encoding=True,
|
||||
detach_duration_predictor=True,
|
||||
model_args: ForwardTTSArgs = field(
|
||||
default_factory=lambda: ForwardTTSArgs(
|
||||
use_pitch=False,
|
||||
encoder_type="residual_conv_bn",
|
||||
encoder_params={
|
||||
"kernel_size": 4,
|
||||
"dilations": 4 * [1, 2, 4] + [1],
|
||||
"num_conv_blocks": 2,
|
||||
"num_res_blocks": 13,
|
||||
},
|
||||
decoder_type="residual_conv_bn",
|
||||
decoder_params={
|
||||
"kernel_size": 4,
|
||||
"dilations": 4 * [1, 2, 4, 8] + [1],
|
||||
"num_conv_blocks": 2,
|
||||
"num_res_blocks": 17,
|
||||
},
|
||||
out_channels=80,
|
||||
hidden_channels=128,
|
||||
positional_encoding=True,
|
||||
detach_duration_predictor=True,
|
||||
)
|
||||
)
|
||||
|
||||
# multi-speaker settings
|
||||
|
|
|
@ -70,7 +70,7 @@ class TortoiseConfig(BaseTTSConfig):
|
|||
model: str = "tortoise"
|
||||
# model specific params
|
||||
model_args: TortoiseArgs = field(default_factory=TortoiseArgs)
|
||||
audio: TortoiseAudioConfig = TortoiseAudioConfig()
|
||||
audio: TortoiseAudioConfig = field(default_factory=TortoiseAudioConfig)
|
||||
model_dir: str = None
|
||||
|
||||
# settings
|
||||
|
|
|
@ -0,0 +1,35 @@
|
|||
# From https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer
|
||||
|
||||
import os.path
|
||||
import shutil
|
||||
import urllib.request
|
||||
|
||||
import huggingface_hub
|
||||
|
||||
|
||||
class HubertManager:
|
||||
@staticmethod
|
||||
def make_sure_hubert_installed(
|
||||
download_url: str = "https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt", model_path: str = ""
|
||||
):
|
||||
if not os.path.isfile(model_path):
|
||||
print("Downloading HuBERT base model")
|
||||
urllib.request.urlretrieve(download_url, model_path)
|
||||
print("Downloaded HuBERT")
|
||||
return model_path
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def make_sure_tokenizer_installed(
|
||||
model: str = "quantifier_hubert_base_ls960_14.pth",
|
||||
repo: str = "GitMylo/bark-voice-cloning",
|
||||
model_path: str = "",
|
||||
):
|
||||
model_dir = os.path.dirname(model_path)
|
||||
if not os.path.isfile(model_path):
|
||||
print("Downloading HuBERT custom tokenizer")
|
||||
huggingface_hub.hf_hub_download(repo, model, local_dir=model_dir, local_dir_use_symlinks=False)
|
||||
shutil.move(os.path.join(model_dir, model), model_path)
|
||||
print("Downloaded tokenizer")
|
||||
return model_path
|
||||
return None
|
|
@ -0,0 +1,80 @@
|
|||
"""
|
||||
Modified HuBERT model without kmeans.
|
||||
Original author: https://github.com/lucidrains/
|
||||
Modified by: https://www.github.com/gitmylo/
|
||||
License: MIT
|
||||
"""
|
||||
|
||||
# Modified code from https://github.com/lucidrains/audiolm-pytorch/blob/main/audiolm_pytorch/hubert_kmeans.py
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
from einops import pack, unpack
|
||||
from torch import nn
|
||||
from torchaudio.functional import resample
|
||||
from transformers import HubertModel
|
||||
def round_down_nearest_multiple(num, divisor):
|
||||
return num // divisor * divisor
|
||||
|
||||
|
||||
def curtail_to_multiple(t, mult, from_left=False):
|
||||
data_len = t.shape[-1]
|
||||
rounded_seq_len = round_down_nearest_multiple(data_len, mult)
|
||||
seq_slice = slice(None, rounded_seq_len) if not from_left else slice(-rounded_seq_len, None)
|
||||
return t[..., seq_slice]
|
||||
|
||||
|
||||
def exists(val):
|
||||
return val is not None
|
||||
|
||||
|
||||
def default(val, d):
|
||||
return val if exists(val) else d
|
||||
|
||||
|
||||
class CustomHubert(nn.Module):
|
||||
"""
|
||||
checkpoint and kmeans can be downloaded at https://github.com/facebookresearch/fairseq/tree/main/examples/hubert
|
||||
or you can train your own
|
||||
"""
|
||||
|
||||
def __init__(self, checkpoint_path, target_sample_hz=16000, seq_len_multiple_of=None, output_layer=9, device=None):
|
||||
super().__init__()
|
||||
self.target_sample_hz = target_sample_hz
|
||||
self.seq_len_multiple_of = seq_len_multiple_of
|
||||
self.output_layer = output_layer
|
||||
if device is not None:
|
||||
self.to(device)
|
||||
self.model = HubertModel.from_pretrained("facebook/hubert-base-ls960")
|
||||
if device is not None:
|
||||
self.model.to(device)
|
||||
self.model.eval()
|
||||
|
||||
@property
|
||||
def groups(self):
|
||||
return 1
|
||||
|
||||
@torch.no_grad()
|
||||
def forward(self, wav_input, flatten=True, input_sample_hz=None):
|
||||
device = wav_input.device
|
||||
|
||||
if exists(input_sample_hz):
|
||||
wav_input = resample(wav_input, input_sample_hz, self.target_sample_hz)
|
||||
|
||||
if exists(self.seq_len_multiple_of):
|
||||
wav_input = curtail_to_multiple(wav_input, self.seq_len_multiple_of)
|
||||
|
||||
outputs = self.model.forward(
|
||||
wav_input,
|
||||
output_hidden_states=True,
|
||||
)
|
||||
embed = outputs["hidden_states"][self.output_layer]
|
||||
embed, packed_shape = pack([embed], "* d")
|
||||
codebook_indices = torch.from_numpy(embed.cpu().detach().numpy()).to(device)
|
||||
if flatten:
|
||||
return codebook_indices
|
||||
|
||||
(codebook_indices,) = unpack(codebook_indices, packed_shape, "*")
|
||||
return codebook_indices
|
|
@ -0,0 +1,196 @@
|
|||
"""
|
||||
Custom tokenizer model.
|
||||
Author: https://www.github.com/gitmylo/
|
||||
License: MIT
|
||||
"""
|
||||
|
||||
import json
|
||||
import os.path
|
||||
from zipfile import ZipFile
|
||||
|
||||
import numpy
|
||||
import torch
|
||||
from torch import nn, optim
|
||||
from torch.serialization import MAP_LOCATION
|
||||
|
||||
|
||||
class HubertTokenizer(nn.Module):
|
||||
def __init__(self, hidden_size=1024, input_size=768, output_size=10000, version=0):
|
||||
super().__init__()
|
||||
next_size = input_size
|
||||
if version == 0:
|
||||
self.lstm = nn.LSTM(input_size, hidden_size, 2, batch_first=True)
|
||||
next_size = hidden_size
|
||||
if version == 1:
|
||||
self.lstm = nn.LSTM(input_size, hidden_size, 2, batch_first=True)
|
||||
self.intermediate = nn.Linear(hidden_size, 4096)
|
||||
next_size = 4096
|
||||
|
||||
self.fc = nn.Linear(next_size, output_size)
|
||||
self.softmax = nn.LogSoftmax(dim=1)
|
||||
self.optimizer: optim.Optimizer = None
|
||||
self.lossfunc = nn.CrossEntropyLoss()
|
||||
self.input_size = input_size
|
||||
self.hidden_size = hidden_size
|
||||
self.output_size = output_size
|
||||
self.version = version
|
||||
|
||||
def forward(self, x):
|
||||
x, _ = self.lstm(x)
|
||||
if self.version == 1:
|
||||
x = self.intermediate(x)
|
||||
x = self.fc(x)
|
||||
x = self.softmax(x)
|
||||
return x
|
||||
|
||||
@torch.no_grad()
|
||||
def get_token(self, x):
|
||||
"""
|
||||
Used to get the token for the first
|
||||
:param x: An array with shape (N, input_size) where N is a whole number greater or equal to 1, and input_size is the input size used when creating the model.
|
||||
:return: An array with shape (N,) where N is the same as N from the input. Every number in the array is a whole number in range 0...output_size - 1 where output_size is the output size used when creating the model.
|
||||
"""
|
||||
return torch.argmax(self(x), dim=1)
|
||||
|
||||
def prepare_training(self):
|
||||
self.optimizer = optim.Adam(self.parameters(), 0.001)
|
||||
|
||||
def train_step(self, x_train, y_train, log_loss=False):
|
||||
# y_train = y_train[:-1]
|
||||
# y_train = y_train[1:]
|
||||
|
||||
optimizer = self.optimizer
|
||||
lossfunc = self.lossfunc
|
||||
# Zero the gradients
|
||||
self.zero_grad()
|
||||
|
||||
# Forward pass
|
||||
y_pred = self(x_train)
|
||||
|
||||
y_train_len = len(y_train)
|
||||
y_pred_len = y_pred.shape[0]
|
||||
|
||||
if y_train_len > y_pred_len:
|
||||
diff = y_train_len - y_pred_len
|
||||
y_train = y_train[diff:]
|
||||
elif y_train_len < y_pred_len:
|
||||
diff = y_pred_len - y_train_len
|
||||
y_pred = y_pred[:-diff, :]
|
||||
|
||||
y_train_hot = torch.zeros(len(y_train), self.output_size)
|
||||
y_train_hot[range(len(y_train)), y_train] = 1
|
||||
y_train_hot = y_train_hot.to("cuda")
|
||||
|
||||
# Calculate the loss
|
||||
loss = lossfunc(y_pred, y_train_hot)
|
||||
|
||||
# Print loss
|
||||
if log_loss:
|
||||
print("Loss", loss.item())
|
||||
|
||||
# Backward pass
|
||||
loss.backward()
|
||||
|
||||
# Update the weights
|
||||
optimizer.step()
|
||||
|
||||
def save(self, path):
|
||||
info_path = ".".join(os.path.basename(path).split(".")[:-1]) + "/.info"
|
||||
torch.save(self.state_dict(), path)
|
||||
data_from_model = Data(self.input_size, self.hidden_size, self.output_size, self.version)
|
||||
with ZipFile(path, "a") as model_zip:
|
||||
model_zip.writestr(info_path, data_from_model.save())
|
||||
model_zip.close()
|
||||
|
||||
@staticmethod
|
||||
def load_from_checkpoint(path, map_location: MAP_LOCATION = None):
|
||||
old = True
|
||||
with ZipFile(path) as model_zip:
|
||||
filesMatch = [file for file in model_zip.namelist() if file.endswith("/.info")]
|
||||
file = filesMatch[0] if filesMatch else None
|
||||
if file:
|
||||
old = False
|
||||
data_from_model = Data.load(model_zip.read(file).decode("utf-8"))
|
||||
model_zip.close()
|
||||
if old:
|
||||
model = HubertTokenizer()
|
||||
else:
|
||||
model = HubertTokenizer(
|
||||
data_from_model.hidden_size,
|
||||
data_from_model.input_size,
|
||||
data_from_model.output_size,
|
||||
data_from_model.version,
|
||||
)
|
||||
model.load_state_dict(torch.load(path))
|
||||
if map_location:
|
||||
model = model.to(map_location)
|
||||
return model
|
||||
|
||||
|
||||
class Data:
|
||||
input_size: int
|
||||
hidden_size: int
|
||||
output_size: int
|
||||
version: int
|
||||
|
||||
def __init__(self, input_size=768, hidden_size=1024, output_size=10000, version=0):
|
||||
self.input_size = input_size
|
||||
self.hidden_size = hidden_size
|
||||
self.output_size = output_size
|
||||
self.version = version
|
||||
|
||||
@staticmethod
|
||||
def load(string):
|
||||
data = json.loads(string)
|
||||
return Data(data["input_size"], data["hidden_size"], data["output_size"], data["version"])
|
||||
|
||||
def save(self):
|
||||
data = {
|
||||
"input_size": self.input_size,
|
||||
"hidden_size": self.hidden_size,
|
||||
"output_size": self.output_size,
|
||||
"version": self.version,
|
||||
}
|
||||
return json.dumps(data)
|
||||
|
||||
|
||||
def auto_train(data_path, save_path="model.pth", load_model: str = None, save_epochs=1):
|
||||
data_x, data_y = [], []
|
||||
|
||||
if load_model and os.path.isfile(load_model):
|
||||
print("Loading model from", load_model)
|
||||
model_training = HubertTokenizer.load_from_checkpoint(load_model, "cuda")
|
||||
else:
|
||||
print("Creating new model.")
|
||||
model_training = HubertTokenizer(version=1).to("cuda") # Settings for the model to run without lstm
|
||||
save_path = os.path.join(data_path, save_path)
|
||||
base_save_path = ".".join(save_path.split(".")[:-1])
|
||||
|
||||
sem_string = "_semantic.npy"
|
||||
feat_string = "_semantic_features.npy"
|
||||
|
||||
ready = os.path.join(data_path, "ready")
|
||||
for input_file in os.listdir(ready):
|
||||
full_path = os.path.join(ready, input_file)
|
||||
if input_file.endswith(sem_string):
|
||||
data_y.append(numpy.load(full_path))
|
||||
elif input_file.endswith(feat_string):
|
||||
data_x.append(numpy.load(full_path))
|
||||
model_training.prepare_training()
|
||||
|
||||
epoch = 1
|
||||
|
||||
while 1:
|
||||
for _ in range(save_epochs):
|
||||
j = 0
|
||||
for x, y in zip(data_x, data_y):
|
||||
model_training.train_step(
|
||||
torch.tensor(x).to("cuda"), torch.tensor(y).to("cuda"), j % 50 == 0
|
||||
) # Print loss every 50 steps
|
||||
j += 1
|
||||
save_p = save_path
|
||||
save_p_2 = f"{base_save_path}_epoch_{epoch}.pth"
|
||||
model_training.save(save_p)
|
||||
model_training.save(save_p_2)
|
||||
print(f"Epoch {epoch} completed")
|
||||
epoch += 1
|
|
@ -0,0 +1,558 @@
|
|||
import logging
|
||||
import os
|
||||
import re
|
||||
from glob import glob
|
||||
from typing import Dict, List
|
||||
|
||||
import librosa
|
||||
import numpy as np
|
||||
import torch
|
||||
import torchaudio
|
||||
import tqdm
|
||||
from encodec.utils import convert_audio
|
||||
from scipy.special import softmax
|
||||
from torch.nn import functional as F
|
||||
|
||||
from TTS.tts.layers.bark.hubert.hubert_manager import HubertManager
|
||||
from TTS.tts.layers.bark.hubert.kmeans_hubert import CustomHubert
|
||||
from TTS.tts.layers.bark.hubert.tokenizer import HubertTokenizer
|
||||
from TTS.tts.layers.bark.load_model import clear_cuda_cache, inference_mode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _tokenize(tokenizer, text):
|
||||
return tokenizer.encode(text, add_special_tokens=False)
|
||||
|
||||
|
||||
def _detokenize(tokenizer, enc_text):
|
||||
return tokenizer.decode(enc_text)
|
||||
|
||||
|
||||
def _normalize_whitespace(text):
|
||||
return re.sub(r"\s+", " ", text).strip()
|
||||
|
||||
|
||||
def get_voices(extra_voice_dirs: List[str] = []): # pylint: disable=dangerous-default-value
|
||||
dirs = extra_voice_dirs
|
||||
voices: Dict[str, List[str]] = {}
|
||||
for d in dirs:
|
||||
subs = os.listdir(d)
|
||||
for sub in subs:
|
||||
subj = os.path.join(d, sub)
|
||||
if os.path.isdir(subj):
|
||||
voices[sub] = list(glob(f"{subj}/*.npz"))
|
||||
# fetch audio files if no npz files are found
|
||||
if len(voices[sub]) == 0:
|
||||
voices[sub] = list(glob(f"{subj}/*.wav")) + list(glob(f"{subj}/*.mp3"))
|
||||
return voices
|
||||
|
||||
|
||||
def load_npz(npz_file):
|
||||
x_history = np.load(npz_file)
|
||||
semantic = x_history["semantic_prompt"]
|
||||
coarse = x_history["coarse_prompt"]
|
||||
fine = x_history["fine_prompt"]
|
||||
return semantic, coarse, fine
|
||||
|
||||
|
||||
def load_voice(model, voice: str, extra_voice_dirs: List[str] = []): # pylint: disable=dangerous-default-value
|
||||
if voice == "random":
|
||||
return None, None, None
|
||||
|
||||
voices = get_voices(extra_voice_dirs)
|
||||
paths = voices[voice]
|
||||
|
||||
# bark only uses a single sample for cloning
|
||||
if len(paths) > 1:
|
||||
raise ValueError(f"Voice {voice} has multiple paths: {paths}")
|
||||
|
||||
try:
|
||||
path = voices[voice]
|
||||
except KeyError as e:
|
||||
raise KeyError(f"Voice {voice} not found in {extra_voice_dirs}") from e
|
||||
|
||||
if len(paths) == 1 and paths[0].endswith(".npz"):
|
||||
return load_npz(path[0])
|
||||
|
||||
audio_path = paths[0]
|
||||
# replace the file extension with .npz
|
||||
output_path = os.path.splitext(audio_path)[0] + ".npz"
|
||||
generate_voice(audio=audio_path, model=model, output_path=output_path)
|
||||
return load_voice(model, voice, extra_voice_dirs)
|
||||
|
||||
|
||||
def zero_crossing_rate(audio, frame_length=1024, hop_length=512):
|
||||
zero_crossings = np.sum(np.abs(np.diff(np.sign(audio))) / 2)
|
||||
total_frames = 1 + int((len(audio) - frame_length) / hop_length)
|
||||
return zero_crossings / total_frames
|
||||
|
||||
|
||||
def compute_spectral_contrast(audio_data, sample_rate, n_bands=6, fmin=200.0):
|
||||
spectral_contrast = librosa.feature.spectral_contrast(y=audio_data, sr=sample_rate, n_bands=n_bands, fmin=fmin)
|
||||
return np.mean(spectral_contrast)
|
||||
|
||||
|
||||
def compute_average_bass_energy(audio_data, sample_rate, max_bass_freq=250):
|
||||
stft = librosa.stft(audio_data)
|
||||
power_spectrogram = np.abs(stft) ** 2
|
||||
frequencies = librosa.fft_frequencies(sr=sample_rate, n_fft=stft.shape[0])
|
||||
bass_mask = frequencies <= max_bass_freq
|
||||
bass_energy = power_spectrogram[np.ix_(bass_mask, np.arange(power_spectrogram.shape[1]))].mean()
|
||||
return bass_energy
|
||||
|
||||
|
||||
def generate_voice(
|
||||
audio,
|
||||
model,
|
||||
output_path,
|
||||
):
|
||||
"""Generate a new voice from a given audio and text prompt.
|
||||
|
||||
Args:
|
||||
audio (np.ndarray): The audio to use as a base for the new voice.
|
||||
text (str): Transcription of the audio you are clonning.
|
||||
model (BarkModel): The BarkModel to use for generating the new voice.
|
||||
output_path (str): The path to save the generated voice to.
|
||||
"""
|
||||
if isinstance(audio, str):
|
||||
audio, sr = torchaudio.load(audio)
|
||||
audio = convert_audio(audio, sr, model.config.sample_rate, model.encodec.channels)
|
||||
audio = audio.unsqueeze(0).to(model.device)
|
||||
|
||||
with torch.no_grad():
|
||||
encoded_frames = model.encodec.encode(audio)
|
||||
codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze() # [n_q, T]
|
||||
|
||||
# move codes to cpu
|
||||
codes = codes.cpu().numpy()
|
||||
|
||||
# generate semantic tokens
|
||||
# Load the HuBERT model
|
||||
hubert_manager = HubertManager()
|
||||
# hubert_manager.make_sure_hubert_installed(model_path=model.config.LOCAL_MODEL_PATHS["hubert"])
|
||||
hubert_manager.make_sure_tokenizer_installed(model_path=model.config.LOCAL_MODEL_PATHS["hubert_tokenizer"])
|
||||
|
||||
hubert_model = CustomHubert(checkpoint_path=model.config.LOCAL_MODEL_PATHS["hubert"]).to(model.device)
|
||||
|
||||
# Load the CustomTokenizer model
|
||||
tokenizer = HubertTokenizer.load_from_checkpoint(model.config.LOCAL_MODEL_PATHS["hubert_tokenizer"]).to(
|
||||
model.device
|
||||
) # Automatically uses
|
||||
# semantic_tokens = model.text_to_semantic(
|
||||
# text, max_gen_duration_s=seconds, top_k=50, top_p=0.95, temp=0.7
|
||||
# ) # not 100%
|
||||
semantic_vectors = hubert_model.forward(audio[0], input_sample_hz=model.config.sample_rate)
|
||||
semantic_tokens = tokenizer.get_token(semantic_vectors)
|
||||
semantic_tokens = semantic_tokens.cpu().numpy()
|
||||
|
||||
np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)
|
||||
|
||||
|
||||
def generate_text_semantic(
|
||||
text,
|
||||
model,
|
||||
history_prompt=None,
|
||||
temp=0.7,
|
||||
top_k=None,
|
||||
top_p=None,
|
||||
silent=False,
|
||||
min_eos_p=0.2,
|
||||
max_gen_duration_s=None,
|
||||
allow_early_stop=True,
|
||||
base=None,
|
||||
use_kv_caching=True,
|
||||
):
|
||||
"""Generate semantic tokens from text."""
|
||||
print(f"history_prompt in gen: {history_prompt}")
|
||||
assert isinstance(text, str)
|
||||
text = _normalize_whitespace(text)
|
||||
assert len(text.strip()) > 0
|
||||
if all(v is not None for v in history_prompt) or base is not None:
|
||||
if history_prompt is not None:
|
||||
semantic_history = history_prompt[0]
|
||||
if base is not None:
|
||||
semantic_history = base[0]
|
||||
assert (
|
||||
isinstance(semantic_history, np.ndarray)
|
||||
and len(semantic_history.shape) == 1
|
||||
and len(semantic_history) > 0
|
||||
and semantic_history.min() >= 0
|
||||
and semantic_history.max() <= model.config.SEMANTIC_VOCAB_SIZE - 1
|
||||
)
|
||||
else:
|
||||
semantic_history = None
|
||||
encoded_text = np.array(_tokenize(model.tokenizer, text)) + model.config.TEXT_ENCODING_OFFSET
|
||||
if len(encoded_text) > 256:
|
||||
p = round((len(encoded_text) - 256) / len(encoded_text) * 100, 1)
|
||||
logger.warning(f"warning, text too long, lopping of last {p}%")
|
||||
encoded_text = encoded_text[:256]
|
||||
encoded_text = np.pad(
|
||||
encoded_text,
|
||||
(0, 256 - len(encoded_text)),
|
||||
constant_values=model.config.TEXT_PAD_TOKEN,
|
||||
mode="constant",
|
||||
)
|
||||
if semantic_history is not None:
|
||||
semantic_history = semantic_history.astype(np.int64)
|
||||
# lop off if history is too long, pad if needed
|
||||
semantic_history = semantic_history[-256:]
|
||||
semantic_history = np.pad(
|
||||
semantic_history,
|
||||
(0, 256 - len(semantic_history)),
|
||||
constant_values=model.config.SEMANTIC_PAD_TOKEN,
|
||||
mode="constant",
|
||||
)
|
||||
else:
|
||||
semantic_history = np.array([model.config.SEMANTIC_PAD_TOKEN] * 256)
|
||||
x = torch.from_numpy(
|
||||
np.hstack([encoded_text, semantic_history, np.array([model.config.SEMANTIC_INFER_TOKEN])]).astype(np.int64)
|
||||
)[None]
|
||||
assert x.shape[1] == 256 + 256 + 1
|
||||
with inference_mode():
|
||||
x = x.to(model.device)
|
||||
n_tot_steps = 768
|
||||
# custom tqdm updates since we don't know when eos will occur
|
||||
pbar = tqdm.tqdm(disable=silent, total=100)
|
||||
pbar_state = 0
|
||||
tot_generated_duration_s = 0
|
||||
kv_cache = None
|
||||
for n in range(n_tot_steps):
|
||||
if use_kv_caching and kv_cache is not None:
|
||||
x_input = x[:, [-1]]
|
||||
else:
|
||||
x_input = x
|
||||
logits, kv_cache = model.semantic_model(
|
||||
x_input, merge_context=True, use_cache=use_kv_caching, past_kv=kv_cache
|
||||
)
|
||||
relevant_logits = logits[0, 0, : model.config.SEMANTIC_VOCAB_SIZE]
|
||||
if allow_early_stop:
|
||||
relevant_logits = torch.hstack(
|
||||
(relevant_logits, logits[0, 0, [model.config.SEMANTIC_PAD_TOKEN]])
|
||||
) # eos
|
||||
if top_p is not None:
|
||||
# faster to convert to numpy
|
||||
logits_device = relevant_logits.device
|
||||
logits_dtype = relevant_logits.type()
|
||||
relevant_logits = relevant_logits.detach().cpu().type(torch.float32).numpy()
|
||||
sorted_indices = np.argsort(relevant_logits)[::-1]
|
||||
sorted_logits = relevant_logits[sorted_indices]
|
||||
cumulative_probs = np.cumsum(softmax(sorted_logits))
|
||||
sorted_indices_to_remove = cumulative_probs > top_p
|
||||
sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].copy()
|
||||
sorted_indices_to_remove[0] = False
|
||||
relevant_logits[sorted_indices[sorted_indices_to_remove]] = -np.inf
|
||||
relevant_logits = torch.from_numpy(relevant_logits)
|
||||
relevant_logits = relevant_logits.to(logits_device).type(logits_dtype)
|
||||
if top_k is not None:
|
||||
v, _ = torch.topk(relevant_logits, min(top_k, relevant_logits.size(-1)))
|
||||
relevant_logits[relevant_logits < v[-1]] = -float("Inf")
|
||||
probs = torch.softmax(relevant_logits / temp, dim=-1)
|
||||
item_next = torch.multinomial(probs, num_samples=1)
|
||||
if allow_early_stop and (
|
||||
item_next == model.config.SEMANTIC_VOCAB_SIZE or (min_eos_p is not None and probs[-1] >= min_eos_p)
|
||||
):
|
||||
# eos found, so break
|
||||
pbar.update(100 - pbar_state)
|
||||
break
|
||||
x = torch.cat((x, item_next[None]), dim=1)
|
||||
tot_generated_duration_s += 1 / model.config.SEMANTIC_RATE_HZ
|
||||
if max_gen_duration_s is not None and tot_generated_duration_s > max_gen_duration_s:
|
||||
pbar.update(100 - pbar_state)
|
||||
break
|
||||
if n == n_tot_steps - 1:
|
||||
pbar.update(100 - pbar_state)
|
||||
break
|
||||
del logits, relevant_logits, probs, item_next
|
||||
req_pbar_state = np.min([100, int(round(100 * n / n_tot_steps))])
|
||||
if req_pbar_state > pbar_state:
|
||||
pbar.update(req_pbar_state - pbar_state)
|
||||
pbar_state = req_pbar_state
|
||||
pbar.close()
|
||||
out = x.detach().cpu().numpy().squeeze()[256 + 256 + 1 :]
|
||||
assert all(out >= 0) and all(out < model.config.SEMANTIC_VOCAB_SIZE)
|
||||
clear_cuda_cache()
|
||||
return out
|
||||
|
||||
|
||||
def _flatten_codebooks(arr, offset_size):
|
||||
assert len(arr.shape) == 2
|
||||
arr = arr.copy()
|
||||
if offset_size is not None:
|
||||
for n in range(1, arr.shape[0]):
|
||||
arr[n, :] += offset_size * n
|
||||
flat_arr = arr.ravel("F")
|
||||
return flat_arr
|
||||
|
||||
|
||||
def generate_coarse(
|
||||
x_semantic,
|
||||
model,
|
||||
history_prompt=None,
|
||||
temp=0.7,
|
||||
top_k=None,
|
||||
top_p=None,
|
||||
silent=False,
|
||||
max_coarse_history=630, # min 60 (faster), max 630 (more context)
|
||||
sliding_window_len=60,
|
||||
base=None,
|
||||
use_kv_caching=True,
|
||||
):
|
||||
"""Generate coarse audio codes from semantic tokens."""
|
||||
assert (
|
||||
isinstance(x_semantic, np.ndarray)
|
||||
and len(x_semantic.shape) == 1
|
||||
and len(x_semantic) > 0
|
||||
and x_semantic.min() >= 0
|
||||
and x_semantic.max() <= model.config.SEMANTIC_VOCAB_SIZE - 1
|
||||
)
|
||||
assert 60 <= max_coarse_history <= 630
|
||||
assert max_coarse_history + sliding_window_len <= 1024 - 256
|
||||
semantic_to_coarse_ratio = (
|
||||
model.config.COARSE_RATE_HZ / model.config.SEMANTIC_RATE_HZ * model.config.N_COARSE_CODEBOOKS
|
||||
)
|
||||
max_semantic_history = int(np.floor(max_coarse_history / semantic_to_coarse_ratio))
|
||||
if all(v is not None for v in history_prompt) or base is not None:
|
||||
if history_prompt is not None:
|
||||
x_history = history_prompt
|
||||
x_semantic_history = x_history[0]
|
||||
x_coarse_history = x_history[1]
|
||||
if base is not None:
|
||||
x_semantic_history = base[0]
|
||||
x_coarse_history = base[1]
|
||||
assert (
|
||||
isinstance(x_semantic_history, np.ndarray)
|
||||
and len(x_semantic_history.shape) == 1
|
||||
and len(x_semantic_history) > 0
|
||||
and x_semantic_history.min() >= 0
|
||||
and x_semantic_history.max() <= model.config.SEMANTIC_VOCAB_SIZE - 1
|
||||
and isinstance(x_coarse_history, np.ndarray)
|
||||
and len(x_coarse_history.shape) == 2
|
||||
and x_coarse_history.shape[0] == model.config.N_COARSE_CODEBOOKS
|
||||
and x_coarse_history.shape[-1] >= 0
|
||||
and x_coarse_history.min() >= 0
|
||||
and x_coarse_history.max() <= model.config.CODEBOOK_SIZE - 1
|
||||
and (
|
||||
round(x_coarse_history.shape[-1] / len(x_semantic_history), 1)
|
||||
== round(semantic_to_coarse_ratio / model.config.N_COARSE_CODEBOOKS, 1)
|
||||
)
|
||||
)
|
||||
x_coarse_history = (
|
||||
_flatten_codebooks(x_coarse_history, model.config.CODEBOOK_SIZE) + model.config.SEMANTIC_VOCAB_SIZE
|
||||
)
|
||||
# trim histories correctly
|
||||
n_semantic_hist_provided = np.min(
|
||||
[
|
||||
max_semantic_history,
|
||||
len(x_semantic_history) - len(x_semantic_history) % 2,
|
||||
int(np.floor(len(x_coarse_history) / semantic_to_coarse_ratio)),
|
||||
]
|
||||
)
|
||||
n_coarse_hist_provided = int(round(n_semantic_hist_provided * semantic_to_coarse_ratio))
|
||||
x_semantic_history = x_semantic_history[-n_semantic_hist_provided:].astype(np.int32)
|
||||
x_coarse_history = x_coarse_history[-n_coarse_hist_provided:].astype(np.int32)
|
||||
# TODO: bit of a hack for time alignment (sounds better)
|
||||
x_coarse_history = x_coarse_history[:-2]
|
||||
else:
|
||||
x_semantic_history = np.array([], dtype=np.int32)
|
||||
x_coarse_history = np.array([], dtype=np.int32)
|
||||
# start loop
|
||||
n_steps = int(
|
||||
round(
|
||||
np.floor(len(x_semantic) * semantic_to_coarse_ratio / model.config.N_COARSE_CODEBOOKS)
|
||||
* model.config.N_COARSE_CODEBOOKS
|
||||
)
|
||||
)
|
||||
assert n_steps > 0 and n_steps % model.config.N_COARSE_CODEBOOKS == 0
|
||||
x_semantic = np.hstack([x_semantic_history, x_semantic]).astype(np.int32)
|
||||
x_coarse = x_coarse_history.astype(np.int32)
|
||||
base_semantic_idx = len(x_semantic_history)
|
||||
with inference_mode():
|
||||
x_semantic_in = torch.from_numpy(x_semantic)[None].to(model.device)
|
||||
x_coarse_in = torch.from_numpy(x_coarse)[None].to(model.device)
|
||||
n_window_steps = int(np.ceil(n_steps / sliding_window_len))
|
||||
n_step = 0
|
||||
for _ in tqdm.tqdm(range(n_window_steps), total=n_window_steps, disable=silent):
|
||||
semantic_idx = base_semantic_idx + int(round(n_step / semantic_to_coarse_ratio))
|
||||
# pad from right side
|
||||
x_in = x_semantic_in[:, np.max([0, semantic_idx - max_semantic_history]) :]
|
||||
x_in = x_in[:, :256]
|
||||
x_in = F.pad(
|
||||
x_in,
|
||||
(0, 256 - x_in.shape[-1]),
|
||||
"constant",
|
||||
model.config.COARSE_SEMANTIC_PAD_TOKEN,
|
||||
)
|
||||
x_in = torch.hstack(
|
||||
[
|
||||
x_in,
|
||||
torch.tensor([model.config.COARSE_INFER_TOKEN])[None].to(model.device),
|
||||
x_coarse_in[:, -max_coarse_history:],
|
||||
]
|
||||
)
|
||||
kv_cache = None
|
||||
for _ in range(sliding_window_len):
|
||||
if n_step >= n_steps:
|
||||
continue
|
||||
is_major_step = n_step % model.config.N_COARSE_CODEBOOKS == 0
|
||||
|
||||
if use_kv_caching and kv_cache is not None:
|
||||
x_input = x_in[:, [-1]]
|
||||
else:
|
||||
x_input = x_in
|
||||
|
||||
logits, kv_cache = model.coarse_model(x_input, use_cache=use_kv_caching, past_kv=kv_cache)
|
||||
logit_start_idx = (
|
||||
model.config.SEMANTIC_VOCAB_SIZE + (1 - int(is_major_step)) * model.config.CODEBOOK_SIZE
|
||||
)
|
||||
logit_end_idx = model.config.SEMANTIC_VOCAB_SIZE + (2 - int(is_major_step)) * model.config.CODEBOOK_SIZE
|
||||
relevant_logits = logits[0, 0, logit_start_idx:logit_end_idx]
|
||||
if top_p is not None:
|
||||
# faster to convert to numpy
|
||||
logits_device = relevant_logits.device
|
||||
logits_dtype = relevant_logits.type()
|
||||
relevant_logits = relevant_logits.detach().cpu().type(torch.float32).numpy()
|
||||
sorted_indices = np.argsort(relevant_logits)[::-1]
|
||||
sorted_logits = relevant_logits[sorted_indices]
|
||||
cumulative_probs = np.cumsum(torch.nn.functional.softmax(sorted_logits))
|
||||
sorted_indices_to_remove = cumulative_probs > top_p
|
||||
sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].copy()
|
||||
sorted_indices_to_remove[0] = False
|
||||
relevant_logits[sorted_indices[sorted_indices_to_remove]] = -np.inf
|
||||
relevant_logits = torch.from_numpy(relevant_logits)
|
||||
relevant_logits = relevant_logits.to(logits_device).type(logits_dtype)
|
||||
if top_k is not None:
|
||||
v, _ = torch.topk(relevant_logits, min(top_k, relevant_logits.size(-1)))
|
||||
relevant_logits[relevant_logits < v[-1]] = -float("Inf")
|
||||
probs = torch.nn.functional.softmax(relevant_logits / temp, dim=-1)
|
||||
item_next = torch.multinomial(probs, num_samples=1)
|
||||
item_next += logit_start_idx
|
||||
x_coarse_in = torch.cat((x_coarse_in, item_next[None]), dim=1)
|
||||
x_in = torch.cat((x_in, item_next[None]), dim=1)
|
||||
del logits, relevant_logits, probs, item_next
|
||||
n_step += 1
|
||||
del x_in
|
||||
del x_semantic_in
|
||||
gen_coarse_arr = x_coarse_in.detach().cpu().numpy().squeeze()[len(x_coarse_history) :]
|
||||
del x_coarse_in
|
||||
assert len(gen_coarse_arr) == n_steps
|
||||
gen_coarse_audio_arr = (
|
||||
gen_coarse_arr.reshape(-1, model.config.N_COARSE_CODEBOOKS).T - model.config.SEMANTIC_VOCAB_SIZE
|
||||
)
|
||||
for n in range(1, model.config.N_COARSE_CODEBOOKS):
|
||||
gen_coarse_audio_arr[n, :] -= n * model.config.CODEBOOK_SIZE
|
||||
clear_cuda_cache()
|
||||
return gen_coarse_audio_arr
|
||||
|
||||
|
||||
def generate_fine(
|
||||
x_coarse_gen,
|
||||
model,
|
||||
history_prompt=None,
|
||||
temp=0.5,
|
||||
silent=True,
|
||||
base=None,
|
||||
):
|
||||
"""Generate full audio codes from coarse audio codes."""
|
||||
assert (
|
||||
isinstance(x_coarse_gen, np.ndarray)
|
||||
and len(x_coarse_gen.shape) == 2
|
||||
and 1 <= x_coarse_gen.shape[0] <= model.config.N_FINE_CODEBOOKS - 1
|
||||
and x_coarse_gen.shape[1] > 0
|
||||
and x_coarse_gen.min() >= 0
|
||||
and x_coarse_gen.max() <= model.config.CODEBOOK_SIZE - 1
|
||||
)
|
||||
if all(v is not None for v in history_prompt) or base is not None:
|
||||
if history_prompt is not None:
|
||||
x_fine_history = history_prompt[2]
|
||||
if base is not None:
|
||||
x_fine_history = base[2]
|
||||
assert (
|
||||
isinstance(x_fine_history, np.ndarray)
|
||||
and len(x_fine_history.shape) == 2
|
||||
and x_fine_history.shape[0] == model.config.N_FINE_CODEBOOKS
|
||||
and x_fine_history.shape[1] >= 0
|
||||
and x_fine_history.min() >= 0
|
||||
and x_fine_history.max() <= model.config.CODEBOOK_SIZE - 1
|
||||
)
|
||||
else:
|
||||
x_fine_history = None
|
||||
n_coarse = x_coarse_gen.shape[0]
|
||||
# make input arr
|
||||
in_arr = np.vstack(
|
||||
[
|
||||
x_coarse_gen,
|
||||
np.zeros((model.config.N_FINE_CODEBOOKS - n_coarse, x_coarse_gen.shape[1]))
|
||||
+ model.config.CODEBOOK_SIZE, # padding
|
||||
]
|
||||
).astype(np.int32)
|
||||
# prepend history if available (max 512)
|
||||
if x_fine_history is not None:
|
||||
x_fine_history = x_fine_history.astype(np.int32)
|
||||
in_arr = np.hstack(
|
||||
[
|
||||
x_fine_history[:, -512:].astype(np.int32),
|
||||
in_arr,
|
||||
]
|
||||
)
|
||||
n_history = x_fine_history[:, -512:].shape[1]
|
||||
else:
|
||||
n_history = 0
|
||||
n_remove_from_end = 0
|
||||
# need to pad if too short (since non-causal model)
|
||||
if in_arr.shape[1] < 1024:
|
||||
n_remove_from_end = 1024 - in_arr.shape[1]
|
||||
in_arr = np.hstack(
|
||||
[
|
||||
in_arr,
|
||||
np.zeros((model.config.N_FINE_CODEBOOKS, n_remove_from_end), dtype=np.int32)
|
||||
+ model.config.CODEBOOK_SIZE,
|
||||
]
|
||||
)
|
||||
# we can be lazy about fractional loop and just keep overwriting codebooks
|
||||
n_loops = np.max([0, int(np.ceil((x_coarse_gen.shape[1] - (1024 - n_history)) / 512))]) + 1
|
||||
with inference_mode():
|
||||
in_arr = torch.tensor(in_arr.T).to(model.device)
|
||||
for n in tqdm.tqdm(range(n_loops), disable=silent):
|
||||
start_idx = np.min([n * 512, in_arr.shape[0] - 1024])
|
||||
start_fill_idx = np.min([n_history + n * 512, in_arr.shape[0] - 512])
|
||||
rel_start_fill_idx = start_fill_idx - start_idx
|
||||
in_buffer = in_arr[start_idx : start_idx + 1024, :][None]
|
||||
for nn in range(n_coarse, model.config.N_FINE_CODEBOOKS):
|
||||
logits = model.fine_model(nn, in_buffer)
|
||||
if temp is None:
|
||||
relevant_logits = logits[0, rel_start_fill_idx:, : model.config.CODEBOOK_SIZE]
|
||||
codebook_preds = torch.argmax(relevant_logits, -1)
|
||||
else:
|
||||
relevant_logits = logits[0, :, : model.config.CODEBOOK_SIZE] / temp
|
||||
probs = F.softmax(relevant_logits, dim=-1)
|
||||
codebook_preds = torch.hstack(
|
||||
[torch.multinomial(probs[n], num_samples=1) for n in range(rel_start_fill_idx, 1024)]
|
||||
)
|
||||
in_buffer[0, rel_start_fill_idx:, nn] = codebook_preds
|
||||
del logits, codebook_preds
|
||||
# transfer over info into model_in and convert to numpy
|
||||
for nn in range(n_coarse, model.config.N_FINE_CODEBOOKS):
|
||||
in_arr[start_fill_idx : start_fill_idx + (1024 - rel_start_fill_idx), nn] = in_buffer[
|
||||
0, rel_start_fill_idx:, nn
|
||||
]
|
||||
del in_buffer
|
||||
gen_fine_arr = in_arr.detach().cpu().numpy().squeeze().T
|
||||
del in_arr
|
||||
gen_fine_arr = gen_fine_arr[:, n_history:]
|
||||
if n_remove_from_end > 0:
|
||||
gen_fine_arr = gen_fine_arr[:, :-n_remove_from_end]
|
||||
assert gen_fine_arr.shape[-1] == x_coarse_gen.shape[-1]
|
||||
clear_cuda_cache()
|
||||
return gen_fine_arr
|
||||
|
||||
|
||||
def codec_decode(fine_tokens, model):
|
||||
"""Turn quantized audio codes into audio array using encodec."""
|
||||
arr = torch.from_numpy(fine_tokens)[None]
|
||||
arr = arr.to(model.device)
|
||||
arr = arr.transpose(0, 1)
|
||||
emb = model.encodec.quantizer.decode(arr)
|
||||
out = model.encodec.decoder(emb)
|
||||
audio_arr = out.detach().cpu().numpy().squeeze()
|
||||
return audio_arr
|
|
@ -0,0 +1,160 @@
|
|||
import contextlib
|
||||
import functools
|
||||
import hashlib
|
||||
import logging
|
||||
import os
|
||||
|
||||
import requests
|
||||
import torch
|
||||
import tqdm
|
||||
|
||||
from TTS.tts.layers.bark.model import GPT, GPTConfig
|
||||
from TTS.tts.layers.bark.model_fine import FineGPT, FineGPTConfig
|
||||
|
||||
if (
|
||||
torch.cuda.is_available()
|
||||
and hasattr(torch.cuda, "amp")
|
||||
and hasattr(torch.cuda.amp, "autocast")
|
||||
and torch.cuda.is_bf16_supported()
|
||||
):
|
||||
autocast = functools.partial(torch.cuda.amp.autocast, dtype=torch.bfloat16)
|
||||
else:
|
||||
|
||||
@contextlib.contextmanager
|
||||
def autocast():
|
||||
yield
|
||||
|
||||
|
||||
# hold models in global scope to lazy load
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
if not hasattr(torch.nn.functional, "scaled_dot_product_attention"):
|
||||
logger.warning(
|
||||
"torch version does not support flash attention. You will get significantly faster"
|
||||
+ " inference speed by upgrade torch to newest version / nightly."
|
||||
)
|
||||
|
||||
|
||||
def _md5(fname):
|
||||
hash_md5 = hashlib.md5()
|
||||
with open(fname, "rb") as f:
|
||||
for chunk in iter(lambda: f.read(4096), b""):
|
||||
hash_md5.update(chunk)
|
||||
return hash_md5.hexdigest()
|
||||
|
||||
|
||||
def _download(from_s3_path, to_local_path, CACHE_DIR):
|
||||
os.makedirs(CACHE_DIR, exist_ok=True)
|
||||
response = requests.get(from_s3_path, stream=True)
|
||||
total_size_in_bytes = int(response.headers.get("content-length", 0))
|
||||
block_size = 1024 # 1 Kibibyte
|
||||
progress_bar = tqdm.tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
|
||||
with open(to_local_path, "wb") as file:
|
||||
for data in response.iter_content(block_size):
|
||||
progress_bar.update(len(data))
|
||||
file.write(data)
|
||||
progress_bar.close()
|
||||
if total_size_in_bytes not in [0, progress_bar.n]:
|
||||
raise ValueError("ERROR, something went wrong")
|
||||
|
||||
|
||||
class InferenceContext:
|
||||
def __init__(self, benchmark=False):
|
||||
# we can't expect inputs to be the same length, so disable benchmarking by default
|
||||
self._chosen_cudnn_benchmark = benchmark
|
||||
self._cudnn_benchmark = None
|
||||
|
||||
def __enter__(self):
|
||||
self._cudnn_benchmark = torch.backends.cudnn.benchmark
|
||||
torch.backends.cudnn.benchmark = self._chosen_cudnn_benchmark
|
||||
|
||||
def __exit__(self, exc_type, exc_value, exc_traceback):
|
||||
torch.backends.cudnn.benchmark = self._cudnn_benchmark
|
||||
|
||||
|
||||
if torch.cuda.is_available():
|
||||
torch.backends.cuda.matmul.allow_tf32 = True
|
||||
torch.backends.cudnn.allow_tf32 = True
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def inference_mode():
|
||||
with InferenceContext(), torch.inference_mode(), torch.no_grad(), autocast():
|
||||
yield
|
||||
|
||||
|
||||
def clear_cuda_cache():
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
torch.cuda.synchronize()
|
||||
|
||||
|
||||
def load_model(ckpt_path, device, config, model_type="text"):
|
||||
logger.info(f"loading {model_type} model from {ckpt_path}...")
|
||||
|
||||
if device == "cpu":
|
||||
logger.warning("No GPU being used. Careful, Inference might be extremely slow!")
|
||||
if model_type == "text":
|
||||
ConfigClass = GPTConfig
|
||||
ModelClass = GPT
|
||||
elif model_type == "coarse":
|
||||
ConfigClass = GPTConfig
|
||||
ModelClass = GPT
|
||||
elif model_type == "fine":
|
||||
ConfigClass = FineGPTConfig
|
||||
ModelClass = FineGPT
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
if (
|
||||
not config.USE_SMALLER_MODELS
|
||||
and os.path.exists(ckpt_path)
|
||||
and _md5(ckpt_path) != config.REMOTE_MODEL_PATHS[model_type]["checksum"]
|
||||
):
|
||||
logger.warning(f"found outdated {model_type} model, removing...")
|
||||
os.remove(ckpt_path)
|
||||
if not os.path.exists(ckpt_path):
|
||||
logger.info(f"{model_type} model not found, downloading...")
|
||||
_download(config.REMOTE_MODEL_PATHS[model_type]["path"], ckpt_path, config.CACHE_DIR)
|
||||
|
||||
checkpoint = torch.load(ckpt_path, map_location=device)
|
||||
# this is a hack
|
||||
model_args = checkpoint["model_args"]
|
||||
if "input_vocab_size" not in model_args:
|
||||
model_args["input_vocab_size"] = model_args["vocab_size"]
|
||||
model_args["output_vocab_size"] = model_args["vocab_size"]
|
||||
del model_args["vocab_size"]
|
||||
|
||||
gptconf = ConfigClass(**checkpoint["model_args"])
|
||||
if model_type == "text":
|
||||
config.semantic_config = gptconf
|
||||
elif model_type == "coarse":
|
||||
config.coarse_config = gptconf
|
||||
elif model_type == "fine":
|
||||
config.fine_config = gptconf
|
||||
|
||||
model = ModelClass(gptconf)
|
||||
state_dict = checkpoint["model"]
|
||||
# fixup checkpoint
|
||||
unwanted_prefix = "_orig_mod."
|
||||
for k, _ in list(state_dict.items()):
|
||||
if k.startswith(unwanted_prefix):
|
||||
state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k)
|
||||
extra_keys = set(state_dict.keys()) - set(model.state_dict().keys())
|
||||
extra_keys = set(k for k in extra_keys if not k.endswith(".attn.bias"))
|
||||
missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())
|
||||
missing_keys = set(k for k in missing_keys if not k.endswith(".attn.bias"))
|
||||
if len(extra_keys) != 0:
|
||||
raise ValueError(f"extra keys found: {extra_keys}")
|
||||
if len(missing_keys) != 0:
|
||||
raise ValueError(f"missing keys: {missing_keys}")
|
||||
model.load_state_dict(state_dict, strict=False)
|
||||
n_params = model.get_num_params()
|
||||
val_loss = checkpoint["best_val_loss"].item()
|
||||
logger.info(f"model loaded: {round(n_params/1e6,1)}M params, {round(val_loss,3)} loss")
|
||||
model.eval()
|
||||
model.to(device)
|
||||
del checkpoint, state_dict
|
||||
clear_cuda_cache()
|
||||
return model, config
|
|
@ -0,0 +1,233 @@
|
|||
"""
|
||||
Much of this code is adapted from Andrej Karpathy's NanoGPT
|
||||
(https://github.com/karpathy/nanoGPT)
|
||||
"""
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
|
||||
import torch
|
||||
from coqpit import Coqpit
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
|
||||
class LayerNorm(nn.Module):
|
||||
"""LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False"""
|
||||
|
||||
def __init__(self, ndim, bias):
|
||||
super().__init__()
|
||||
self.weight = nn.Parameter(torch.ones(ndim))
|
||||
self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
|
||||
|
||||
def forward(self, x):
|
||||
return F.layer_norm(x, self.weight.shape, self.weight, self.bias, 1e-5)
|
||||
|
||||
|
||||
class CausalSelfAttention(nn.Module):
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
assert config.n_embd % config.n_head == 0
|
||||
# key, query, value projections for all heads, but in a batch
|
||||
self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
|
||||
# output projection
|
||||
self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
|
||||
# regularization
|
||||
self.attn_dropout = nn.Dropout(config.dropout)
|
||||
self.resid_dropout = nn.Dropout(config.dropout)
|
||||
self.n_head = config.n_head
|
||||
self.n_embd = config.n_embd
|
||||
self.dropout = config.dropout
|
||||
# flash attention make GPU go brrrrr but support is only in PyTorch nightly and still a bit scary
|
||||
self.flash = hasattr(torch.nn.functional, "scaled_dot_product_attention")
|
||||
if not self.flash:
|
||||
# print("WARNING: using slow attention. Flash Attention atm needs PyTorch nightly and dropout=0.0")
|
||||
# causal mask to ensure that attention is only applied to the left in the input sequence
|
||||
self.register_buffer(
|
||||
"bias",
|
||||
torch.tril(torch.ones(config.block_size, config.block_size)).view(
|
||||
1, 1, config.block_size, config.block_size
|
||||
),
|
||||
)
|
||||
|
||||
def forward(self, x, past_kv=None, use_cache=False):
|
||||
B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
|
||||
|
||||
# calculate query, key, values for all heads in batch and move head forward to be the batch dim
|
||||
q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
|
||||
k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
|
||||
q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
|
||||
v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
|
||||
|
||||
if past_kv is not None:
|
||||
past_key = past_kv[0]
|
||||
past_value = past_kv[1]
|
||||
k = torch.cat((past_key, k), dim=-2)
|
||||
v = torch.cat((past_value, v), dim=-2)
|
||||
|
||||
FULL_T = k.shape[-2]
|
||||
|
||||
if use_cache is True:
|
||||
present = (k, v)
|
||||
else:
|
||||
present = None
|
||||
|
||||
# causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
|
||||
if self.flash:
|
||||
# efficient attention using Flash Attention CUDA kernels
|
||||
if past_kv is not None:
|
||||
# When `past_kv` is provided, we're doing incremental decoding and `q.shape[2] == 1`: q only contains
|
||||
# the query for the last token. scaled_dot_product_attention interprets this as the first token in the
|
||||
# sequence, so if is_causal=True it will mask out all attention from it. This is not what we want, so
|
||||
# to work around this we set is_causal=False.
|
||||
is_causal = False
|
||||
else:
|
||||
is_causal = True
|
||||
|
||||
# efficient attention using Flash Attention CUDA kernels
|
||||
y = torch.nn.functional.scaled_dot_product_attention(q, k, v, dropout_p=self.dropout, is_causal=is_causal)
|
||||
else:
|
||||
# manual implementation of attention
|
||||
att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
|
||||
att = att.masked_fill(self.bias[:, :, FULL_T - T : FULL_T, :FULL_T] == 0, float("-inf"))
|
||||
att = F.softmax(att, dim=-1)
|
||||
att = self.attn_dropout(att)
|
||||
y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
|
||||
y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
|
||||
|
||||
# output projection
|
||||
y = self.resid_dropout(self.c_proj(y))
|
||||
return (y, present)
|
||||
|
||||
|
||||
class MLP(nn.Module):
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
|
||||
self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
|
||||
self.dropout = nn.Dropout(config.dropout)
|
||||
self.gelu = nn.GELU()
|
||||
|
||||
def forward(self, x):
|
||||
x = self.c_fc(x)
|
||||
x = self.gelu(x)
|
||||
x = self.c_proj(x)
|
||||
x = self.dropout(x)
|
||||
return x
|
||||
|
||||
|
||||
class Block(nn.Module):
|
||||
def __init__(self, config, layer_idx):
|
||||
super().__init__()
|
||||
self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
|
||||
self.attn = CausalSelfAttention(config)
|
||||
self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
|
||||
self.mlp = MLP(config)
|
||||
self.layer_idx = layer_idx
|
||||
|
||||
def forward(self, x, past_kv=None, use_cache=False):
|
||||
attn_output, prev_kvs = self.attn(self.ln_1(x), past_kv=past_kv, use_cache=use_cache)
|
||||
x = x + attn_output
|
||||
x = x + self.mlp(self.ln_2(x))
|
||||
return (x, prev_kvs)
|
||||
|
||||
|
||||
@dataclass
|
||||
class GPTConfig(Coqpit):
|
||||
block_size: int = 1024
|
||||
input_vocab_size: int = 10_048
|
||||
output_vocab_size: int = 10_048
|
||||
n_layer: int = 12
|
||||
n_head: int = 12
|
||||
n_embd: int = 768
|
||||
dropout: float = 0.0
|
||||
bias: bool = True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
|
||||
|
||||
|
||||
class GPT(nn.Module):
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
assert config.input_vocab_size is not None
|
||||
assert config.output_vocab_size is not None
|
||||
assert config.block_size is not None
|
||||
self.config = config
|
||||
|
||||
self.transformer = nn.ModuleDict(
|
||||
dict(
|
||||
wte=nn.Embedding(config.input_vocab_size, config.n_embd),
|
||||
wpe=nn.Embedding(config.block_size, config.n_embd),
|
||||
drop=nn.Dropout(config.dropout),
|
||||
h=nn.ModuleList([Block(config, idx) for idx in range(config.n_layer)]),
|
||||
ln_f=LayerNorm(config.n_embd, bias=config.bias),
|
||||
)
|
||||
)
|
||||
self.lm_head = nn.Linear(config.n_embd, config.output_vocab_size, bias=False)
|
||||
|
||||
def get_num_params(self, non_embedding=True):
|
||||
"""
|
||||
Return the number of parameters in the model.
|
||||
For non-embedding count (default), the position embeddings get subtracted.
|
||||
The token embeddings would too, except due to the parameter sharing these
|
||||
params are actually used as weights in the final layer, so we include them.
|
||||
"""
|
||||
n_params = sum(p.numel() for p in self.parameters())
|
||||
if non_embedding:
|
||||
n_params -= self.transformer.wte.weight.numel()
|
||||
n_params -= self.transformer.wpe.weight.numel()
|
||||
return n_params
|
||||
|
||||
def forward(self, idx, merge_context=False, past_kv=None, position_ids=None, use_cache=False):
|
||||
device = idx.device
|
||||
_, t = idx.size()
|
||||
if past_kv is not None:
|
||||
assert t == 1
|
||||
tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
|
||||
else:
|
||||
if merge_context:
|
||||
assert idx.shape[1] >= 256 + 256 + 1
|
||||
t = idx.shape[1] - 256
|
||||
else:
|
||||
assert (
|
||||
t <= self.config.block_size
|
||||
), f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
|
||||
|
||||
# forward the GPT model itself
|
||||
if merge_context:
|
||||
tok_emb = torch.cat(
|
||||
[
|
||||
self.transformer.wte(idx[:, :256]) + self.transformer.wte(idx[:, 256 : 256 + 256]),
|
||||
self.transformer.wte(idx[:, 256 + 256 :]),
|
||||
],
|
||||
dim=1,
|
||||
)
|
||||
else:
|
||||
tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
|
||||
|
||||
if past_kv is None:
|
||||
past_length = 0
|
||||
past_kv = tuple([None] * len(self.transformer.h))
|
||||
else:
|
||||
past_length = past_kv[0][0].size(-2)
|
||||
|
||||
if position_ids is None:
|
||||
position_ids = torch.arange(past_length, t + past_length, dtype=torch.long, device=device)
|
||||
position_ids = position_ids.unsqueeze(0) # shape (1, t)
|
||||
assert position_ids.shape == (1, t)
|
||||
|
||||
pos_emb = self.transformer.wpe(position_ids) # position embeddings of shape (1, t, n_embd)
|
||||
|
||||
x = self.transformer.drop(tok_emb + pos_emb)
|
||||
|
||||
new_kv = () if use_cache else None
|
||||
|
||||
for _, (block, past_layer_kv) in enumerate(zip(self.transformer.h, past_kv)):
|
||||
x, kv = block(x, past_kv=past_layer_kv, use_cache=use_cache)
|
||||
|
||||
if use_cache:
|
||||
new_kv = new_kv + (kv,)
|
||||
|
||||
x = self.transformer.ln_f(x)
|
||||
|
||||
# inference-time mini-optimization: only forward the lm_head on the very last position
|
||||
logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
|
||||
|
||||
return (logits, new_kv)
|
|
@ -0,0 +1,142 @@
|
|||
"""
|
||||
Much of this code is adapted from Andrej Karpathy's NanoGPT
|
||||
(https://github.com/karpathy/nanoGPT)
|
||||
"""
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
from .model import GPT, MLP, GPTConfig
|
||||
|
||||
|
||||
class NonCausalSelfAttention(nn.Module):
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
assert config.n_embd % config.n_head == 0
|
||||
# key, query, value projections for all heads, but in a batch
|
||||
self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
|
||||
# output projection
|
||||
self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
|
||||
# regularization
|
||||
self.attn_dropout = nn.Dropout(config.dropout)
|
||||
self.resid_dropout = nn.Dropout(config.dropout)
|
||||
self.n_head = config.n_head
|
||||
self.n_embd = config.n_embd
|
||||
self.dropout = config.dropout
|
||||
# flash attention make GPU go brrrrr but support is only in PyTorch nightly and still a bit scary
|
||||
self.flash = hasattr(torch.nn.functional, "scaled_dot_product_attention") and self.dropout == 0.0
|
||||
|
||||
def forward(self, x):
|
||||
B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
|
||||
|
||||
# calculate query, key, values for all heads in batch and move head forward to be the batch dim
|
||||
q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
|
||||
k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
|
||||
q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
|
||||
v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
|
||||
|
||||
# causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
|
||||
if self.flash:
|
||||
# efficient attention using Flash Attention CUDA kernels
|
||||
y = torch.nn.functional.scaled_dot_product_attention(
|
||||
q, k, v, attn_mask=None, dropout_p=self.dropout, is_causal=False
|
||||
)
|
||||
else:
|
||||
# manual implementation of attention
|
||||
att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
|
||||
att = F.softmax(att, dim=-1)
|
||||
att = self.attn_dropout(att)
|
||||
y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
|
||||
y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
|
||||
|
||||
# output projection
|
||||
y = self.resid_dropout(self.c_proj(y))
|
||||
return y
|
||||
|
||||
|
||||
class FineBlock(nn.Module):
|
||||
def __init__(self, config):
|
||||
super().__init__()
|
||||
self.ln_1 = nn.LayerNorm(config.n_embd)
|
||||
self.attn = NonCausalSelfAttention(config)
|
||||
self.ln_2 = nn.LayerNorm(config.n_embd)
|
||||
self.mlp = MLP(config)
|
||||
|
||||
def forward(self, x):
|
||||
x = x + self.attn(self.ln_1(x))
|
||||
x = x + self.mlp(self.ln_2(x))
|
||||
return x
|
||||
|
||||
|
||||
class FineGPT(GPT):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
del self.lm_head
|
||||
self.config = config
|
||||
self.n_codes_total = config.n_codes_total
|
||||
self.transformer = nn.ModuleDict(
|
||||
dict(
|
||||
wtes=nn.ModuleList(
|
||||
[nn.Embedding(config.input_vocab_size, config.n_embd) for _ in range(config.n_codes_total)]
|
||||
),
|
||||
wpe=nn.Embedding(config.block_size, config.n_embd),
|
||||
drop=nn.Dropout(config.dropout),
|
||||
h=nn.ModuleList([FineBlock(config) for _ in range(config.n_layer)]),
|
||||
ln_f=nn.LayerNorm(config.n_embd),
|
||||
)
|
||||
)
|
||||
self.lm_heads = nn.ModuleList(
|
||||
[
|
||||
nn.Linear(config.n_embd, config.output_vocab_size, bias=False)
|
||||
for _ in range(config.n_codes_given, self.n_codes_total)
|
||||
]
|
||||
)
|
||||
for i in range(self.n_codes_total - config.n_codes_given):
|
||||
self.transformer.wtes[i + 1].weight = self.lm_heads[i].weight
|
||||
|
||||
def forward(self, pred_idx, idx):
|
||||
device = idx.device
|
||||
b, t, codes = idx.size()
|
||||
assert (
|
||||
t <= self.config.block_size
|
||||
), f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
|
||||
assert pred_idx > 0, "cannot predict 0th codebook"
|
||||
assert codes == self.n_codes_total, (b, t, codes)
|
||||
pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0) # shape (1, t)
|
||||
|
||||
# forward the GPT model itself
|
||||
tok_embs = [
|
||||
wte(idx[:, :, i]).unsqueeze(-1) for i, wte in enumerate(self.transformer.wtes)
|
||||
] # token embeddings of shape (b, t, n_embd)
|
||||
tok_emb = torch.cat(tok_embs, dim=-1)
|
||||
pos_emb = self.transformer.wpe(pos) # position embeddings of shape (1, t, n_embd)
|
||||
x = tok_emb[:, :, :, : pred_idx + 1].sum(dim=-1)
|
||||
x = self.transformer.drop(x + pos_emb)
|
||||
for block in self.transformer.h:
|
||||
x = block(x)
|
||||
x = self.transformer.ln_f(x)
|
||||
logits = self.lm_heads[pred_idx - self.config.n_codes_given](x)
|
||||
return logits
|
||||
|
||||
def get_num_params(self, non_embedding=True):
|
||||
"""
|
||||
Return the number of parameters in the model.
|
||||
For non-embedding count (default), the position embeddings get subtracted.
|
||||
The token embeddings would too, except due to the parameter sharing these
|
||||
params are actually used as weights in the final layer, so we include them.
|
||||
"""
|
||||
n_params = sum(p.numel() for p in self.parameters())
|
||||
if non_embedding:
|
||||
for wte in self.transformer.wtes:
|
||||
n_params -= wte.weight.numel()
|
||||
n_params -= self.transformer.wpe.weight.numel()
|
||||
return n_params
|
||||
|
||||
|
||||
@dataclass
|
||||
class FineGPTConfig(GPTConfig):
|
||||
n_codes_total: int = 8
|
||||
n_codes_given: int = 1
|
|
@ -165,7 +165,7 @@ class BCELossMasked(nn.Module):
|
|||
|
||||
def __init__(self, pos_weight: float = None):
|
||||
super().__init__()
|
||||
self.pos_weight = nn.Parameter(torch.tensor([pos_weight]), requires_grad=False)
|
||||
self.register_buffer("pos_weight", torch.tensor([pos_weight]))
|
||||
|
||||
def forward(self, x, target, length):
|
||||
"""
|
||||
|
@ -191,10 +191,15 @@ class BCELossMasked(nn.Module):
|
|||
mask = sequence_mask(sequence_length=length, max_len=target.size(1))
|
||||
num_items = mask.sum()
|
||||
loss = functional.binary_cross_entropy_with_logits(
|
||||
x.masked_select(mask), target.masked_select(mask), pos_weight=self.pos_weight, reduction="sum"
|
||||
x.masked_select(mask),
|
||||
target.masked_select(mask),
|
||||
pos_weight=self.pos_weight.to(x.device),
|
||||
reduction="sum",
|
||||
)
|
||||
else:
|
||||
loss = functional.binary_cross_entropy_with_logits(x, target, pos_weight=self.pos_weight, reduction="sum")
|
||||
loss = functional.binary_cross_entropy_with_logits(
|
||||
x, target, pos_weight=self.pos_weight.to(x.device), reduction="sum"
|
||||
)
|
||||
num_items = torch.numel(x)
|
||||
loss = loss / num_items
|
||||
return loss
|
||||
|
|
|
@ -150,7 +150,7 @@ class ConvFlow(nn.Module):
|
|||
class StochasticDurationPredictor(nn.Module):
|
||||
"""Stochastic duration predictor with Spline Flows.
|
||||
|
||||
It applies Variational Dequantization and Variationsl Data Augmentation.
|
||||
It applies Variational Dequantization and Variational Data Augmentation.
|
||||
|
||||
Paper:
|
||||
SDP: https://arxiv.org/pdf/2106.06103.pdf
|
||||
|
|
|
@ -0,0 +1,277 @@
|
|||
import os
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
from coqpit import Coqpit
|
||||
from encodec import EncodecModel
|
||||
from transformers import BertTokenizer
|
||||
|
||||
from TTS.tts.layers.bark.inference_funcs import (
|
||||
codec_decode,
|
||||
generate_coarse,
|
||||
generate_fine,
|
||||
generate_text_semantic,
|
||||
generate_voice,
|
||||
load_voice,
|
||||
)
|
||||
from TTS.tts.layers.bark.load_model import load_model
|
||||
from TTS.tts.layers.bark.model import GPT
|
||||
from TTS.tts.layers.bark.model_fine import FineGPT
|
||||
from TTS.tts.models.base_tts import BaseTTS
|
||||
|
||||
|
||||
@dataclass
|
||||
class BarkAudioConfig(Coqpit):
|
||||
sample_rate: int = 24000
|
||||
output_sample_rate: int = 24000
|
||||
|
||||
|
||||
class Bark(BaseTTS):
|
||||
def __init__(
|
||||
self,
|
||||
config: Coqpit,
|
||||
tokenizer: BertTokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased"),
|
||||
) -> None:
|
||||
super().__init__(config=config, ap=None, tokenizer=None, speaker_manager=None, language_manager=None)
|
||||
self.config.num_chars = len(tokenizer)
|
||||
self.tokenizer = tokenizer
|
||||
self.semantic_model = GPT(config.semantic_config)
|
||||
self.coarse_model = GPT(config.coarse_config)
|
||||
self.fine_model = FineGPT(config.fine_config)
|
||||
self.encodec = EncodecModel.encodec_model_24khz()
|
||||
self.encodec.set_target_bandwidth(6.0)
|
||||
|
||||
@property
|
||||
def device(self):
|
||||
return next(self.parameters()).device
|
||||
|
||||
def load_bark_models(self):
|
||||
self.semantic_model, self.config = load_model(
|
||||
ckpt_path=self.config.LOCAL_MODEL_PATHS["text"], device=self.device, config=self.config, model_type="text"
|
||||
)
|
||||
self.coarse_model, self.config = load_model(
|
||||
ckpt_path=self.config.LOCAL_MODEL_PATHS["coarse"],
|
||||
device=self.device,
|
||||
config=self.config,
|
||||
model_type="coarse",
|
||||
)
|
||||
self.fine_model, self.config = load_model(
|
||||
ckpt_path=self.config.LOCAL_MODEL_PATHS["fine"], device=self.device, config=self.config, model_type="fine"
|
||||
)
|
||||
|
||||
def train_step(
|
||||
self,
|
||||
):
|
||||
pass
|
||||
|
||||
def text_to_semantic(
|
||||
self,
|
||||
text: str,
|
||||
history_prompt: Optional[str] = None,
|
||||
temp: float = 0.7,
|
||||
base=None,
|
||||
allow_early_stop=True,
|
||||
**kwargs,
|
||||
):
|
||||
"""Generate semantic array from text.
|
||||
|
||||
Args:
|
||||
text: text to be turned into audio
|
||||
history_prompt: history choice for audio cloning
|
||||
temp: generation temperature (1.0 more diverse, 0.0 more conservative)
|
||||
|
||||
Returns:
|
||||
numpy semantic array to be fed into `semantic_to_waveform`
|
||||
"""
|
||||
x_semantic = generate_text_semantic(
|
||||
text,
|
||||
self,
|
||||
history_prompt=history_prompt,
|
||||
temp=temp,
|
||||
base=base,
|
||||
allow_early_stop=allow_early_stop,
|
||||
**kwargs,
|
||||
)
|
||||
return x_semantic
|
||||
|
||||
def semantic_to_waveform(
|
||||
self,
|
||||
semantic_tokens: np.ndarray,
|
||||
history_prompt: Optional[str] = None,
|
||||
temp: float = 0.7,
|
||||
base=None,
|
||||
):
|
||||
"""Generate audio array from semantic input.
|
||||
|
||||
Args:
|
||||
semantic_tokens: semantic token output from `text_to_semantic`
|
||||
history_prompt: history choice for audio cloning
|
||||
temp: generation temperature (1.0 more diverse, 0.0 more conservative)
|
||||
|
||||
Returns:
|
||||
numpy audio array at sample frequency 24khz
|
||||
"""
|
||||
x_coarse_gen = generate_coarse(
|
||||
semantic_tokens,
|
||||
self,
|
||||
history_prompt=history_prompt,
|
||||
temp=temp,
|
||||
base=base,
|
||||
)
|
||||
x_fine_gen = generate_fine(
|
||||
x_coarse_gen,
|
||||
self,
|
||||
history_prompt=history_prompt,
|
||||
temp=0.5,
|
||||
base=base,
|
||||
)
|
||||
audio_arr = codec_decode(x_fine_gen, self)
|
||||
return audio_arr, x_coarse_gen, x_fine_gen
|
||||
|
||||
def generate_audio(
|
||||
self,
|
||||
text: str,
|
||||
history_prompt: Optional[str] = None,
|
||||
text_temp: float = 0.7,
|
||||
waveform_temp: float = 0.7,
|
||||
base=None,
|
||||
allow_early_stop=True,
|
||||
**kwargs,
|
||||
):
|
||||
"""Generate audio array from input text.
|
||||
|
||||
Args:
|
||||
text: text to be turned into audio
|
||||
history_prompt: history choice for audio cloning
|
||||
text_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
|
||||
waveform_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
|
||||
|
||||
Returns:
|
||||
numpy audio array at sample frequency 24khz
|
||||
"""
|
||||
x_semantic = self.text_to_semantic(
|
||||
text,
|
||||
history_prompt=history_prompt,
|
||||
temp=text_temp,
|
||||
base=base,
|
||||
allow_early_stop=allow_early_stop,
|
||||
**kwargs,
|
||||
)
|
||||
audio_arr, c, f = self.semantic_to_waveform(
|
||||
x_semantic, history_prompt=history_prompt, temp=waveform_temp, base=base
|
||||
)
|
||||
return audio_arr, [x_semantic, c, f]
|
||||
|
||||
def generate_voice(self, audio, speaker_id, voice_dir):
|
||||
"""Generate a voice from the given audio and text.
|
||||
|
||||
Args:
|
||||
audio (str): Path to the audio file.
|
||||
speaker_id (str): Speaker name.
|
||||
voice_dir (str): Path to the directory to save the generate voice.
|
||||
"""
|
||||
if voice_dir is not None:
|
||||
voice_dirs = [voice_dir]
|
||||
try:
|
||||
_ = load_voice(speaker_id, voice_dirs)
|
||||
except (KeyError, FileNotFoundError):
|
||||
output_path = os.path.join(voice_dir, speaker_id + ".npz")
|
||||
os.makedirs(voice_dir, exist_ok=True)
|
||||
generate_voice(audio, self, output_path)
|
||||
|
||||
def _set_voice_dirs(self, voice_dirs):
|
||||
def_voice_dir = None
|
||||
if isinstance(self.config.DEF_SPEAKER_DIR, str):
|
||||
os.makedirs(self.config.DEF_SPEAKER_DIR, exist_ok=True)
|
||||
if os.path.isdir(self.config.DEF_SPEAKER_DIR):
|
||||
def_voice_dir = self.config.DEF_SPEAKER_DIR
|
||||
_voice_dirs = [def_voice_dir] if def_voice_dir is not None else []
|
||||
if voice_dirs is not None:
|
||||
if isinstance(voice_dirs, str):
|
||||
voice_dirs = [voice_dirs]
|
||||
_voice_dirs = voice_dirs + _voice_dirs
|
||||
return _voice_dirs
|
||||
|
||||
# TODO: remove config from synthesize
|
||||
def synthesize(
|
||||
self, text, config, speaker_id="random", voice_dirs=None, **kwargs
|
||||
): # pylint: disable=unused-argument
|
||||
"""Synthesize speech with the given input text.
|
||||
|
||||
Args:
|
||||
text (str): Input text.
|
||||
config (BarkConfig): Config with inference parameters.
|
||||
speaker_id (str): One of the available speaker names. If `random`, it generates a random speaker.
|
||||
speaker_wav (str): Path to the speaker audio file for cloning a new voice. It is cloned and saved in
|
||||
`voice_dirs` with the name `speaker_id`. Defaults to None.
|
||||
voice_dirs (List[str]): List of paths that host reference audio files for speakers. Defaults to None.
|
||||
**kwargs: Inference settings. See `inference()`.
|
||||
|
||||
Returns:
|
||||
A dictionary of the output values with `wav` as output waveform, `deterministic_seed` as seed used at inference,
|
||||
`text_input` as text token IDs after tokenizer, `voice_samples` as samples used for cloning, `conditioning_latents`
|
||||
as latents used at inference.
|
||||
|
||||
"""
|
||||
voice_dirs = self._set_voice_dirs(voice_dirs)
|
||||
history_prompt = load_voice(self, speaker_id, voice_dirs)
|
||||
outputs = self.generate_audio(text, history_prompt=history_prompt, **kwargs)
|
||||
return_dict = {
|
||||
"wav": outputs[0],
|
||||
"text_inputs": text,
|
||||
}
|
||||
|
||||
return return_dict
|
||||
|
||||
def eval_step(self):
|
||||
...
|
||||
|
||||
def forward(self):
|
||||
...
|
||||
|
||||
def inference(self):
|
||||
...
|
||||
|
||||
@staticmethod
|
||||
def init_from_config(config: "BarkConfig", **kwargs): # pylint: disable=unused-argument
|
||||
return Bark(config)
|
||||
|
||||
# pylint: disable=unused-argument, redefined-builtin
|
||||
def load_checkpoint(
|
||||
self,
|
||||
config,
|
||||
checkpoint_dir,
|
||||
text_model_path=None,
|
||||
coarse_model_path=None,
|
||||
fine_model_path=None,
|
||||
eval=False,
|
||||
strict=True,
|
||||
**kwargs,
|
||||
):
|
||||
"""Load a model checkpoints from a directory. This model is with multiple checkpoint files and it
|
||||
expects to have all the files to be under the given `checkpoint_dir` with the rigth names.
|
||||
If eval is True, set the model to eval mode.
|
||||
|
||||
Args:
|
||||
config (TortoiseConfig): The model config.
|
||||
checkpoint_dir (str): The directory where the checkpoints are stored.
|
||||
ar_checkpoint_path (str, optional): The path to the autoregressive checkpoint. Defaults to None.
|
||||
diff_checkpoint_path (str, optional): The path to the diffusion checkpoint. Defaults to None.
|
||||
clvp_checkpoint_path (str, optional): The path to the CLVP checkpoint. Defaults to None.
|
||||
vocoder_checkpoint_path (str, optional): The path to the vocoder checkpoint. Defaults to None.
|
||||
eval (bool, optional): Whether to set the model to eval mode. Defaults to False.
|
||||
strict (bool, optional): Whether to load the model strictly. Defaults to True.
|
||||
"""
|
||||
text_model_path = text_model_path or os.path.join(checkpoint_dir, "text_2.pt")
|
||||
coarse_model_path = coarse_model_path or os.path.join(checkpoint_dir, "coarse_2.pt")
|
||||
fine_model_path = fine_model_path or os.path.join(checkpoint_dir, "fine_2.pt")
|
||||
|
||||
self.config.LOCAL_MODEL_PATHS["text"] = text_model_path
|
||||
self.config.LOCAL_MODEL_PATHS["coarse"] = coarse_model_path
|
||||
self.config.LOCAL_MODEL_PATHS["fine"] = fine_model_path
|
||||
|
||||
self.load_bark_models()
|
||||
|
||||
if eval:
|
||||
self.eval()
|
|
@ -1,5 +1,6 @@
|
|||
import os
|
||||
import random
|
||||
import re
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import dataclass
|
||||
from time import time
|
||||
|
@ -255,7 +256,7 @@ class TortoiseArgs(Coqpit):
|
|||
"""
|
||||
|
||||
autoregressive_batch_size: int = 1
|
||||
enable_redaction: bool = True
|
||||
enable_redaction: bool = False
|
||||
high_vram: bool = False
|
||||
kv_cache: bool = True
|
||||
ar_checkpoint: str = None
|
||||
|
@ -871,7 +872,16 @@ class Tortoise(BaseTTS):
|
|||
vocoder_checkpoint_path = vocoder_checkpoint_path or os.path.join(checkpoint_dir, "vocoder.pth")
|
||||
|
||||
if os.path.exists(ar_path):
|
||||
self.autoregressive.load_state_dict(torch.load(ar_path), strict=strict)
|
||||
keys_to_ignore = self.autoregressive.gpt._keys_to_ignore_on_load_missing # pylint: disable=protected-access
|
||||
# remove keys from the checkpoint that are not in the model
|
||||
checkpoint = torch.load(ar_path, map_location=torch.device("cpu"))
|
||||
for key in list(checkpoint.keys()):
|
||||
for pat in keys_to_ignore:
|
||||
if re.search(pat, key) is not None:
|
||||
del checkpoint[key]
|
||||
break
|
||||
|
||||
self.autoregressive.load_state_dict(checkpoint, strict=strict)
|
||||
|
||||
if os.path.exists(diff_path):
|
||||
self.diffusion.load_state_dict(torch.load(diff_path), strict=strict)
|
||||
|
|
|
@ -25,11 +25,12 @@ from TTS.tts.layers.vits.discriminator import VitsDiscriminator
|
|||
from TTS.tts.layers.vits.networks import PosteriorEncoder, ResidualCouplingBlocks, TextEncoder
|
||||
from TTS.tts.layers.vits.stochastic_duration_predictor import StochasticDurationPredictor
|
||||
from TTS.tts.models.base_tts import BaseTTS
|
||||
from TTS.tts.utils.fairseq import rehash_fairseq_vits_checkpoint
|
||||
from TTS.tts.utils.helpers import generate_path, maximum_path, rand_segments, segment, sequence_mask
|
||||
from TTS.tts.utils.languages import LanguageManager
|
||||
from TTS.tts.utils.speakers import SpeakerManager
|
||||
from TTS.tts.utils.synthesis import synthesis
|
||||
from TTS.tts.utils.text.characters import BaseCharacters, _characters, _pad, _phonemes, _punctuations
|
||||
from TTS.tts.utils.text.characters import BaseCharacters, BaseVocabulary, _characters, _pad, _phonemes, _punctuations
|
||||
from TTS.tts.utils.text.tokenizer import TTSTokenizer
|
||||
from TTS.tts.utils.visual import plot_alignment
|
||||
from TTS.utils.io import load_fsspec
|
||||
|
@ -1769,6 +1770,50 @@ class Vits(BaseTTS):
|
|||
self.eval()
|
||||
assert not self.training
|
||||
|
||||
def load_fairseq_checkpoint(
|
||||
self, config, checkpoint_dir, eval=False
|
||||
): # pylint: disable=unused-argument, redefined-builtin
|
||||
"""Load VITS checkpoints released by fairseq here: https://github.com/facebookresearch/fairseq/tree/main/examples/mms
|
||||
Performs some changes for compatibility.
|
||||
|
||||
Args:
|
||||
config (Coqpit): 🐸TTS model config.
|
||||
checkpoint_dir (str): Path to the checkpoint directory.
|
||||
eval (bool, optional): Set to True for evaluation. Defaults to False.
|
||||
"""
|
||||
import json
|
||||
|
||||
from TTS.tts.utils.text.cleaners import basic_cleaners
|
||||
|
||||
self.disc = None
|
||||
# set paths
|
||||
config_file = os.path.join(checkpoint_dir, "config.json")
|
||||
checkpoint_file = os.path.join(checkpoint_dir, "G_100000.pth")
|
||||
vocab_file = os.path.join(checkpoint_dir, "vocab.txt")
|
||||
# set config params
|
||||
with open(config_file, "r", encoding="utf-8") as file:
|
||||
# Load the JSON data as a dictionary
|
||||
config_org = json.load(file)
|
||||
self.config.audio.sample_rate = config_org["data"]["sampling_rate"]
|
||||
# self.config.add_blank = config['add_blank']
|
||||
# set tokenizer
|
||||
vocab = FairseqVocab(vocab_file)
|
||||
self.text_encoder.emb = nn.Embedding(vocab.num_chars, config.model_args.hidden_channels)
|
||||
self.tokenizer = TTSTokenizer(
|
||||
use_phonemes=False,
|
||||
text_cleaner=basic_cleaners,
|
||||
characters=vocab,
|
||||
phonemizer=None,
|
||||
add_blank=config_org["data"]["add_blank"],
|
||||
use_eos_bos=False,
|
||||
)
|
||||
# load fairseq checkpoint
|
||||
new_chk = rehash_fairseq_vits_checkpoint(checkpoint_file)
|
||||
self.load_state_dict(new_chk)
|
||||
if eval:
|
||||
self.eval()
|
||||
assert not self.training
|
||||
|
||||
@staticmethod
|
||||
def init_from_config(config: "VitsConfig", samples: Union[List[List], List[Dict]] = None, verbose=True):
|
||||
"""Initiate model from config
|
||||
|
@ -1965,3 +2010,24 @@ class VitsCharacters(BaseCharacters):
|
|||
is_unique=False,
|
||||
is_sorted=True,
|
||||
)
|
||||
|
||||
|
||||
class FairseqVocab(BaseVocabulary):
|
||||
def __init__(self, vocab: str):
|
||||
super(FairseqVocab).__init__()
|
||||
self.vocab = vocab
|
||||
|
||||
@property
|
||||
def vocab(self):
|
||||
"""Return the vocabulary dictionary."""
|
||||
return self._vocab
|
||||
|
||||
@vocab.setter
|
||||
def vocab(self, vocab_file):
|
||||
with open(vocab_file, encoding="utf-8") as f:
|
||||
self._vocab = [x.replace("\n", "") for x in f.readlines()]
|
||||
self.blank = self._vocab[0]
|
||||
print(self._vocab)
|
||||
self.pad = " "
|
||||
self._char_to_id = {s: i for i, s in enumerate(self._vocab)} # pylint: disable=unnecessary-comprehension
|
||||
self._id_to_char = {i: s for i, s in enumerate(self._vocab)} # pylint: disable=unnecessary-comprehension
|
||||
|
|
|
@ -0,0 +1,48 @@
|
|||
import torch
|
||||
|
||||
|
||||
def rehash_fairseq_vits_checkpoint(checkpoint_file):
|
||||
chk = torch.load(checkpoint_file, map_location=torch.device("cpu"))["model"]
|
||||
new_chk = {}
|
||||
for k, v in chk.items():
|
||||
if "enc_p." in k:
|
||||
new_chk[k.replace("enc_p.", "text_encoder.")] = v
|
||||
elif "dec." in k:
|
||||
new_chk[k.replace("dec.", "waveform_decoder.")] = v
|
||||
elif "enc_q." in k:
|
||||
new_chk[k.replace("enc_q.", "posterior_encoder.")] = v
|
||||
elif "flow.flows.2." in k:
|
||||
new_chk[k.replace("flow.flows.2.", "flow.flows.1.")] = v
|
||||
elif "flow.flows.4." in k:
|
||||
new_chk[k.replace("flow.flows.4.", "flow.flows.2.")] = v
|
||||
elif "flow.flows.6." in k:
|
||||
new_chk[k.replace("flow.flows.6.", "flow.flows.3.")] = v
|
||||
elif "dp.flows.0.m" in k:
|
||||
new_chk[k.replace("dp.flows.0.m", "duration_predictor.flows.0.translation")] = v
|
||||
elif "dp.flows.0.logs" in k:
|
||||
new_chk[k.replace("dp.flows.0.logs", "duration_predictor.flows.0.log_scale")] = v
|
||||
elif "dp.flows.1" in k:
|
||||
new_chk[k.replace("dp.flows.1", "duration_predictor.flows.1")] = v
|
||||
elif "dp.flows.3" in k:
|
||||
new_chk[k.replace("dp.flows.3", "duration_predictor.flows.2")] = v
|
||||
elif "dp.flows.5" in k:
|
||||
new_chk[k.replace("dp.flows.5", "duration_predictor.flows.3")] = v
|
||||
elif "dp.flows.7" in k:
|
||||
new_chk[k.replace("dp.flows.7", "duration_predictor.flows.4")] = v
|
||||
elif "dp.post_flows.0.m" in k:
|
||||
new_chk[k.replace("dp.post_flows.0.m", "duration_predictor.post_flows.0.translation")] = v
|
||||
elif "dp.post_flows.0.logs" in k:
|
||||
new_chk[k.replace("dp.post_flows.0.logs", "duration_predictor.post_flows.0.log_scale")] = v
|
||||
elif "dp.post_flows.1" in k:
|
||||
new_chk[k.replace("dp.post_flows.1", "duration_predictor.post_flows.1")] = v
|
||||
elif "dp.post_flows.3" in k:
|
||||
new_chk[k.replace("dp.post_flows.3", "duration_predictor.post_flows.2")] = v
|
||||
elif "dp.post_flows.5" in k:
|
||||
new_chk[k.replace("dp.post_flows.5", "duration_predictor.post_flows.3")] = v
|
||||
elif "dp.post_flows.7" in k:
|
||||
new_chk[k.replace("dp.post_flows.7", "duration_predictor.post_flows.4")] = v
|
||||
elif "dp." in k:
|
||||
new_chk[k.replace("dp.", "duration_predictor.")] = v
|
||||
else:
|
||||
new_chk[k] = v
|
||||
return new_chk
|
|
@ -207,7 +207,7 @@ def maximum_path_numpy(value, mask, max_neg_val=None):
|
|||
device = value.device
|
||||
dtype = value.dtype
|
||||
value = value.cpu().detach().numpy()
|
||||
mask = mask.cpu().detach().numpy().astype(np.bool)
|
||||
mask = mask.cpu().detach().numpy().astype(bool)
|
||||
|
||||
b, t_x, t_y = value.shape
|
||||
direction = np.zeros(value.shape, dtype=np.int64)
|
||||
|
|
|
@ -63,6 +63,18 @@ class BaseVocabulary:
|
|||
the vocabulary."""
|
||||
return self.char_to_id(self.blank) if self.blank else len(self.vocab)
|
||||
|
||||
@property
|
||||
def bos_id(self) -> int:
|
||||
"""Return the index of the bos character. If the bos character is not specified, return the length of the
|
||||
vocabulary."""
|
||||
return self.char_to_id(self.bos) if self.bos else len(self.vocab)
|
||||
|
||||
@property
|
||||
def eos_id(self) -> int:
|
||||
"""Return the index of the eos character. If the eos character is not specified, return the length of the
|
||||
vocabulary."""
|
||||
return self.char_to_id(self.eos) if self.eos else len(self.vocab)
|
||||
|
||||
@property
|
||||
def vocab(self):
|
||||
"""Return the vocabulary dictionary."""
|
||||
|
@ -71,11 +83,13 @@ class BaseVocabulary:
|
|||
@vocab.setter
|
||||
def vocab(self, vocab):
|
||||
"""Set the vocabulary dictionary and character mapping dictionaries."""
|
||||
self._vocab = vocab
|
||||
self._char_to_id = {char: idx for idx, char in enumerate(self._vocab)}
|
||||
self._id_to_char = {
|
||||
idx: char for idx, char in enumerate(self._vocab) # pylint: disable=unnecessary-comprehension
|
||||
}
|
||||
self._vocab, self._char_to_id, self._id_to_char = None, None, None
|
||||
if vocab is not None:
|
||||
self._vocab = vocab
|
||||
self._char_to_id = {char: idx for idx, char in enumerate(self._vocab)}
|
||||
self._id_to_char = {
|
||||
idx: char for idx, char in enumerate(self._vocab) # pylint: disable=unnecessary-comprehension
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def init_from_config(config, **kwargs):
|
||||
|
@ -93,6 +107,17 @@ class BaseVocabulary:
|
|||
)
|
||||
return BaseVocabulary(**kwargs), config
|
||||
|
||||
def to_config(self) -> "CharactersConfig":
|
||||
return CharactersConfig(
|
||||
vocab_dict=self._vocab,
|
||||
pad=self.pad,
|
||||
eos=self.eos,
|
||||
bos=self.bos,
|
||||
blank=self.blank,
|
||||
is_unique=False,
|
||||
is_sorted=False,
|
||||
)
|
||||
|
||||
@property
|
||||
def num_chars(self):
|
||||
"""Return number of tokens in the vocabulary."""
|
||||
|
@ -174,6 +199,14 @@ class BaseCharacters:
|
|||
def blank_id(self) -> int:
|
||||
return self.char_to_id(self.blank) if self.blank else len(self.vocab)
|
||||
|
||||
@property
|
||||
def eos_id(self) -> int:
|
||||
return self.char_to_id(self.eos) if self.eos else len(self.vocab)
|
||||
|
||||
@property
|
||||
def bos_id(self) -> int:
|
||||
return self.char_to_id(self.bos) if self.bos else len(self.vocab)
|
||||
|
||||
@property
|
||||
def characters(self):
|
||||
return self._characters
|
||||
|
|
|
@ -108,11 +108,12 @@ class TTSTokenizer:
|
|||
text = self.text_cleaner(text)
|
||||
if self.use_phonemes:
|
||||
text = self.phonemizer.phonemize(text, separator="", language=language)
|
||||
text = self.encode(text)
|
||||
if self.add_blank:
|
||||
text = self.intersperse_blank_char(text, True)
|
||||
if self.use_eos_bos:
|
||||
text = self.pad_with_bos_eos(text)
|
||||
return self.encode(text)
|
||||
return text
|
||||
|
||||
def ids_to_text(self, id_sequence: List[int]) -> str:
|
||||
"""Converts a sequence of token IDs to a string of text."""
|
||||
|
@ -120,14 +121,14 @@ class TTSTokenizer:
|
|||
|
||||
def pad_with_bos_eos(self, char_sequence: List[str]):
|
||||
"""Pads a sequence with the special BOS and EOS characters."""
|
||||
return [self.characters.bos] + list(char_sequence) + [self.characters.eos]
|
||||
return [self.characters.bos_id] + list(char_sequence) + [self.characters.eos_id]
|
||||
|
||||
def intersperse_blank_char(self, char_sequence: List[str], use_blank_char: bool = False):
|
||||
"""Intersperses the blank character between characters in a sequence.
|
||||
|
||||
Use the ```blank``` character if defined else use the ```pad``` character.
|
||||
"""
|
||||
char_to_use = self.characters.blank if use_blank_char else self.characters.pad
|
||||
char_to_use = self.characters.blank_id if use_blank_char else self.characters.pad
|
||||
result = [char_to_use] * (len(char_sequence) * 2 + 1)
|
||||
result[1::2] = char_sequence
|
||||
return result
|
||||
|
|
|
@ -540,7 +540,10 @@ class AudioProcessor(object):
|
|||
|
||||
def _griffin_lim(self, S):
|
||||
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
|
||||
S_complex = np.abs(S).astype(np.complex)
|
||||
try:
|
||||
S_complex = np.abs(S).astype(np.complex)
|
||||
except AttributeError: # np.complex is deprecated since numpy 1.20.0
|
||||
S_complex = np.abs(S).astype(complex)
|
||||
y = self._istft(S_complex * angles)
|
||||
if not np.isfinite(y).all():
|
||||
print(" [!] Waveform is not finite everywhere. Skipping the GL.")
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import json
|
||||
import os
|
||||
import tarfile
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from shutil import copyfile, rmtree
|
||||
|
@ -245,6 +246,55 @@ class ModelManager(object):
|
|||
else:
|
||||
print(" > Model's license - No license information available")
|
||||
|
||||
def _download_github_model(self, model_item: Dict, output_path: str):
|
||||
if isinstance(model_item["github_rls_url"], list):
|
||||
self._download_model_files(model_item["github_rls_url"], output_path, self.progress_bar)
|
||||
else:
|
||||
self._download_zip_file(model_item["github_rls_url"], output_path, self.progress_bar)
|
||||
|
||||
def _download_hf_model(self, model_item: Dict, output_path: str):
|
||||
if isinstance(model_item["hf_url"], list):
|
||||
self._download_model_files(model_item["hf_url"], output_path, self.progress_bar)
|
||||
else:
|
||||
self._download_zip_file(model_item["hf_url"], output_path, self.progress_bar)
|
||||
|
||||
def download_fairseq_model(self, model_name, output_path):
|
||||
URI_PREFIX = "https://coqui.gateway.scarf.sh/fairseq/"
|
||||
_, lang, _, _ = model_name.split("/")
|
||||
model_download_uri = os.path.join(URI_PREFIX, f"{lang}.tar.gz")
|
||||
self._download_tar_file(model_download_uri, output_path, self.progress_bar)
|
||||
|
||||
@staticmethod
|
||||
def set_model_url(model_item: Dict):
|
||||
model_item["model_url"] = None
|
||||
if "github_rls_url" in model_item:
|
||||
model_item["model_url"] = model_item["github_rls_url"]
|
||||
elif "hf_url" in model_item:
|
||||
model_item["model_url"] = model_item["hf_url"]
|
||||
elif "fairseq" in model_item["model_name"]:
|
||||
model_item["model_url"] = "https://coqui.gateway.scarf.sh/fairseq/"
|
||||
return model_item
|
||||
|
||||
def _set_model_item(self, model_name):
|
||||
# fetch model info from the dict
|
||||
model_type, lang, dataset, model = model_name.split("/")
|
||||
model_full_name = f"{model_type}--{lang}--{dataset}--{model}"
|
||||
if "fairseq" in model_name:
|
||||
model_item = {
|
||||
"model_type": "tts_models",
|
||||
"license": "CC BY-NC 4.0",
|
||||
"default_vocoder": None,
|
||||
"author": "fairseq",
|
||||
"description": "this model is released by Meta under Fairseq repo. Visit https://github.com/facebookresearch/fairseq/tree/main/examples/mms for more info.",
|
||||
}
|
||||
model_item["model_name"] = model_name
|
||||
else:
|
||||
# get model from models.json
|
||||
model_item = self.models_dict[model_type][lang][dataset][model]
|
||||
model_item["model_type"] = model_type
|
||||
model_item = self.set_model_url(model_item)
|
||||
return model_item, model_full_name, model
|
||||
|
||||
def download_model(self, model_name):
|
||||
"""Download model files given the full model name.
|
||||
Model name is in the format
|
||||
|
@ -259,11 +309,7 @@ class ModelManager(object):
|
|||
Args:
|
||||
model_name (str): model name as explained above.
|
||||
"""
|
||||
# fetch model info from the dict
|
||||
model_type, lang, dataset, model = model_name.split("/")
|
||||
model_full_name = f"{model_type}--{lang}--{dataset}--{model}"
|
||||
model_item = self.models_dict[model_type][lang][dataset][model]
|
||||
model_item["model_type"] = model_type
|
||||
model_item, model_full_name, model = self._set_model_item(model_name)
|
||||
# set the model specific output path
|
||||
output_path = os.path.join(self.output_prefix, model_full_name)
|
||||
if os.path.exists(output_path):
|
||||
|
@ -271,16 +317,20 @@ class ModelManager(object):
|
|||
else:
|
||||
os.makedirs(output_path, exist_ok=True)
|
||||
print(f" > Downloading model to {output_path}")
|
||||
# download from github release
|
||||
if isinstance(model_item["github_rls_url"], list):
|
||||
self._download_model_files(model_item["github_rls_url"], output_path, self.progress_bar)
|
||||
else:
|
||||
self._download_zip_file(model_item["github_rls_url"], output_path, self.progress_bar)
|
||||
if "fairseq" in model_name:
|
||||
self.download_fairseq_model(model_name, output_path)
|
||||
elif "github_rls_url" in model_item:
|
||||
self._download_github_model(model_item, output_path)
|
||||
elif "hf_url" in model_item:
|
||||
self._download_hf_model(model_item, output_path)
|
||||
|
||||
self.print_model_license(model_item=model_item)
|
||||
# find downloaded files
|
||||
output_model_path = output_path
|
||||
output_config_path = None
|
||||
if model != "tortoise-v2":
|
||||
if (
|
||||
model not in ["tortoise-v2", "bark"] and "fairseq" not in model_name
|
||||
): # TODO:This is stupid but don't care for now.
|
||||
output_model_path, output_config_path = self._find_files(output_path)
|
||||
# update paths in the config.json
|
||||
self._update_paths(output_path, output_config_path)
|
||||
|
@ -421,6 +471,39 @@ class ModelManager(object):
|
|||
# remove the extracted folder
|
||||
rmtree(os.path.join(output_folder, z.namelist()[0]))
|
||||
|
||||
@staticmethod
|
||||
def _download_tar_file(file_url, output_folder, progress_bar):
|
||||
"""Download the github releases"""
|
||||
# download the file
|
||||
r = requests.get(file_url, stream=True)
|
||||
# extract the file
|
||||
try:
|
||||
total_size_in_bytes = int(r.headers.get("content-length", 0))
|
||||
block_size = 1024 # 1 Kibibyte
|
||||
if progress_bar:
|
||||
progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
|
||||
temp_tar_name = os.path.join(output_folder, file_url.split("/")[-1])
|
||||
with open(temp_tar_name, "wb") as file:
|
||||
for data in r.iter_content(block_size):
|
||||
if progress_bar:
|
||||
progress_bar.update(len(data))
|
||||
file.write(data)
|
||||
with tarfile.open(temp_tar_name) as t:
|
||||
t.extractall(output_folder)
|
||||
tar_names = t.getnames()
|
||||
os.remove(temp_tar_name) # delete tar after extract
|
||||
except tarfile.ReadError:
|
||||
print(f" > Error: Bad tar file - {file_url}")
|
||||
raise tarfile.ReadError # pylint: disable=raise-missing-from
|
||||
# move the files to the outer path
|
||||
for file_path in os.listdir(os.path.join(output_folder, tar_names[0])):
|
||||
src_path = os.path.join(output_folder, tar_names[0], file_path)
|
||||
dst_path = os.path.join(output_folder, os.path.basename(file_path))
|
||||
if src_path != dst_path:
|
||||
copyfile(src_path, dst_path)
|
||||
# remove the extracted folder
|
||||
rmtree(os.path.join(output_folder, tar_names[0]))
|
||||
|
||||
@staticmethod
|
||||
def _download_model_files(file_urls, output_folder, progress_bar):
|
||||
"""Download the github releases"""
|
||||
|
|
|
@ -7,7 +7,9 @@ import pysbd
|
|||
import torch
|
||||
|
||||
from TTS.config import load_config
|
||||
from TTS.tts.configs.vits_config import VitsConfig
|
||||
from TTS.tts.models import setup_model as setup_tts_model
|
||||
from TTS.tts.models.vits import Vits
|
||||
|
||||
# pylint: disable=unused-wildcard-import
|
||||
# pylint: disable=wildcard-import
|
||||
|
@ -98,8 +100,12 @@ class Synthesizer(object):
|
|||
self.output_sample_rate = self.vc_config.audio["output_sample_rate"]
|
||||
|
||||
if model_dir:
|
||||
self._load_tts_from_dir(model_dir, use_cuda)
|
||||
self.output_sample_rate = self.tts_config.audio["output_sample_rate"]
|
||||
if "fairseq" in model_dir:
|
||||
self._load_fairseq_from_dir(model_dir, use_cuda)
|
||||
self.output_sample_rate = self.tts_config.audio["sample_rate"]
|
||||
else:
|
||||
self._load_tts_from_dir(model_dir, use_cuda)
|
||||
self.output_sample_rate = self.tts_config.audio["output_sample_rate"]
|
||||
|
||||
@staticmethod
|
||||
def _get_segmenter(lang: str):
|
||||
|
@ -133,12 +139,23 @@ class Synthesizer(object):
|
|||
if use_cuda:
|
||||
self.vc_model.cuda()
|
||||
|
||||
def _load_fairseq_from_dir(self, model_dir: str, use_cuda: bool) -> None:
|
||||
"""Load the fairseq model from a directory.
|
||||
|
||||
We assume it is VITS and the model knows how to load itself from the directory and there is a config.json file in the directory.
|
||||
"""
|
||||
self.tts_config = VitsConfig()
|
||||
self.tts_model = Vits.init_from_config(self.tts_config)
|
||||
self.tts_model.load_fairseq_checkpoint(self.tts_config, checkpoint_dir=model_dir, eval=True)
|
||||
self.tts_config = self.tts_model.config
|
||||
if use_cuda:
|
||||
self.tts_model.cuda()
|
||||
|
||||
def _load_tts_from_dir(self, model_dir: str, use_cuda: bool) -> None:
|
||||
"""Load the TTS model from a directory.
|
||||
|
||||
We assume the model knows how to load itself from the directory and there is a config.json file in the directory.
|
||||
"""
|
||||
|
||||
config = load_config(os.path.join(model_dir, "config.json"))
|
||||
self.tts_config = config
|
||||
self.tts_model = setup_tts_model(config)
|
||||
|
@ -260,13 +277,13 @@ class Synthesizer(object):
|
|||
|
||||
Args:
|
||||
text (str): input text.
|
||||
speaker_name (str, optional): spekaer id for multi-speaker models. Defaults to "".
|
||||
speaker_name (str, optional): speaker id for multi-speaker models. Defaults to "".
|
||||
language_name (str, optional): language id for multi-language models. Defaults to "".
|
||||
speaker_wav (Union[str, List[str]], optional): path to the speaker wav for voice cloning. Defaults to None.
|
||||
style_wav ([type], optional): style waveform for GST. Defaults to None.
|
||||
style_text ([type], optional): transcription of style_wav for Capacitron. Defaults to None.
|
||||
reference_wav ([type], optional): reference waveform for voice conversion. Defaults to None.
|
||||
reference_speaker_name ([type], optional): spekaer id of reference waveform. Defaults to None.
|
||||
reference_speaker_name ([type], optional): speaker id of reference waveform. Defaults to None.
|
||||
Returns:
|
||||
List[int]: [description]
|
||||
"""
|
||||
|
@ -355,7 +372,7 @@ class Synthesizer(object):
|
|||
|
||||
use_gl = self.vocoder_model is None
|
||||
|
||||
if not reference_wav:
|
||||
if not reference_wav: # not voice conversion
|
||||
for sen in sens:
|
||||
if hasattr(self.tts_model, "synthesize"):
|
||||
sp_name = "random" if speaker_name is None else speaker_name
|
||||
|
@ -363,7 +380,7 @@ class Synthesizer(object):
|
|||
text=sen,
|
||||
config=self.tts_config,
|
||||
speaker_id=sp_name,
|
||||
extra_voice_dirs=self.voice_dir,
|
||||
voice_dirs=self.voice_dir,
|
||||
**kwargs,
|
||||
)
|
||||
else:
|
||||
|
|
|
@ -794,8 +794,8 @@ class FreeVCConfig(BaseVCConfig):
|
|||
|
||||
model: str = "freevc"
|
||||
# model specific params
|
||||
model_args: FreeVCArgs = FreeVCArgs()
|
||||
audio: FreeVCAudioConfig = FreeVCAudioConfig()
|
||||
model_args: FreeVCArgs = field(default_factory=FreeVCArgs)
|
||||
audio: FreeVCAudioConfig = field(default_factory=FreeVCAudioConfig)
|
||||
|
||||
# optimizer
|
||||
# TODO with training support
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
furo
|
||||
myst-parser == 0.15.1
|
||||
sphinx == 4.0.2
|
||||
myst-parser == 2.0.0
|
||||
sphinx == 7.0.1
|
||||
sphinx_inline_tabs
|
||||
sphinx_copybutton
|
||||
linkify-it-py
|
|
@ -76,7 +76,7 @@ myst_enable_extensions = ['linkify',]
|
|||
# duplicated section names that are in different documents.
|
||||
autosectionlabel_prefix_document = True
|
||||
|
||||
language = None
|
||||
language = 'en'
|
||||
|
||||
autodoc_inherit_docstrings = False
|
||||
|
||||
|
|
|
@ -52,6 +52,7 @@
|
|||
models/tacotron1-2.md
|
||||
models/overflow.md
|
||||
models/tortoise.md
|
||||
models/bark.md
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
|
|
@ -128,7 +128,7 @@ wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0],
|
|||
tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav")
|
||||
```
|
||||
|
||||
Here is an example for a single speaker model.
|
||||
#### Here is an example for a single speaker model.
|
||||
|
||||
```python
|
||||
# Init TTS with the target model name
|
||||
|
@ -137,7 +137,7 @@ tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False,
|
|||
tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path=OUTPUT_PATH)
|
||||
```
|
||||
|
||||
Example voice cloning with YourTTS in English, French and Portuguese:
|
||||
#### Example voice cloning with YourTTS in English, French and Portuguese:
|
||||
|
||||
```python
|
||||
tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)
|
||||
|
@ -146,15 +146,16 @@ tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wa
|
|||
tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt", file_path="output.wav")
|
||||
```
|
||||
|
||||
Example voice conversion converting speaker of the `source_wav` to the speaker of the `target_wav`
|
||||
#### Example voice conversion converting speaker of the `source_wav` to the speaker of the `target_wav`
|
||||
|
||||
```python
|
||||
tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False, gpu=True)
|
||||
tts.voice_conversion_to_file(source_wav="my/source.wav", target_wav="my/target.wav", file_path="output.wav")
|
||||
```
|
||||
|
||||
Example voice cloning by a single speaker TTS model combining with the voice conversion model. This way, you can
|
||||
clone voices by using any model in 🐸TTS.
|
||||
#### Example voice cloning by a single speaker TTS model combining with the voice conversion model.
|
||||
|
||||
This way, you can clone voices by using any model in 🐸TTS.
|
||||
|
||||
```python
|
||||
tts = TTS("tts_models/de/thorsten/tacotron2-DDC")
|
||||
|
@ -163,8 +164,11 @@ tts.tts_with_vc_to_file(
|
|||
speaker_wav="target/speaker.wav",
|
||||
file_path="ouptut.wav"
|
||||
)
|
||||
```
|
||||
|
||||
Example text to speech using [🐸Coqui Studio](https://coqui.ai) models. You can use all of your available speakers in the studio.
|
||||
#### Example text to speech using [🐸Coqui Studio](https://coqui.ai) models.
|
||||
|
||||
You can use all of your available speakers in the studio.
|
||||
[🐸Coqui Studio](https://coqui.ai) API token is required. You can get it from the [account page](https://coqui.ai/account).
|
||||
You should set the `COQUI_STUDIO_TOKEN` environment variable to use the API token.
|
||||
|
||||
|
@ -193,4 +197,23 @@ api.emotions
|
|||
api.list_speakers()
|
||||
api.list_voices()
|
||||
wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5)
|
||||
```
|
||||
|
||||
#### Example text to speech using **Fairseq models in ~1100 languages** 🤯.
|
||||
For these models use the following name format: `tts_models/<lang-iso_code>/fairseq/vits`.
|
||||
|
||||
You can find the list of language ISO codes [here](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html) and learn about the Fairseq models [here](https://github.com/facebookresearch/fairseq/tree/main/examples/mms).
|
||||
|
||||
```python
|
||||
from TTS.api import TTS
|
||||
api = TTS(model_name="tts_models/eng/fairseq/vits", gpu=True)
|
||||
api.tts_to_file("This is a test.", file_path="output.wav")
|
||||
|
||||
# TTS with on the fly voice conversion
|
||||
api = TTS("tts_models/deu/fairseq/vits")
|
||||
api.tts_with_vc_to_file(
|
||||
"Wie sage ich auf Italienisch, dass ich dich liebe?",
|
||||
speaker_wav="target/speaker.wav",
|
||||
file_path="ouptut.wav"
|
||||
)
|
||||
```
|
|
@ -0,0 +1,43 @@
|
|||
# Mary-TTS API Support for Coqui-TTS
|
||||
|
||||
## What is Mary-TTS?
|
||||
|
||||
[Mary (Modular Architecture for Research in sYynthesis) Text-to-Speech](http://mary.dfki.de/) is an open-source (GNU LGPL license), multilingual Text-to-Speech Synthesis platform written in Java. It was originally developed as a collaborative project of [DFKI’s](http://www.dfki.de/web) Language Technology Lab and the [Institute of Phonetics](http://www.coli.uni-saarland.de/groups/WB/Phonetics/) at Saarland University, Germany. It is now maintained by the Multimodal Speech Processing Group in the [Cluster of Excellence MMCI](https://www.mmci.uni-saarland.de/) and DFKI.
|
||||
MaryTTS has been around for a very! long time. Version 3.0 even dates back to 2006, long before Deep Learning was a broadly known term and the last official release was version 5.2 in 2016.
|
||||
You can check out this OpenVoice-Tech page to learn more: https://openvoice-tech.net/index.php/MaryTTS
|
||||
|
||||
## Why Mary-TTS compatibility is relevant
|
||||
|
||||
Due to it's open-source nature, relatively high quality voices and fast synthetization speed Mary-TTS was a popular choice in the past and many tools implemented API support over the years like screen-readers (NVDA + SpeechHub), smart-home HUBs (openHAB, Home Assistant) or voice assistants (Rhasspy, Mycroft, SEPIA). A compatibility layer for Coqui-TTS will ensure that these tools can use Coqui as a drop-in replacement and get even better voices right away.
|
||||
|
||||
## API and code examples
|
||||
|
||||
Like Coqui-TTS, Mary-TTS can run as HTTP server to allow access to the API via HTTP GET and POST calls. The best documentations of this API are probably the [web-page](https://github.com/marytts/marytts/tree/master/marytts-runtime/src/main/resources/marytts/server/http), available via your self-hosted Mary-TTS server and the [Java docs page](http://mary.dfki.de/javadoc/marytts/server/http/MaryHttpServer.html).
|
||||
Mary-TTS offers a larger number of endpoints to load styles, audio effects, examples etc., but compatible tools often only require 3 of them to work:
|
||||
- `/locales` (GET) - Returns a list of supported locales in the format `[locale]\n...`, for example "en_US" or "de_DE" or simply "en" etc.
|
||||
- `/voices` (GET) - Returns a list of supported voices in the format `[name] [locale] [gender]\n...`, 'name' can be anything without spaces(!) and 'gender' is traditionally `f` or `m`
|
||||
- `/process?INPUT_TEXT=[my text]&INPUT_TYPE=TEXT&LOCALE=[locale]&VOICE=[name]&OUTPUT_TYPE=AUDIO&AUDIO=WAVE_FILE` (GET/POST) - Processes the input text and returns a wav file. INPUT_TYPE, OUTPUT_TYPE and AUDIO support additional values, but are usually static in compatible tools.
|
||||
|
||||
If your Coqui-TTS server is running on `localhost` using `port` 59125 (for classic Mary-TTS compatibility) you can us the following CURL requests to test the API:
|
||||
|
||||
Return locale of active voice, e.g. "en":
|
||||
```bash
|
||||
curl http://localhost:59125/locales
|
||||
```
|
||||
|
||||
Return name of active voice, e.g. "glow-tts en u"
|
||||
```bash
|
||||
curl http://localhost:59125/voices
|
||||
```
|
||||
|
||||
Create a wav-file with spoken input text:
|
||||
```bash
|
||||
curl http://localhost:59125/process?INPUT_TEXT=this+is+a+test > test.wav
|
||||
```
|
||||
|
||||
You can enter the same URLs in your browser and check-out the results there as well.
|
||||
|
||||
### How it works and limitations
|
||||
|
||||
A classic Mary-TTS server would usually show all installed locales and voices via the corresponding endpoints and accept the parameters `LOCALE` and `VOICE` for processing. For Coqui-TTS we usually start the server with one specific locale and model and thus cannot return all available options. Instead we return the active locale and use the model name as "voice". Since we only have one active model and always want to return a WAV-file, we currently ignore all other processing parameters except `INPUT_TEXT`. Since the gender is not defined for models in Coqui-TTS we always return `u` (undefined).
|
||||
We think that this is an acceptable compromise, since users are often only interested in one specific voice anyways, but the API might get extended in the future to support multiple languages and voices at the same time.
|
|
@ -0,0 +1,103 @@
|
|||
# Bark 🐶
|
||||
|
||||
Bark is a multi-lingual TTS model created by [Suno-AI](https://www.suno.ai/). It can generate conversational speech as well as music and sound effects.
|
||||
It is architecturally very similar to Google's [AudioLM](https://arxiv.org/abs/2209.03143). For more information, please refer to the [Suno-AI's repo](https://github.com/suno-ai/bark).
|
||||
|
||||
|
||||
## Acknowledgements
|
||||
- 👑[Suno-AI](https://www.suno.ai/) for training and open-sourcing this model.
|
||||
- 👑[serp-ai](https://github.com/serp-ai/bark-with-voice-clone) for controlled voice cloning.
|
||||
|
||||
|
||||
## Example Use
|
||||
|
||||
```python
|
||||
text = "Hello, my name is Manmay , how are you?"
|
||||
|
||||
from TTS.tts.configs.bark_config import BarkConfig
|
||||
from TTS.tts.models.bark import Bark
|
||||
|
||||
config = BarkConfig()
|
||||
model = Bark.init_from_config(config)
|
||||
model.load_checkpoint(config, checkpoint_dir="path/to/model/dir/", eval=True)
|
||||
|
||||
# with random speaker
|
||||
output_dict = model.synthesize(text, config, speaker_id="random", voice_dirs=None)
|
||||
|
||||
# cloning a speaker.
|
||||
# It assumes that you have a speaker file in `bark_voices/speaker_n/speaker.wav` or `bark_voices/speaker_n/speaker.npz`
|
||||
output_dict = model.synthesize(text, config, speaker_id="ljspeech", voice_dirs="bark_voices/")
|
||||
```
|
||||
|
||||
Using 🐸TTS API:
|
||||
|
||||
```python
|
||||
from TTS.api import TTS
|
||||
|
||||
# Load the model to GPU
|
||||
# Bark is really slow on CPU, so we recommend using GPU.
|
||||
tts = TTS("tts_models/multilingual/multi-dataset/bark", gpu=True)
|
||||
|
||||
|
||||
# Cloning a new speaker
|
||||
# This expects to find a mp3 or wav file like `bark_voices/new_speaker/speaker.wav`
|
||||
# It computes the cloning values and stores in `bark_voices/new_speaker/speaker.npz`
|
||||
tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
|
||||
file_path="output.wav",
|
||||
voice_dir="bark_voices/",
|
||||
speaker="ljspeech")
|
||||
|
||||
|
||||
# When you run it again it uses the stored values to generate the voice.
|
||||
tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
|
||||
file_path="output.wav",
|
||||
voice_dir="bark_voices/",
|
||||
speaker="ljspeech")
|
||||
|
||||
|
||||
# random speaker
|
||||
tts = TTS("tts_models/multilingual/multi-dataset/bark", gpu=True)
|
||||
tts.tts_to_file("hello world", file_path="out.wav")
|
||||
```
|
||||
|
||||
Using 🐸TTS Command line:
|
||||
|
||||
```console
|
||||
# cloning the `ljspeech` voice
|
||||
tts --model_name tts_models/multilingual/multi-dataset/bark \
|
||||
--text "This is an example." \
|
||||
--out_path "output.wav" \
|
||||
--voice_dir bark_voices/ \
|
||||
--speaker_idx "ljspeech" \
|
||||
--progress_bar True
|
||||
|
||||
# Random voice generation
|
||||
tts --model_name tts_models/multilingual/multi-dataset/bark \
|
||||
--text "This is an example." \
|
||||
--out_path "output.wav" \
|
||||
--progress_bar True
|
||||
```
|
||||
|
||||
|
||||
## Important resources & papers
|
||||
- Original Repo: https://github.com/suno-ai/bark
|
||||
- Cloning implementation: https://github.com/serp-ai/bark-with-voice-clone
|
||||
- AudioLM: https://arxiv.org/abs/2209.03143
|
||||
|
||||
## BarkConfig
|
||||
```{eval-rst}
|
||||
.. autoclass:: TTS.tts.configs.bark_config.BarkConfig
|
||||
:members:
|
||||
```
|
||||
|
||||
## BarkArgs
|
||||
```{eval-rst}
|
||||
.. autoclass:: TTS.tts.models.bark.BarkArgs
|
||||
:members:
|
||||
```
|
||||
|
||||
## Bark Model
|
||||
```{eval-rst}
|
||||
.. autoclass:: TTS.tts.models.bark.Bark
|
||||
:members:
|
||||
```
|
|
@ -1,7 +1,7 @@
|
|||
# Tortoise 🐢
|
||||
Tortoise is a very expressive TTS system with impressive voice cloning capabilities. It is based on an GPT like autogressive acoustic model that converts input
|
||||
text to discritized acouistic tokens, a diffusion model that converts these tokens to melspeectrogram frames and a Univnet vocoder to convert the spectrograms to
|
||||
the final audio signal. The important downside is that Tortoise is very slow compared to the parallel TTS models like VITS.
|
||||
the final audio signal. The important downside is that Tortoise is very slow compared to the parallel TTS models like VITS.
|
||||
|
||||
Big thanks to 👑[@manmay-nakhashi](https://github.com/manmay-nakhashi) who helped us implement Tortoise in 🐸TTS.
|
||||
|
||||
|
@ -12,7 +12,7 @@ from TTS.tts.configs.tortoise_config import TortoiseConfig
|
|||
from TTS.tts.models.tortoise import Tortoise
|
||||
|
||||
config = TortoiseConfig()
|
||||
model = Tortoise.inif_from_config(config)
|
||||
model = Tortoise.init_from_config(config)
|
||||
model.load_checkpoint(config, checkpoint_dir="paths/to/models_dir/", eval=True)
|
||||
|
||||
# with random speaker
|
||||
|
@ -29,23 +29,23 @@ from TTS.api import TTS
|
|||
tts = TTS("tts_models/en/multi-dataset/tortoise-v2")
|
||||
|
||||
# cloning `lj` voice from `TTS/tts/utils/assets/tortoise/voices/lj`
|
||||
# with custom inference settings overriding defaults.
|
||||
tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
|
||||
# with custom inference settings overriding defaults.
|
||||
tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
|
||||
file_path="output.wav",
|
||||
voice_dir="TTS/tts/utils/assets/tortoise/voices/",
|
||||
voice_dir="path/to/tortoise/voices/dir/",
|
||||
speaker="lj",
|
||||
num_autoregressive_samples=1,
|
||||
diffusion_iterations=10)
|
||||
|
||||
# Using presets with the same voice
|
||||
tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
|
||||
tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
|
||||
file_path="output.wav",
|
||||
voice_dir="TTS/tts/utils/assets/tortoise/voices/",
|
||||
voice_dir="path/to/tortoise/voices/dir/",
|
||||
speaker="lj",
|
||||
preset="ultra_fast")
|
||||
|
||||
# Random voice generation
|
||||
tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
|
||||
tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
|
||||
file_path="output.wav")
|
||||
```
|
||||
|
||||
|
@ -54,16 +54,16 @@ Using 🐸TTS Command line:
|
|||
```console
|
||||
# cloning the `lj` voice
|
||||
tts --model_name tts_models/en/multi-dataset/tortoise-v2 \
|
||||
--text "This is an example." \
|
||||
--out_path "/data/speech_synth/coqui-tts/TTS/tests/outputs/output.wav" \
|
||||
--voice_dir TTS/tts/utils/assets/tortoise/voices/ \
|
||||
--text "This is an example." \
|
||||
--out_path "output.wav" \
|
||||
--voice_dir path/to/tortoise/voices/dir/ \
|
||||
--speaker_idx "lj" \
|
||||
--progress_bar True
|
||||
|
||||
# Random voice generation
|
||||
tts --model_name tts_models/en/multi-dataset/tortoise-v2 \
|
||||
--text "This is an example." \
|
||||
--out_path "/data/speech_synth/coqui-tts/TTS/tests/outputs/output.wav" \
|
||||
--out_path "output.wav" \
|
||||
--progress_bar True
|
||||
```
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
[build-system]
|
||||
requires = ["setuptools", "wheel", "cython==0.29.28", "numpy==1.21.6", "packaging"]
|
||||
requires = ["setuptools", "wheel", "cython==0.29.30", "numpy==1.22.0", "packaging"]
|
||||
|
||||
[flake8]
|
||||
max-line-length=120
|
||||
|
|
|
@ -1,14 +1,14 @@
|
|||
# core deps
|
||||
numpy==1.21.6;python_version<"3.10"
|
||||
numpy;python_version=="3.10"
|
||||
cython==0.29.28
|
||||
numpy==1.22.0;python_version<="3.10"
|
||||
numpy==1.24.3;python_version>"3.10"
|
||||
cython==0.29.30
|
||||
scipy>=1.4.0
|
||||
torch>=1.7
|
||||
torchaudio
|
||||
soundfile
|
||||
librosa==0.10.0.*
|
||||
numba==0.55.1;python_version<"3.9"
|
||||
numba==0.56.4;python_version>="3.9"
|
||||
numba==0.57.0;python_version>="3.9"
|
||||
inflect==5.6.0
|
||||
tqdm
|
||||
anyascii
|
||||
|
@ -26,14 +26,14 @@ pandas
|
|||
# deps for training
|
||||
matplotlib
|
||||
# coqui stack
|
||||
trainer==0.0.20
|
||||
trainer
|
||||
# config management
|
||||
coqpit>=0.0.16
|
||||
# chinese g2p deps
|
||||
jieba
|
||||
pypinyin
|
||||
# japanese g2p deps
|
||||
mecab-python3==1.0.5
|
||||
mecab-python3==1.0.6
|
||||
unidic-lite==1.0.8
|
||||
# gruut+supported langs
|
||||
gruut[de,es,fr]==2.2.3
|
||||
|
@ -45,8 +45,9 @@ g2pkk>=0.1.1
|
|||
bangla==0.0.2
|
||||
bnnumerizer
|
||||
bnunicodenormalizer==0.1.1
|
||||
|
||||
#deps for tortoise
|
||||
k_diffusion
|
||||
einops
|
||||
transformers
|
||||
transformers
|
||||
#deps for bark
|
||||
encodec
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
[build_py]
|
||||
build-lib=temp_build
|
||||
build_lib=temp_build
|
||||
|
||||
[bdist_wheel]
|
||||
bdist-dir=temp_build
|
||||
bdist_dir=temp_build
|
||||
|
||||
[install_lib]
|
||||
build-dir=temp_build
|
||||
build_dir=temp_build
|
||||
|
|
9
setup.py
9
setup.py
|
@ -32,8 +32,8 @@ from Cython.Build import cythonize
|
|||
from setuptools import Extension, find_packages, setup
|
||||
|
||||
python_version = sys.version.split()[0]
|
||||
if Version(python_version) < Version("3.7") or Version(python_version) >= Version("3.11"):
|
||||
raise RuntimeError("TTS requires python >= 3.7 and < 3.11 " "but your Python version is {}".format(sys.version))
|
||||
if Version(python_version) < Version("3.9") or Version(python_version) >= Version("3.12"):
|
||||
raise RuntimeError("TTS requires python >= 3.9 and < 3.12 " "but your Python version is {}".format(sys.version))
|
||||
|
||||
|
||||
cwd = os.path.dirname(os.path.abspath(__file__))
|
||||
|
@ -114,15 +114,14 @@ setup(
|
|||
"dev": requirements_dev,
|
||||
"notebooks": requirements_notebooks,
|
||||
},
|
||||
python_requires=">=3.7.0, <3.11",
|
||||
python_requires=">=3.9.0, <3.12",
|
||||
entry_points={"console_scripts": ["tts=TTS.bin.synthesize:main", "tts-server = TTS.server.server:main"]},
|
||||
classifiers=[
|
||||
"Programming Language :: Python",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.7",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
"Development Status :: 3 - Alpha",
|
||||
"Intended Audience :: Science/Research",
|
||||
"Intended Audience :: Developers",
|
||||
|
|
|
@ -60,7 +60,7 @@ if is_coqui_available:
|
|||
self.assertIsNone(tts.languages)
|
||||
|
||||
def test_studio_model(self):
|
||||
tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio")
|
||||
tts = TTS(model_name="coqui_studio/en/Zacharie Aimilios/coqui_studio")
|
||||
tts.tts_to_file(text="This is a test.")
|
||||
|
||||
# check speed > 2.0 raises error
|
||||
|
@ -83,6 +83,10 @@ if is_coqui_available:
|
|||
wav = tts.tts(text="This is a test.", speed=2.0, emotion="Sad")
|
||||
self.assertGreater(len(wav), 0)
|
||||
|
||||
def test_fairseq_model(self): # pylint: disable=no-self-use
|
||||
tts = TTS(model_name="tts_models/eng/fairseq/vits")
|
||||
tts.tts_to_file(text="This is a test.")
|
||||
|
||||
def test_multi_speaker_multi_lingual_model(self):
|
||||
tts = TTS()
|
||||
tts.load_tts_model_by_name(tts.models[0]) # YourTTS
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
import unittest
|
||||
from dataclasses import dataclass
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from coqpit import Coqpit
|
||||
|
||||
|
@ -86,11 +86,11 @@ class TestTTSTokenizer(unittest.TestCase):
|
|||
enable_eos_bos_chars: bool = True
|
||||
use_phonemes: bool = True
|
||||
add_blank: bool = False
|
||||
characters: str = Characters()
|
||||
characters: str = field(default_factory=Characters)
|
||||
phonemizer: str = "espeak"
|
||||
phoneme_language: str = "tr"
|
||||
text_cleaner: str = "phoneme_cleaners"
|
||||
characters = Characters()
|
||||
characters = field(default_factory=Characters)
|
||||
|
||||
tokenizer_ph, _ = TTSTokenizer.init_from_config(TokenizerConfig())
|
||||
tokenizer_ph.phonemizer.backend = "espeak"
|
||||
|
|
|
@ -16,7 +16,7 @@ from TTS.utils.audio import AudioProcessor
|
|||
|
||||
torch.manual_seed(1)
|
||||
use_cuda = torch.cuda.is_available()
|
||||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
||||
device = torch.device("cuda" if use_cuda else "cpu")
|
||||
|
||||
config_global = TacotronConfig(num_chars=32, num_speakers=5, out_channels=513, decoder_output_dim=80)
|
||||
|
||||
|
@ -288,7 +288,6 @@ class TacotronCapacitronTrainTest(unittest.TestCase):
|
|||
batch["text_input"].shape[0], batch["stop_targets"].size(1) // config.r, -1
|
||||
)
|
||||
batch["stop_targets"] = (batch["stop_targets"].sum(2) > 0.0).unsqueeze(2).float().squeeze()
|
||||
|
||||
model = Tacotron(config).to(device)
|
||||
criterion = model.get_criterion()
|
||||
optimizer = model.get_optimizer()
|
||||
|
|
|
@ -15,7 +15,7 @@ def run_models(offset=0, step=1):
|
|||
print(" > Run synthesizer with all the models.")
|
||||
output_path = os.path.join(get_tests_output_path(), "output.wav")
|
||||
manager = ModelManager(output_prefix=get_tests_output_path(), progress_bar=False)
|
||||
model_names = manager.list_models()
|
||||
model_names = [name for name in manager.list_models() if "bark" not in name]
|
||||
for model_name in model_names[offset::step]:
|
||||
print(f"\n > Run - {model_name}")
|
||||
model_path, _, _ = manager.download_model(model_name)
|
||||
|
@ -79,6 +79,15 @@ def test_models_offset_2_step_3():
|
|||
run_models(offset=2, step=3)
|
||||
|
||||
|
||||
def test_bark():
|
||||
"""Bark is too big to run on github actions. We need to test it locally"""
|
||||
output_path = os.path.join(get_tests_output_path(), "output.wav")
|
||||
run_cli(
|
||||
f" tts --model_name tts_models/multilingual/multi-dataset/bark "
|
||||
f'--text "This is an example." --out_path "{output_path}" --progress_bar False'
|
||||
)
|
||||
|
||||
|
||||
def test_voice_conversion():
|
||||
print(" > Run voice conversion inference using YourTTS model.")
|
||||
model_name = "tts_models/multilingual/multi-dataset/your_tts"
|
||||
|
|
Loading…
Reference in New Issue