Merge branch 'coqui-ai:main' into main

This commit is contained in:
Jindrich Matousek 2023-07-03 08:45:07 +02:00 committed by GitHub
commit b761d488a7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
62 changed files with 2504 additions and 222 deletions

View File

@ -18,7 +18,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.7, 3.8, 3.9, "3.10"]
python-version: [3.9, "3.10", "3.11"]
experimental: [false]
steps:
- uses: actions/checkout@v3

View File

@ -18,7 +18,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.7, 3.8, 3.9, "3.10"]
python-version: [3.9, "3.10", "3.11"]
experimental: [false]
steps:
- uses: actions/checkout@v3

View File

@ -18,7 +18,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.7, 3.8, 3.9, "3.10"]
python-version: [3.9, "3.10", "3.11"]
experimental: [false]
steps:
- uses: actions/checkout@v3

View File

@ -21,7 +21,7 @@ jobs:
fi
- uses: actions/setup-python@v2
with:
python-version: 3.8
python-version: 3.9
- run: |
python -m pip install -U pip setuptools wheel build
- run: |
@ -36,7 +36,7 @@ jobs:
runs-on: ubuntu-20.04
strategy:
matrix:
python-version: ["3.7", "3.8", "3.9", "3.10"]
python-version: ["3.9", "3.10", "3.11"]
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
@ -64,14 +64,6 @@ jobs:
with:
name: "sdist"
path: "dist/"
- uses: actions/download-artifact@v2
with:
name: "wheel-3.7"
path: "dist/"
- uses: actions/download-artifact@v2
with:
name: "wheel-3.8"
path: "dist/"
- uses: actions/download-artifact@v2
with:
name: "wheel-3.9"
@ -80,6 +72,10 @@ jobs:
with:
name: "wheel-3.10"
path: "dist/"
- uses: actions/download-artifact@v2
with:
name: "wheel-3.11"
path: "dist/"
- run: |
ls -lh dist/
- name: Setup PyPI config
@ -91,7 +87,7 @@ jobs:
EOF
- uses: actions/setup-python@v2
with:
python-version: 3.8
python-version: 3.9
- run: |
python -m pip install twine
- run: |

View File

@ -42,6 +42,6 @@ jobs:
run: |
python3 -m pip install .[all]
python3 setup.py egg_info
- name: Lint check
run: |
make lint
# - name: Lint check
# run: |
# make lint

View File

@ -18,7 +18,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.7, 3.8, 3.9, "3.10"]
python-version: [3.9, "3.10", "3.11"]
experimental: [false]
steps:
- uses: actions/checkout@v3

View File

@ -18,7 +18,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.7, 3.8, 3.9, "3.10"]
python-version: [3.9, "3.10", "3.11"]
experimental: [false]
steps:
- uses: actions/checkout@v3

View File

@ -18,7 +18,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.7, 3.8, 3.9, "3.10"]
python-version: [3.9, "3.10", "3.11"]
experimental: [false]
steps:
- uses: actions/checkout@v3

View File

@ -18,7 +18,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.7, 3.8, 3.9, "3.10"]
python-version: [3.9, "3.10", "3.11"]
experimental: [false]
steps:
- uses: actions/checkout@v3

View File

@ -18,7 +18,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.7, 3.8, 3.9, "3.10"]
python-version: [3.9, "3.10", "3.11"]
experimental: [false]
steps:
- uses: actions/checkout@v3
@ -43,6 +43,7 @@ jobs:
run: python3 -m pip install --upgrade pip setuptools wheel
- name: Replace scarf urls
run: |
sed -i 's/https:\/\/coqui.gateway.scarf.sh\/hf\/bark\//https:\/\/huggingface.co\/erogol\/bark\/resolve\/main\//g' TTS/.models.json
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
- name: Install TTS
run: |

View File

@ -18,7 +18,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.7, 3.8, 3.9, "3.10"]
python-version: [3.9, "3.10", "3.11"]
experimental: [false]
steps:
- uses: actions/checkout@v3

View File

@ -169,7 +169,9 @@ disable=missing-docstring,
comprehension-escape,
duplicate-code,
not-callable,
import-outside-toplevel
import-outside-toplevel,
logging-fstring-interpolation,
logging-not-lazy
# Enable the message, report, category or checker with the given id(s). You can
# either give multiple identifier separated by comma (,) or put this option

View File

@ -5,14 +5,19 @@
# Required
version: 2
# Set the version of Python and other tools you might need
build:
os: ubuntu-22.04
tools:
python: "3.11"
# Optionally set the version of Python and requirements required to build your docs
python:
install:
- requirements: docs/requirements.txt
- requirements: requirements.txt
# Build documentation in the docs/ directory with Sphinx
sphinx:
builder: html
configuration: docs/source/conf.py
# Optionally set the version of Python and requirements required to build your docs
python:
version: 3.7
install:
- requirements: docs/requirements.txt
- requirements: requirements.txt

View File

@ -1,10 +1,14 @@
## 🐸Coqui.ai News
- 📣 Coqui Studio API is landed on 🐸TTS. You can use the studio voices in combination with 🐸TTS models. [Example](https://github.com/coqui-ai/TTS/blob/dev/README.md#-python-api)
- 📣 Voice generation with prompts - **Prompt to Voice** - is live on Coqui.ai!! [Blog Post](https://coqui.ai/blog/tts/prompt-to-voice)
- 📣 Clone your voice with a single click on [🐸Coqui.ai](https://app.coqui.ai/auth/signin)
<br>
- 📣 [🐶Bark](https://github.com/suno-ai/bark) is now available for inference with uncontrained voice cloning. [Docs](https://tts.readthedocs.io/en/dev/models/bark.html)
- 📣 You can use [~1100 Fairseq models](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS.
- 📣 🐸TTS now supports 🐢Tortoise with faster inference. [Docs](https://tts.readthedocs.io/en/dev/models/tortoise.html)
- 📣 **Coqui Studio API** is landed on 🐸TTS. - [Example](https://github.com/coqui-ai/TTS/blob/dev/README.md#-python-api)
- 📣 [**Coqui Studio API**](https://docs.coqui.ai/docs) is live.
- 📣 Voice generation with prompts - **Prompt to Voice** - is live on [**Coqui Studio**](https://app.coqui.ai/auth/signin)!! - [Blog Post](https://coqui.ai/blog/tts/prompt-to-voice)
- 📣 Voice generation with fusion - **Voice fusion** - is live on [**Coqui Studio**](https://app.coqui.ai/auth/signin).
- 📣 Voice cloning is live on [**Coqui Studio**](https://app.coqui.ai/auth/signin).
## <img src="https://raw.githubusercontent.com/coqui-ai/TTS/main/images/coqui-log-green-TTS.png" height="56"/>
@ -185,7 +189,9 @@ from TTS.api import TTS
model_name = TTS.list_models()[0]
# Init TTS
tts = TTS(model_name)
# Run TTS
# ❗ Since this model is multi-speaker and multi-lingual, we must set the target speaker and the language
# Text to speech with a numpy output
wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0], language=tts.languages[0])
@ -199,7 +205,8 @@ tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False,
# Run TTS
tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path=OUTPUT_PATH)
# Example voice cloning with YourTTS in English, French and Portuguese:
# Example voice cloning with YourTTS in English, French and Portuguese
tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)
tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav")
tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr-fr", file_path="output.wav")
@ -221,7 +228,9 @@ tts.tts_with_vc_to_file(
file_path="ouptut.wav"
)
# Example text to speech using [🐸Coqui Studio](https://coqui.ai) models. You can use all of your available speakers in the studio.
# Example text to speech using [🐸Coqui Studio](https://coqui.ai) models.
# You can use all of your available speakers in the studio.
# [🐸Coqui Studio](https://coqui.ai) API token is required. You can get it from the [account page](https://coqui.ai/account).
# You should set the `COQUI_STUDIO_TOKEN` environment variable to use the API token.
@ -234,6 +243,20 @@ tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_b
tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH)
# Run TTS with emotion and speed control
tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5)
#Example text to speech using **Fairseq models in ~1100 languages** 🤯.
#For these models use the following name format: `tts_models/<lang-iso_code>/fairseq/vits`.
#You can find the list of language ISO codes [here](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html) and learn about the Fairseq models [here](https://github.com/facebookresearch/fairseq/tree/main/examples/mms).
# TTS with on the fly voice conversion
api = TTS("tts_models/deu/fairseq/vits")
api.tts_with_vc_to_file(
"Wie sage ich auf Italienisch, dass ich dich liebe?",
speaker_wav="target/speaker.wav",
file_path="ouptut.wav"
)
```
### Command line `tts`

View File

@ -1,20 +1,33 @@
{
"tts_models": {
"multilingual":{
"multi-dataset":{
"your_tts":{
"multilingual": {
"multi-dataset": {
"your_tts": {
"description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--multilingual--multi-dataset--your_tts.zip",
"default_vocoder": null,
"commit": "e9a1953e",
"license": "CC BY-NC-ND 4.0",
"contact": "egolge@coqui.ai"
},
"bark": {
"description": "🐶 Bark TTS model released by suno-ai. You can find the original implementation in https://github.com/suno-ai/bark.",
"hf_url": [
"https://coqui.gateway.scarf.sh/hf/bark/coarse_2.pt",
"https://coqui.gateway.scarf.sh/hf/bark/fine_2.pt",
"https://coqui.gateway.scarf.sh/hf/bark/text_2.pt",
"https://coqui.gateway.scarf.sh/hf/bark/config.json"
],
"default_vocoder": null,
"commit": "e9a1953e",
"license": "MIT",
"contact": "https://www.suno.ai/"
}
}
},
"bg": {
"cv": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--bg--cv--vits.zip",
"default_vocoder": null,
"commit": null,
@ -25,7 +38,7 @@
},
"cs": {
"cv": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--cs--cv--vits.zip",
"default_vocoder": null,
"commit": null,
@ -36,7 +49,7 @@
},
"da": {
"cv": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--da--cv--vits.zip",
"default_vocoder": null,
"commit": null,
@ -47,7 +60,7 @@
},
"et": {
"cv": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--et--cv--vits.zip",
"default_vocoder": null,
"commit": null,
@ -58,7 +71,7 @@
},
"ga": {
"cv": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ga--cv--vits.zip",
"default_vocoder": null,
"commit": null,
@ -180,7 +193,7 @@
"license": "apache 2.0",
"contact": "egolge@coqui.ai"
},
"fast_pitch":{
"fast_pitch": {
"description": "FastPitch model trained on VCTK dataseset.",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--fast_pitch.zip",
"default_vocoder": null,
@ -220,21 +233,21 @@
"license": "apache 2.0",
"contact": "adamfroghyar@gmail.com"
}
},
"multi-dataset":{
"tortoise-v2":{
"multi-dataset": {
"tortoise-v2": {
"description": "Tortoise tts model https://github.com/neonbjb/tortoise-tts",
"github_rls_url": ["https://coqui.gateway.scarf.sh/v0.14.1_models/autoregressive.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/clvp2.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/cvvp.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/diffusion_decoder.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_auto.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_diffuser.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/vocoder.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/mel_norms.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/config.json"
],
"github_rls_url": [
"https://coqui.gateway.scarf.sh/v0.14.1_models/autoregressive.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/clvp2.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/cvvp.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/diffusion_decoder.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_auto.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_diffuser.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/vocoder.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/mel_norms.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/config.json"
],
"commit": "c1875f6",
"default_vocoder": null,
"author": "@neonbjb - James Betker, @manmay-nakhashi Manmay Nakhashi",
@ -242,7 +255,7 @@
}
},
"jenny": {
"jenny":{
"jenny": {
"description": "VITS model trained with Jenny(Dioco) dataset. Named as Jenny as demanded by the license. Original URL for the model https://www.kaggle.com/datasets/noml4u/tts-models--en--jenny-dioco--vits",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.14.0_models/tts_models--en--jenny--jenny.zip",
"default_vocoder": null,
@ -263,8 +276,8 @@
"contact": "egolge@coqui.com"
}
},
"css10":{
"vits":{
"css10": {
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--es--css10--vits.zip",
"default_vocoder": null,
"commit": null,
@ -284,8 +297,8 @@
"contact": "egolge@coqui.com"
}
},
"css10":{
"vits":{
"css10": {
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fr--css10--vits.zip",
"default_vocoder": null,
"commit": null,
@ -294,17 +307,17 @@
}
}
},
"uk":{
"uk": {
"mai": {
"glow-tts": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--uk--mai--glow-tts.zip",
"author":"@robinhad",
"author": "@robinhad",
"commit": "bdab788d",
"license": "MIT",
"contact": "",
"default_vocoder": "vocoder_models/uk/mai/multiband-melgan"
},
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--uk--mai--vits.zip",
"default_vocoder": null,
"commit": null,
@ -335,8 +348,8 @@
"commit": "540d811"
}
},
"css10":{
"vits":{
"css10": {
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--nl--css10--vits.zip",
"default_vocoder": null,
"commit": null,
@ -371,7 +384,7 @@
}
},
"css10": {
"vits-neon":{
"vits-neon": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--de--css10--vits.zip",
"default_vocoder": null,
"author": "@NeonGeckoCom",
@ -392,9 +405,9 @@
}
}
},
"tr":{
"tr": {
"common-voice": {
"glow-tts":{
"glow-tts": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--tr--common-voice--glow-tts.zip",
"default_vocoder": "vocoder_models/tr/common-voice/hifigan",
"license": "MIT",
@ -406,7 +419,7 @@
},
"it": {
"mai_female": {
"glow-tts":{
"glow-tts": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--glow-tts.zip",
"default_vocoder": null,
"description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
@ -414,7 +427,7 @@
"license": "apache 2.0",
"commit": null
},
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--vits.zip",
"default_vocoder": null,
"description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
@ -424,7 +437,7 @@
}
},
"mai_male": {
"glow-tts":{
"glow-tts": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--glow-tts.zip",
"default_vocoder": null,
"description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
@ -432,7 +445,7 @@
"license": "apache 2.0",
"commit": null
},
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--vits.zip",
"default_vocoder": null,
"description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
@ -444,7 +457,7 @@
},
"ewe": {
"openbible": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--ewe--openbible--vits.zip",
"default_vocoder": null,
"license": "CC-BY-SA 4.0",
@ -456,7 +469,7 @@
},
"hau": {
"openbible": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--hau--openbible--vits.zip",
"default_vocoder": null,
"license": "CC-BY-SA 4.0",
@ -468,7 +481,7 @@
},
"lin": {
"openbible": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--lin--openbible--vits.zip",
"default_vocoder": null,
"license": "CC-BY-SA 4.0",
@ -480,7 +493,7 @@
},
"tw_akuapem": {
"openbible": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_akuapem--openbible--vits.zip",
"default_vocoder": null,
"license": "CC-BY-SA 4.0",
@ -492,7 +505,7 @@
},
"tw_asante": {
"openbible": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_asante--openbible--vits.zip",
"default_vocoder": null,
"license": "CC-BY-SA 4.0",
@ -504,7 +517,7 @@
},
"yor": {
"openbible": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--yor--openbible--vits.zip",
"default_vocoder": null,
"license": "CC-BY-SA 4.0",
@ -538,7 +551,7 @@
},
"fi": {
"css10": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fi--css10--vits.zip",
"default_vocoder": null,
"commit": null,
@ -549,7 +562,7 @@
},
"hr": {
"cv": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hr--cv--vits.zip",
"default_vocoder": null,
"commit": null,
@ -560,7 +573,7 @@
},
"lt": {
"cv": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lt--cv--vits.zip",
"default_vocoder": null,
"commit": null,
@ -571,7 +584,7 @@
},
"lv": {
"cv": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lv--cv--vits.zip",
"default_vocoder": null,
"commit": null,
@ -582,7 +595,7 @@
},
"mt": {
"cv": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--mt--cv--vits.zip",
"default_vocoder": null,
"commit": null,
@ -593,7 +606,7 @@
},
"pl": {
"mai_female": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pl--mai_female--vits.zip",
"default_vocoder": null,
"commit": null,
@ -604,7 +617,7 @@
},
"pt": {
"cv": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pt--cv--vits.zip",
"default_vocoder": null,
"commit": null,
@ -615,7 +628,7 @@
},
"ro": {
"cv": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ro--cv--vits.zip",
"default_vocoder": null,
"commit": null,
@ -626,7 +639,7 @@
},
"sk": {
"cv": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sk--cv--vits.zip",
"default_vocoder": null,
"commit": null,
@ -637,7 +650,7 @@
},
"sl": {
"cv": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sl--cv--vits.zip",
"default_vocoder": null,
"commit": null,
@ -648,7 +661,7 @@
},
"sv": {
"cv": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sv--cv--vits.zip",
"default_vocoder": null,
"commit": null,
@ -659,7 +672,7 @@
},
"ca": {
"custom": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--ca--custom--vits.zip",
"default_vocoder": null,
"commit": null,
@ -669,8 +682,8 @@
}
}
},
"fa":{
"custom":{
"fa": {
"custom": {
"glow-tts": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--fa--custom--glow-tts.zip",
"default_vocoder": null,
@ -681,18 +694,18 @@
}
}
},
"bn":{
"custom":{
"vits-male":{
"github_rls_url":"https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_male.zip",
"bn": {
"custom": {
"vits-male": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_male.zip",
"default_vocoder": null,
"commit": null,
"description": "Single speaker Bangla male model. For more information -> https://github.com/mobassir94/comprehensive-bangla-tts",
"author": "@mobassir94",
"license": "Apache 2.0"
},
"vits-female":{
"github_rls_url":"https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_female.zip",
"vits-female": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_female.zip",
"default_vocoder": null,
"commit": null,
"description": "Single speaker Bangla female model. For more information -> https://github.com/mobassir94/comprehensive-bangla-tts",
@ -834,16 +847,16 @@
"mai": {
"multiband-melgan": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--uk--mai--multiband-melgan.zip",
"author":"@robinhad",
"author": "@robinhad",
"commit": "bdab788d",
"license": "MIT",
"contact": ""
}
}
},
"tr":{
"tr": {
"common-voice": {
"hifigan":{
"hifigan": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--tr--common-voice--hifigan.zip",
"description": "HifiGAN model using an unknown speaker from the Common-Voice dataset.",
"author": "Fatih Akademi",
@ -853,10 +866,10 @@
}
}
},
"voice_conversion_models":{
"multilingual":{
"vctk":{
"freevc24":{
"voice_conversion_models": {
"multilingual": {
"vctk": {
"freevc24": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.0_models/voice_conversion_models--multilingual--vctk--freevc24.zip",
"description": "FreeVC model trained on VCTK dataset from https://github.com/OlaWod/FreeVC",
"author": "Jing-Yi Li @OlaWod",
@ -866,4 +879,4 @@
}
}
}
}
}

View File

@ -1 +1 @@
0.14.0
0.15.4

View File

@ -105,7 +105,7 @@ class CS_API:
"""List built-in Coqui Studio speakers."""
self._check_token()
conn = http.client.HTTPSConnection("app.coqui.ai")
conn.request("GET", f"{self.api_prefix}/speakers", headers=self.headers)
conn.request("GET", f"{self.api_prefix}/speakers?per_page=100", headers=self.headers)
res = conn.getresponse()
data = res.read()
return [Speaker(s) for s in json.loads(data)["result"]]
@ -130,7 +130,7 @@ class CS_API:
for speaker in self.speakers:
if speaker.name == name:
return speaker
raise ValueError(f"Speaker {name} not found.")
raise ValueError(f"Speaker {name} not found in {self.speakers}")
def id_to_speaker(self, speaker_id):
for speaker in self.speakers:
@ -264,6 +264,10 @@ class TTS:
>>> tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr", file_path="thisisit.wav")
>>> tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt", file_path="thisisit.wav")
Example Fairseq TTS models (uses ISO language codes in https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html):
>>> tts = TTS(model_name="tts_models/eng/fairseq/vits", progress_bar=False, gpu=True)
>>> tts.tts_to_file("This is a test.", file_path="output.wav")
Args:
model_name (str, optional): Model name to load. You can list models by ```tts.models```. Defaults to None.
model_path (str, optional): Path to the model checkpoint. Defaults to None.
@ -342,7 +346,7 @@ class TTS:
def download_model_by_name(self, model_name: str):
model_path, config_path, model_item = self.manager.download_model(model_name)
if isinstance(model_item["github_rls_url"], list):
if "fairseq" in model_name or (model_item is not None and isinstance(model_item["model_url"], list)):
# return model directory if there are multiple files
# we assume that the model knows how to load itself
return None, None, None, None, model_path
@ -580,6 +584,8 @@ class TTS:
Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None.
file_path (str, optional):
Output file path. Defaults to "output.wav".
kwargs (dict, optional):
Additional arguments for the model.
"""
self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)

View File

@ -356,7 +356,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
vc_config_path = config_path
# tts model with multiple files to be loaded from the directory path
if isinstance(model_item["github_rls_url"], list):
if model_item.get("author", None) == "fairseq" or isinstance(model_item["model_url"], list):
model_dir = model_path
tts_path = None
tts_config_path = None

View File

@ -23,7 +23,7 @@ colormap = (
[0, 0, 0],
[183, 183, 183],
],
dtype=np.float,
dtype=float,
)
/ 255
)

View File

@ -0,0 +1,105 @@
import os
from dataclasses import dataclass, field
from typing import Dict
from TTS.tts.configs.shared_configs import BaseTTSConfig
from TTS.tts.layers.bark.model import GPTConfig
from TTS.tts.layers.bark.model_fine import FineGPTConfig
from TTS.tts.models.bark import BarkAudioConfig
from TTS.utils.generic_utils import get_user_data_dir
@dataclass
class BarkConfig(BaseTTSConfig):
"""Bark TTS configuration
Args:
model (str): model name that registers the model.
audio (BarkAudioConfig): audio configuration. Defaults to BarkAudioConfig().
num_chars (int): number of characters in the alphabet. Defaults to 0.
semantic_config (GPTConfig): semantic configuration. Defaults to GPTConfig().
fine_config (FineGPTConfig): fine configuration. Defaults to FineGPTConfig().
coarse_config (GPTConfig): coarse configuration. Defaults to GPTConfig().
CONTEXT_WINDOW_SIZE (int): GPT context window size. Defaults to 1024.
SEMANTIC_RATE_HZ (float): semantic tokens rate in Hz. Defaults to 49.9.
SEMANTIC_VOCAB_SIZE (int): semantic vocabulary size. Defaults to 10_000.
CODEBOOK_SIZE (int): encodec codebook size. Defaults to 1024.
N_COARSE_CODEBOOKS (int): number of coarse codebooks. Defaults to 2.
N_FINE_CODEBOOKS (int): number of fine codebooks. Defaults to 8.
COARSE_RATE_HZ (int): coarse tokens rate in Hz. Defaults to 75.
SAMPLE_RATE (int): sample rate. Defaults to 24_000.
USE_SMALLER_MODELS (bool): use smaller models. Defaults to False.
TEXT_ENCODING_OFFSET (int): text encoding offset. Defaults to 10_048.
SEMANTIC_PAD_TOKEN (int): semantic pad token. Defaults to 10_000.
TEXT_PAD_TOKEN ([type]): text pad token. Defaults to 10_048.
TEXT_EOS_TOKEN ([type]): text end of sentence token. Defaults to 10_049.
TEXT_SOS_TOKEN ([type]): text start of sentence token. Defaults to 10_050.
SEMANTIC_INFER_TOKEN (int): semantic infer token. Defaults to 10_051.
COARSE_SEMANTIC_PAD_TOKEN (int): coarse semantic pad token. Defaults to 12_048.
COARSE_INFER_TOKEN (int): coarse infer token. Defaults to 12_050.
REMOTE_BASE_URL ([type]): remote base url. Defaults to "https://huggingface.co/erogol/bark/tree".
REMOTE_MODEL_PATHS (Dict): remote model paths. Defaults to None.
LOCAL_MODEL_PATHS (Dict): local model paths. Defaults to None.
SMALL_REMOTE_MODEL_PATHS (Dict): small remote model paths. Defaults to None.
CACHE_DIR (str): local cache directory. Defaults to get_user_data_dir().
DEF_SPEAKER_DIR (str): default speaker directory to stoke speaker values for voice cloning. Defaults to get_user_data_dir().
"""
model: str = "bark"
audio: BarkAudioConfig = field(default_factory=BarkAudioConfig)
num_chars: int = 0
semantic_config: GPTConfig = field(default_factory=GPTConfig)
fine_config: FineGPTConfig = field(default_factory=FineGPTConfig)
coarse_config: GPTConfig = field(default_factory=GPTConfig)
CONTEXT_WINDOW_SIZE: int = 1024
SEMANTIC_RATE_HZ: float = 49.9
SEMANTIC_VOCAB_SIZE: int = 10_000
CODEBOOK_SIZE: int = 1024
N_COARSE_CODEBOOKS: int = 2
N_FINE_CODEBOOKS: int = 8
COARSE_RATE_HZ: int = 75
SAMPLE_RATE: int = 24_000
USE_SMALLER_MODELS: bool = False
TEXT_ENCODING_OFFSET: int = 10_048
SEMANTIC_PAD_TOKEN: int = 10_000
TEXT_PAD_TOKEN: int = 129_595
SEMANTIC_INFER_TOKEN: int = 129_599
COARSE_SEMANTIC_PAD_TOKEN: int = 12_048
COARSE_INFER_TOKEN: int = 12_050
REMOTE_BASE_URL = "https://huggingface.co/erogol/bark/tree/main/"
REMOTE_MODEL_PATHS: Dict = None
LOCAL_MODEL_PATHS: Dict = None
SMALL_REMOTE_MODEL_PATHS: Dict = None
CACHE_DIR: str = str(get_user_data_dir("tts/suno/bark_v0"))
DEF_SPEAKER_DIR: str = str(get_user_data_dir("tts/bark_v0/speakers"))
def __post_init__(self):
self.REMOTE_MODEL_PATHS = {
"text": {
"path": os.path.join(self.REMOTE_BASE_URL, "text_2.pt"),
"checksum": "54afa89d65e318d4f5f80e8e8799026a",
},
"coarse": {
"path": os.path.join(self.REMOTE_BASE_URL, "coarse_2.pt"),
"checksum": "8a98094e5e3a255a5c9c0ab7efe8fd28",
},
"fine": {
"path": os.path.join(self.REMOTE_BASE_URL, "fine_2.pt"),
"checksum": "59d184ed44e3650774a2f0503a48a97b",
},
}
self.LOCAL_MODEL_PATHS = {
"text": os.path.join(self.CACHE_DIR, "text_2.pt"),
"coarse": os.path.join(self.CACHE_DIR, "coarse_2.pt"),
"fine": os.path.join(self.CACHE_DIR, "fine_2.pt"),
"hubert_tokenizer": os.path.join(self.CACHE_DIR, "tokenizer.pth"),
"hubert": os.path.join(self.CACHE_DIR, "hubert.pt"),
}
self.SMALL_REMOTE_MODEL_PATHS = {
"text": {"path": os.path.join(self.REMOTE_BASE_URL, "text.pt")},
"coarse": {"path": os.path.join(self.REMOTE_BASE_URL, "coarse.pt")},
"fine": {"path": os.path.join(self.REMOTE_BASE_URL, "fine.pt")},
}
self.sample_rate = self.SAMPLE_RATE # pylint: disable=attribute-defined-outside-init

View File

@ -113,7 +113,7 @@ class FastPitchConfig(BaseTTSConfig):
base_model: str = "forward_tts"
# model specific params
model_args: ForwardTTSArgs = ForwardTTSArgs()
model_args: ForwardTTSArgs = field(default_factory=ForwardTTSArgs)
# multi-speaker settings
num_speakers: int = 0

View File

@ -107,7 +107,7 @@ class FastSpeechConfig(BaseTTSConfig):
base_model: str = "forward_tts"
# model specific params
model_args: ForwardTTSArgs = ForwardTTSArgs(use_pitch=False)
model_args: ForwardTTSArgs = field(default_factory=lambda: ForwardTTSArgs(use_pitch=False))
# multi-speaker settings
num_speakers: int = 0

View File

@ -123,7 +123,7 @@ class Fastspeech2Config(BaseTTSConfig):
base_model: str = "forward_tts"
# model specific params
model_args: ForwardTTSArgs = ForwardTTSArgs(use_pitch=True, use_energy=True)
model_args: ForwardTTSArgs = field(default_factory=lambda: ForwardTTSArgs(use_pitch=True, use_energy=True))
# multi-speaker settings
num_speakers: int = 0

View File

@ -103,26 +103,28 @@ class SpeedySpeechConfig(BaseTTSConfig):
base_model: str = "forward_tts"
# set model args as SpeedySpeech
model_args: ForwardTTSArgs = ForwardTTSArgs(
use_pitch=False,
encoder_type="residual_conv_bn",
encoder_params={
"kernel_size": 4,
"dilations": 4 * [1, 2, 4] + [1],
"num_conv_blocks": 2,
"num_res_blocks": 13,
},
decoder_type="residual_conv_bn",
decoder_params={
"kernel_size": 4,
"dilations": 4 * [1, 2, 4, 8] + [1],
"num_conv_blocks": 2,
"num_res_blocks": 17,
},
out_channels=80,
hidden_channels=128,
positional_encoding=True,
detach_duration_predictor=True,
model_args: ForwardTTSArgs = field(
default_factory=lambda: ForwardTTSArgs(
use_pitch=False,
encoder_type="residual_conv_bn",
encoder_params={
"kernel_size": 4,
"dilations": 4 * [1, 2, 4] + [1],
"num_conv_blocks": 2,
"num_res_blocks": 13,
},
decoder_type="residual_conv_bn",
decoder_params={
"kernel_size": 4,
"dilations": 4 * [1, 2, 4, 8] + [1],
"num_conv_blocks": 2,
"num_res_blocks": 17,
},
out_channels=80,
hidden_channels=128,
positional_encoding=True,
detach_duration_predictor=True,
)
)
# multi-speaker settings

View File

@ -70,7 +70,7 @@ class TortoiseConfig(BaseTTSConfig):
model: str = "tortoise"
# model specific params
model_args: TortoiseArgs = field(default_factory=TortoiseArgs)
audio: TortoiseAudioConfig = TortoiseAudioConfig()
audio: TortoiseAudioConfig = field(default_factory=TortoiseAudioConfig)
model_dir: str = None
# settings

View File

View File

View File

@ -0,0 +1,35 @@
# From https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer
import os.path
import shutil
import urllib.request
import huggingface_hub
class HubertManager:
@staticmethod
def make_sure_hubert_installed(
download_url: str = "https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt", model_path: str = ""
):
if not os.path.isfile(model_path):
print("Downloading HuBERT base model")
urllib.request.urlretrieve(download_url, model_path)
print("Downloaded HuBERT")
return model_path
return None
@staticmethod
def make_sure_tokenizer_installed(
model: str = "quantifier_hubert_base_ls960_14.pth",
repo: str = "GitMylo/bark-voice-cloning",
model_path: str = "",
):
model_dir = os.path.dirname(model_path)
if not os.path.isfile(model_path):
print("Downloading HuBERT custom tokenizer")
huggingface_hub.hf_hub_download(repo, model, local_dir=model_dir, local_dir_use_symlinks=False)
shutil.move(os.path.join(model_dir, model), model_path)
print("Downloaded tokenizer")
return model_path
return None

View File

@ -0,0 +1,80 @@
"""
Modified HuBERT model without kmeans.
Original author: https://github.com/lucidrains/
Modified by: https://www.github.com/gitmylo/
License: MIT
"""
# Modified code from https://github.com/lucidrains/audiolm-pytorch/blob/main/audiolm_pytorch/hubert_kmeans.py
import logging
from pathlib import Path
import torch
from einops import pack, unpack
from torch import nn
from torchaudio.functional import resample
from transformers import HubertModel
def round_down_nearest_multiple(num, divisor):
return num // divisor * divisor
def curtail_to_multiple(t, mult, from_left=False):
data_len = t.shape[-1]
rounded_seq_len = round_down_nearest_multiple(data_len, mult)
seq_slice = slice(None, rounded_seq_len) if not from_left else slice(-rounded_seq_len, None)
return t[..., seq_slice]
def exists(val):
return val is not None
def default(val, d):
return val if exists(val) else d
class CustomHubert(nn.Module):
"""
checkpoint and kmeans can be downloaded at https://github.com/facebookresearch/fairseq/tree/main/examples/hubert
or you can train your own
"""
def __init__(self, checkpoint_path, target_sample_hz=16000, seq_len_multiple_of=None, output_layer=9, device=None):
super().__init__()
self.target_sample_hz = target_sample_hz
self.seq_len_multiple_of = seq_len_multiple_of
self.output_layer = output_layer
if device is not None:
self.to(device)
self.model = HubertModel.from_pretrained("facebook/hubert-base-ls960")
if device is not None:
self.model.to(device)
self.model.eval()
@property
def groups(self):
return 1
@torch.no_grad()
def forward(self, wav_input, flatten=True, input_sample_hz=None):
device = wav_input.device
if exists(input_sample_hz):
wav_input = resample(wav_input, input_sample_hz, self.target_sample_hz)
if exists(self.seq_len_multiple_of):
wav_input = curtail_to_multiple(wav_input, self.seq_len_multiple_of)
outputs = self.model.forward(
wav_input,
output_hidden_states=True,
)
embed = outputs["hidden_states"][self.output_layer]
embed, packed_shape = pack([embed], "* d")
codebook_indices = torch.from_numpy(embed.cpu().detach().numpy()).to(device)
if flatten:
return codebook_indices
(codebook_indices,) = unpack(codebook_indices, packed_shape, "*")
return codebook_indices

View File

@ -0,0 +1,196 @@
"""
Custom tokenizer model.
Author: https://www.github.com/gitmylo/
License: MIT
"""
import json
import os.path
from zipfile import ZipFile
import numpy
import torch
from torch import nn, optim
from torch.serialization import MAP_LOCATION
class HubertTokenizer(nn.Module):
def __init__(self, hidden_size=1024, input_size=768, output_size=10000, version=0):
super().__init__()
next_size = input_size
if version == 0:
self.lstm = nn.LSTM(input_size, hidden_size, 2, batch_first=True)
next_size = hidden_size
if version == 1:
self.lstm = nn.LSTM(input_size, hidden_size, 2, batch_first=True)
self.intermediate = nn.Linear(hidden_size, 4096)
next_size = 4096
self.fc = nn.Linear(next_size, output_size)
self.softmax = nn.LogSoftmax(dim=1)
self.optimizer: optim.Optimizer = None
self.lossfunc = nn.CrossEntropyLoss()
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.version = version
def forward(self, x):
x, _ = self.lstm(x)
if self.version == 1:
x = self.intermediate(x)
x = self.fc(x)
x = self.softmax(x)
return x
@torch.no_grad()
def get_token(self, x):
"""
Used to get the token for the first
:param x: An array with shape (N, input_size) where N is a whole number greater or equal to 1, and input_size is the input size used when creating the model.
:return: An array with shape (N,) where N is the same as N from the input. Every number in the array is a whole number in range 0...output_size - 1 where output_size is the output size used when creating the model.
"""
return torch.argmax(self(x), dim=1)
def prepare_training(self):
self.optimizer = optim.Adam(self.parameters(), 0.001)
def train_step(self, x_train, y_train, log_loss=False):
# y_train = y_train[:-1]
# y_train = y_train[1:]
optimizer = self.optimizer
lossfunc = self.lossfunc
# Zero the gradients
self.zero_grad()
# Forward pass
y_pred = self(x_train)
y_train_len = len(y_train)
y_pred_len = y_pred.shape[0]
if y_train_len > y_pred_len:
diff = y_train_len - y_pred_len
y_train = y_train[diff:]
elif y_train_len < y_pred_len:
diff = y_pred_len - y_train_len
y_pred = y_pred[:-diff, :]
y_train_hot = torch.zeros(len(y_train), self.output_size)
y_train_hot[range(len(y_train)), y_train] = 1
y_train_hot = y_train_hot.to("cuda")
# Calculate the loss
loss = lossfunc(y_pred, y_train_hot)
# Print loss
if log_loss:
print("Loss", loss.item())
# Backward pass
loss.backward()
# Update the weights
optimizer.step()
def save(self, path):
info_path = ".".join(os.path.basename(path).split(".")[:-1]) + "/.info"
torch.save(self.state_dict(), path)
data_from_model = Data(self.input_size, self.hidden_size, self.output_size, self.version)
with ZipFile(path, "a") as model_zip:
model_zip.writestr(info_path, data_from_model.save())
model_zip.close()
@staticmethod
def load_from_checkpoint(path, map_location: MAP_LOCATION = None):
old = True
with ZipFile(path) as model_zip:
filesMatch = [file for file in model_zip.namelist() if file.endswith("/.info")]
file = filesMatch[0] if filesMatch else None
if file:
old = False
data_from_model = Data.load(model_zip.read(file).decode("utf-8"))
model_zip.close()
if old:
model = HubertTokenizer()
else:
model = HubertTokenizer(
data_from_model.hidden_size,
data_from_model.input_size,
data_from_model.output_size,
data_from_model.version,
)
model.load_state_dict(torch.load(path))
if map_location:
model = model.to(map_location)
return model
class Data:
input_size: int
hidden_size: int
output_size: int
version: int
def __init__(self, input_size=768, hidden_size=1024, output_size=10000, version=0):
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.version = version
@staticmethod
def load(string):
data = json.loads(string)
return Data(data["input_size"], data["hidden_size"], data["output_size"], data["version"])
def save(self):
data = {
"input_size": self.input_size,
"hidden_size": self.hidden_size,
"output_size": self.output_size,
"version": self.version,
}
return json.dumps(data)
def auto_train(data_path, save_path="model.pth", load_model: str = None, save_epochs=1):
data_x, data_y = [], []
if load_model and os.path.isfile(load_model):
print("Loading model from", load_model)
model_training = HubertTokenizer.load_from_checkpoint(load_model, "cuda")
else:
print("Creating new model.")
model_training = HubertTokenizer(version=1).to("cuda") # Settings for the model to run without lstm
save_path = os.path.join(data_path, save_path)
base_save_path = ".".join(save_path.split(".")[:-1])
sem_string = "_semantic.npy"
feat_string = "_semantic_features.npy"
ready = os.path.join(data_path, "ready")
for input_file in os.listdir(ready):
full_path = os.path.join(ready, input_file)
if input_file.endswith(sem_string):
data_y.append(numpy.load(full_path))
elif input_file.endswith(feat_string):
data_x.append(numpy.load(full_path))
model_training.prepare_training()
epoch = 1
while 1:
for _ in range(save_epochs):
j = 0
for x, y in zip(data_x, data_y):
model_training.train_step(
torch.tensor(x).to("cuda"), torch.tensor(y).to("cuda"), j % 50 == 0
) # Print loss every 50 steps
j += 1
save_p = save_path
save_p_2 = f"{base_save_path}_epoch_{epoch}.pth"
model_training.save(save_p)
model_training.save(save_p_2)
print(f"Epoch {epoch} completed")
epoch += 1

View File

@ -0,0 +1,558 @@
import logging
import os
import re
from glob import glob
from typing import Dict, List
import librosa
import numpy as np
import torch
import torchaudio
import tqdm
from encodec.utils import convert_audio
from scipy.special import softmax
from torch.nn import functional as F
from TTS.tts.layers.bark.hubert.hubert_manager import HubertManager
from TTS.tts.layers.bark.hubert.kmeans_hubert import CustomHubert
from TTS.tts.layers.bark.hubert.tokenizer import HubertTokenizer
from TTS.tts.layers.bark.load_model import clear_cuda_cache, inference_mode
logger = logging.getLogger(__name__)
def _tokenize(tokenizer, text):
return tokenizer.encode(text, add_special_tokens=False)
def _detokenize(tokenizer, enc_text):
return tokenizer.decode(enc_text)
def _normalize_whitespace(text):
return re.sub(r"\s+", " ", text).strip()
def get_voices(extra_voice_dirs: List[str] = []): # pylint: disable=dangerous-default-value
dirs = extra_voice_dirs
voices: Dict[str, List[str]] = {}
for d in dirs:
subs = os.listdir(d)
for sub in subs:
subj = os.path.join(d, sub)
if os.path.isdir(subj):
voices[sub] = list(glob(f"{subj}/*.npz"))
# fetch audio files if no npz files are found
if len(voices[sub]) == 0:
voices[sub] = list(glob(f"{subj}/*.wav")) + list(glob(f"{subj}/*.mp3"))
return voices
def load_npz(npz_file):
x_history = np.load(npz_file)
semantic = x_history["semantic_prompt"]
coarse = x_history["coarse_prompt"]
fine = x_history["fine_prompt"]
return semantic, coarse, fine
def load_voice(model, voice: str, extra_voice_dirs: List[str] = []): # pylint: disable=dangerous-default-value
if voice == "random":
return None, None, None
voices = get_voices(extra_voice_dirs)
paths = voices[voice]
# bark only uses a single sample for cloning
if len(paths) > 1:
raise ValueError(f"Voice {voice} has multiple paths: {paths}")
try:
path = voices[voice]
except KeyError as e:
raise KeyError(f"Voice {voice} not found in {extra_voice_dirs}") from e
if len(paths) == 1 and paths[0].endswith(".npz"):
return load_npz(path[0])
audio_path = paths[0]
# replace the file extension with .npz
output_path = os.path.splitext(audio_path)[0] + ".npz"
generate_voice(audio=audio_path, model=model, output_path=output_path)
return load_voice(model, voice, extra_voice_dirs)
def zero_crossing_rate(audio, frame_length=1024, hop_length=512):
zero_crossings = np.sum(np.abs(np.diff(np.sign(audio))) / 2)
total_frames = 1 + int((len(audio) - frame_length) / hop_length)
return zero_crossings / total_frames
def compute_spectral_contrast(audio_data, sample_rate, n_bands=6, fmin=200.0):
spectral_contrast = librosa.feature.spectral_contrast(y=audio_data, sr=sample_rate, n_bands=n_bands, fmin=fmin)
return np.mean(spectral_contrast)
def compute_average_bass_energy(audio_data, sample_rate, max_bass_freq=250):
stft = librosa.stft(audio_data)
power_spectrogram = np.abs(stft) ** 2
frequencies = librosa.fft_frequencies(sr=sample_rate, n_fft=stft.shape[0])
bass_mask = frequencies <= max_bass_freq
bass_energy = power_spectrogram[np.ix_(bass_mask, np.arange(power_spectrogram.shape[1]))].mean()
return bass_energy
def generate_voice(
audio,
model,
output_path,
):
"""Generate a new voice from a given audio and text prompt.
Args:
audio (np.ndarray): The audio to use as a base for the new voice.
text (str): Transcription of the audio you are clonning.
model (BarkModel): The BarkModel to use for generating the new voice.
output_path (str): The path to save the generated voice to.
"""
if isinstance(audio, str):
audio, sr = torchaudio.load(audio)
audio = convert_audio(audio, sr, model.config.sample_rate, model.encodec.channels)
audio = audio.unsqueeze(0).to(model.device)
with torch.no_grad():
encoded_frames = model.encodec.encode(audio)
codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze() # [n_q, T]
# move codes to cpu
codes = codes.cpu().numpy()
# generate semantic tokens
# Load the HuBERT model
hubert_manager = HubertManager()
# hubert_manager.make_sure_hubert_installed(model_path=model.config.LOCAL_MODEL_PATHS["hubert"])
hubert_manager.make_sure_tokenizer_installed(model_path=model.config.LOCAL_MODEL_PATHS["hubert_tokenizer"])
hubert_model = CustomHubert(checkpoint_path=model.config.LOCAL_MODEL_PATHS["hubert"]).to(model.device)
# Load the CustomTokenizer model
tokenizer = HubertTokenizer.load_from_checkpoint(model.config.LOCAL_MODEL_PATHS["hubert_tokenizer"]).to(
model.device
) # Automatically uses
# semantic_tokens = model.text_to_semantic(
# text, max_gen_duration_s=seconds, top_k=50, top_p=0.95, temp=0.7
# ) # not 100%
semantic_vectors = hubert_model.forward(audio[0], input_sample_hz=model.config.sample_rate)
semantic_tokens = tokenizer.get_token(semantic_vectors)
semantic_tokens = semantic_tokens.cpu().numpy()
np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)
def generate_text_semantic(
text,
model,
history_prompt=None,
temp=0.7,
top_k=None,
top_p=None,
silent=False,
min_eos_p=0.2,
max_gen_duration_s=None,
allow_early_stop=True,
base=None,
use_kv_caching=True,
):
"""Generate semantic tokens from text."""
print(f"history_prompt in gen: {history_prompt}")
assert isinstance(text, str)
text = _normalize_whitespace(text)
assert len(text.strip()) > 0
if all(v is not None for v in history_prompt) or base is not None:
if history_prompt is not None:
semantic_history = history_prompt[0]
if base is not None:
semantic_history = base[0]
assert (
isinstance(semantic_history, np.ndarray)
and len(semantic_history.shape) == 1
and len(semantic_history) > 0
and semantic_history.min() >= 0
and semantic_history.max() <= model.config.SEMANTIC_VOCAB_SIZE - 1
)
else:
semantic_history = None
encoded_text = np.array(_tokenize(model.tokenizer, text)) + model.config.TEXT_ENCODING_OFFSET
if len(encoded_text) > 256:
p = round((len(encoded_text) - 256) / len(encoded_text) * 100, 1)
logger.warning(f"warning, text too long, lopping of last {p}%")
encoded_text = encoded_text[:256]
encoded_text = np.pad(
encoded_text,
(0, 256 - len(encoded_text)),
constant_values=model.config.TEXT_PAD_TOKEN,
mode="constant",
)
if semantic_history is not None:
semantic_history = semantic_history.astype(np.int64)
# lop off if history is too long, pad if needed
semantic_history = semantic_history[-256:]
semantic_history = np.pad(
semantic_history,
(0, 256 - len(semantic_history)),
constant_values=model.config.SEMANTIC_PAD_TOKEN,
mode="constant",
)
else:
semantic_history = np.array([model.config.SEMANTIC_PAD_TOKEN] * 256)
x = torch.from_numpy(
np.hstack([encoded_text, semantic_history, np.array([model.config.SEMANTIC_INFER_TOKEN])]).astype(np.int64)
)[None]
assert x.shape[1] == 256 + 256 + 1
with inference_mode():
x = x.to(model.device)
n_tot_steps = 768
# custom tqdm updates since we don't know when eos will occur
pbar = tqdm.tqdm(disable=silent, total=100)
pbar_state = 0
tot_generated_duration_s = 0
kv_cache = None
for n in range(n_tot_steps):
if use_kv_caching and kv_cache is not None:
x_input = x[:, [-1]]
else:
x_input = x
logits, kv_cache = model.semantic_model(
x_input, merge_context=True, use_cache=use_kv_caching, past_kv=kv_cache
)
relevant_logits = logits[0, 0, : model.config.SEMANTIC_VOCAB_SIZE]
if allow_early_stop:
relevant_logits = torch.hstack(
(relevant_logits, logits[0, 0, [model.config.SEMANTIC_PAD_TOKEN]])
) # eos
if top_p is not None:
# faster to convert to numpy
logits_device = relevant_logits.device
logits_dtype = relevant_logits.type()
relevant_logits = relevant_logits.detach().cpu().type(torch.float32).numpy()
sorted_indices = np.argsort(relevant_logits)[::-1]
sorted_logits = relevant_logits[sorted_indices]
cumulative_probs = np.cumsum(softmax(sorted_logits))
sorted_indices_to_remove = cumulative_probs > top_p
sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].copy()
sorted_indices_to_remove[0] = False
relevant_logits[sorted_indices[sorted_indices_to_remove]] = -np.inf
relevant_logits = torch.from_numpy(relevant_logits)
relevant_logits = relevant_logits.to(logits_device).type(logits_dtype)
if top_k is not None:
v, _ = torch.topk(relevant_logits, min(top_k, relevant_logits.size(-1)))
relevant_logits[relevant_logits < v[-1]] = -float("Inf")
probs = torch.softmax(relevant_logits / temp, dim=-1)
item_next = torch.multinomial(probs, num_samples=1)
if allow_early_stop and (
item_next == model.config.SEMANTIC_VOCAB_SIZE or (min_eos_p is not None and probs[-1] >= min_eos_p)
):
# eos found, so break
pbar.update(100 - pbar_state)
break
x = torch.cat((x, item_next[None]), dim=1)
tot_generated_duration_s += 1 / model.config.SEMANTIC_RATE_HZ
if max_gen_duration_s is not None and tot_generated_duration_s > max_gen_duration_s:
pbar.update(100 - pbar_state)
break
if n == n_tot_steps - 1:
pbar.update(100 - pbar_state)
break
del logits, relevant_logits, probs, item_next
req_pbar_state = np.min([100, int(round(100 * n / n_tot_steps))])
if req_pbar_state > pbar_state:
pbar.update(req_pbar_state - pbar_state)
pbar_state = req_pbar_state
pbar.close()
out = x.detach().cpu().numpy().squeeze()[256 + 256 + 1 :]
assert all(out >= 0) and all(out < model.config.SEMANTIC_VOCAB_SIZE)
clear_cuda_cache()
return out
def _flatten_codebooks(arr, offset_size):
assert len(arr.shape) == 2
arr = arr.copy()
if offset_size is not None:
for n in range(1, arr.shape[0]):
arr[n, :] += offset_size * n
flat_arr = arr.ravel("F")
return flat_arr
def generate_coarse(
x_semantic,
model,
history_prompt=None,
temp=0.7,
top_k=None,
top_p=None,
silent=False,
max_coarse_history=630, # min 60 (faster), max 630 (more context)
sliding_window_len=60,
base=None,
use_kv_caching=True,
):
"""Generate coarse audio codes from semantic tokens."""
assert (
isinstance(x_semantic, np.ndarray)
and len(x_semantic.shape) == 1
and len(x_semantic) > 0
and x_semantic.min() >= 0
and x_semantic.max() <= model.config.SEMANTIC_VOCAB_SIZE - 1
)
assert 60 <= max_coarse_history <= 630
assert max_coarse_history + sliding_window_len <= 1024 - 256
semantic_to_coarse_ratio = (
model.config.COARSE_RATE_HZ / model.config.SEMANTIC_RATE_HZ * model.config.N_COARSE_CODEBOOKS
)
max_semantic_history = int(np.floor(max_coarse_history / semantic_to_coarse_ratio))
if all(v is not None for v in history_prompt) or base is not None:
if history_prompt is not None:
x_history = history_prompt
x_semantic_history = x_history[0]
x_coarse_history = x_history[1]
if base is not None:
x_semantic_history = base[0]
x_coarse_history = base[1]
assert (
isinstance(x_semantic_history, np.ndarray)
and len(x_semantic_history.shape) == 1
and len(x_semantic_history) > 0
and x_semantic_history.min() >= 0
and x_semantic_history.max() <= model.config.SEMANTIC_VOCAB_SIZE - 1
and isinstance(x_coarse_history, np.ndarray)
and len(x_coarse_history.shape) == 2
and x_coarse_history.shape[0] == model.config.N_COARSE_CODEBOOKS
and x_coarse_history.shape[-1] >= 0
and x_coarse_history.min() >= 0
and x_coarse_history.max() <= model.config.CODEBOOK_SIZE - 1
and (
round(x_coarse_history.shape[-1] / len(x_semantic_history), 1)
== round(semantic_to_coarse_ratio / model.config.N_COARSE_CODEBOOKS, 1)
)
)
x_coarse_history = (
_flatten_codebooks(x_coarse_history, model.config.CODEBOOK_SIZE) + model.config.SEMANTIC_VOCAB_SIZE
)
# trim histories correctly
n_semantic_hist_provided = np.min(
[
max_semantic_history,
len(x_semantic_history) - len(x_semantic_history) % 2,
int(np.floor(len(x_coarse_history) / semantic_to_coarse_ratio)),
]
)
n_coarse_hist_provided = int(round(n_semantic_hist_provided * semantic_to_coarse_ratio))
x_semantic_history = x_semantic_history[-n_semantic_hist_provided:].astype(np.int32)
x_coarse_history = x_coarse_history[-n_coarse_hist_provided:].astype(np.int32)
# TODO: bit of a hack for time alignment (sounds better)
x_coarse_history = x_coarse_history[:-2]
else:
x_semantic_history = np.array([], dtype=np.int32)
x_coarse_history = np.array([], dtype=np.int32)
# start loop
n_steps = int(
round(
np.floor(len(x_semantic) * semantic_to_coarse_ratio / model.config.N_COARSE_CODEBOOKS)
* model.config.N_COARSE_CODEBOOKS
)
)
assert n_steps > 0 and n_steps % model.config.N_COARSE_CODEBOOKS == 0
x_semantic = np.hstack([x_semantic_history, x_semantic]).astype(np.int32)
x_coarse = x_coarse_history.astype(np.int32)
base_semantic_idx = len(x_semantic_history)
with inference_mode():
x_semantic_in = torch.from_numpy(x_semantic)[None].to(model.device)
x_coarse_in = torch.from_numpy(x_coarse)[None].to(model.device)
n_window_steps = int(np.ceil(n_steps / sliding_window_len))
n_step = 0
for _ in tqdm.tqdm(range(n_window_steps), total=n_window_steps, disable=silent):
semantic_idx = base_semantic_idx + int(round(n_step / semantic_to_coarse_ratio))
# pad from right side
x_in = x_semantic_in[:, np.max([0, semantic_idx - max_semantic_history]) :]
x_in = x_in[:, :256]
x_in = F.pad(
x_in,
(0, 256 - x_in.shape[-1]),
"constant",
model.config.COARSE_SEMANTIC_PAD_TOKEN,
)
x_in = torch.hstack(
[
x_in,
torch.tensor([model.config.COARSE_INFER_TOKEN])[None].to(model.device),
x_coarse_in[:, -max_coarse_history:],
]
)
kv_cache = None
for _ in range(sliding_window_len):
if n_step >= n_steps:
continue
is_major_step = n_step % model.config.N_COARSE_CODEBOOKS == 0
if use_kv_caching and kv_cache is not None:
x_input = x_in[:, [-1]]
else:
x_input = x_in
logits, kv_cache = model.coarse_model(x_input, use_cache=use_kv_caching, past_kv=kv_cache)
logit_start_idx = (
model.config.SEMANTIC_VOCAB_SIZE + (1 - int(is_major_step)) * model.config.CODEBOOK_SIZE
)
logit_end_idx = model.config.SEMANTIC_VOCAB_SIZE + (2 - int(is_major_step)) * model.config.CODEBOOK_SIZE
relevant_logits = logits[0, 0, logit_start_idx:logit_end_idx]
if top_p is not None:
# faster to convert to numpy
logits_device = relevant_logits.device
logits_dtype = relevant_logits.type()
relevant_logits = relevant_logits.detach().cpu().type(torch.float32).numpy()
sorted_indices = np.argsort(relevant_logits)[::-1]
sorted_logits = relevant_logits[sorted_indices]
cumulative_probs = np.cumsum(torch.nn.functional.softmax(sorted_logits))
sorted_indices_to_remove = cumulative_probs > top_p
sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].copy()
sorted_indices_to_remove[0] = False
relevant_logits[sorted_indices[sorted_indices_to_remove]] = -np.inf
relevant_logits = torch.from_numpy(relevant_logits)
relevant_logits = relevant_logits.to(logits_device).type(logits_dtype)
if top_k is not None:
v, _ = torch.topk(relevant_logits, min(top_k, relevant_logits.size(-1)))
relevant_logits[relevant_logits < v[-1]] = -float("Inf")
probs = torch.nn.functional.softmax(relevant_logits / temp, dim=-1)
item_next = torch.multinomial(probs, num_samples=1)
item_next += logit_start_idx
x_coarse_in = torch.cat((x_coarse_in, item_next[None]), dim=1)
x_in = torch.cat((x_in, item_next[None]), dim=1)
del logits, relevant_logits, probs, item_next
n_step += 1
del x_in
del x_semantic_in
gen_coarse_arr = x_coarse_in.detach().cpu().numpy().squeeze()[len(x_coarse_history) :]
del x_coarse_in
assert len(gen_coarse_arr) == n_steps
gen_coarse_audio_arr = (
gen_coarse_arr.reshape(-1, model.config.N_COARSE_CODEBOOKS).T - model.config.SEMANTIC_VOCAB_SIZE
)
for n in range(1, model.config.N_COARSE_CODEBOOKS):
gen_coarse_audio_arr[n, :] -= n * model.config.CODEBOOK_SIZE
clear_cuda_cache()
return gen_coarse_audio_arr
def generate_fine(
x_coarse_gen,
model,
history_prompt=None,
temp=0.5,
silent=True,
base=None,
):
"""Generate full audio codes from coarse audio codes."""
assert (
isinstance(x_coarse_gen, np.ndarray)
and len(x_coarse_gen.shape) == 2
and 1 <= x_coarse_gen.shape[0] <= model.config.N_FINE_CODEBOOKS - 1
and x_coarse_gen.shape[1] > 0
and x_coarse_gen.min() >= 0
and x_coarse_gen.max() <= model.config.CODEBOOK_SIZE - 1
)
if all(v is not None for v in history_prompt) or base is not None:
if history_prompt is not None:
x_fine_history = history_prompt[2]
if base is not None:
x_fine_history = base[2]
assert (
isinstance(x_fine_history, np.ndarray)
and len(x_fine_history.shape) == 2
and x_fine_history.shape[0] == model.config.N_FINE_CODEBOOKS
and x_fine_history.shape[1] >= 0
and x_fine_history.min() >= 0
and x_fine_history.max() <= model.config.CODEBOOK_SIZE - 1
)
else:
x_fine_history = None
n_coarse = x_coarse_gen.shape[0]
# make input arr
in_arr = np.vstack(
[
x_coarse_gen,
np.zeros((model.config.N_FINE_CODEBOOKS - n_coarse, x_coarse_gen.shape[1]))
+ model.config.CODEBOOK_SIZE, # padding
]
).astype(np.int32)
# prepend history if available (max 512)
if x_fine_history is not None:
x_fine_history = x_fine_history.astype(np.int32)
in_arr = np.hstack(
[
x_fine_history[:, -512:].astype(np.int32),
in_arr,
]
)
n_history = x_fine_history[:, -512:].shape[1]
else:
n_history = 0
n_remove_from_end = 0
# need to pad if too short (since non-causal model)
if in_arr.shape[1] < 1024:
n_remove_from_end = 1024 - in_arr.shape[1]
in_arr = np.hstack(
[
in_arr,
np.zeros((model.config.N_FINE_CODEBOOKS, n_remove_from_end), dtype=np.int32)
+ model.config.CODEBOOK_SIZE,
]
)
# we can be lazy about fractional loop and just keep overwriting codebooks
n_loops = np.max([0, int(np.ceil((x_coarse_gen.shape[1] - (1024 - n_history)) / 512))]) + 1
with inference_mode():
in_arr = torch.tensor(in_arr.T).to(model.device)
for n in tqdm.tqdm(range(n_loops), disable=silent):
start_idx = np.min([n * 512, in_arr.shape[0] - 1024])
start_fill_idx = np.min([n_history + n * 512, in_arr.shape[0] - 512])
rel_start_fill_idx = start_fill_idx - start_idx
in_buffer = in_arr[start_idx : start_idx + 1024, :][None]
for nn in range(n_coarse, model.config.N_FINE_CODEBOOKS):
logits = model.fine_model(nn, in_buffer)
if temp is None:
relevant_logits = logits[0, rel_start_fill_idx:, : model.config.CODEBOOK_SIZE]
codebook_preds = torch.argmax(relevant_logits, -1)
else:
relevant_logits = logits[0, :, : model.config.CODEBOOK_SIZE] / temp
probs = F.softmax(relevant_logits, dim=-1)
codebook_preds = torch.hstack(
[torch.multinomial(probs[n], num_samples=1) for n in range(rel_start_fill_idx, 1024)]
)
in_buffer[0, rel_start_fill_idx:, nn] = codebook_preds
del logits, codebook_preds
# transfer over info into model_in and convert to numpy
for nn in range(n_coarse, model.config.N_FINE_CODEBOOKS):
in_arr[start_fill_idx : start_fill_idx + (1024 - rel_start_fill_idx), nn] = in_buffer[
0, rel_start_fill_idx:, nn
]
del in_buffer
gen_fine_arr = in_arr.detach().cpu().numpy().squeeze().T
del in_arr
gen_fine_arr = gen_fine_arr[:, n_history:]
if n_remove_from_end > 0:
gen_fine_arr = gen_fine_arr[:, :-n_remove_from_end]
assert gen_fine_arr.shape[-1] == x_coarse_gen.shape[-1]
clear_cuda_cache()
return gen_fine_arr
def codec_decode(fine_tokens, model):
"""Turn quantized audio codes into audio array using encodec."""
arr = torch.from_numpy(fine_tokens)[None]
arr = arr.to(model.device)
arr = arr.transpose(0, 1)
emb = model.encodec.quantizer.decode(arr)
out = model.encodec.decoder(emb)
audio_arr = out.detach().cpu().numpy().squeeze()
return audio_arr

View File

@ -0,0 +1,160 @@
import contextlib
import functools
import hashlib
import logging
import os
import requests
import torch
import tqdm
from TTS.tts.layers.bark.model import GPT, GPTConfig
from TTS.tts.layers.bark.model_fine import FineGPT, FineGPTConfig
if (
torch.cuda.is_available()
and hasattr(torch.cuda, "amp")
and hasattr(torch.cuda.amp, "autocast")
and torch.cuda.is_bf16_supported()
):
autocast = functools.partial(torch.cuda.amp.autocast, dtype=torch.bfloat16)
else:
@contextlib.contextmanager
def autocast():
yield
# hold models in global scope to lazy load
logger = logging.getLogger(__name__)
if not hasattr(torch.nn.functional, "scaled_dot_product_attention"):
logger.warning(
"torch version does not support flash attention. You will get significantly faster"
+ " inference speed by upgrade torch to newest version / nightly."
)
def _md5(fname):
hash_md5 = hashlib.md5()
with open(fname, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def _download(from_s3_path, to_local_path, CACHE_DIR):
os.makedirs(CACHE_DIR, exist_ok=True)
response = requests.get(from_s3_path, stream=True)
total_size_in_bytes = int(response.headers.get("content-length", 0))
block_size = 1024 # 1 Kibibyte
progress_bar = tqdm.tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
with open(to_local_path, "wb") as file:
for data in response.iter_content(block_size):
progress_bar.update(len(data))
file.write(data)
progress_bar.close()
if total_size_in_bytes not in [0, progress_bar.n]:
raise ValueError("ERROR, something went wrong")
class InferenceContext:
def __init__(self, benchmark=False):
# we can't expect inputs to be the same length, so disable benchmarking by default
self._chosen_cudnn_benchmark = benchmark
self._cudnn_benchmark = None
def __enter__(self):
self._cudnn_benchmark = torch.backends.cudnn.benchmark
torch.backends.cudnn.benchmark = self._chosen_cudnn_benchmark
def __exit__(self, exc_type, exc_value, exc_traceback):
torch.backends.cudnn.benchmark = self._cudnn_benchmark
if torch.cuda.is_available():
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
@contextlib.contextmanager
def inference_mode():
with InferenceContext(), torch.inference_mode(), torch.no_grad(), autocast():
yield
def clear_cuda_cache():
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.synchronize()
def load_model(ckpt_path, device, config, model_type="text"):
logger.info(f"loading {model_type} model from {ckpt_path}...")
if device == "cpu":
logger.warning("No GPU being used. Careful, Inference might be extremely slow!")
if model_type == "text":
ConfigClass = GPTConfig
ModelClass = GPT
elif model_type == "coarse":
ConfigClass = GPTConfig
ModelClass = GPT
elif model_type == "fine":
ConfigClass = FineGPTConfig
ModelClass = FineGPT
else:
raise NotImplementedError()
if (
not config.USE_SMALLER_MODELS
and os.path.exists(ckpt_path)
and _md5(ckpt_path) != config.REMOTE_MODEL_PATHS[model_type]["checksum"]
):
logger.warning(f"found outdated {model_type} model, removing...")
os.remove(ckpt_path)
if not os.path.exists(ckpt_path):
logger.info(f"{model_type} model not found, downloading...")
_download(config.REMOTE_MODEL_PATHS[model_type]["path"], ckpt_path, config.CACHE_DIR)
checkpoint = torch.load(ckpt_path, map_location=device)
# this is a hack
model_args = checkpoint["model_args"]
if "input_vocab_size" not in model_args:
model_args["input_vocab_size"] = model_args["vocab_size"]
model_args["output_vocab_size"] = model_args["vocab_size"]
del model_args["vocab_size"]
gptconf = ConfigClass(**checkpoint["model_args"])
if model_type == "text":
config.semantic_config = gptconf
elif model_type == "coarse":
config.coarse_config = gptconf
elif model_type == "fine":
config.fine_config = gptconf
model = ModelClass(gptconf)
state_dict = checkpoint["model"]
# fixup checkpoint
unwanted_prefix = "_orig_mod."
for k, _ in list(state_dict.items()):
if k.startswith(unwanted_prefix):
state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k)
extra_keys = set(state_dict.keys()) - set(model.state_dict().keys())
extra_keys = set(k for k in extra_keys if not k.endswith(".attn.bias"))
missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())
missing_keys = set(k for k in missing_keys if not k.endswith(".attn.bias"))
if len(extra_keys) != 0:
raise ValueError(f"extra keys found: {extra_keys}")
if len(missing_keys) != 0:
raise ValueError(f"missing keys: {missing_keys}")
model.load_state_dict(state_dict, strict=False)
n_params = model.get_num_params()
val_loss = checkpoint["best_val_loss"].item()
logger.info(f"model loaded: {round(n_params/1e6,1)}M params, {round(val_loss,3)} loss")
model.eval()
model.to(device)
del checkpoint, state_dict
clear_cuda_cache()
return model, config

View File

@ -0,0 +1,233 @@
"""
Much of this code is adapted from Andrej Karpathy's NanoGPT
(https://github.com/karpathy/nanoGPT)
"""
import math
from dataclasses import dataclass
import torch
from coqpit import Coqpit
from torch import nn
from torch.nn import functional as F
class LayerNorm(nn.Module):
"""LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False"""
def __init__(self, ndim, bias):
super().__init__()
self.weight = nn.Parameter(torch.ones(ndim))
self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
def forward(self, x):
return F.layer_norm(x, self.weight.shape, self.weight, self.bias, 1e-5)
class CausalSelfAttention(nn.Module):
def __init__(self, config):
super().__init__()
assert config.n_embd % config.n_head == 0
# key, query, value projections for all heads, but in a batch
self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
# output projection
self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
# regularization
self.attn_dropout = nn.Dropout(config.dropout)
self.resid_dropout = nn.Dropout(config.dropout)
self.n_head = config.n_head
self.n_embd = config.n_embd
self.dropout = config.dropout
# flash attention make GPU go brrrrr but support is only in PyTorch nightly and still a bit scary
self.flash = hasattr(torch.nn.functional, "scaled_dot_product_attention")
if not self.flash:
# print("WARNING: using slow attention. Flash Attention atm needs PyTorch nightly and dropout=0.0")
# causal mask to ensure that attention is only applied to the left in the input sequence
self.register_buffer(
"bias",
torch.tril(torch.ones(config.block_size, config.block_size)).view(
1, 1, config.block_size, config.block_size
),
)
def forward(self, x, past_kv=None, use_cache=False):
B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
# calculate query, key, values for all heads in batch and move head forward to be the batch dim
q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
if past_kv is not None:
past_key = past_kv[0]
past_value = past_kv[1]
k = torch.cat((past_key, k), dim=-2)
v = torch.cat((past_value, v), dim=-2)
FULL_T = k.shape[-2]
if use_cache is True:
present = (k, v)
else:
present = None
# causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
if self.flash:
# efficient attention using Flash Attention CUDA kernels
if past_kv is not None:
# When `past_kv` is provided, we're doing incremental decoding and `q.shape[2] == 1`: q only contains
# the query for the last token. scaled_dot_product_attention interprets this as the first token in the
# sequence, so if is_causal=True it will mask out all attention from it. This is not what we want, so
# to work around this we set is_causal=False.
is_causal = False
else:
is_causal = True
# efficient attention using Flash Attention CUDA kernels
y = torch.nn.functional.scaled_dot_product_attention(q, k, v, dropout_p=self.dropout, is_causal=is_causal)
else:
# manual implementation of attention
att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
att = att.masked_fill(self.bias[:, :, FULL_T - T : FULL_T, :FULL_T] == 0, float("-inf"))
att = F.softmax(att, dim=-1)
att = self.attn_dropout(att)
y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
# output projection
y = self.resid_dropout(self.c_proj(y))
return (y, present)
class MLP(nn.Module):
def __init__(self, config):
super().__init__()
self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
self.dropout = nn.Dropout(config.dropout)
self.gelu = nn.GELU()
def forward(self, x):
x = self.c_fc(x)
x = self.gelu(x)
x = self.c_proj(x)
x = self.dropout(x)
return x
class Block(nn.Module):
def __init__(self, config, layer_idx):
super().__init__()
self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
self.attn = CausalSelfAttention(config)
self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
self.mlp = MLP(config)
self.layer_idx = layer_idx
def forward(self, x, past_kv=None, use_cache=False):
attn_output, prev_kvs = self.attn(self.ln_1(x), past_kv=past_kv, use_cache=use_cache)
x = x + attn_output
x = x + self.mlp(self.ln_2(x))
return (x, prev_kvs)
@dataclass
class GPTConfig(Coqpit):
block_size: int = 1024
input_vocab_size: int = 10_048
output_vocab_size: int = 10_048
n_layer: int = 12
n_head: int = 12
n_embd: int = 768
dropout: float = 0.0
bias: bool = True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
class GPT(nn.Module):
def __init__(self, config):
super().__init__()
assert config.input_vocab_size is not None
assert config.output_vocab_size is not None
assert config.block_size is not None
self.config = config
self.transformer = nn.ModuleDict(
dict(
wte=nn.Embedding(config.input_vocab_size, config.n_embd),
wpe=nn.Embedding(config.block_size, config.n_embd),
drop=nn.Dropout(config.dropout),
h=nn.ModuleList([Block(config, idx) for idx in range(config.n_layer)]),
ln_f=LayerNorm(config.n_embd, bias=config.bias),
)
)
self.lm_head = nn.Linear(config.n_embd, config.output_vocab_size, bias=False)
def get_num_params(self, non_embedding=True):
"""
Return the number of parameters in the model.
For non-embedding count (default), the position embeddings get subtracted.
The token embeddings would too, except due to the parameter sharing these
params are actually used as weights in the final layer, so we include them.
"""
n_params = sum(p.numel() for p in self.parameters())
if non_embedding:
n_params -= self.transformer.wte.weight.numel()
n_params -= self.transformer.wpe.weight.numel()
return n_params
def forward(self, idx, merge_context=False, past_kv=None, position_ids=None, use_cache=False):
device = idx.device
_, t = idx.size()
if past_kv is not None:
assert t == 1
tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
else:
if merge_context:
assert idx.shape[1] >= 256 + 256 + 1
t = idx.shape[1] - 256
else:
assert (
t <= self.config.block_size
), f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
# forward the GPT model itself
if merge_context:
tok_emb = torch.cat(
[
self.transformer.wte(idx[:, :256]) + self.transformer.wte(idx[:, 256 : 256 + 256]),
self.transformer.wte(idx[:, 256 + 256 :]),
],
dim=1,
)
else:
tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
if past_kv is None:
past_length = 0
past_kv = tuple([None] * len(self.transformer.h))
else:
past_length = past_kv[0][0].size(-2)
if position_ids is None:
position_ids = torch.arange(past_length, t + past_length, dtype=torch.long, device=device)
position_ids = position_ids.unsqueeze(0) # shape (1, t)
assert position_ids.shape == (1, t)
pos_emb = self.transformer.wpe(position_ids) # position embeddings of shape (1, t, n_embd)
x = self.transformer.drop(tok_emb + pos_emb)
new_kv = () if use_cache else None
for _, (block, past_layer_kv) in enumerate(zip(self.transformer.h, past_kv)):
x, kv = block(x, past_kv=past_layer_kv, use_cache=use_cache)
if use_cache:
new_kv = new_kv + (kv,)
x = self.transformer.ln_f(x)
# inference-time mini-optimization: only forward the lm_head on the very last position
logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
return (logits, new_kv)

View File

@ -0,0 +1,142 @@
"""
Much of this code is adapted from Andrej Karpathy's NanoGPT
(https://github.com/karpathy/nanoGPT)
"""
import math
from dataclasses import dataclass
import torch
from torch import nn
from torch.nn import functional as F
from .model import GPT, MLP, GPTConfig
class NonCausalSelfAttention(nn.Module):
def __init__(self, config):
super().__init__()
assert config.n_embd % config.n_head == 0
# key, query, value projections for all heads, but in a batch
self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
# output projection
self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
# regularization
self.attn_dropout = nn.Dropout(config.dropout)
self.resid_dropout = nn.Dropout(config.dropout)
self.n_head = config.n_head
self.n_embd = config.n_embd
self.dropout = config.dropout
# flash attention make GPU go brrrrr but support is only in PyTorch nightly and still a bit scary
self.flash = hasattr(torch.nn.functional, "scaled_dot_product_attention") and self.dropout == 0.0
def forward(self, x):
B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
# calculate query, key, values for all heads in batch and move head forward to be the batch dim
q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
# causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
if self.flash:
# efficient attention using Flash Attention CUDA kernels
y = torch.nn.functional.scaled_dot_product_attention(
q, k, v, attn_mask=None, dropout_p=self.dropout, is_causal=False
)
else:
# manual implementation of attention
att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
att = F.softmax(att, dim=-1)
att = self.attn_dropout(att)
y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
# output projection
y = self.resid_dropout(self.c_proj(y))
return y
class FineBlock(nn.Module):
def __init__(self, config):
super().__init__()
self.ln_1 = nn.LayerNorm(config.n_embd)
self.attn = NonCausalSelfAttention(config)
self.ln_2 = nn.LayerNorm(config.n_embd)
self.mlp = MLP(config)
def forward(self, x):
x = x + self.attn(self.ln_1(x))
x = x + self.mlp(self.ln_2(x))
return x
class FineGPT(GPT):
def __init__(self, config):
super().__init__(config)
del self.lm_head
self.config = config
self.n_codes_total = config.n_codes_total
self.transformer = nn.ModuleDict(
dict(
wtes=nn.ModuleList(
[nn.Embedding(config.input_vocab_size, config.n_embd) for _ in range(config.n_codes_total)]
),
wpe=nn.Embedding(config.block_size, config.n_embd),
drop=nn.Dropout(config.dropout),
h=nn.ModuleList([FineBlock(config) for _ in range(config.n_layer)]),
ln_f=nn.LayerNorm(config.n_embd),
)
)
self.lm_heads = nn.ModuleList(
[
nn.Linear(config.n_embd, config.output_vocab_size, bias=False)
for _ in range(config.n_codes_given, self.n_codes_total)
]
)
for i in range(self.n_codes_total - config.n_codes_given):
self.transformer.wtes[i + 1].weight = self.lm_heads[i].weight
def forward(self, pred_idx, idx):
device = idx.device
b, t, codes = idx.size()
assert (
t <= self.config.block_size
), f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
assert pred_idx > 0, "cannot predict 0th codebook"
assert codes == self.n_codes_total, (b, t, codes)
pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0) # shape (1, t)
# forward the GPT model itself
tok_embs = [
wte(idx[:, :, i]).unsqueeze(-1) for i, wte in enumerate(self.transformer.wtes)
] # token embeddings of shape (b, t, n_embd)
tok_emb = torch.cat(tok_embs, dim=-1)
pos_emb = self.transformer.wpe(pos) # position embeddings of shape (1, t, n_embd)
x = tok_emb[:, :, :, : pred_idx + 1].sum(dim=-1)
x = self.transformer.drop(x + pos_emb)
for block in self.transformer.h:
x = block(x)
x = self.transformer.ln_f(x)
logits = self.lm_heads[pred_idx - self.config.n_codes_given](x)
return logits
def get_num_params(self, non_embedding=True):
"""
Return the number of parameters in the model.
For non-embedding count (default), the position embeddings get subtracted.
The token embeddings would too, except due to the parameter sharing these
params are actually used as weights in the final layer, so we include them.
"""
n_params = sum(p.numel() for p in self.parameters())
if non_embedding:
for wte in self.transformer.wtes:
n_params -= wte.weight.numel()
n_params -= self.transformer.wpe.weight.numel()
return n_params
@dataclass
class FineGPTConfig(GPTConfig):
n_codes_total: int = 8
n_codes_given: int = 1

View File

@ -165,7 +165,7 @@ class BCELossMasked(nn.Module):
def __init__(self, pos_weight: float = None):
super().__init__()
self.pos_weight = nn.Parameter(torch.tensor([pos_weight]), requires_grad=False)
self.register_buffer("pos_weight", torch.tensor([pos_weight]))
def forward(self, x, target, length):
"""
@ -191,10 +191,15 @@ class BCELossMasked(nn.Module):
mask = sequence_mask(sequence_length=length, max_len=target.size(1))
num_items = mask.sum()
loss = functional.binary_cross_entropy_with_logits(
x.masked_select(mask), target.masked_select(mask), pos_weight=self.pos_weight, reduction="sum"
x.masked_select(mask),
target.masked_select(mask),
pos_weight=self.pos_weight.to(x.device),
reduction="sum",
)
else:
loss = functional.binary_cross_entropy_with_logits(x, target, pos_weight=self.pos_weight, reduction="sum")
loss = functional.binary_cross_entropy_with_logits(
x, target, pos_weight=self.pos_weight.to(x.device), reduction="sum"
)
num_items = torch.numel(x)
loss = loss / num_items
return loss

View File

@ -150,7 +150,7 @@ class ConvFlow(nn.Module):
class StochasticDurationPredictor(nn.Module):
"""Stochastic duration predictor with Spline Flows.
It applies Variational Dequantization and Variationsl Data Augmentation.
It applies Variational Dequantization and Variational Data Augmentation.
Paper:
SDP: https://arxiv.org/pdf/2106.06103.pdf

277
TTS/tts/models/bark.py Normal file
View File

@ -0,0 +1,277 @@
import os
from dataclasses import dataclass
from typing import Optional
import numpy as np
from coqpit import Coqpit
from encodec import EncodecModel
from transformers import BertTokenizer
from TTS.tts.layers.bark.inference_funcs import (
codec_decode,
generate_coarse,
generate_fine,
generate_text_semantic,
generate_voice,
load_voice,
)
from TTS.tts.layers.bark.load_model import load_model
from TTS.tts.layers.bark.model import GPT
from TTS.tts.layers.bark.model_fine import FineGPT
from TTS.tts.models.base_tts import BaseTTS
@dataclass
class BarkAudioConfig(Coqpit):
sample_rate: int = 24000
output_sample_rate: int = 24000
class Bark(BaseTTS):
def __init__(
self,
config: Coqpit,
tokenizer: BertTokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased"),
) -> None:
super().__init__(config=config, ap=None, tokenizer=None, speaker_manager=None, language_manager=None)
self.config.num_chars = len(tokenizer)
self.tokenizer = tokenizer
self.semantic_model = GPT(config.semantic_config)
self.coarse_model = GPT(config.coarse_config)
self.fine_model = FineGPT(config.fine_config)
self.encodec = EncodecModel.encodec_model_24khz()
self.encodec.set_target_bandwidth(6.0)
@property
def device(self):
return next(self.parameters()).device
def load_bark_models(self):
self.semantic_model, self.config = load_model(
ckpt_path=self.config.LOCAL_MODEL_PATHS["text"], device=self.device, config=self.config, model_type="text"
)
self.coarse_model, self.config = load_model(
ckpt_path=self.config.LOCAL_MODEL_PATHS["coarse"],
device=self.device,
config=self.config,
model_type="coarse",
)
self.fine_model, self.config = load_model(
ckpt_path=self.config.LOCAL_MODEL_PATHS["fine"], device=self.device, config=self.config, model_type="fine"
)
def train_step(
self,
):
pass
def text_to_semantic(
self,
text: str,
history_prompt: Optional[str] = None,
temp: float = 0.7,
base=None,
allow_early_stop=True,
**kwargs,
):
"""Generate semantic array from text.
Args:
text: text to be turned into audio
history_prompt: history choice for audio cloning
temp: generation temperature (1.0 more diverse, 0.0 more conservative)
Returns:
numpy semantic array to be fed into `semantic_to_waveform`
"""
x_semantic = generate_text_semantic(
text,
self,
history_prompt=history_prompt,
temp=temp,
base=base,
allow_early_stop=allow_early_stop,
**kwargs,
)
return x_semantic
def semantic_to_waveform(
self,
semantic_tokens: np.ndarray,
history_prompt: Optional[str] = None,
temp: float = 0.7,
base=None,
):
"""Generate audio array from semantic input.
Args:
semantic_tokens: semantic token output from `text_to_semantic`
history_prompt: history choice for audio cloning
temp: generation temperature (1.0 more diverse, 0.0 more conservative)
Returns:
numpy audio array at sample frequency 24khz
"""
x_coarse_gen = generate_coarse(
semantic_tokens,
self,
history_prompt=history_prompt,
temp=temp,
base=base,
)
x_fine_gen = generate_fine(
x_coarse_gen,
self,
history_prompt=history_prompt,
temp=0.5,
base=base,
)
audio_arr = codec_decode(x_fine_gen, self)
return audio_arr, x_coarse_gen, x_fine_gen
def generate_audio(
self,
text: str,
history_prompt: Optional[str] = None,
text_temp: float = 0.7,
waveform_temp: float = 0.7,
base=None,
allow_early_stop=True,
**kwargs,
):
"""Generate audio array from input text.
Args:
text: text to be turned into audio
history_prompt: history choice for audio cloning
text_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
waveform_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
Returns:
numpy audio array at sample frequency 24khz
"""
x_semantic = self.text_to_semantic(
text,
history_prompt=history_prompt,
temp=text_temp,
base=base,
allow_early_stop=allow_early_stop,
**kwargs,
)
audio_arr, c, f = self.semantic_to_waveform(
x_semantic, history_prompt=history_prompt, temp=waveform_temp, base=base
)
return audio_arr, [x_semantic, c, f]
def generate_voice(self, audio, speaker_id, voice_dir):
"""Generate a voice from the given audio and text.
Args:
audio (str): Path to the audio file.
speaker_id (str): Speaker name.
voice_dir (str): Path to the directory to save the generate voice.
"""
if voice_dir is not None:
voice_dirs = [voice_dir]
try:
_ = load_voice(speaker_id, voice_dirs)
except (KeyError, FileNotFoundError):
output_path = os.path.join(voice_dir, speaker_id + ".npz")
os.makedirs(voice_dir, exist_ok=True)
generate_voice(audio, self, output_path)
def _set_voice_dirs(self, voice_dirs):
def_voice_dir = None
if isinstance(self.config.DEF_SPEAKER_DIR, str):
os.makedirs(self.config.DEF_SPEAKER_DIR, exist_ok=True)
if os.path.isdir(self.config.DEF_SPEAKER_DIR):
def_voice_dir = self.config.DEF_SPEAKER_DIR
_voice_dirs = [def_voice_dir] if def_voice_dir is not None else []
if voice_dirs is not None:
if isinstance(voice_dirs, str):
voice_dirs = [voice_dirs]
_voice_dirs = voice_dirs + _voice_dirs
return _voice_dirs
# TODO: remove config from synthesize
def synthesize(
self, text, config, speaker_id="random", voice_dirs=None, **kwargs
): # pylint: disable=unused-argument
"""Synthesize speech with the given input text.
Args:
text (str): Input text.
config (BarkConfig): Config with inference parameters.
speaker_id (str): One of the available speaker names. If `random`, it generates a random speaker.
speaker_wav (str): Path to the speaker audio file for cloning a new voice. It is cloned and saved in
`voice_dirs` with the name `speaker_id`. Defaults to None.
voice_dirs (List[str]): List of paths that host reference audio files for speakers. Defaults to None.
**kwargs: Inference settings. See `inference()`.
Returns:
A dictionary of the output values with `wav` as output waveform, `deterministic_seed` as seed used at inference,
`text_input` as text token IDs after tokenizer, `voice_samples` as samples used for cloning, `conditioning_latents`
as latents used at inference.
"""
voice_dirs = self._set_voice_dirs(voice_dirs)
history_prompt = load_voice(self, speaker_id, voice_dirs)
outputs = self.generate_audio(text, history_prompt=history_prompt, **kwargs)
return_dict = {
"wav": outputs[0],
"text_inputs": text,
}
return return_dict
def eval_step(self):
...
def forward(self):
...
def inference(self):
...
@staticmethod
def init_from_config(config: "BarkConfig", **kwargs): # pylint: disable=unused-argument
return Bark(config)
# pylint: disable=unused-argument, redefined-builtin
def load_checkpoint(
self,
config,
checkpoint_dir,
text_model_path=None,
coarse_model_path=None,
fine_model_path=None,
eval=False,
strict=True,
**kwargs,
):
"""Load a model checkpoints from a directory. This model is with multiple checkpoint files and it
expects to have all the files to be under the given `checkpoint_dir` with the rigth names.
If eval is True, set the model to eval mode.
Args:
config (TortoiseConfig): The model config.
checkpoint_dir (str): The directory where the checkpoints are stored.
ar_checkpoint_path (str, optional): The path to the autoregressive checkpoint. Defaults to None.
diff_checkpoint_path (str, optional): The path to the diffusion checkpoint. Defaults to None.
clvp_checkpoint_path (str, optional): The path to the CLVP checkpoint. Defaults to None.
vocoder_checkpoint_path (str, optional): The path to the vocoder checkpoint. Defaults to None.
eval (bool, optional): Whether to set the model to eval mode. Defaults to False.
strict (bool, optional): Whether to load the model strictly. Defaults to True.
"""
text_model_path = text_model_path or os.path.join(checkpoint_dir, "text_2.pt")
coarse_model_path = coarse_model_path or os.path.join(checkpoint_dir, "coarse_2.pt")
fine_model_path = fine_model_path or os.path.join(checkpoint_dir, "fine_2.pt")
self.config.LOCAL_MODEL_PATHS["text"] = text_model_path
self.config.LOCAL_MODEL_PATHS["coarse"] = coarse_model_path
self.config.LOCAL_MODEL_PATHS["fine"] = fine_model_path
self.load_bark_models()
if eval:
self.eval()

View File

@ -1,5 +1,6 @@
import os
import random
import re
from contextlib import contextmanager
from dataclasses import dataclass
from time import time
@ -255,7 +256,7 @@ class TortoiseArgs(Coqpit):
"""
autoregressive_batch_size: int = 1
enable_redaction: bool = True
enable_redaction: bool = False
high_vram: bool = False
kv_cache: bool = True
ar_checkpoint: str = None
@ -871,7 +872,16 @@ class Tortoise(BaseTTS):
vocoder_checkpoint_path = vocoder_checkpoint_path or os.path.join(checkpoint_dir, "vocoder.pth")
if os.path.exists(ar_path):
self.autoregressive.load_state_dict(torch.load(ar_path), strict=strict)
keys_to_ignore = self.autoregressive.gpt._keys_to_ignore_on_load_missing # pylint: disable=protected-access
# remove keys from the checkpoint that are not in the model
checkpoint = torch.load(ar_path, map_location=torch.device("cpu"))
for key in list(checkpoint.keys()):
for pat in keys_to_ignore:
if re.search(pat, key) is not None:
del checkpoint[key]
break
self.autoregressive.load_state_dict(checkpoint, strict=strict)
if os.path.exists(diff_path):
self.diffusion.load_state_dict(torch.load(diff_path), strict=strict)

View File

@ -25,11 +25,12 @@ from TTS.tts.layers.vits.discriminator import VitsDiscriminator
from TTS.tts.layers.vits.networks import PosteriorEncoder, ResidualCouplingBlocks, TextEncoder
from TTS.tts.layers.vits.stochastic_duration_predictor import StochasticDurationPredictor
from TTS.tts.models.base_tts import BaseTTS
from TTS.tts.utils.fairseq import rehash_fairseq_vits_checkpoint
from TTS.tts.utils.helpers import generate_path, maximum_path, rand_segments, segment, sequence_mask
from TTS.tts.utils.languages import LanguageManager
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.synthesis import synthesis
from TTS.tts.utils.text.characters import BaseCharacters, _characters, _pad, _phonemes, _punctuations
from TTS.tts.utils.text.characters import BaseCharacters, BaseVocabulary, _characters, _pad, _phonemes, _punctuations
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.tts.utils.visual import plot_alignment
from TTS.utils.io import load_fsspec
@ -1769,6 +1770,50 @@ class Vits(BaseTTS):
self.eval()
assert not self.training
def load_fairseq_checkpoint(
self, config, checkpoint_dir, eval=False
): # pylint: disable=unused-argument, redefined-builtin
"""Load VITS checkpoints released by fairseq here: https://github.com/facebookresearch/fairseq/tree/main/examples/mms
Performs some changes for compatibility.
Args:
config (Coqpit): 🐸TTS model config.
checkpoint_dir (str): Path to the checkpoint directory.
eval (bool, optional): Set to True for evaluation. Defaults to False.
"""
import json
from TTS.tts.utils.text.cleaners import basic_cleaners
self.disc = None
# set paths
config_file = os.path.join(checkpoint_dir, "config.json")
checkpoint_file = os.path.join(checkpoint_dir, "G_100000.pth")
vocab_file = os.path.join(checkpoint_dir, "vocab.txt")
# set config params
with open(config_file, "r", encoding="utf-8") as file:
# Load the JSON data as a dictionary
config_org = json.load(file)
self.config.audio.sample_rate = config_org["data"]["sampling_rate"]
# self.config.add_blank = config['add_blank']
# set tokenizer
vocab = FairseqVocab(vocab_file)
self.text_encoder.emb = nn.Embedding(vocab.num_chars, config.model_args.hidden_channels)
self.tokenizer = TTSTokenizer(
use_phonemes=False,
text_cleaner=basic_cleaners,
characters=vocab,
phonemizer=None,
add_blank=config_org["data"]["add_blank"],
use_eos_bos=False,
)
# load fairseq checkpoint
new_chk = rehash_fairseq_vits_checkpoint(checkpoint_file)
self.load_state_dict(new_chk)
if eval:
self.eval()
assert not self.training
@staticmethod
def init_from_config(config: "VitsConfig", samples: Union[List[List], List[Dict]] = None, verbose=True):
"""Initiate model from config
@ -1965,3 +2010,24 @@ class VitsCharacters(BaseCharacters):
is_unique=False,
is_sorted=True,
)
class FairseqVocab(BaseVocabulary):
def __init__(self, vocab: str):
super(FairseqVocab).__init__()
self.vocab = vocab
@property
def vocab(self):
"""Return the vocabulary dictionary."""
return self._vocab
@vocab.setter
def vocab(self, vocab_file):
with open(vocab_file, encoding="utf-8") as f:
self._vocab = [x.replace("\n", "") for x in f.readlines()]
self.blank = self._vocab[0]
print(self._vocab)
self.pad = " "
self._char_to_id = {s: i for i, s in enumerate(self._vocab)} # pylint: disable=unnecessary-comprehension
self._id_to_char = {i: s for i, s in enumerate(self._vocab)} # pylint: disable=unnecessary-comprehension

48
TTS/tts/utils/fairseq.py Normal file
View File

@ -0,0 +1,48 @@
import torch
def rehash_fairseq_vits_checkpoint(checkpoint_file):
chk = torch.load(checkpoint_file, map_location=torch.device("cpu"))["model"]
new_chk = {}
for k, v in chk.items():
if "enc_p." in k:
new_chk[k.replace("enc_p.", "text_encoder.")] = v
elif "dec." in k:
new_chk[k.replace("dec.", "waveform_decoder.")] = v
elif "enc_q." in k:
new_chk[k.replace("enc_q.", "posterior_encoder.")] = v
elif "flow.flows.2." in k:
new_chk[k.replace("flow.flows.2.", "flow.flows.1.")] = v
elif "flow.flows.4." in k:
new_chk[k.replace("flow.flows.4.", "flow.flows.2.")] = v
elif "flow.flows.6." in k:
new_chk[k.replace("flow.flows.6.", "flow.flows.3.")] = v
elif "dp.flows.0.m" in k:
new_chk[k.replace("dp.flows.0.m", "duration_predictor.flows.0.translation")] = v
elif "dp.flows.0.logs" in k:
new_chk[k.replace("dp.flows.0.logs", "duration_predictor.flows.0.log_scale")] = v
elif "dp.flows.1" in k:
new_chk[k.replace("dp.flows.1", "duration_predictor.flows.1")] = v
elif "dp.flows.3" in k:
new_chk[k.replace("dp.flows.3", "duration_predictor.flows.2")] = v
elif "dp.flows.5" in k:
new_chk[k.replace("dp.flows.5", "duration_predictor.flows.3")] = v
elif "dp.flows.7" in k:
new_chk[k.replace("dp.flows.7", "duration_predictor.flows.4")] = v
elif "dp.post_flows.0.m" in k:
new_chk[k.replace("dp.post_flows.0.m", "duration_predictor.post_flows.0.translation")] = v
elif "dp.post_flows.0.logs" in k:
new_chk[k.replace("dp.post_flows.0.logs", "duration_predictor.post_flows.0.log_scale")] = v
elif "dp.post_flows.1" in k:
new_chk[k.replace("dp.post_flows.1", "duration_predictor.post_flows.1")] = v
elif "dp.post_flows.3" in k:
new_chk[k.replace("dp.post_flows.3", "duration_predictor.post_flows.2")] = v
elif "dp.post_flows.5" in k:
new_chk[k.replace("dp.post_flows.5", "duration_predictor.post_flows.3")] = v
elif "dp.post_flows.7" in k:
new_chk[k.replace("dp.post_flows.7", "duration_predictor.post_flows.4")] = v
elif "dp." in k:
new_chk[k.replace("dp.", "duration_predictor.")] = v
else:
new_chk[k] = v
return new_chk

View File

@ -207,7 +207,7 @@ def maximum_path_numpy(value, mask, max_neg_val=None):
device = value.device
dtype = value.dtype
value = value.cpu().detach().numpy()
mask = mask.cpu().detach().numpy().astype(np.bool)
mask = mask.cpu().detach().numpy().astype(bool)
b, t_x, t_y = value.shape
direction = np.zeros(value.shape, dtype=np.int64)

View File

@ -63,6 +63,18 @@ class BaseVocabulary:
the vocabulary."""
return self.char_to_id(self.blank) if self.blank else len(self.vocab)
@property
def bos_id(self) -> int:
"""Return the index of the bos character. If the bos character is not specified, return the length of the
vocabulary."""
return self.char_to_id(self.bos) if self.bos else len(self.vocab)
@property
def eos_id(self) -> int:
"""Return the index of the eos character. If the eos character is not specified, return the length of the
vocabulary."""
return self.char_to_id(self.eos) if self.eos else len(self.vocab)
@property
def vocab(self):
"""Return the vocabulary dictionary."""
@ -71,11 +83,13 @@ class BaseVocabulary:
@vocab.setter
def vocab(self, vocab):
"""Set the vocabulary dictionary and character mapping dictionaries."""
self._vocab = vocab
self._char_to_id = {char: idx for idx, char in enumerate(self._vocab)}
self._id_to_char = {
idx: char for idx, char in enumerate(self._vocab) # pylint: disable=unnecessary-comprehension
}
self._vocab, self._char_to_id, self._id_to_char = None, None, None
if vocab is not None:
self._vocab = vocab
self._char_to_id = {char: idx for idx, char in enumerate(self._vocab)}
self._id_to_char = {
idx: char for idx, char in enumerate(self._vocab) # pylint: disable=unnecessary-comprehension
}
@staticmethod
def init_from_config(config, **kwargs):
@ -93,6 +107,17 @@ class BaseVocabulary:
)
return BaseVocabulary(**kwargs), config
def to_config(self) -> "CharactersConfig":
return CharactersConfig(
vocab_dict=self._vocab,
pad=self.pad,
eos=self.eos,
bos=self.bos,
blank=self.blank,
is_unique=False,
is_sorted=False,
)
@property
def num_chars(self):
"""Return number of tokens in the vocabulary."""
@ -174,6 +199,14 @@ class BaseCharacters:
def blank_id(self) -> int:
return self.char_to_id(self.blank) if self.blank else len(self.vocab)
@property
def eos_id(self) -> int:
return self.char_to_id(self.eos) if self.eos else len(self.vocab)
@property
def bos_id(self) -> int:
return self.char_to_id(self.bos) if self.bos else len(self.vocab)
@property
def characters(self):
return self._characters

View File

@ -108,11 +108,12 @@ class TTSTokenizer:
text = self.text_cleaner(text)
if self.use_phonemes:
text = self.phonemizer.phonemize(text, separator="", language=language)
text = self.encode(text)
if self.add_blank:
text = self.intersperse_blank_char(text, True)
if self.use_eos_bos:
text = self.pad_with_bos_eos(text)
return self.encode(text)
return text
def ids_to_text(self, id_sequence: List[int]) -> str:
"""Converts a sequence of token IDs to a string of text."""
@ -120,14 +121,14 @@ class TTSTokenizer:
def pad_with_bos_eos(self, char_sequence: List[str]):
"""Pads a sequence with the special BOS and EOS characters."""
return [self.characters.bos] + list(char_sequence) + [self.characters.eos]
return [self.characters.bos_id] + list(char_sequence) + [self.characters.eos_id]
def intersperse_blank_char(self, char_sequence: List[str], use_blank_char: bool = False):
"""Intersperses the blank character between characters in a sequence.
Use the ```blank``` character if defined else use the ```pad``` character.
"""
char_to_use = self.characters.blank if use_blank_char else self.characters.pad
char_to_use = self.characters.blank_id if use_blank_char else self.characters.pad
result = [char_to_use] * (len(char_sequence) * 2 + 1)
result[1::2] = char_sequence
return result

View File

@ -540,7 +540,10 @@ class AudioProcessor(object):
def _griffin_lim(self, S):
angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
S_complex = np.abs(S).astype(np.complex)
try:
S_complex = np.abs(S).astype(np.complex)
except AttributeError: # np.complex is deprecated since numpy 1.20.0
S_complex = np.abs(S).astype(complex)
y = self._istft(S_complex * angles)
if not np.isfinite(y).all():
print(" [!] Waveform is not finite everywhere. Skipping the GL.")

View File

@ -1,5 +1,6 @@
import json
import os
import tarfile
import zipfile
from pathlib import Path
from shutil import copyfile, rmtree
@ -245,6 +246,55 @@ class ModelManager(object):
else:
print(" > Model's license - No license information available")
def _download_github_model(self, model_item: Dict, output_path: str):
if isinstance(model_item["github_rls_url"], list):
self._download_model_files(model_item["github_rls_url"], output_path, self.progress_bar)
else:
self._download_zip_file(model_item["github_rls_url"], output_path, self.progress_bar)
def _download_hf_model(self, model_item: Dict, output_path: str):
if isinstance(model_item["hf_url"], list):
self._download_model_files(model_item["hf_url"], output_path, self.progress_bar)
else:
self._download_zip_file(model_item["hf_url"], output_path, self.progress_bar)
def download_fairseq_model(self, model_name, output_path):
URI_PREFIX = "https://coqui.gateway.scarf.sh/fairseq/"
_, lang, _, _ = model_name.split("/")
model_download_uri = os.path.join(URI_PREFIX, f"{lang}.tar.gz")
self._download_tar_file(model_download_uri, output_path, self.progress_bar)
@staticmethod
def set_model_url(model_item: Dict):
model_item["model_url"] = None
if "github_rls_url" in model_item:
model_item["model_url"] = model_item["github_rls_url"]
elif "hf_url" in model_item:
model_item["model_url"] = model_item["hf_url"]
elif "fairseq" in model_item["model_name"]:
model_item["model_url"] = "https://coqui.gateway.scarf.sh/fairseq/"
return model_item
def _set_model_item(self, model_name):
# fetch model info from the dict
model_type, lang, dataset, model = model_name.split("/")
model_full_name = f"{model_type}--{lang}--{dataset}--{model}"
if "fairseq" in model_name:
model_item = {
"model_type": "tts_models",
"license": "CC BY-NC 4.0",
"default_vocoder": None,
"author": "fairseq",
"description": "this model is released by Meta under Fairseq repo. Visit https://github.com/facebookresearch/fairseq/tree/main/examples/mms for more info.",
}
model_item["model_name"] = model_name
else:
# get model from models.json
model_item = self.models_dict[model_type][lang][dataset][model]
model_item["model_type"] = model_type
model_item = self.set_model_url(model_item)
return model_item, model_full_name, model
def download_model(self, model_name):
"""Download model files given the full model name.
Model name is in the format
@ -259,11 +309,7 @@ class ModelManager(object):
Args:
model_name (str): model name as explained above.
"""
# fetch model info from the dict
model_type, lang, dataset, model = model_name.split("/")
model_full_name = f"{model_type}--{lang}--{dataset}--{model}"
model_item = self.models_dict[model_type][lang][dataset][model]
model_item["model_type"] = model_type
model_item, model_full_name, model = self._set_model_item(model_name)
# set the model specific output path
output_path = os.path.join(self.output_prefix, model_full_name)
if os.path.exists(output_path):
@ -271,16 +317,20 @@ class ModelManager(object):
else:
os.makedirs(output_path, exist_ok=True)
print(f" > Downloading model to {output_path}")
# download from github release
if isinstance(model_item["github_rls_url"], list):
self._download_model_files(model_item["github_rls_url"], output_path, self.progress_bar)
else:
self._download_zip_file(model_item["github_rls_url"], output_path, self.progress_bar)
if "fairseq" in model_name:
self.download_fairseq_model(model_name, output_path)
elif "github_rls_url" in model_item:
self._download_github_model(model_item, output_path)
elif "hf_url" in model_item:
self._download_hf_model(model_item, output_path)
self.print_model_license(model_item=model_item)
# find downloaded files
output_model_path = output_path
output_config_path = None
if model != "tortoise-v2":
if (
model not in ["tortoise-v2", "bark"] and "fairseq" not in model_name
): # TODO:This is stupid but don't care for now.
output_model_path, output_config_path = self._find_files(output_path)
# update paths in the config.json
self._update_paths(output_path, output_config_path)
@ -421,6 +471,39 @@ class ModelManager(object):
# remove the extracted folder
rmtree(os.path.join(output_folder, z.namelist()[0]))
@staticmethod
def _download_tar_file(file_url, output_folder, progress_bar):
"""Download the github releases"""
# download the file
r = requests.get(file_url, stream=True)
# extract the file
try:
total_size_in_bytes = int(r.headers.get("content-length", 0))
block_size = 1024 # 1 Kibibyte
if progress_bar:
progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
temp_tar_name = os.path.join(output_folder, file_url.split("/")[-1])
with open(temp_tar_name, "wb") as file:
for data in r.iter_content(block_size):
if progress_bar:
progress_bar.update(len(data))
file.write(data)
with tarfile.open(temp_tar_name) as t:
t.extractall(output_folder)
tar_names = t.getnames()
os.remove(temp_tar_name) # delete tar after extract
except tarfile.ReadError:
print(f" > Error: Bad tar file - {file_url}")
raise tarfile.ReadError # pylint: disable=raise-missing-from
# move the files to the outer path
for file_path in os.listdir(os.path.join(output_folder, tar_names[0])):
src_path = os.path.join(output_folder, tar_names[0], file_path)
dst_path = os.path.join(output_folder, os.path.basename(file_path))
if src_path != dst_path:
copyfile(src_path, dst_path)
# remove the extracted folder
rmtree(os.path.join(output_folder, tar_names[0]))
@staticmethod
def _download_model_files(file_urls, output_folder, progress_bar):
"""Download the github releases"""

View File

@ -7,7 +7,9 @@ import pysbd
import torch
from TTS.config import load_config
from TTS.tts.configs.vits_config import VitsConfig
from TTS.tts.models import setup_model as setup_tts_model
from TTS.tts.models.vits import Vits
# pylint: disable=unused-wildcard-import
# pylint: disable=wildcard-import
@ -98,8 +100,12 @@ class Synthesizer(object):
self.output_sample_rate = self.vc_config.audio["output_sample_rate"]
if model_dir:
self._load_tts_from_dir(model_dir, use_cuda)
self.output_sample_rate = self.tts_config.audio["output_sample_rate"]
if "fairseq" in model_dir:
self._load_fairseq_from_dir(model_dir, use_cuda)
self.output_sample_rate = self.tts_config.audio["sample_rate"]
else:
self._load_tts_from_dir(model_dir, use_cuda)
self.output_sample_rate = self.tts_config.audio["output_sample_rate"]
@staticmethod
def _get_segmenter(lang: str):
@ -133,12 +139,23 @@ class Synthesizer(object):
if use_cuda:
self.vc_model.cuda()
def _load_fairseq_from_dir(self, model_dir: str, use_cuda: bool) -> None:
"""Load the fairseq model from a directory.
We assume it is VITS and the model knows how to load itself from the directory and there is a config.json file in the directory.
"""
self.tts_config = VitsConfig()
self.tts_model = Vits.init_from_config(self.tts_config)
self.tts_model.load_fairseq_checkpoint(self.tts_config, checkpoint_dir=model_dir, eval=True)
self.tts_config = self.tts_model.config
if use_cuda:
self.tts_model.cuda()
def _load_tts_from_dir(self, model_dir: str, use_cuda: bool) -> None:
"""Load the TTS model from a directory.
We assume the model knows how to load itself from the directory and there is a config.json file in the directory.
"""
config = load_config(os.path.join(model_dir, "config.json"))
self.tts_config = config
self.tts_model = setup_tts_model(config)
@ -260,13 +277,13 @@ class Synthesizer(object):
Args:
text (str): input text.
speaker_name (str, optional): spekaer id for multi-speaker models. Defaults to "".
speaker_name (str, optional): speaker id for multi-speaker models. Defaults to "".
language_name (str, optional): language id for multi-language models. Defaults to "".
speaker_wav (Union[str, List[str]], optional): path to the speaker wav for voice cloning. Defaults to None.
style_wav ([type], optional): style waveform for GST. Defaults to None.
style_text ([type], optional): transcription of style_wav for Capacitron. Defaults to None.
reference_wav ([type], optional): reference waveform for voice conversion. Defaults to None.
reference_speaker_name ([type], optional): spekaer id of reference waveform. Defaults to None.
reference_speaker_name ([type], optional): speaker id of reference waveform. Defaults to None.
Returns:
List[int]: [description]
"""
@ -355,7 +372,7 @@ class Synthesizer(object):
use_gl = self.vocoder_model is None
if not reference_wav:
if not reference_wav: # not voice conversion
for sen in sens:
if hasattr(self.tts_model, "synthesize"):
sp_name = "random" if speaker_name is None else speaker_name
@ -363,7 +380,7 @@ class Synthesizer(object):
text=sen,
config=self.tts_config,
speaker_id=sp_name,
extra_voice_dirs=self.voice_dir,
voice_dirs=self.voice_dir,
**kwargs,
)
else:

View File

@ -794,8 +794,8 @@ class FreeVCConfig(BaseVCConfig):
model: str = "freevc"
# model specific params
model_args: FreeVCArgs = FreeVCArgs()
audio: FreeVCAudioConfig = FreeVCAudioConfig()
model_args: FreeVCArgs = field(default_factory=FreeVCArgs)
audio: FreeVCAudioConfig = field(default_factory=FreeVCAudioConfig)
# optimizer
# TODO with training support

View File

@ -1,6 +1,6 @@
furo
myst-parser == 0.15.1
sphinx == 4.0.2
myst-parser == 2.0.0
sphinx == 7.0.1
sphinx_inline_tabs
sphinx_copybutton
linkify-it-py

View File

@ -76,7 +76,7 @@ myst_enable_extensions = ['linkify',]
# duplicated section names that are in different documents.
autosectionlabel_prefix_document = True
language = None
language = 'en'
autodoc_inherit_docstrings = False

View File

@ -52,6 +52,7 @@
models/tacotron1-2.md
models/overflow.md
models/tortoise.md
models/bark.md
.. toctree::
:maxdepth: 2

View File

@ -128,7 +128,7 @@ wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0],
tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav")
```
Here is an example for a single speaker model.
#### Here is an example for a single speaker model.
```python
# Init TTS with the target model name
@ -137,7 +137,7 @@ tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False,
tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path=OUTPUT_PATH)
```
Example voice cloning with YourTTS in English, French and Portuguese:
#### Example voice cloning with YourTTS in English, French and Portuguese:
```python
tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)
@ -146,15 +146,16 @@ tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wa
tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt", file_path="output.wav")
```
Example voice conversion converting speaker of the `source_wav` to the speaker of the `target_wav`
#### Example voice conversion converting speaker of the `source_wav` to the speaker of the `target_wav`
```python
tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False, gpu=True)
tts.voice_conversion_to_file(source_wav="my/source.wav", target_wav="my/target.wav", file_path="output.wav")
```
Example voice cloning by a single speaker TTS model combining with the voice conversion model. This way, you can
clone voices by using any model in 🐸TTS.
#### Example voice cloning by a single speaker TTS model combining with the voice conversion model.
This way, you can clone voices by using any model in 🐸TTS.
```python
tts = TTS("tts_models/de/thorsten/tacotron2-DDC")
@ -163,8 +164,11 @@ tts.tts_with_vc_to_file(
speaker_wav="target/speaker.wav",
file_path="ouptut.wav"
)
```
Example text to speech using [🐸Coqui Studio](https://coqui.ai) models. You can use all of your available speakers in the studio.
#### Example text to speech using [🐸Coqui Studio](https://coqui.ai) models.
You can use all of your available speakers in the studio.
[🐸Coqui Studio](https://coqui.ai) API token is required. You can get it from the [account page](https://coqui.ai/account).
You should set the `COQUI_STUDIO_TOKEN` environment variable to use the API token.
@ -193,4 +197,23 @@ api.emotions
api.list_speakers()
api.list_voices()
wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5)
```
#### Example text to speech using **Fairseq models in ~1100 languages** 🤯.
For these models use the following name format: `tts_models/<lang-iso_code>/fairseq/vits`.
You can find the list of language ISO codes [here](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html) and learn about the Fairseq models [here](https://github.com/facebookresearch/fairseq/tree/main/examples/mms).
```python
from TTS.api import TTS
api = TTS(model_name="tts_models/eng/fairseq/vits", gpu=True)
api.tts_to_file("This is a test.", file_path="output.wav")
# TTS with on the fly voice conversion
api = TTS("tts_models/deu/fairseq/vits")
api.tts_with_vc_to_file(
"Wie sage ich auf Italienisch, dass ich dich liebe?",
speaker_wav="target/speaker.wav",
file_path="ouptut.wav"
)
```

View File

@ -0,0 +1,43 @@
# Mary-TTS API Support for Coqui-TTS
## What is Mary-TTS?
[Mary (Modular Architecture for Research in sYynthesis) Text-to-Speech](http://mary.dfki.de/) is an open-source (GNU LGPL license), multilingual Text-to-Speech Synthesis platform written in Java. It was originally developed as a collaborative project of [DFKIs](http://www.dfki.de/web) Language Technology Lab and the [Institute of Phonetics](http://www.coli.uni-saarland.de/groups/WB/Phonetics/) at Saarland University, Germany. It is now maintained by the Multimodal Speech Processing Group in the [Cluster of Excellence MMCI](https://www.mmci.uni-saarland.de/) and DFKI.
MaryTTS has been around for a very! long time. Version 3.0 even dates back to 2006, long before Deep Learning was a broadly known term and the last official release was version 5.2 in 2016.
You can check out this OpenVoice-Tech page to learn more: https://openvoice-tech.net/index.php/MaryTTS
## Why Mary-TTS compatibility is relevant
Due to it's open-source nature, relatively high quality voices and fast synthetization speed Mary-TTS was a popular choice in the past and many tools implemented API support over the years like screen-readers (NVDA + SpeechHub), smart-home HUBs (openHAB, Home Assistant) or voice assistants (Rhasspy, Mycroft, SEPIA). A compatibility layer for Coqui-TTS will ensure that these tools can use Coqui as a drop-in replacement and get even better voices right away.
## API and code examples
Like Coqui-TTS, Mary-TTS can run as HTTP server to allow access to the API via HTTP GET and POST calls. The best documentations of this API are probably the [web-page](https://github.com/marytts/marytts/tree/master/marytts-runtime/src/main/resources/marytts/server/http), available via your self-hosted Mary-TTS server and the [Java docs page](http://mary.dfki.de/javadoc/marytts/server/http/MaryHttpServer.html).
Mary-TTS offers a larger number of endpoints to load styles, audio effects, examples etc., but compatible tools often only require 3 of them to work:
- `/locales` (GET) - Returns a list of supported locales in the format `[locale]\n...`, for example "en_US" or "de_DE" or simply "en" etc.
- `/voices` (GET) - Returns a list of supported voices in the format `[name] [locale] [gender]\n...`, 'name' can be anything without spaces(!) and 'gender' is traditionally `f` or `m`
- `/process?INPUT_TEXT=[my text]&INPUT_TYPE=TEXT&LOCALE=[locale]&VOICE=[name]&OUTPUT_TYPE=AUDIO&AUDIO=WAVE_FILE` (GET/POST) - Processes the input text and returns a wav file. INPUT_TYPE, OUTPUT_TYPE and AUDIO support additional values, but are usually static in compatible tools.
If your Coqui-TTS server is running on `localhost` using `port` 59125 (for classic Mary-TTS compatibility) you can us the following CURL requests to test the API:
Return locale of active voice, e.g. "en":
```bash
curl http://localhost:59125/locales
```
Return name of active voice, e.g. "glow-tts en u"
```bash
curl http://localhost:59125/voices
```
Create a wav-file with spoken input text:
```bash
curl http://localhost:59125/process?INPUT_TEXT=this+is+a+test > test.wav
```
You can enter the same URLs in your browser and check-out the results there as well.
### How it works and limitations
A classic Mary-TTS server would usually show all installed locales and voices via the corresponding endpoints and accept the parameters `LOCALE` and `VOICE` for processing. For Coqui-TTS we usually start the server with one specific locale and model and thus cannot return all available options. Instead we return the active locale and use the model name as "voice". Since we only have one active model and always want to return a WAV-file, we currently ignore all other processing parameters except `INPUT_TEXT`. Since the gender is not defined for models in Coqui-TTS we always return `u` (undefined).
We think that this is an acceptable compromise, since users are often only interested in one specific voice anyways, but the API might get extended in the future to support multiple languages and voices at the same time.

103
docs/source/models/bark.md Normal file
View File

@ -0,0 +1,103 @@
# Bark 🐶
Bark is a multi-lingual TTS model created by [Suno-AI](https://www.suno.ai/). It can generate conversational speech as well as music and sound effects.
It is architecturally very similar to Google's [AudioLM](https://arxiv.org/abs/2209.03143). For more information, please refer to the [Suno-AI's repo](https://github.com/suno-ai/bark).
## Acknowledgements
- 👑[Suno-AI](https://www.suno.ai/) for training and open-sourcing this model.
- 👑[serp-ai](https://github.com/serp-ai/bark-with-voice-clone) for controlled voice cloning.
## Example Use
```python
text = "Hello, my name is Manmay , how are you?"
from TTS.tts.configs.bark_config import BarkConfig
from TTS.tts.models.bark import Bark
config = BarkConfig()
model = Bark.init_from_config(config)
model.load_checkpoint(config, checkpoint_dir="path/to/model/dir/", eval=True)
# with random speaker
output_dict = model.synthesize(text, config, speaker_id="random", voice_dirs=None)
# cloning a speaker.
# It assumes that you have a speaker file in `bark_voices/speaker_n/speaker.wav` or `bark_voices/speaker_n/speaker.npz`
output_dict = model.synthesize(text, config, speaker_id="ljspeech", voice_dirs="bark_voices/")
```
Using 🐸TTS API:
```python
from TTS.api import TTS
# Load the model to GPU
# Bark is really slow on CPU, so we recommend using GPU.
tts = TTS("tts_models/multilingual/multi-dataset/bark", gpu=True)
# Cloning a new speaker
# This expects to find a mp3 or wav file like `bark_voices/new_speaker/speaker.wav`
# It computes the cloning values and stores in `bark_voices/new_speaker/speaker.npz`
tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
file_path="output.wav",
voice_dir="bark_voices/",
speaker="ljspeech")
# When you run it again it uses the stored values to generate the voice.
tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
file_path="output.wav",
voice_dir="bark_voices/",
speaker="ljspeech")
# random speaker
tts = TTS("tts_models/multilingual/multi-dataset/bark", gpu=True)
tts.tts_to_file("hello world", file_path="out.wav")
```
Using 🐸TTS Command line:
```console
# cloning the `ljspeech` voice
tts --model_name tts_models/multilingual/multi-dataset/bark \
--text "This is an example." \
--out_path "output.wav" \
--voice_dir bark_voices/ \
--speaker_idx "ljspeech" \
--progress_bar True
# Random voice generation
tts --model_name tts_models/multilingual/multi-dataset/bark \
--text "This is an example." \
--out_path "output.wav" \
--progress_bar True
```
## Important resources & papers
- Original Repo: https://github.com/suno-ai/bark
- Cloning implementation: https://github.com/serp-ai/bark-with-voice-clone
- AudioLM: https://arxiv.org/abs/2209.03143
## BarkConfig
```{eval-rst}
.. autoclass:: TTS.tts.configs.bark_config.BarkConfig
:members:
```
## BarkArgs
```{eval-rst}
.. autoclass:: TTS.tts.models.bark.BarkArgs
:members:
```
## Bark Model
```{eval-rst}
.. autoclass:: TTS.tts.models.bark.Bark
:members:
```

View File

@ -1,7 +1,7 @@
# Tortoise 🐢
Tortoise is a very expressive TTS system with impressive voice cloning capabilities. It is based on an GPT like autogressive acoustic model that converts input
text to discritized acouistic tokens, a diffusion model that converts these tokens to melspeectrogram frames and a Univnet vocoder to convert the spectrograms to
the final audio signal. The important downside is that Tortoise is very slow compared to the parallel TTS models like VITS.
the final audio signal. The important downside is that Tortoise is very slow compared to the parallel TTS models like VITS.
Big thanks to 👑[@manmay-nakhashi](https://github.com/manmay-nakhashi) who helped us implement Tortoise in 🐸TTS.
@ -12,7 +12,7 @@ from TTS.tts.configs.tortoise_config import TortoiseConfig
from TTS.tts.models.tortoise import Tortoise
config = TortoiseConfig()
model = Tortoise.inif_from_config(config)
model = Tortoise.init_from_config(config)
model.load_checkpoint(config, checkpoint_dir="paths/to/models_dir/", eval=True)
# with random speaker
@ -29,23 +29,23 @@ from TTS.api import TTS
tts = TTS("tts_models/en/multi-dataset/tortoise-v2")
# cloning `lj` voice from `TTS/tts/utils/assets/tortoise/voices/lj`
# with custom inference settings overriding defaults.
tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
# with custom inference settings overriding defaults.
tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
file_path="output.wav",
voice_dir="TTS/tts/utils/assets/tortoise/voices/",
voice_dir="path/to/tortoise/voices/dir/",
speaker="lj",
num_autoregressive_samples=1,
diffusion_iterations=10)
# Using presets with the same voice
tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
file_path="output.wav",
voice_dir="TTS/tts/utils/assets/tortoise/voices/",
voice_dir="path/to/tortoise/voices/dir/",
speaker="lj",
preset="ultra_fast")
# Random voice generation
tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
file_path="output.wav")
```
@ -54,16 +54,16 @@ Using 🐸TTS Command line:
```console
# cloning the `lj` voice
tts --model_name tts_models/en/multi-dataset/tortoise-v2 \
--text "This is an example." \
--out_path "/data/speech_synth/coqui-tts/TTS/tests/outputs/output.wav" \
--voice_dir TTS/tts/utils/assets/tortoise/voices/ \
--text "This is an example." \
--out_path "output.wav" \
--voice_dir path/to/tortoise/voices/dir/ \
--speaker_idx "lj" \
--progress_bar True
# Random voice generation
tts --model_name tts_models/en/multi-dataset/tortoise-v2 \
--text "This is an example." \
--out_path "/data/speech_synth/coqui-tts/TTS/tests/outputs/output.wav" \
--out_path "output.wav" \
--progress_bar True
```

View File

@ -1,5 +1,5 @@
[build-system]
requires = ["setuptools", "wheel", "cython==0.29.28", "numpy==1.21.6", "packaging"]
requires = ["setuptools", "wheel", "cython==0.29.30", "numpy==1.22.0", "packaging"]
[flake8]
max-line-length=120

View File

@ -1,14 +1,14 @@
# core deps
numpy==1.21.6;python_version<"3.10"
numpy;python_version=="3.10"
cython==0.29.28
numpy==1.22.0;python_version<="3.10"
numpy==1.24.3;python_version>"3.10"
cython==0.29.30
scipy>=1.4.0
torch>=1.7
torchaudio
soundfile
librosa==0.10.0.*
numba==0.55.1;python_version<"3.9"
numba==0.56.4;python_version>="3.9"
numba==0.57.0;python_version>="3.9"
inflect==5.6.0
tqdm
anyascii
@ -26,14 +26,14 @@ pandas
# deps for training
matplotlib
# coqui stack
trainer==0.0.20
trainer
# config management
coqpit>=0.0.16
# chinese g2p deps
jieba
pypinyin
# japanese g2p deps
mecab-python3==1.0.5
mecab-python3==1.0.6
unidic-lite==1.0.8
# gruut+supported langs
gruut[de,es,fr]==2.2.3
@ -45,8 +45,9 @@ g2pkk>=0.1.1
bangla==0.0.2
bnnumerizer
bnunicodenormalizer==0.1.1
#deps for tortoise
k_diffusion
einops
transformers
transformers
#deps for bark
encodec

View File

@ -1,8 +1,8 @@
[build_py]
build-lib=temp_build
build_lib=temp_build
[bdist_wheel]
bdist-dir=temp_build
bdist_dir=temp_build
[install_lib]
build-dir=temp_build
build_dir=temp_build

View File

@ -32,8 +32,8 @@ from Cython.Build import cythonize
from setuptools import Extension, find_packages, setup
python_version = sys.version.split()[0]
if Version(python_version) < Version("3.7") or Version(python_version) >= Version("3.11"):
raise RuntimeError("TTS requires python >= 3.7 and < 3.11 " "but your Python version is {}".format(sys.version))
if Version(python_version) < Version("3.9") or Version(python_version) >= Version("3.12"):
raise RuntimeError("TTS requires python >= 3.9 and < 3.12 " "but your Python version is {}".format(sys.version))
cwd = os.path.dirname(os.path.abspath(__file__))
@ -114,15 +114,14 @@ setup(
"dev": requirements_dev,
"notebooks": requirements_notebooks,
},
python_requires=">=3.7.0, <3.11",
python_requires=">=3.9.0, <3.12",
entry_points={"console_scripts": ["tts=TTS.bin.synthesize:main", "tts-server = TTS.server.server:main"]},
classifiers=[
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Development Status :: 3 - Alpha",
"Intended Audience :: Science/Research",
"Intended Audience :: Developers",

View File

@ -60,7 +60,7 @@ if is_coqui_available:
self.assertIsNone(tts.languages)
def test_studio_model(self):
tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio")
tts = TTS(model_name="coqui_studio/en/Zacharie Aimilios/coqui_studio")
tts.tts_to_file(text="This is a test.")
# check speed > 2.0 raises error
@ -83,6 +83,10 @@ if is_coqui_available:
wav = tts.tts(text="This is a test.", speed=2.0, emotion="Sad")
self.assertGreater(len(wav), 0)
def test_fairseq_model(self): # pylint: disable=no-self-use
tts = TTS(model_name="tts_models/eng/fairseq/vits")
tts.tts_to_file(text="This is a test.")
def test_multi_speaker_multi_lingual_model(self):
tts = TTS()
tts.load_tts_model_by_name(tts.models[0]) # YourTTS

View File

@ -1,5 +1,5 @@
import unittest
from dataclasses import dataclass
from dataclasses import dataclass, field
from coqpit import Coqpit
@ -86,11 +86,11 @@ class TestTTSTokenizer(unittest.TestCase):
enable_eos_bos_chars: bool = True
use_phonemes: bool = True
add_blank: bool = False
characters: str = Characters()
characters: str = field(default_factory=Characters)
phonemizer: str = "espeak"
phoneme_language: str = "tr"
text_cleaner: str = "phoneme_cleaners"
characters = Characters()
characters = field(default_factory=Characters)
tokenizer_ph, _ = TTSTokenizer.init_from_config(TokenizerConfig())
tokenizer_ph.phonemizer.backend = "espeak"

View File

@ -16,7 +16,7 @@ from TTS.utils.audio import AudioProcessor
torch.manual_seed(1)
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = torch.device("cuda" if use_cuda else "cpu")
config_global = TacotronConfig(num_chars=32, num_speakers=5, out_channels=513, decoder_output_dim=80)
@ -288,7 +288,6 @@ class TacotronCapacitronTrainTest(unittest.TestCase):
batch["text_input"].shape[0], batch["stop_targets"].size(1) // config.r, -1
)
batch["stop_targets"] = (batch["stop_targets"].sum(2) > 0.0).unsqueeze(2).float().squeeze()
model = Tacotron(config).to(device)
criterion = model.get_criterion()
optimizer = model.get_optimizer()

View File

@ -15,7 +15,7 @@ def run_models(offset=0, step=1):
print(" > Run synthesizer with all the models.")
output_path = os.path.join(get_tests_output_path(), "output.wav")
manager = ModelManager(output_prefix=get_tests_output_path(), progress_bar=False)
model_names = manager.list_models()
model_names = [name for name in manager.list_models() if "bark" not in name]
for model_name in model_names[offset::step]:
print(f"\n > Run - {model_name}")
model_path, _, _ = manager.download_model(model_name)
@ -79,6 +79,15 @@ def test_models_offset_2_step_3():
run_models(offset=2, step=3)
def test_bark():
"""Bark is too big to run on github actions. We need to test it locally"""
output_path = os.path.join(get_tests_output_path(), "output.wav")
run_cli(
f" tts --model_name tts_models/multilingual/multi-dataset/bark "
f'--text "This is an example." --out_path "{output_path}" --progress_bar False'
)
def test_voice_conversion():
print(" > Run voice conversion inference using YourTTS model.")
model_name = "tts_models/multilingual/multi-dataset/your_tts"