mirror of https://github.com/coqui-ai/TTS.git
Merge branch 'dev' into reuben/docs-studio-refs
This commit is contained in:
commit
3991d83b2c
|
@ -1,53 +0,0 @@
|
||||||
name: api_tests
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
jobs:
|
|
||||||
check_skip:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
if: "! contains(github.event.head_commit.message, '[ci skip]')"
|
|
||||||
steps:
|
|
||||||
- run: echo "${{ github.event.head_commit.message }}"
|
|
||||||
|
|
||||||
test:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
python-version: [3.9, "3.10", "3.11"]
|
|
||||||
experimental: [false]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v3
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
|
||||||
uses: actions/setup-python@v4
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python-version }}
|
|
||||||
architecture: x64
|
|
||||||
cache: 'pip'
|
|
||||||
cache-dependency-path: 'requirements*'
|
|
||||||
- name: check OS
|
|
||||||
run: cat /etc/os-release
|
|
||||||
- name: set ENV
|
|
||||||
run: |
|
|
||||||
export TRAINER_TELEMETRY=0
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install -y --no-install-recommends git make gcc
|
|
||||||
sudo apt-get install espeak-ng
|
|
||||||
make system-deps
|
|
||||||
- name: Install/upgrade Python setup deps
|
|
||||||
run: python3 -m pip install --upgrade pip setuptools wheel
|
|
||||||
- name: Replace scarf urls
|
|
||||||
run: |
|
|
||||||
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
|
|
||||||
- name: Install TTS
|
|
||||||
run: |
|
|
||||||
python3 -m pip install .[all]
|
|
||||||
python3 setup.py egg_info
|
|
||||||
- name: Unit tests
|
|
||||||
run: make api_tests
|
|
||||||
env:
|
|
||||||
COQUI_STUDIO_TOKEN: ${{ secrets.COQUI_STUDIO_TOKEN }}
|
|
|
@ -1,52 +0,0 @@
|
||||||
name: zoo-tests-tortoise
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
pull_request:
|
|
||||||
types: [opened, synchronize, reopened]
|
|
||||||
jobs:
|
|
||||||
check_skip:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
if: "! contains(github.event.head_commit.message, '[ci skip]')"
|
|
||||||
steps:
|
|
||||||
- run: echo "${{ github.event.head_commit.message }}"
|
|
||||||
|
|
||||||
test:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
python-version: [3.9, "3.10", "3.11"]
|
|
||||||
experimental: [false]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v3
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
|
||||||
uses: actions/setup-python@v4
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python-version }}
|
|
||||||
architecture: x64
|
|
||||||
cache: 'pip'
|
|
||||||
cache-dependency-path: 'requirements*'
|
|
||||||
- name: check OS
|
|
||||||
run: cat /etc/os-release
|
|
||||||
- name: set ENV
|
|
||||||
run: export TRAINER_TELEMETRY=0
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install -y git make gcc
|
|
||||||
sudo apt-get install espeak espeak-ng
|
|
||||||
make system-deps
|
|
||||||
- name: Install/upgrade Python setup deps
|
|
||||||
run: python3 -m pip install --upgrade pip setuptools wheel
|
|
||||||
- name: Replace scarf urls
|
|
||||||
run: |
|
|
||||||
sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
|
|
||||||
- name: Install TTS
|
|
||||||
run: |
|
|
||||||
python3 -m pip install .[all]
|
|
||||||
python3 setup.py egg_info
|
|
||||||
- name: Unit tests
|
|
||||||
run: nose2 -F -v -B --with-coverage --coverage TTS tests.zoo_tests.test_models.test_tortoise
|
|
3
Makefile
3
Makefile
|
@ -35,9 +35,6 @@ test_zoo: ## run zoo tests.
|
||||||
inference_tests: ## run inference tests.
|
inference_tests: ## run inference tests.
|
||||||
nose2 -F -v -B --with-coverage --coverage TTS tests.inference_tests
|
nose2 -F -v -B --with-coverage --coverage TTS tests.inference_tests
|
||||||
|
|
||||||
api_tests: ## run api tests.
|
|
||||||
nose2 -F -v -B --with-coverage --coverage TTS tests.api_tests
|
|
||||||
|
|
||||||
data_tests: ## run data tests.
|
data_tests: ## run data tests.
|
||||||
nose2 -F -v -B --with-coverage --coverage TTS tests.data_tests
|
nose2 -F -v -B --with-coverage --coverage TTS tests.data_tests
|
||||||
|
|
||||||
|
|
|
@ -3,12 +3,13 @@
|
||||||
"multilingual": {
|
"multilingual": {
|
||||||
"multi-dataset": {
|
"multi-dataset": {
|
||||||
"xtts_v2": {
|
"xtts_v2": {
|
||||||
"description": "XTTS-v2.0.2 by Coqui with 16 languages.",
|
"description": "XTTS-v2.0.3 by Coqui with 17 languages.",
|
||||||
"hf_url": [
|
"hf_url": [
|
||||||
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth",
|
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth",
|
||||||
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json",
|
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json",
|
||||||
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json",
|
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json",
|
||||||
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/hash.md5"
|
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/hash.md5",
|
||||||
|
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/speakers_xtts.pth"
|
||||||
],
|
],
|
||||||
"model_hash": "10f92b55c512af7a8d39d650547a15a7",
|
"model_hash": "10f92b55c512af7a8d39d650547a15a7",
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
|
@ -45,7 +46,7 @@
|
||||||
"hf_url": [
|
"hf_url": [
|
||||||
"https://coqui.gateway.scarf.sh/hf/bark/coarse_2.pt",
|
"https://coqui.gateway.scarf.sh/hf/bark/coarse_2.pt",
|
||||||
"https://coqui.gateway.scarf.sh/hf/bark/fine_2.pt",
|
"https://coqui.gateway.scarf.sh/hf/bark/fine_2.pt",
|
||||||
"https://app.coqui.ai/tts_model/text_2.pt",
|
"https://coqui.gateway.scarf.sh/hf/text_2.pt",
|
||||||
"https://coqui.gateway.scarf.sh/hf/bark/config.json",
|
"https://coqui.gateway.scarf.sh/hf/bark/config.json",
|
||||||
"https://coqui.gateway.scarf.sh/hf/bark/hubert.pt",
|
"https://coqui.gateway.scarf.sh/hf/bark/hubert.pt",
|
||||||
"https://coqui.gateway.scarf.sh/hf/bark/tokenizer.pth"
|
"https://coqui.gateway.scarf.sh/hf/bark/tokenizer.pth"
|
||||||
|
@ -270,7 +271,7 @@
|
||||||
"tortoise-v2": {
|
"tortoise-v2": {
|
||||||
"description": "Tortoise tts model https://github.com/neonbjb/tortoise-tts",
|
"description": "Tortoise tts model https://github.com/neonbjb/tortoise-tts",
|
||||||
"github_rls_url": [
|
"github_rls_url": [
|
||||||
"https://app.coqui.ai/tts_model/autoregressive.pth",
|
"https://coqui.gateway.scarf.sh/v0.14.1_models/autoregressive.pth",
|
||||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/clvp2.pth",
|
"https://coqui.gateway.scarf.sh/v0.14.1_models/clvp2.pth",
|
||||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/cvvp.pth",
|
"https://coqui.gateway.scarf.sh/v0.14.1_models/cvvp.pth",
|
||||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/diffusion_decoder.pth",
|
"https://coqui.gateway.scarf.sh/v0.14.1_models/diffusion_decoder.pth",
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
0.21.3
|
0.22.0
|
||||||
|
|
158
TTS/api.py
158
TTS/api.py
|
@ -6,7 +6,6 @@ from typing import Union
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
||||||
from TTS.cs_api import CS_API
|
|
||||||
from TTS.utils.audio.numpy_transforms import save_wav
|
from TTS.utils.audio.numpy_transforms import save_wav
|
||||||
from TTS.utils.manage import ModelManager
|
from TTS.utils.manage import ModelManager
|
||||||
from TTS.utils.synthesizer import Synthesizer
|
from TTS.utils.synthesizer import Synthesizer
|
||||||
|
@ -24,7 +23,6 @@ class TTS(nn.Module):
|
||||||
vocoder_path: str = None,
|
vocoder_path: str = None,
|
||||||
vocoder_config_path: str = None,
|
vocoder_config_path: str = None,
|
||||||
progress_bar: bool = True,
|
progress_bar: bool = True,
|
||||||
cs_api_model: str = "XTTS",
|
|
||||||
gpu=False,
|
gpu=False,
|
||||||
):
|
):
|
||||||
"""🐸TTS python interface that allows to load and use the released models.
|
"""🐸TTS python interface that allows to load and use the released models.
|
||||||
|
@ -60,9 +58,6 @@ class TTS(nn.Module):
|
||||||
vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
|
vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
|
||||||
vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
|
vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
|
||||||
progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
|
progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
|
||||||
cs_api_model (str, optional): Name of the model to use for the Coqui Studio API. Available models are
|
|
||||||
"XTTS", "V1". You can also use `TTS.cs_api.CS_API" for more control.
|
|
||||||
Defaults to "XTTS".
|
|
||||||
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
|
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
|
||||||
"""
|
"""
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
@ -70,14 +65,12 @@ class TTS(nn.Module):
|
||||||
self.config = load_config(config_path) if config_path else None
|
self.config = load_config(config_path) if config_path else None
|
||||||
self.synthesizer = None
|
self.synthesizer = None
|
||||||
self.voice_converter = None
|
self.voice_converter = None
|
||||||
self.csapi = None
|
|
||||||
self.cs_api_model = cs_api_model
|
|
||||||
self.model_name = ""
|
self.model_name = ""
|
||||||
if gpu:
|
if gpu:
|
||||||
warnings.warn("`gpu` will be deprecated. Please use `tts.to(device)` instead.")
|
warnings.warn("`gpu` will be deprecated. Please use `tts.to(device)` instead.")
|
||||||
|
|
||||||
if model_name is not None and len(model_name) > 0:
|
if model_name is not None and len(model_name) > 0:
|
||||||
if "tts_models" in model_name or "coqui_studio" in model_name:
|
if "tts_models" in model_name:
|
||||||
self.load_tts_model_by_name(model_name, gpu)
|
self.load_tts_model_by_name(model_name, gpu)
|
||||||
elif "voice_conversion_models" in model_name:
|
elif "voice_conversion_models" in model_name:
|
||||||
self.load_vc_model_by_name(model_name, gpu)
|
self.load_vc_model_by_name(model_name, gpu)
|
||||||
|
@ -99,12 +92,6 @@ class TTS(nn.Module):
|
||||||
return self.synthesizer.tts_model.speaker_manager.num_speakers > 1
|
return self.synthesizer.tts_model.speaker_manager.num_speakers > 1
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@property
|
|
||||||
def is_coqui_studio(self):
|
|
||||||
if self.model_name is None:
|
|
||||||
return False
|
|
||||||
return "coqui_studio" in self.model_name
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def is_multi_lingual(self):
|
def is_multi_lingual(self):
|
||||||
# Not sure what sets this to None, but applied a fix to prevent crashing.
|
# Not sure what sets this to None, but applied a fix to prevent crashing.
|
||||||
|
@ -136,14 +123,7 @@ class TTS(nn.Module):
|
||||||
return Path(__file__).parent / ".models.json"
|
return Path(__file__).parent / ".models.json"
|
||||||
|
|
||||||
def list_models(self):
|
def list_models(self):
|
||||||
try:
|
return ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False)
|
||||||
csapi = CS_API(model=self.cs_api_model)
|
|
||||||
models = csapi.list_speakers_as_tts_models()
|
|
||||||
except ValueError as e:
|
|
||||||
print(e)
|
|
||||||
models = []
|
|
||||||
manager = ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False)
|
|
||||||
return manager.list_tts_models() + models
|
|
||||||
|
|
||||||
def download_model_by_name(self, model_name: str):
|
def download_model_by_name(self, model_name: str):
|
||||||
model_path, config_path, model_item = self.manager.download_model(model_name)
|
model_path, config_path, model_item = self.manager.download_model(model_name)
|
||||||
|
@ -186,30 +166,26 @@ class TTS(nn.Module):
|
||||||
TODO: Add tests
|
TODO: Add tests
|
||||||
"""
|
"""
|
||||||
self.synthesizer = None
|
self.synthesizer = None
|
||||||
self.csapi = None
|
|
||||||
self.model_name = model_name
|
self.model_name = model_name
|
||||||
|
|
||||||
if "coqui_studio" in model_name:
|
model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
|
||||||
self.csapi = CS_API()
|
model_name
|
||||||
else:
|
)
|
||||||
model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
|
|
||||||
model_name
|
|
||||||
)
|
|
||||||
|
|
||||||
# init synthesizer
|
# init synthesizer
|
||||||
# None values are fetch from the model
|
# None values are fetch from the model
|
||||||
self.synthesizer = Synthesizer(
|
self.synthesizer = Synthesizer(
|
||||||
tts_checkpoint=model_path,
|
tts_checkpoint=model_path,
|
||||||
tts_config_path=config_path,
|
tts_config_path=config_path,
|
||||||
tts_speakers_file=None,
|
tts_speakers_file=None,
|
||||||
tts_languages_file=None,
|
tts_languages_file=None,
|
||||||
vocoder_checkpoint=vocoder_path,
|
vocoder_checkpoint=vocoder_path,
|
||||||
vocoder_config=vocoder_config_path,
|
vocoder_config=vocoder_config_path,
|
||||||
encoder_checkpoint=None,
|
encoder_checkpoint=None,
|
||||||
encoder_config=None,
|
encoder_config=None,
|
||||||
model_dir=model_dir,
|
model_dir=model_dir,
|
||||||
use_cuda=gpu,
|
use_cuda=gpu,
|
||||||
)
|
)
|
||||||
|
|
||||||
def load_tts_model_by_path(
|
def load_tts_model_by_path(
|
||||||
self, model_path: str, config_path: str, vocoder_path: str = None, vocoder_config: str = None, gpu: bool = False
|
self, model_path: str, config_path: str, vocoder_path: str = None, vocoder_config: str = None, gpu: bool = False
|
||||||
|
@ -246,77 +222,17 @@ class TTS(nn.Module):
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Check if the arguments are valid for the model."""
|
"""Check if the arguments are valid for the model."""
|
||||||
if not self.is_coqui_studio:
|
# check for the coqui tts models
|
||||||
# check for the coqui tts models
|
if self.is_multi_speaker and (speaker is None and speaker_wav is None):
|
||||||
if self.is_multi_speaker and (speaker is None and speaker_wav is None):
|
raise ValueError("Model is multi-speaker but no `speaker` is provided.")
|
||||||
raise ValueError("Model is multi-speaker but no `speaker` is provided.")
|
if self.is_multi_lingual and language is None:
|
||||||
if self.is_multi_lingual and language is None:
|
raise ValueError("Model is multi-lingual but no `language` is provided.")
|
||||||
raise ValueError("Model is multi-lingual but no `language` is provided.")
|
if not self.is_multi_speaker and speaker is not None and "voice_dir" not in kwargs:
|
||||||
if not self.is_multi_speaker and speaker is not None and "voice_dir" not in kwargs:
|
raise ValueError("Model is not multi-speaker but `speaker` is provided.")
|
||||||
raise ValueError("Model is not multi-speaker but `speaker` is provided.")
|
if not self.is_multi_lingual and language is not None:
|
||||||
if not self.is_multi_lingual and language is not None:
|
raise ValueError("Model is not multi-lingual but `language` is provided.")
|
||||||
raise ValueError("Model is not multi-lingual but `language` is provided.")
|
if not emotion is None and not speed is None:
|
||||||
if not emotion is None and not speed is None:
|
raise ValueError("Emotion and speed can only be used with Coqui Studio models. Which is discontinued.")
|
||||||
raise ValueError("Emotion and speed can only be used with Coqui Studio models.")
|
|
||||||
else:
|
|
||||||
if emotion is None:
|
|
||||||
emotion = "Neutral"
|
|
||||||
if speed is None:
|
|
||||||
speed = 1.0
|
|
||||||
# check for the studio models
|
|
||||||
if speaker_wav is not None:
|
|
||||||
raise ValueError("Coqui Studio models do not support `speaker_wav` argument.")
|
|
||||||
if speaker is not None:
|
|
||||||
raise ValueError("Coqui Studio models do not support `speaker` argument.")
|
|
||||||
if language is not None and language != "en":
|
|
||||||
raise ValueError("Coqui Studio models currently support only `language=en` argument.")
|
|
||||||
if emotion not in ["Neutral", "Happy", "Sad", "Angry", "Dull"]:
|
|
||||||
raise ValueError(f"Emotion - `{emotion}` - must be one of `Neutral`, `Happy`, `Sad`, `Angry`, `Dull`.")
|
|
||||||
|
|
||||||
def tts_coqui_studio(
|
|
||||||
self,
|
|
||||||
text: str,
|
|
||||||
speaker_name: str = None,
|
|
||||||
language: str = None,
|
|
||||||
emotion: str = None,
|
|
||||||
speed: float = 1.0,
|
|
||||||
pipe_out=None,
|
|
||||||
file_path: str = None,
|
|
||||||
) -> Union[np.ndarray, str]:
|
|
||||||
"""Convert text to speech using Coqui Studio models. Use `CS_API` class if you are only interested in the API.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text (str):
|
|
||||||
Input text to synthesize.
|
|
||||||
speaker_name (str, optional):
|
|
||||||
Speaker name from Coqui Studio. Defaults to None.
|
|
||||||
language (str): Language of the text. If None, the default language of the speaker is used. Language is only
|
|
||||||
supported by `XTTS` model.
|
|
||||||
emotion (str, optional):
|
|
||||||
Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only available
|
|
||||||
with "V1" model. Defaults to None.
|
|
||||||
speed (float, optional):
|
|
||||||
Speed of the speech. Defaults to 1.0.
|
|
||||||
pipe_out (BytesIO, optional):
|
|
||||||
Flag to stdout the generated TTS wav file for shell pipe.
|
|
||||||
file_path (str, optional):
|
|
||||||
Path to save the output file. When None it returns the `np.ndarray` of waveform. Defaults to None.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Union[np.ndarray, str]: Waveform of the synthesized speech or path to the output file.
|
|
||||||
"""
|
|
||||||
speaker_name = self.model_name.split("/")[2]
|
|
||||||
if file_path is not None:
|
|
||||||
return self.csapi.tts_to_file(
|
|
||||||
text=text,
|
|
||||||
speaker_name=speaker_name,
|
|
||||||
language=language,
|
|
||||||
speed=speed,
|
|
||||||
pipe_out=pipe_out,
|
|
||||||
emotion=emotion,
|
|
||||||
file_path=file_path,
|
|
||||||
)[0]
|
|
||||||
return self.csapi.tts(text=text, speaker_name=speaker_name, language=language, speed=speed, emotion=emotion)[0]
|
|
||||||
|
|
||||||
def tts(
|
def tts(
|
||||||
self,
|
self,
|
||||||
|
@ -357,10 +273,6 @@ class TTS(nn.Module):
|
||||||
self._check_arguments(
|
self._check_arguments(
|
||||||
speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed, **kwargs
|
speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed, **kwargs
|
||||||
)
|
)
|
||||||
if self.csapi is not None:
|
|
||||||
return self.tts_coqui_studio(
|
|
||||||
text=text, speaker_name=speaker, language=language, emotion=emotion, speed=speed
|
|
||||||
)
|
|
||||||
wav = self.synthesizer.tts(
|
wav = self.synthesizer.tts(
|
||||||
text=text,
|
text=text,
|
||||||
speaker_name=speaker,
|
speaker_name=speaker,
|
||||||
|
@ -419,16 +331,6 @@ class TTS(nn.Module):
|
||||||
"""
|
"""
|
||||||
self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
|
self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
|
||||||
|
|
||||||
if self.csapi is not None:
|
|
||||||
return self.tts_coqui_studio(
|
|
||||||
text=text,
|
|
||||||
speaker_name=speaker,
|
|
||||||
language=language,
|
|
||||||
emotion=emotion,
|
|
||||||
speed=speed,
|
|
||||||
file_path=file_path,
|
|
||||||
pipe_out=pipe_out,
|
|
||||||
)
|
|
||||||
wav = self.tts(
|
wav = self.tts(
|
||||||
text=text,
|
text=text,
|
||||||
speaker=speaker,
|
speaker=speaker,
|
||||||
|
|
|
@ -66,12 +66,6 @@ If you don't specify any models, then it uses LJSpeech based English model.
|
||||||
$ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
|
$ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
|
||||||
```
|
```
|
||||||
|
|
||||||
- Run TTS and define speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0:
|
|
||||||
|
|
||||||
```
|
|
||||||
$ tts --text "Text for TTS" --model_name "coqui_studio/<language>/<dataset>/<model_name>" --speed 1.2 --out_path output/path/speech.wav
|
|
||||||
```
|
|
||||||
|
|
||||||
- Run a TTS model with its default vocoder model:
|
- Run a TTS model with its default vocoder model:
|
||||||
|
|
||||||
```
|
```
|
||||||
|
@ -222,25 +216,6 @@ def main():
|
||||||
default=None,
|
default=None,
|
||||||
)
|
)
|
||||||
parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None)
|
parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None)
|
||||||
|
|
||||||
# args for coqui studio
|
|
||||||
parser.add_argument(
|
|
||||||
"--cs_model",
|
|
||||||
type=str,
|
|
||||||
help="Name of the 🐸Coqui Studio model. Available models are `XTTS`, `V1`.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--emotion",
|
|
||||||
type=str,
|
|
||||||
help="Emotion to condition the model with. Only available for 🐸Coqui Studio `V1` model.",
|
|
||||||
default=None,
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--language",
|
|
||||||
type=str,
|
|
||||||
help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS` model.",
|
|
||||||
default=None,
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--pipe_out",
|
"--pipe_out",
|
||||||
help="stdout the generated TTS wav file for shell pipe.",
|
help="stdout the generated TTS wav file for shell pipe.",
|
||||||
|
@ -249,12 +224,6 @@ def main():
|
||||||
const=True,
|
const=True,
|
||||||
default=False,
|
default=False,
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
|
||||||
"--speed",
|
|
||||||
type=float,
|
|
||||||
help="Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0.",
|
|
||||||
default=None,
|
|
||||||
)
|
|
||||||
|
|
||||||
# args for multi-speaker synthesis
|
# args for multi-speaker synthesis
|
||||||
parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
|
parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
|
||||||
|
@ -389,7 +358,6 @@ def main():
|
||||||
|
|
||||||
# CASE1 #list : list pre-trained TTS models
|
# CASE1 #list : list pre-trained TTS models
|
||||||
if args.list_models:
|
if args.list_models:
|
||||||
manager.add_cs_api_models(api.list_models())
|
|
||||||
manager.list_models()
|
manager.list_models()
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
|
@ -404,29 +372,7 @@ def main():
|
||||||
manager.model_info_by_full_name(model_query_full_name)
|
manager.model_info_by_full_name(model_query_full_name)
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
# CASE3: TTS with coqui studio models
|
# CASE3: load pre-trained model paths
|
||||||
if "coqui_studio" in args.model_name:
|
|
||||||
print(" > Using 🐸Coqui Studio model: ", args.model_name)
|
|
||||||
api = TTS(model_name=args.model_name, cs_api_model=args.cs_model)
|
|
||||||
api.tts_to_file(
|
|
||||||
text=args.text,
|
|
||||||
emotion=args.emotion,
|
|
||||||
file_path=args.out_path,
|
|
||||||
language=args.language,
|
|
||||||
speed=args.speed,
|
|
||||||
pipe_out=pipe_out,
|
|
||||||
)
|
|
||||||
print(" > Saving output to ", args.out_path)
|
|
||||||
return
|
|
||||||
|
|
||||||
if args.language_idx is None and args.language is not None:
|
|
||||||
msg = (
|
|
||||||
"--language is only supported for Coqui Studio models. "
|
|
||||||
"Use --language_idx to specify the target language for multilingual models."
|
|
||||||
)
|
|
||||||
raise ValueError(msg)
|
|
||||||
|
|
||||||
# CASE4: load pre-trained model paths
|
|
||||||
if args.model_name is not None and not args.model_path:
|
if args.model_name is not None and not args.model_path:
|
||||||
model_path, config_path, model_item = manager.download_model(args.model_name)
|
model_path, config_path, model_item = manager.download_model(args.model_name)
|
||||||
# tts model
|
# tts model
|
||||||
|
@ -454,7 +400,7 @@ def main():
|
||||||
if args.vocoder_name is not None and not args.vocoder_path:
|
if args.vocoder_name is not None and not args.vocoder_path:
|
||||||
vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
|
vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
|
||||||
|
|
||||||
# CASE5: set custom model paths
|
# CASE4: set custom model paths
|
||||||
if args.model_path is not None:
|
if args.model_path is not None:
|
||||||
tts_path = args.model_path
|
tts_path = args.model_path
|
||||||
tts_config_path = args.config_path
|
tts_config_path = args.config_path
|
||||||
|
|
|
@ -16,12 +16,9 @@ def read_json_with_comments(json_path):
|
||||||
# fallback to json
|
# fallback to json
|
||||||
with fsspec.open(json_path, "r", encoding="utf-8") as f:
|
with fsspec.open(json_path, "r", encoding="utf-8") as f:
|
||||||
input_str = f.read()
|
input_str = f.read()
|
||||||
# handle comments
|
# handle comments but not urls with //
|
||||||
input_str = re.sub(r"\\\n", "", input_str)
|
input_str = re.sub(r"(\"(?:[^\"\\]|\\.)*\")|(/\*(?:.|[\\n\\r])*?\*/)|(//.*)", lambda m: m.group(1) or m.group(2) or "", input_str)
|
||||||
input_str = re.sub(r"//.*\n", "\n", input_str)
|
return json.loads(input_str)
|
||||||
data = json.loads(input_str)
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
def register_config(model_name: str) -> Coqpit:
|
def register_config(model_name: str) -> Coqpit:
|
||||||
"""Find the right config for the given model name.
|
"""Find the right config for the given model name.
|
||||||
|
|
317
TTS/cs_api.py
317
TTS/cs_api.py
|
@ -1,317 +0,0 @@
|
||||||
import http.client
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import tempfile
|
|
||||||
import urllib.request
|
|
||||||
from typing import Tuple
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import requests
|
|
||||||
from scipy.io import wavfile
|
|
||||||
|
|
||||||
from TTS.utils.audio.numpy_transforms import save_wav
|
|
||||||
|
|
||||||
|
|
||||||
class Speaker(object):
|
|
||||||
"""Convert dict to object."""
|
|
||||||
|
|
||||||
def __init__(self, d, is_voice=False):
|
|
||||||
self.is_voice = is_voice
|
|
||||||
for k, v in d.items():
|
|
||||||
if isinstance(k, (list, tuple)):
|
|
||||||
setattr(self, k, [Speaker(x) if isinstance(x, dict) else x for x in v])
|
|
||||||
else:
|
|
||||||
setattr(self, k, Speaker(v) if isinstance(v, dict) else v)
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return str(self.__dict__)
|
|
||||||
|
|
||||||
|
|
||||||
class CS_API:
|
|
||||||
"""🐸Coqui Studio API Wrapper.
|
|
||||||
|
|
||||||
🐸Coqui Studio is the most advanced voice generation platform. You can generate new voices by voice cloning, voice
|
|
||||||
interpolation, or our unique prompt to voice technology. It also provides a set of built-in voices with different
|
|
||||||
characteristics. You can use these voices to generate new audio files or use them in your applications.
|
|
||||||
You can use all the built-in and your own 🐸Coqui Studio speakers with this API with an API token.
|
|
||||||
You can signup to 🐸Coqui Studio from https://app.coqui.ai/auth/signup and get an API token from
|
|
||||||
https://app.coqui.ai/account. We can either enter the token as an environment variable as
|
|
||||||
`export COQUI_STUDIO_TOKEN=<token>` or pass it as `CS_API(api_token=<toke>)`.
|
|
||||||
Visit https://app.coqui.ai/api for more information.
|
|
||||||
|
|
||||||
|
|
||||||
Args:
|
|
||||||
api_token (str): 🐸Coqui Studio API token. If not provided, it will be read from the environment variable
|
|
||||||
`COQUI_STUDIO_TOKEN`.
|
|
||||||
model (str): 🐸Coqui Studio model. It can be either `V1`, `XTTS`. Default is `XTTS`.
|
|
||||||
|
|
||||||
|
|
||||||
Example listing all available speakers:
|
|
||||||
>>> from TTS.api import CS_API
|
|
||||||
>>> tts = CS_API()
|
|
||||||
>>> tts.speakers
|
|
||||||
|
|
||||||
Example listing all emotions:
|
|
||||||
>>> # emotions are only available for `V1` model
|
|
||||||
>>> from TTS.api import CS_API
|
|
||||||
>>> tts = CS_API(model="V1")
|
|
||||||
>>> tts.emotions
|
|
||||||
|
|
||||||
Example with a built-in 🐸 speaker:
|
|
||||||
>>> from TTS.api import CS_API
|
|
||||||
>>> tts = CS_API()
|
|
||||||
>>> wav, sr = api.tts("Hello world", speaker_name=tts.speakers[0].name)
|
|
||||||
>>> filepath = tts.tts_to_file(text="Hello world!", speaker_name=tts.speakers[0].name, file_path="output.wav")
|
|
||||||
|
|
||||||
Example with multi-language model:
|
|
||||||
>>> from TTS.api import CS_API
|
|
||||||
>>> tts = CS_API(model="XTTS")
|
|
||||||
>>> wav, sr = api.tts("Hello world", speaker_name=tts.speakers[0].name, language="en")
|
|
||||||
"""
|
|
||||||
|
|
||||||
MODEL_ENDPOINTS = {
|
|
||||||
"V1": {
|
|
||||||
"list_speakers": "https://app.coqui.ai/api/v2/speakers",
|
|
||||||
"synthesize": "https://app.coqui.ai/api/v2/samples",
|
|
||||||
"list_voices": "https://app.coqui.ai/api/v2/voices",
|
|
||||||
},
|
|
||||||
"XTTS": {
|
|
||||||
"list_speakers": "https://app.coqui.ai/api/v2/speakers",
|
|
||||||
"synthesize": "https://app.coqui.ai/api/v2/samples/xtts/render/",
|
|
||||||
"list_voices": "https://app.coqui.ai/api/v2/voices/xtts",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
SUPPORTED_LANGUAGES = ["en", "es", "de", "fr", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja"]
|
|
||||||
|
|
||||||
def __init__(self, api_token=None, model="XTTS"):
|
|
||||||
self.api_token = api_token
|
|
||||||
self.model = model
|
|
||||||
self.headers = None
|
|
||||||
self._speakers = None
|
|
||||||
self._check_token()
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def ping_api():
|
|
||||||
URL = "https://coqui.gateway.scarf.sh/tts/api"
|
|
||||||
_ = requests.get(URL)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def speakers(self):
|
|
||||||
if self._speakers is None:
|
|
||||||
self._speakers = self.list_all_speakers()
|
|
||||||
return self._speakers
|
|
||||||
|
|
||||||
@property
|
|
||||||
def emotions(self):
|
|
||||||
"""Return a list of available emotions.
|
|
||||||
|
|
||||||
TODO: Get this from the API endpoint.
|
|
||||||
"""
|
|
||||||
if self.model == "V1":
|
|
||||||
return ["Neutral", "Happy", "Sad", "Angry", "Dull"]
|
|
||||||
else:
|
|
||||||
raise ValueError(f"❗ Emotions are not available for {self.model}.")
|
|
||||||
|
|
||||||
def _check_token(self):
|
|
||||||
if self.api_token is None:
|
|
||||||
self.api_token = os.environ.get("COQUI_STUDIO_TOKEN")
|
|
||||||
self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_token}"}
|
|
||||||
if not self.api_token:
|
|
||||||
raise ValueError(
|
|
||||||
"No API token found for 🐸Coqui Studio voices - https://coqui.ai \n"
|
|
||||||
"Visit 🔗https://app.coqui.ai/account to get one.\n"
|
|
||||||
"Set it as an environment variable `export COQUI_STUDIO_TOKEN=<token>`\n"
|
|
||||||
""
|
|
||||||
)
|
|
||||||
|
|
||||||
def list_all_speakers(self):
|
|
||||||
"""Return both built-in Coqui Studio speakers and custom voices created by the user."""
|
|
||||||
return self.list_speakers() + self.list_voices()
|
|
||||||
|
|
||||||
def list_speakers(self):
|
|
||||||
"""List built-in Coqui Studio speakers."""
|
|
||||||
self._check_token()
|
|
||||||
conn = http.client.HTTPSConnection("app.coqui.ai")
|
|
||||||
url = self.MODEL_ENDPOINTS[self.model]["list_speakers"]
|
|
||||||
conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers)
|
|
||||||
res = conn.getresponse()
|
|
||||||
data = res.read()
|
|
||||||
return [Speaker(s) for s in json.loads(data)["result"]]
|
|
||||||
|
|
||||||
def list_voices(self):
|
|
||||||
"""List custom voices created by the user."""
|
|
||||||
conn = http.client.HTTPSConnection("app.coqui.ai")
|
|
||||||
url = self.MODEL_ENDPOINTS[self.model]["list_voices"]
|
|
||||||
conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers)
|
|
||||||
res = conn.getresponse()
|
|
||||||
data = res.read()
|
|
||||||
return [Speaker(s, True) for s in json.loads(data)["result"]]
|
|
||||||
|
|
||||||
def list_speakers_as_tts_models(self):
|
|
||||||
"""List speakers in ModelManager format."""
|
|
||||||
models = []
|
|
||||||
for speaker in self.speakers:
|
|
||||||
model = f"coqui_studio/multilingual/{speaker.name}/{self.model}"
|
|
||||||
models.append(model)
|
|
||||||
return models
|
|
||||||
|
|
||||||
def name_to_speaker(self, name):
|
|
||||||
for speaker in self.speakers:
|
|
||||||
if speaker.name == name:
|
|
||||||
return speaker
|
|
||||||
raise ValueError(f"Speaker {name} not found in {self.speakers}")
|
|
||||||
|
|
||||||
def id_to_speaker(self, speaker_id):
|
|
||||||
for speaker in self.speakers:
|
|
||||||
if speaker.id == speaker_id:
|
|
||||||
return speaker
|
|
||||||
raise ValueError(f"Speaker {speaker_id} not found.")
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def url_to_np(url):
|
|
||||||
tmp_file, _ = urllib.request.urlretrieve(url)
|
|
||||||
rate, data = wavfile.read(tmp_file)
|
|
||||||
return data, rate
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _create_payload(model, text, speaker, speed, emotion, language):
|
|
||||||
payload = {}
|
|
||||||
# if speaker.is_voice:
|
|
||||||
payload["voice_id"] = speaker.id
|
|
||||||
# else:
|
|
||||||
payload["speaker_id"] = speaker.id
|
|
||||||
|
|
||||||
if model == "V1":
|
|
||||||
payload.update(
|
|
||||||
{
|
|
||||||
"emotion": emotion,
|
|
||||||
"name": speaker.name,
|
|
||||||
"text": text,
|
|
||||||
"speed": speed,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
elif model == "XTTS":
|
|
||||||
payload.update(
|
|
||||||
{
|
|
||||||
"name": speaker.name,
|
|
||||||
"text": text,
|
|
||||||
"speed": speed,
|
|
||||||
"language": language,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
raise ValueError(f"❗ Unknown model {model}")
|
|
||||||
return payload
|
|
||||||
|
|
||||||
def _check_tts_args(self, text, speaker_name, speaker_id, emotion, speed, language):
|
|
||||||
assert text is not None, "❗ text is required for V1 model."
|
|
||||||
assert speaker_name is not None, "❗ speaker_name is required for V1 model."
|
|
||||||
if self.model == "V1":
|
|
||||||
if emotion is None:
|
|
||||||
emotion = "Neutral"
|
|
||||||
assert language is None, "❗ language is not supported for V1 model."
|
|
||||||
elif self.model == "XTTS":
|
|
||||||
assert emotion is None, f"❗ Emotions are not supported for XTTS model. Use V1 model."
|
|
||||||
assert language is not None, "❗ Language is required for XTTS model."
|
|
||||||
assert (
|
|
||||||
language in self.SUPPORTED_LANGUAGES
|
|
||||||
), f"❗ Language {language} is not yet supported. Check https://docs.coqui.ai/reference/samples_xtts_create."
|
|
||||||
return text, speaker_name, speaker_id, emotion, speed, language
|
|
||||||
|
|
||||||
def tts(
|
|
||||||
self,
|
|
||||||
text: str,
|
|
||||||
speaker_name: str = None,
|
|
||||||
speaker_id=None,
|
|
||||||
emotion=None,
|
|
||||||
speed=1.0,
|
|
||||||
language=None, # pylint: disable=unused-argument
|
|
||||||
) -> Tuple[np.ndarray, int]:
|
|
||||||
"""Synthesize speech from text.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text (str): Text to synthesize.
|
|
||||||
speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and
|
|
||||||
voices (user generated speakers) with `list_voices()`.
|
|
||||||
speaker_id (str): Speaker ID. If None, the speaker name is used.
|
|
||||||
emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only
|
|
||||||
supported by `V1` model. Defaults to None.
|
|
||||||
speed (float): Speed of the speech. 1.0 is normal speed.
|
|
||||||
language (str): Language of the text. If None, the default language of the speaker is used. Language is only
|
|
||||||
supported by `XTTS` model. See https://docs.coqui.ai/reference/samples_xtts_create for supported languages.
|
|
||||||
"""
|
|
||||||
self._check_token()
|
|
||||||
self.ping_api()
|
|
||||||
|
|
||||||
if speaker_name is None and speaker_id is None:
|
|
||||||
raise ValueError(" [!] Please provide either a `speaker_name` or a `speaker_id`.")
|
|
||||||
if speaker_id is None:
|
|
||||||
speaker = self.name_to_speaker(speaker_name)
|
|
||||||
else:
|
|
||||||
speaker = self.id_to_speaker(speaker_id)
|
|
||||||
|
|
||||||
text, speaker_name, speaker_id, emotion, speed, language = self._check_tts_args(
|
|
||||||
text, speaker_name, speaker_id, emotion, speed, language
|
|
||||||
)
|
|
||||||
|
|
||||||
conn = http.client.HTTPSConnection("app.coqui.ai")
|
|
||||||
payload = self._create_payload(self.model, text, speaker, speed, emotion, language)
|
|
||||||
url = self.MODEL_ENDPOINTS[self.model]["synthesize"]
|
|
||||||
conn.request("POST", url, json.dumps(payload), self.headers)
|
|
||||||
res = conn.getresponse()
|
|
||||||
data = res.read()
|
|
||||||
try:
|
|
||||||
wav, sr = self.url_to_np(json.loads(data)["audio_url"])
|
|
||||||
except KeyError as e:
|
|
||||||
raise ValueError(f" [!] 🐸 API returned error: {data}") from e
|
|
||||||
return wav, sr
|
|
||||||
|
|
||||||
def tts_to_file(
|
|
||||||
self,
|
|
||||||
text: str,
|
|
||||||
speaker_name: str,
|
|
||||||
speaker_id=None,
|
|
||||||
emotion=None,
|
|
||||||
speed=1.0,
|
|
||||||
pipe_out=None,
|
|
||||||
language=None,
|
|
||||||
file_path: str = None,
|
|
||||||
) -> str:
|
|
||||||
"""Synthesize speech from text and save it to a file.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text (str): Text to synthesize.
|
|
||||||
speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and
|
|
||||||
voices (user generated speakers) with `list_voices()`.
|
|
||||||
speaker_id (str): Speaker ID. If None, the speaker name is used.
|
|
||||||
emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull".
|
|
||||||
speed (float): Speed of the speech. 1.0 is normal speed.
|
|
||||||
pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe.
|
|
||||||
language (str): Language of the text. If None, the default language of the speaker is used. Language is only
|
|
||||||
supported by `XTTS` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
|
|
||||||
file_path (str): Path to save the file. If None, a temporary file is created.
|
|
||||||
"""
|
|
||||||
if file_path is None:
|
|
||||||
file_path = tempfile.mktemp(".wav")
|
|
||||||
wav, sr = self.tts(text, speaker_name, speaker_id, emotion, speed, language)
|
|
||||||
save_wav(wav=wav, path=file_path, sample_rate=sr, pipe_out=pipe_out)
|
|
||||||
return file_path
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
import time
|
|
||||||
|
|
||||||
api = CS_API()
|
|
||||||
print(api.speakers)
|
|
||||||
print(api.list_speakers_as_tts_models())
|
|
||||||
|
|
||||||
ts = time.time()
|
|
||||||
wav, sr = api.tts(
|
|
||||||
"It took me quite a long time to develop a voice.", language="en", speaker_name=api.speakers[0].name
|
|
||||||
)
|
|
||||||
print(f" [i] XTTS took {time.time() - ts:.2f}s")
|
|
||||||
|
|
||||||
filepath = api.tts_to_file(
|
|
||||||
text="Hello world!", speaker_name=api.speakers[0].name, language="en", file_path="output.wav"
|
|
||||||
)
|
|
|
@ -0,0 +1,34 @@
|
||||||
|
import torch
|
||||||
|
|
||||||
|
class SpeakerManager():
|
||||||
|
def __init__(self, speaker_file_path=None):
|
||||||
|
self.speakers = torch.load(speaker_file_path)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def name_to_id(self):
|
||||||
|
return self.speakers.keys()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def num_speakers(self):
|
||||||
|
return len(self.name_to_id)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def speaker_names(self):
|
||||||
|
return list(self.name_to_id.keys())
|
||||||
|
|
||||||
|
|
||||||
|
class LanguageManager():
|
||||||
|
def __init__(self, config):
|
||||||
|
self.langs = config["languages"]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def name_to_id(self):
|
||||||
|
return self.langs
|
||||||
|
|
||||||
|
@property
|
||||||
|
def num_languages(self):
|
||||||
|
return len(self.name_to_id)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def language_names(self):
|
||||||
|
return list(self.name_to_id)
|
|
@ -11,6 +11,7 @@ from TTS.tts.layers.xtts.gpt import GPT
|
||||||
from TTS.tts.layers.xtts.hifigan_decoder import HifiDecoder
|
from TTS.tts.layers.xtts.hifigan_decoder import HifiDecoder
|
||||||
from TTS.tts.layers.xtts.stream_generator import init_stream_support
|
from TTS.tts.layers.xtts.stream_generator import init_stream_support
|
||||||
from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer, split_sentence
|
from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer, split_sentence
|
||||||
|
from TTS.tts.layers.xtts.xtts_manager import SpeakerManager, LanguageManager
|
||||||
from TTS.tts.models.base_tts import BaseTTS
|
from TTS.tts.models.base_tts import BaseTTS
|
||||||
from TTS.utils.io import load_fsspec
|
from TTS.utils.io import load_fsspec
|
||||||
|
|
||||||
|
@ -378,7 +379,7 @@ class Xtts(BaseTTS):
|
||||||
|
|
||||||
return gpt_cond_latents, speaker_embedding
|
return gpt_cond_latents, speaker_embedding
|
||||||
|
|
||||||
def synthesize(self, text, config, speaker_wav, language, **kwargs):
|
def synthesize(self, text, config, speaker_wav, language, speaker_id=None, **kwargs):
|
||||||
"""Synthesize speech with the given input text.
|
"""Synthesize speech with the given input text.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
@ -393,12 +394,6 @@ class Xtts(BaseTTS):
|
||||||
`text_input` as text token IDs after tokenizer, `voice_samples` as samples used for cloning, `conditioning_latents`
|
`text_input` as text token IDs after tokenizer, `voice_samples` as samples used for cloning, `conditioning_latents`
|
||||||
as latents used at inference.
|
as latents used at inference.
|
||||||
|
|
||||||
"""
|
|
||||||
return self.inference_with_config(text, config, ref_audio_path=speaker_wav, language=language, **kwargs)
|
|
||||||
|
|
||||||
def inference_with_config(self, text, config, ref_audio_path, language, **kwargs):
|
|
||||||
"""
|
|
||||||
inference with config
|
|
||||||
"""
|
"""
|
||||||
assert (
|
assert (
|
||||||
"zh-cn" if language == "zh" else language in self.config.languages
|
"zh-cn" if language == "zh" else language in self.config.languages
|
||||||
|
@ -410,13 +405,18 @@ class Xtts(BaseTTS):
|
||||||
"repetition_penalty": config.repetition_penalty,
|
"repetition_penalty": config.repetition_penalty,
|
||||||
"top_k": config.top_k,
|
"top_k": config.top_k,
|
||||||
"top_p": config.top_p,
|
"top_p": config.top_p,
|
||||||
|
}
|
||||||
|
settings.update(kwargs) # allow overriding of preset settings with kwargs
|
||||||
|
if speaker_id is not None:
|
||||||
|
gpt_cond_latent, speaker_embedding = self.speaker_manager.speakers[speaker_id].values()
|
||||||
|
return self.inference(text, language, gpt_cond_latent, speaker_embedding, **settings)
|
||||||
|
settings.update({
|
||||||
"gpt_cond_len": config.gpt_cond_len,
|
"gpt_cond_len": config.gpt_cond_len,
|
||||||
"gpt_cond_chunk_len": config.gpt_cond_chunk_len,
|
"gpt_cond_chunk_len": config.gpt_cond_chunk_len,
|
||||||
"max_ref_len": config.max_ref_len,
|
"max_ref_len": config.max_ref_len,
|
||||||
"sound_norm_refs": config.sound_norm_refs,
|
"sound_norm_refs": config.sound_norm_refs,
|
||||||
}
|
})
|
||||||
settings.update(kwargs) # allow overriding of preset settings with kwargs
|
return self.full_inference(text, speaker_wav, language, **settings)
|
||||||
return self.full_inference(text, ref_audio_path, language, **settings)
|
|
||||||
|
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def full_inference(
|
def full_inference(
|
||||||
|
@ -520,6 +520,8 @@ class Xtts(BaseTTS):
|
||||||
):
|
):
|
||||||
language = language.split("-")[0] # remove the country code
|
language = language.split("-")[0] # remove the country code
|
||||||
length_scale = 1.0 / max(speed, 0.05)
|
length_scale = 1.0 / max(speed, 0.05)
|
||||||
|
gpt_cond_latent = gpt_cond_latent.to(self.device)
|
||||||
|
speaker_embedding = speaker_embedding.to(self.device)
|
||||||
if enable_text_splitting:
|
if enable_text_splitting:
|
||||||
text = split_sentence(text, language, self.tokenizer.char_limits[language])
|
text = split_sentence(text, language, self.tokenizer.char_limits[language])
|
||||||
else:
|
else:
|
||||||
|
@ -628,6 +630,8 @@ class Xtts(BaseTTS):
|
||||||
):
|
):
|
||||||
language = language.split("-")[0] # remove the country code
|
language = language.split("-")[0] # remove the country code
|
||||||
length_scale = 1.0 / max(speed, 0.05)
|
length_scale = 1.0 / max(speed, 0.05)
|
||||||
|
gpt_cond_latent = gpt_cond_latent.to(self.device)
|
||||||
|
speaker_embedding = speaker_embedding.to(self.device)
|
||||||
if enable_text_splitting:
|
if enable_text_splitting:
|
||||||
text = split_sentence(text, language, self.tokenizer.char_limits[language])
|
text = split_sentence(text, language, self.tokenizer.char_limits[language])
|
||||||
else:
|
else:
|
||||||
|
@ -733,6 +737,7 @@ class Xtts(BaseTTS):
|
||||||
eval=True,
|
eval=True,
|
||||||
strict=True,
|
strict=True,
|
||||||
use_deepspeed=False,
|
use_deepspeed=False,
|
||||||
|
speaker_file_path=None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Loads a checkpoint from disk and initializes the model's state and tokenizer.
|
Loads a checkpoint from disk and initializes the model's state and tokenizer.
|
||||||
|
@ -751,6 +756,12 @@ class Xtts(BaseTTS):
|
||||||
|
|
||||||
model_path = checkpoint_path or os.path.join(checkpoint_dir, "model.pth")
|
model_path = checkpoint_path or os.path.join(checkpoint_dir, "model.pth")
|
||||||
vocab_path = vocab_path or os.path.join(checkpoint_dir, "vocab.json")
|
vocab_path = vocab_path or os.path.join(checkpoint_dir, "vocab.json")
|
||||||
|
speaker_file_path = speaker_file_path or os.path.join(checkpoint_dir, "speakers_xtts.pth")
|
||||||
|
|
||||||
|
self.language_manager = LanguageManager(config)
|
||||||
|
self.speaker_manager = None
|
||||||
|
if os.path.exists(speaker_file_path):
|
||||||
|
self.speaker_manager = SpeakerManager(speaker_file_path)
|
||||||
|
|
||||||
if os.path.exists(vocab_path):
|
if os.path.exists(vocab_path):
|
||||||
self.tokenizer = VoiceBpeTokenizer(vocab_file=vocab_path)
|
self.tokenizer = VoiceBpeTokenizer(vocab_file=vocab_path)
|
||||||
|
|
|
@ -11,7 +11,7 @@ import fsspec
|
||||||
import requests
|
import requests
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from TTS.config import load_config
|
from TTS.config import load_config, read_json_with_comments
|
||||||
from TTS.utils.generic_utils import get_user_data_dir
|
from TTS.utils.generic_utils import get_user_data_dir
|
||||||
|
|
||||||
LICENSE_URLS = {
|
LICENSE_URLS = {
|
||||||
|
@ -65,30 +65,7 @@ class ModelManager(object):
|
||||||
Args:
|
Args:
|
||||||
file_path (str): path to .models.json.
|
file_path (str): path to .models.json.
|
||||||
"""
|
"""
|
||||||
with open(file_path, "r", encoding="utf-8") as json_file:
|
self.models_dict = read_json_with_comments(file_path)
|
||||||
self.models_dict = json.load(json_file)
|
|
||||||
|
|
||||||
def add_cs_api_models(self, model_list: List[str]):
|
|
||||||
"""Add list of Coqui Studio model names that are returned from the api
|
|
||||||
|
|
||||||
Each has the following format `<coqui_studio_model>/en/<speaker_name>/<coqui_studio_model>`
|
|
||||||
"""
|
|
||||||
|
|
||||||
def _add_model(model_name: str):
|
|
||||||
if not "coqui_studio" in model_name:
|
|
||||||
return
|
|
||||||
model_type, lang, dataset, model = model_name.split("/")
|
|
||||||
if model_type not in self.models_dict:
|
|
||||||
self.models_dict[model_type] = {}
|
|
||||||
if lang not in self.models_dict[model_type]:
|
|
||||||
self.models_dict[model_type][lang] = {}
|
|
||||||
if dataset not in self.models_dict[model_type][lang]:
|
|
||||||
self.models_dict[model_type][lang][dataset] = {}
|
|
||||||
if model not in self.models_dict[model_type][lang][dataset]:
|
|
||||||
self.models_dict[model_type][lang][dataset][model] = {}
|
|
||||||
|
|
||||||
for model_name in model_list:
|
|
||||||
_add_model(model_name)
|
|
||||||
|
|
||||||
def _list_models(self, model_type, model_count=0):
|
def _list_models(self, model_type, model_count=0):
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
|
@ -315,6 +292,7 @@ class ModelManager(object):
|
||||||
f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/config.json",
|
f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/config.json",
|
||||||
f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/vocab.json",
|
f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/vocab.json",
|
||||||
f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/hash.md5",
|
f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/hash.md5",
|
||||||
|
f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/speakers_xtts.pth",
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -305,7 +305,7 @@ class Synthesizer(nn.Module):
|
||||||
speaker_embedding = None
|
speaker_embedding = None
|
||||||
speaker_id = None
|
speaker_id = None
|
||||||
if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "name_to_id"):
|
if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "name_to_id"):
|
||||||
if speaker_name and isinstance(speaker_name, str):
|
if speaker_name and isinstance(speaker_name, str) and not self.tts_config.model == "xtts":
|
||||||
if self.tts_config.use_d_vector_file:
|
if self.tts_config.use_d_vector_file:
|
||||||
# get the average speaker embedding from the saved d_vectors.
|
# get the average speaker embedding from the saved d_vectors.
|
||||||
speaker_embedding = self.tts_model.speaker_manager.get_mean_embedding(
|
speaker_embedding = self.tts_model.speaker_manager.get_mean_embedding(
|
||||||
|
@ -335,7 +335,9 @@ class Synthesizer(nn.Module):
|
||||||
# handle multi-lingual
|
# handle multi-lingual
|
||||||
language_id = None
|
language_id = None
|
||||||
if self.tts_languages_file or (
|
if self.tts_languages_file or (
|
||||||
hasattr(self.tts_model, "language_manager") and self.tts_model.language_manager is not None
|
hasattr(self.tts_model, "language_manager")
|
||||||
|
and self.tts_model.language_manager is not None
|
||||||
|
and not self.tts_config.model == "xtts"
|
||||||
):
|
):
|
||||||
if len(self.tts_model.language_manager.name_to_id) == 1:
|
if len(self.tts_model.language_manager.name_to_id) == 1:
|
||||||
language_id = list(self.tts_model.language_manager.name_to_id.values())[0]
|
language_id = list(self.tts_model.language_manager.name_to_id.values())[0]
|
||||||
|
@ -366,6 +368,7 @@ class Synthesizer(nn.Module):
|
||||||
if (
|
if (
|
||||||
speaker_wav is not None
|
speaker_wav is not None
|
||||||
and self.tts_model.speaker_manager is not None
|
and self.tts_model.speaker_manager is not None
|
||||||
|
and hasattr(self.tts_model.speaker_manager, "encoder_ap")
|
||||||
and self.tts_model.speaker_manager.encoder_ap is not None
|
and self.tts_model.speaker_manager.encoder_ap is not None
|
||||||
):
|
):
|
||||||
speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(speaker_wav)
|
speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(speaker_wav)
|
||||||
|
|
|
@ -172,48 +172,6 @@ tts.tts_with_vc_to_file(
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Example text to speech using [🐸Coqui Studio](https://coqui.ai) models.
|
|
||||||
|
|
||||||
You can use all of your available speakers in the studio.
|
|
||||||
[🐸Coqui Studio](https://coqui.ai) API token is required. You can get it from the [account page](https://coqui.ai/account).
|
|
||||||
You should set the `COQUI_STUDIO_TOKEN` environment variable to use the API token.
|
|
||||||
|
|
||||||
```python
|
|
||||||
# If you have a valid API token set you will see the studio speakers as separate models in the list.
|
|
||||||
# The name format is coqui_studio/en/<studio_speaker_name>/coqui_studio
|
|
||||||
models = TTS().list_models()
|
|
||||||
# Init TTS with the target studio speaker
|
|
||||||
tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False)
|
|
||||||
# Run TTS
|
|
||||||
tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH)
|
|
||||||
# Run TTS with emotion and speed control
|
|
||||||
tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5)
|
|
||||||
```
|
|
||||||
|
|
||||||
If you just need 🐸 Coqui Studio speakers, you can use `CS_API`. It is a wrapper around the 🐸 Coqui Studio API.
|
|
||||||
|
|
||||||
```python
|
|
||||||
from TTS.api import CS_API
|
|
||||||
|
|
||||||
# Init 🐸 Coqui Studio API
|
|
||||||
# you can either set the API token as an environment variable `COQUI_STUDIO_TOKEN` or pass it as an argument.
|
|
||||||
|
|
||||||
# XTTS - Best quality and life-like speech in multiple languages. See https://docs.coqui.ai/reference/samples_xtts_create for supported languages.
|
|
||||||
api = CS_API(api_token=<token>, model="XTTS")
|
|
||||||
api.speakers # all the speakers are available with all the models.
|
|
||||||
api.list_speakers()
|
|
||||||
api.list_voices()
|
|
||||||
wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", language="en", speed=1.5)
|
|
||||||
|
|
||||||
# V1 - Fast and lightweight TTS in EN with emotion control.
|
|
||||||
api = CS_API(api_token=<token>, model="V1")
|
|
||||||
api.speakers
|
|
||||||
api.emotions # emotions are only for the V1 model.
|
|
||||||
api.list_speakers()
|
|
||||||
api.list_voices()
|
|
||||||
wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5)
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Example text to speech using **Fairseq models in ~1100 languages** 🤯.
|
#### Example text to speech using **Fairseq models in ~1100 languages** 🤯.
|
||||||
For these models use the following name format: `tts_models/<lang-iso_code>/fairseq/vits`.
|
For these models use the following name format: `tts_models/<lang-iso_code>/fairseq/vits`.
|
||||||
|
|
||||||
|
|
|
@ -21,7 +21,7 @@ a few tricks to make it faster and support streaming inference.
|
||||||
- Across the board quality improvements.
|
- Across the board quality improvements.
|
||||||
|
|
||||||
### Code
|
### Code
|
||||||
Current implementation only supports inference.
|
Current implementation only supports inference and GPT encoder training.
|
||||||
|
|
||||||
### Languages
|
### Languages
|
||||||
As of now, XTTS-v2 supports 16 languages: English (en), Spanish (es), French (fr), German (de), Italian (it), Portuguese (pt), Polish (pl), Turkish (tr), Russian (ru), Dutch (nl), Czech (cs), Arabic (ar), Chinese (zh-cn), Japanese (ja), Hungarian (hu) and Korean (ko).
|
As of now, XTTS-v2 supports 16 languages: English (en), Spanish (es), French (fr), German (de), Italian (it), Portuguese (pt), Polish (pl), Turkish (tr), Russian (ru), Dutch (nl), Czech (cs), Arabic (ar), Chinese (zh-cn), Japanese (ja), Hungarian (hu) and Korean (ko).
|
||||||
|
@ -36,9 +36,71 @@ Come and join in our 🐸Community. We're active on [Discord](https://discord.gg
|
||||||
You can also mail us at info@coqui.ai.
|
You can also mail us at info@coqui.ai.
|
||||||
|
|
||||||
### Inference
|
### Inference
|
||||||
|
|
||||||
|
#### 🐸TTS Command line
|
||||||
|
|
||||||
|
You can check all supported languages with the following command:
|
||||||
|
|
||||||
|
```console
|
||||||
|
tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \
|
||||||
|
--list_language_idx
|
||||||
|
```
|
||||||
|
|
||||||
|
You can check all Coqui available speakers with the following command:
|
||||||
|
|
||||||
|
```console
|
||||||
|
tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \
|
||||||
|
--list_speaker_idx
|
||||||
|
```
|
||||||
|
|
||||||
|
##### Coqui speakers
|
||||||
|
You can do inference using one of the available speakers using the following command:
|
||||||
|
|
||||||
|
```console
|
||||||
|
tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \
|
||||||
|
--text "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent." \
|
||||||
|
--speaker_idx "Ana Florence" \
|
||||||
|
--language_idx en \
|
||||||
|
--use_cuda true
|
||||||
|
```
|
||||||
|
|
||||||
|
##### Clone a voice
|
||||||
|
You can clone a speaker voice using a single or multiple references:
|
||||||
|
|
||||||
|
###### Single reference
|
||||||
|
|
||||||
|
```console
|
||||||
|
tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \
|
||||||
|
--text "Bugün okula gitmek istemiyorum." \
|
||||||
|
--speaker_wav /path/to/target/speaker.wav \
|
||||||
|
--language_idx tr \
|
||||||
|
--use_cuda true
|
||||||
|
```
|
||||||
|
|
||||||
|
###### Multiple references
|
||||||
|
```console
|
||||||
|
tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \
|
||||||
|
--text "Bugün okula gitmek istemiyorum." \
|
||||||
|
--speaker_wav /path/to/target/speaker.wav /path/to/target/speaker_2.wav /path/to/target/speaker_3.wav \
|
||||||
|
--language_idx tr \
|
||||||
|
--use_cuda true
|
||||||
|
```
|
||||||
|
or for all wav files in a directory you can use:
|
||||||
|
|
||||||
|
```console
|
||||||
|
tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \
|
||||||
|
--text "Bugün okula gitmek istemiyorum." \
|
||||||
|
--speaker_wav /path/to/target/*.wav \
|
||||||
|
--language_idx tr \
|
||||||
|
--use_cuda true
|
||||||
|
```
|
||||||
|
|
||||||
#### 🐸TTS API
|
#### 🐸TTS API
|
||||||
|
|
||||||
##### Single reference
|
##### Clone a voice
|
||||||
|
You can clone a speaker voice using a single or multiple references:
|
||||||
|
|
||||||
|
###### Single reference
|
||||||
|
|
||||||
Splits the text into sentences and generates audio for each sentence. The audio files are then concatenated to produce the final audio.
|
Splits the text into sentences and generates audio for each sentence. The audio files are then concatenated to produce the final audio.
|
||||||
You can optionally disable sentence splitting for better coherence but more VRAM and possibly hitting models context length limit.
|
You can optionally disable sentence splitting for better coherence but more VRAM and possibly hitting models context length limit.
|
||||||
|
@ -56,7 +118,7 @@ tts.tts_to_file(text="It took me quite a long time to develop a voice, and now t
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
##### Multiple references
|
###### Multiple references
|
||||||
|
|
||||||
You can pass multiple audio files to the `speaker_wav` argument for better voice cloning.
|
You can pass multiple audio files to the `speaker_wav` argument for better voice cloning.
|
||||||
|
|
||||||
|
@ -81,34 +143,23 @@ tts.tts_to_file(text="It took me quite a long time to develop a voice, and now t
|
||||||
language="en")
|
language="en")
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 🐸TTS Command line
|
##### Coqui speakers
|
||||||
|
|
||||||
##### Single reference
|
You can do inference using one of the available speakers using the following code:
|
||||||
```console
|
|
||||||
tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \
|
```python
|
||||||
--text "Bugün okula gitmek istemiyorum." \
|
from TTS.api import TTS
|
||||||
--speaker_wav /path/to/target/speaker.wav \
|
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
|
||||||
--language_idx tr \
|
|
||||||
--use_cuda true
|
# generate speech by cloning a voice using default settings
|
||||||
|
tts.tts_to_file(text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
|
||||||
|
file_path="output.wav",
|
||||||
|
speaker="Ana Florence",
|
||||||
|
language="en",
|
||||||
|
split_sentences=True
|
||||||
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
##### Multiple references
|
|
||||||
```console
|
|
||||||
tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \
|
|
||||||
--text "Bugün okula gitmek istemiyorum." \
|
|
||||||
--speaker_wav /path/to/target/speaker.wav /path/to/target/speaker_2.wav /path/to/target/speaker_3.wav \
|
|
||||||
--language_idx tr \
|
|
||||||
--use_cuda true
|
|
||||||
```
|
|
||||||
or for all wav files in a directory you can use:
|
|
||||||
|
|
||||||
```console
|
|
||||||
tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \
|
|
||||||
--text "Bugün okula gitmek istemiyorum." \
|
|
||||||
--speaker_wav /path/to/target/*.wav \
|
|
||||||
--language_idx tr \
|
|
||||||
--use_cuda true
|
|
||||||
```
|
|
||||||
|
|
||||||
#### 🐸TTS Model API
|
#### 🐸TTS Model API
|
||||||
|
|
||||||
|
|
|
@ -1,113 +0,0 @@
|
||||||
import os
|
|
||||||
import unittest
|
|
||||||
|
|
||||||
from tests import get_tests_data_path, get_tests_output_path
|
|
||||||
from TTS.api import CS_API, TTS
|
|
||||||
|
|
||||||
OUTPUT_PATH = os.path.join(get_tests_output_path(), "test_python_api.wav")
|
|
||||||
cloning_test_wav_path = os.path.join(get_tests_data_path(), "ljspeech/wavs/LJ001-0028.wav")
|
|
||||||
|
|
||||||
|
|
||||||
is_coqui_available = os.environ.get("COQUI_STUDIO_TOKEN")
|
|
||||||
|
|
||||||
|
|
||||||
if is_coqui_available:
|
|
||||||
|
|
||||||
class CS_APITest(unittest.TestCase):
|
|
||||||
def test_speakers(self):
|
|
||||||
tts = CS_API()
|
|
||||||
self.assertGreater(len(tts.speakers), 1)
|
|
||||||
|
|
||||||
def test_emotions(self):
|
|
||||||
tts = CS_API()
|
|
||||||
self.assertGreater(len(tts.emotions), 1)
|
|
||||||
|
|
||||||
def test_list_calls(self):
|
|
||||||
tts = CS_API()
|
|
||||||
self.assertGreater(len(tts.list_voices()), 1)
|
|
||||||
self.assertGreater(len(tts.list_speakers()), 1)
|
|
||||||
self.assertGreater(len(tts.list_all_speakers()), 1)
|
|
||||||
self.assertGreater(len(tts.list_speakers_as_tts_models()), 1)
|
|
||||||
|
|
||||||
def test_name_to_speaker(self):
|
|
||||||
tts = CS_API()
|
|
||||||
speaker_name = tts.list_speakers_as_tts_models()[0].split("/")[2]
|
|
||||||
speaker = tts.name_to_speaker(speaker_name)
|
|
||||||
self.assertEqual(speaker.name, speaker_name)
|
|
||||||
|
|
||||||
def test_tts(self):
|
|
||||||
tts = CS_API()
|
|
||||||
wav, sr = tts.tts(text="This is a test.", speaker_name=tts.list_speakers()[0].name)
|
|
||||||
self.assertEqual(sr, 44100)
|
|
||||||
self.assertGreater(len(wav), 1)
|
|
||||||
|
|
||||||
class TTSTest(unittest.TestCase):
|
|
||||||
def test_single_speaker_model(self):
|
|
||||||
tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False)
|
|
||||||
|
|
||||||
error_raised = False
|
|
||||||
try:
|
|
||||||
tts.tts_to_file(text="Ich bin eine Testnachricht.", speaker="Thorsten", language="de")
|
|
||||||
except ValueError:
|
|
||||||
error_raised = True
|
|
||||||
|
|
||||||
tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path=OUTPUT_PATH)
|
|
||||||
|
|
||||||
self.assertTrue(error_raised)
|
|
||||||
self.assertFalse(tts.is_multi_speaker)
|
|
||||||
self.assertFalse(tts.is_multi_lingual)
|
|
||||||
self.assertIsNone(tts.speakers)
|
|
||||||
self.assertIsNone(tts.languages)
|
|
||||||
|
|
||||||
def test_studio_model(self):
|
|
||||||
tts = TTS(model_name="coqui_studio/en/Zacharie Aimilios/coqui_studio")
|
|
||||||
tts.tts_to_file(text="This is a test.")
|
|
||||||
|
|
||||||
# check speed > 2.0 raises error
|
|
||||||
raised_error = False
|
|
||||||
try:
|
|
||||||
_ = tts.tts(text="This is a test.", speed=4.0, emotion="Sad") # should raise error with speed > 2.0
|
|
||||||
except ValueError:
|
|
||||||
raised_error = True
|
|
||||||
self.assertTrue(raised_error)
|
|
||||||
|
|
||||||
# check emotion is invalid
|
|
||||||
raised_error = False
|
|
||||||
try:
|
|
||||||
_ = tts.tts(text="This is a test.", speed=2.0, emotion="No Emo") # should raise error with speed > 2.0
|
|
||||||
except ValueError:
|
|
||||||
raised_error = True
|
|
||||||
self.assertTrue(raised_error)
|
|
||||||
|
|
||||||
# check valid call
|
|
||||||
wav = tts.tts(text="This is a test.", speed=2.0, emotion="Sad")
|
|
||||||
self.assertGreater(len(wav), 0)
|
|
||||||
|
|
||||||
def test_fairseq_model(self): # pylint: disable=no-self-use
|
|
||||||
tts = TTS(model_name="tts_models/eng/fairseq/vits")
|
|
||||||
tts.tts_to_file(text="This is a test.")
|
|
||||||
|
|
||||||
def test_multi_speaker_multi_lingual_model(self):
|
|
||||||
tts = TTS()
|
|
||||||
tts.load_tts_model_by_name(tts.models[0]) # YourTTS
|
|
||||||
tts.tts_to_file(
|
|
||||||
text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path=OUTPUT_PATH
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertTrue(tts.is_multi_speaker)
|
|
||||||
self.assertTrue(tts.is_multi_lingual)
|
|
||||||
self.assertGreater(len(tts.speakers), 1)
|
|
||||||
self.assertGreater(len(tts.languages), 1)
|
|
||||||
|
|
||||||
def test_voice_cloning(self): # pylint: disable=no-self-use
|
|
||||||
tts = TTS()
|
|
||||||
tts.load_tts_model_by_name("tts_models/multilingual/multi-dataset/your_tts")
|
|
||||||
tts.tts_to_file("Hello world!", speaker_wav=cloning_test_wav_path, language="en", file_path=OUTPUT_PATH)
|
|
||||||
|
|
||||||
def test_voice_conversion(self): # pylint: disable=no-self-use
|
|
||||||
tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False, gpu=False)
|
|
||||||
tts.voice_conversion_to_file(
|
|
||||||
source_wav=cloning_test_wav_path,
|
|
||||||
target_wav=cloning_test_wav_path,
|
|
||||||
file_path=OUTPUT_PATH,
|
|
||||||
)
|
|
|
@ -1,25 +0,0 @@
|
||||||
import os
|
|
||||||
|
|
||||||
from tests import get_tests_output_path, run_cli
|
|
||||||
|
|
||||||
|
|
||||||
def test_synthesize():
|
|
||||||
"""Test synthesize.py with diffent arguments."""
|
|
||||||
output_path = os.path.join(get_tests_output_path(), "output.wav")
|
|
||||||
|
|
||||||
# 🐸 Coqui studio model
|
|
||||||
run_cli(
|
|
||||||
'tts --model_name "coqui_studio/en/Torcull Diarmuid/coqui_studio" '
|
|
||||||
'--text "This is it" '
|
|
||||||
f'--out_path "{output_path}"'
|
|
||||||
)
|
|
||||||
|
|
||||||
# 🐸 Coqui studio model with speed arg.
|
|
||||||
run_cli(
|
|
||||||
'tts --model_name "coqui_studio/en/Torcull Diarmuid/coqui_studio" '
|
|
||||||
'--text "This is it but slow" --speed 0.1'
|
|
||||||
f'--out_path "{output_path}"'
|
|
||||||
)
|
|
||||||
|
|
||||||
# test pipe_out command
|
|
||||||
run_cli(f'tts --text "test." --pipe_out --out_path "{output_path}" | aplay')
|
|
Loading…
Reference in New Issue