mirror of https://github.com/coqui-ai/TTS.git
🐸 Coqui Studio API integration (#2484)
* Warn when lang is not avail * Make style * Implement Coqui Studio API * Test * Update docs * Set action * Make style * Make lint * Update README * Make style * Fix action * Run actions
This commit is contained in:
parent
ce79160576
commit
ad8b9bf2be
|
@ -32,7 +32,8 @@ jobs:
|
||||||
- name: check OS
|
- name: check OS
|
||||||
run: cat /etc/os-release
|
run: cat /etc/os-release
|
||||||
- name: set ENV
|
- name: set ENV
|
||||||
run: export TRAINER_TELEMETRY=0
|
run: |
|
||||||
|
export TRAINER_TELEMETRY=0
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
sudo apt-get update
|
sudo apt-get update
|
||||||
|
@ -49,4 +50,6 @@ jobs:
|
||||||
python3 -m pip install .[all]
|
python3 -m pip install .[all]
|
||||||
python3 setup.py egg_info
|
python3 setup.py egg_info
|
||||||
- name: Unit tests
|
- name: Unit tests
|
||||||
run: make inference_tests
|
run: |
|
||||||
|
export COQUI_STUDIO_TOKEN=${{ secrets.COQUI_STUDIO_TOKEN }}
|
||||||
|
make inference_tests
|
||||||
|
|
30
README.md
30
README.md
|
@ -197,6 +197,36 @@ tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_
|
||||||
tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav")
|
tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav")
|
||||||
tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr-fr", file_path="output.wav")
|
tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr-fr", file_path="output.wav")
|
||||||
tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt-br", file_path="output.wav")
|
tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt-br", file_path="output.wav")
|
||||||
|
|
||||||
|
|
||||||
|
# Example voice conversion converting speaker of the `source_wav` to the speaker of the `target_wav`
|
||||||
|
|
||||||
|
tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False, gpu=True)
|
||||||
|
tts.voice_conversion_to_file(source_wav="my/source.wav", target_wav="my/target.wav", file_path="output.wav")
|
||||||
|
|
||||||
|
# Example voice cloning by a single speaker TTS model combining with the voice conversion model. This way, you can
|
||||||
|
# clone voices by using any model in 🐸TTS.
|
||||||
|
|
||||||
|
tts = TTS("tts_models/de/thorsten/tacotron2-DDC")
|
||||||
|
tts.tts_with_vc_to_file(
|
||||||
|
"Wie sage ich auf Italienisch, dass ich dich liebe?",
|
||||||
|
speaker_wav="target/speaker.wav",
|
||||||
|
file_path="ouptut.wav"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Example text to speech using [🐸Coqui Studio](https://coqui.ai) models. You can use all of your available speakers in the studio.
|
||||||
|
# [🐸Coqui Studio](https://coqui.ai) API token is required. You can get it from the [account page](https://coqui.ai/account).
|
||||||
|
# You should set the `COQUI_STUDIO_TOKEN` environment variable to use the API token.
|
||||||
|
|
||||||
|
# If you have a valid API token set you will see the studio speakers as separate models in the list.
|
||||||
|
# The name format is coqui_studio/en/<studio_speaker_name>/coqui_studio
|
||||||
|
models = TTS().list_models()
|
||||||
|
# Init TTS with the target studio speaker
|
||||||
|
tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False, gpu=False)
|
||||||
|
# Run TTS
|
||||||
|
tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH)
|
||||||
|
# Run TTS with emotion and speed control
|
||||||
|
tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5)
|
||||||
```
|
```
|
||||||
|
|
||||||
### Command line `tts`
|
### Command line `tts`
|
||||||
|
|
380
TTS/api.py
380
TTS/api.py
|
@ -1,11 +1,227 @@
|
||||||
|
import http.client
|
||||||
|
import json
|
||||||
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
|
import urllib.request
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from scipy.io import wavfile
|
||||||
|
|
||||||
from TTS.utils.audio.numpy_transforms import save_wav
|
from TTS.utils.audio.numpy_transforms import save_wav
|
||||||
from TTS.utils.manage import ModelManager
|
from TTS.utils.manage import ModelManager
|
||||||
from TTS.utils.synthesizer import Synthesizer
|
from TTS.utils.synthesizer import Synthesizer
|
||||||
|
|
||||||
|
|
||||||
|
class Speaker(object):
|
||||||
|
"""Convert dict to object."""
|
||||||
|
|
||||||
|
def __init__(self, d, is_voice=False):
|
||||||
|
self.is_voice = is_voice
|
||||||
|
for k, v in d.items():
|
||||||
|
if isinstance(k, (list, tuple)):
|
||||||
|
setattr(self, k, [Speaker(x) if isinstance(x, dict) else x for x in v])
|
||||||
|
else:
|
||||||
|
setattr(self, k, Speaker(v) if isinstance(v, dict) else v)
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return str(self.__dict__)
|
||||||
|
|
||||||
|
|
||||||
|
class CS_API:
|
||||||
|
"""🐸Coqui Studio API Wrapper.
|
||||||
|
|
||||||
|
🐸Coqui Studio is the most advanced voice generation platform. You can generate new voices by voice cloning, voice
|
||||||
|
interpolation, or our unique prompt to voice technology. It also provides a set of built-in voices with different
|
||||||
|
characteristics. You can use these voices to generate new audio files or use them in your applications.
|
||||||
|
You can use all the built-in and your own 🐸Coqui Studio speakers with this API with an API token.
|
||||||
|
You can signup to 🐸Coqui Studio from https://app.coqui.ai/auth/signup and get an API token from
|
||||||
|
https://app.coqui.ai/account. We can either enter the token as an environment variable as
|
||||||
|
`export COQUI_STUDIO_TOKEN=<token>` or pass it as `CS_API(api_token=<toke>)`.
|
||||||
|
Visit https://app.coqui.ai/api for more information.
|
||||||
|
|
||||||
|
Example listing all available speakers:
|
||||||
|
>>> from TTS.api import CS_API
|
||||||
|
>>> tts = CS_API()
|
||||||
|
>>> tts.speakers
|
||||||
|
|
||||||
|
Example listing all emotions:
|
||||||
|
>>> from TTS.api import CS_API
|
||||||
|
>>> tts = CS_API()
|
||||||
|
>>> tts.emotions
|
||||||
|
|
||||||
|
Example with a built-in 🐸 speaker:
|
||||||
|
>>> from TTS.api import CS_API
|
||||||
|
>>> tts = CS_API()
|
||||||
|
>>> wav, sr = api.tts("Hello world", speaker_name="Claribel Dervla")
|
||||||
|
>>> filepath = tts.tts_to_file(text="Hello world!", speaker_name=tts.speakers[0].name, file_path="output.wav")
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, api_token=None):
|
||||||
|
self.api_token = api_token
|
||||||
|
self.api_prefix = "/api/v2"
|
||||||
|
self.headers = None
|
||||||
|
self._speakers = None
|
||||||
|
self._check_token()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def speakers(self):
|
||||||
|
if self._speakers is None:
|
||||||
|
self._speakers = self.list_all_speakers()
|
||||||
|
return self._speakers
|
||||||
|
|
||||||
|
@property
|
||||||
|
def emotions(self):
|
||||||
|
"""Return a list of available emotions.
|
||||||
|
|
||||||
|
TODO: Get this from the API endpoint.
|
||||||
|
"""
|
||||||
|
return ["Neutral", "Happy", "Sad", "Angry", "Dull"]
|
||||||
|
|
||||||
|
def _check_token(self):
|
||||||
|
if self.api_token is None:
|
||||||
|
self.api_token = os.environ.get("COQUI_STUDIO_TOKEN")
|
||||||
|
self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_token}"}
|
||||||
|
if not self.api_token:
|
||||||
|
raise ValueError(
|
||||||
|
"No API token found for 🐸Coqui Studio voices - https://coqui.ai.\n"
|
||||||
|
"Visit 🔗https://app.coqui.ai/account to get one.\n"
|
||||||
|
"Set it as an environment variable `export COQUI_STUDIO_TOKEN=<token>`\n"
|
||||||
|
""
|
||||||
|
)
|
||||||
|
|
||||||
|
def list_all_speakers(self):
|
||||||
|
"""Return both built-in Coqui Studio speakers and custom voices created by the user."""
|
||||||
|
return self.list_speakers() + self.list_voices()
|
||||||
|
|
||||||
|
def list_speakers(self):
|
||||||
|
"""List built-in Coqui Studio speakers."""
|
||||||
|
self._check_token()
|
||||||
|
conn = http.client.HTTPSConnection("app.coqui.ai")
|
||||||
|
conn.request("GET", f"{self.api_prefix}/speakers", headers=self.headers)
|
||||||
|
res = conn.getresponse()
|
||||||
|
data = res.read()
|
||||||
|
return [Speaker(s) for s in json.loads(data)["result"]]
|
||||||
|
|
||||||
|
def list_voices(self):
|
||||||
|
"""List custom voices created by the user."""
|
||||||
|
conn = http.client.HTTPSConnection("app.coqui.ai")
|
||||||
|
conn.request("GET", f"{self.api_prefix}/voices", headers=self.headers)
|
||||||
|
res = conn.getresponse()
|
||||||
|
data = res.read()
|
||||||
|
return [Speaker(s, True) for s in json.loads(data)["result"]]
|
||||||
|
|
||||||
|
def list_speakers_as_tts_models(self):
|
||||||
|
"""List speakers in ModelManager format."""
|
||||||
|
models = []
|
||||||
|
for speaker in self.speakers:
|
||||||
|
model = f"coqui_studio/en/{speaker.name}/coqui_studio"
|
||||||
|
models.append(model)
|
||||||
|
return models
|
||||||
|
|
||||||
|
def name_to_speaker(self, name):
|
||||||
|
for speaker in self.speakers:
|
||||||
|
if speaker.name == name:
|
||||||
|
return speaker
|
||||||
|
raise ValueError(f"Speaker {name} not found.")
|
||||||
|
|
||||||
|
def id_to_speaker(self, speaker_id):
|
||||||
|
for speaker in self.speakers:
|
||||||
|
if speaker.id == speaker_id:
|
||||||
|
return speaker
|
||||||
|
raise ValueError(f"Speaker {speaker_id} not found.")
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def url_to_np(url):
|
||||||
|
tmp_file, _ = urllib.request.urlretrieve(url)
|
||||||
|
rate, data = wavfile.read(tmp_file)
|
||||||
|
return data, rate
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _create_payload(text, speaker, emotion, speed):
|
||||||
|
payload = {}
|
||||||
|
if speaker.is_voice:
|
||||||
|
payload["voice_id"] = speaker.id
|
||||||
|
else:
|
||||||
|
payload["speaker_id"] = speaker.id
|
||||||
|
payload.update(
|
||||||
|
{
|
||||||
|
"emotion": emotion,
|
||||||
|
"name": speaker.name,
|
||||||
|
"text": text,
|
||||||
|
"speed": speed,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return payload
|
||||||
|
|
||||||
|
def tts(
|
||||||
|
self,
|
||||||
|
text: str,
|
||||||
|
speaker_name: str = None,
|
||||||
|
speaker_id=None,
|
||||||
|
emotion="Neutral",
|
||||||
|
speed=1.0,
|
||||||
|
language=None, # pylint: disable=unused-argument
|
||||||
|
) -> Tuple[np.ndarray, int]:
|
||||||
|
"""Synthesize speech from text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text (str): Text to synthesize.
|
||||||
|
speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and
|
||||||
|
voices (user generated speakers) with `list_voices()`.
|
||||||
|
speaker_id (str): Speaker ID. If None, the speaker name is used.
|
||||||
|
emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull".
|
||||||
|
speed (float): Speed of the speech. 1.0 is normal speed.
|
||||||
|
language (str): Language of the text. If None, the default language of the speaker is used.
|
||||||
|
"""
|
||||||
|
self._check_token()
|
||||||
|
if speaker_name is None and speaker_id is None:
|
||||||
|
raise ValueError(" [!] Please provide either a `speaker_name` or a `speaker_id`.")
|
||||||
|
if speaker_id is None:
|
||||||
|
speaker = self.name_to_speaker(speaker_name)
|
||||||
|
else:
|
||||||
|
speaker = self.id_to_speaker(speaker_id)
|
||||||
|
conn = http.client.HTTPSConnection("app.coqui.ai")
|
||||||
|
payload = self._create_payload(text, speaker, emotion, speed)
|
||||||
|
conn.request("POST", "/api/v2/samples", json.dumps(payload), self.headers)
|
||||||
|
res = conn.getresponse()
|
||||||
|
data = res.read()
|
||||||
|
try:
|
||||||
|
wav, sr = self.url_to_np(json.loads(data)["audio_url"])
|
||||||
|
except KeyError as e:
|
||||||
|
raise ValueError(f" [!] 🐸 API returned error: {data}") from e
|
||||||
|
return wav, sr
|
||||||
|
|
||||||
|
def tts_to_file(
|
||||||
|
self,
|
||||||
|
text: str,
|
||||||
|
speaker_name: str,
|
||||||
|
speaker_id=None,
|
||||||
|
emotion="Neutral",
|
||||||
|
speed=1.0,
|
||||||
|
language=None,
|
||||||
|
file_path: str = None,
|
||||||
|
) -> str:
|
||||||
|
"""Synthesize speech from text and save it to a file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text (str): Text to synthesize.
|
||||||
|
speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and
|
||||||
|
voices (user generated speakers) with `list_voices()`.
|
||||||
|
speaker_id (str): Speaker ID. If None, the speaker name is used.
|
||||||
|
emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull".
|
||||||
|
speed (float): Speed of the speech. 1.0 is normal speed.
|
||||||
|
language (str): Language of the text. If None, the default language of the speaker is used.
|
||||||
|
file_path (str): Path to save the file. If None, a temporary file is created.
|
||||||
|
"""
|
||||||
|
if file_path is None:
|
||||||
|
file_path = tempfile.mktemp(".wav")
|
||||||
|
wav, sr = self.tts(text, speaker_name, speaker_id, emotion, speed, language)
|
||||||
|
wavfile.write(file_path, sr, wav)
|
||||||
|
return file_path
|
||||||
|
|
||||||
|
|
||||||
class TTS:
|
class TTS:
|
||||||
"""TODO: Add voice conversion and Capacitron support."""
|
"""TODO: Add voice conversion and Capacitron support."""
|
||||||
|
|
||||||
|
@ -54,9 +270,12 @@ class TTS:
|
||||||
|
|
||||||
self.synthesizer = None
|
self.synthesizer = None
|
||||||
self.voice_converter = None
|
self.voice_converter = None
|
||||||
|
self.csapi = None
|
||||||
|
self.model_name = None
|
||||||
|
|
||||||
if model_name:
|
if model_name:
|
||||||
self.load_tts_model_by_name(model_name, gpu)
|
self.load_tts_model_by_name(model_name, gpu)
|
||||||
|
|
||||||
if model_path:
|
if model_path:
|
||||||
self.load_tts_model_by_path(
|
self.load_tts_model_by_path(
|
||||||
model_path, config_path, vocoder_path=vocoder_path, vocoder_config=vocoder_config_path, gpu=gpu
|
model_path, config_path, vocoder_path=vocoder_path, vocoder_config=vocoder_config_path, gpu=gpu
|
||||||
|
@ -72,6 +291,10 @@ class TTS:
|
||||||
return self.synthesizer.tts_model.speaker_manager.num_speakers > 1
|
return self.synthesizer.tts_model.speaker_manager.num_speakers > 1
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_coqui_studio(self):
|
||||||
|
return "coqui_studio" in self.model_name
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def is_multi_lingual(self):
|
def is_multi_lingual(self):
|
||||||
if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager:
|
if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager:
|
||||||
|
@ -96,8 +319,14 @@ class TTS:
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def list_models():
|
def list_models():
|
||||||
|
try:
|
||||||
|
csapi = CS_API()
|
||||||
|
models = csapi.list_speakers_as_tts_models()
|
||||||
|
except ValueError as e:
|
||||||
|
print(e)
|
||||||
|
models = []
|
||||||
manager = ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False)
|
manager = ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False)
|
||||||
return manager.list_tts_models()
|
return manager.list_tts_models() + models
|
||||||
|
|
||||||
def download_model_by_name(self, model_name: str):
|
def download_model_by_name(self, model_name: str):
|
||||||
model_path, config_path, model_item = self.manager.download_model(model_name)
|
model_path, config_path, model_item = self.manager.download_model(model_name)
|
||||||
|
@ -125,22 +354,28 @@ class TTS:
|
||||||
|
|
||||||
TODO: Add tests
|
TODO: Add tests
|
||||||
"""
|
"""
|
||||||
|
self.synthesizer = None
|
||||||
|
self.csapi = None
|
||||||
|
self.model_name = model_name
|
||||||
|
|
||||||
model_path, config_path, vocoder_path, vocoder_config_path = self.download_model_by_name(model_name)
|
if "coqui_studio" in model_name:
|
||||||
|
self.csapi = CS_API()
|
||||||
|
else:
|
||||||
|
model_path, config_path, vocoder_path, vocoder_config_path = self.download_model_by_name(model_name)
|
||||||
|
|
||||||
# init synthesizer
|
# init synthesizer
|
||||||
# None values are fetch from the model
|
# None values are fetch from the model
|
||||||
self.synthesizer = Synthesizer(
|
self.synthesizer = Synthesizer(
|
||||||
tts_checkpoint=model_path,
|
tts_checkpoint=model_path,
|
||||||
tts_config_path=config_path,
|
tts_config_path=config_path,
|
||||||
tts_speakers_file=None,
|
tts_speakers_file=None,
|
||||||
tts_languages_file=None,
|
tts_languages_file=None,
|
||||||
vocoder_checkpoint=vocoder_path,
|
vocoder_checkpoint=vocoder_path,
|
||||||
vocoder_config=vocoder_config_path,
|
vocoder_config=vocoder_config_path,
|
||||||
encoder_checkpoint=None,
|
encoder_checkpoint=None,
|
||||||
encoder_config=None,
|
encoder_config=None,
|
||||||
use_cuda=gpu,
|
use_cuda=gpu,
|
||||||
)
|
)
|
||||||
|
|
||||||
def load_tts_model_by_path(
|
def load_tts_model_by_path(
|
||||||
self, model_path: str, config_path: str, vocoder_path: str = None, vocoder_config: str = None, gpu: bool = False
|
self, model_path: str, config_path: str, vocoder_path: str = None, vocoder_config: str = None, gpu: bool = False
|
||||||
|
@ -167,17 +402,88 @@ class TTS:
|
||||||
use_cuda=gpu,
|
use_cuda=gpu,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _check_arguments(self, speaker: str = None, language: str = None, speaker_wav: str = None):
|
def _check_arguments(
|
||||||
if self.is_multi_speaker and (speaker is None and speaker_wav is None):
|
self,
|
||||||
raise ValueError("Model is multi-speaker but no speaker is provided.")
|
speaker: str = None,
|
||||||
if self.is_multi_lingual and language is None:
|
language: str = None,
|
||||||
raise ValueError("Model is multi-lingual but no language is provided.")
|
speaker_wav: str = None,
|
||||||
if not self.is_multi_speaker and speaker is not None:
|
emotion: str = None,
|
||||||
raise ValueError("Model is not multi-speaker but speaker is provided.")
|
speed: float = None,
|
||||||
if not self.is_multi_lingual and language is not None:
|
) -> None:
|
||||||
raise ValueError("Model is not multi-lingual but language is provided.")
|
"""Check if the arguments are valid for the model."""
|
||||||
|
if not self.is_coqui_studio:
|
||||||
|
# check for the coqui tts models
|
||||||
|
if self.is_multi_speaker and (speaker is None and speaker_wav is None):
|
||||||
|
raise ValueError("Model is multi-speaker but no `speaker` is provided.")
|
||||||
|
if self.is_multi_lingual and language is None:
|
||||||
|
raise ValueError("Model is multi-lingual but no `language` is provided.")
|
||||||
|
if not self.is_multi_speaker and speaker is not None:
|
||||||
|
raise ValueError("Model is not multi-speaker but `speaker` is provided.")
|
||||||
|
if not self.is_multi_lingual and language is not None:
|
||||||
|
raise ValueError("Model is not multi-lingual but `language` is provided.")
|
||||||
|
if not emotion is None and not speed is None:
|
||||||
|
raise ValueError("Emotion and speed can only be used with Coqui Studio models.")
|
||||||
|
else:
|
||||||
|
if emotion is None:
|
||||||
|
emotion = "Neutral"
|
||||||
|
if speed is None:
|
||||||
|
speed = 1.0
|
||||||
|
# check for the studio models
|
||||||
|
if speaker_wav is not None:
|
||||||
|
raise ValueError("Coqui Studio models do not support `speaker_wav` argument.")
|
||||||
|
if speaker is not None:
|
||||||
|
raise ValueError("Coqui Studio models do not support `speaker` argument.")
|
||||||
|
if language is not None and language != "en":
|
||||||
|
raise ValueError("Coqui Studio models currently support only `language=en` argument.")
|
||||||
|
if emotion not in ["Neutral", "Happy", "Sad", "Angry", "Dull"]:
|
||||||
|
raise ValueError(f"Emotion - `{emotion}` - must be one of `Neutral`, `Happy`, `Sad`, `Angry`, `Dull`.")
|
||||||
|
|
||||||
def tts(self, text: str, speaker: str = None, language: str = None, speaker_wav: str = None):
|
def tts_coqui_studio(
|
||||||
|
self,
|
||||||
|
text: str,
|
||||||
|
speaker_name: str = None,
|
||||||
|
language: str = None,
|
||||||
|
emotion: str = "Neutral",
|
||||||
|
speed: float = 1.0,
|
||||||
|
file_path: str = None,
|
||||||
|
):
|
||||||
|
"""Convert text to speech using Coqui Studio models. Use `CS_API` class if you are only interested in the API.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text (str):
|
||||||
|
Input text to synthesize.
|
||||||
|
speaker_name (str, optional):
|
||||||
|
Speaker name from Coqui Studio. Defaults to None.
|
||||||
|
language (str, optional):
|
||||||
|
Language code. Coqui Studio currently supports only English. Defaults to None.
|
||||||
|
emotion (str, optional):
|
||||||
|
Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Defaults to "Neutral".
|
||||||
|
speed (float, optional):
|
||||||
|
Speed of the speech. Defaults to 1.0.
|
||||||
|
file_path (str, optional):
|
||||||
|
Path to save the output file. When None it returns the `np.ndarray` of waveform. Defaults to None.
|
||||||
|
"""
|
||||||
|
speaker_name = self.model_name.split("/")[2]
|
||||||
|
if file_path is None:
|
||||||
|
return self.csapi.tts_to_file(
|
||||||
|
text=text,
|
||||||
|
speaker_name=speaker_name,
|
||||||
|
language=language,
|
||||||
|
speed=speed,
|
||||||
|
emotion=emotion,
|
||||||
|
file_path=file_path,
|
||||||
|
)[0]
|
||||||
|
return self.csapi.tts(text=text, speaker_name=speaker_name, language=language, speed=speed, emotion=emotion)[0]
|
||||||
|
|
||||||
|
def tts(
|
||||||
|
self,
|
||||||
|
text: str,
|
||||||
|
speaker: str = None,
|
||||||
|
language: str = None,
|
||||||
|
speaker_wav: str = None,
|
||||||
|
emotion: str = None,
|
||||||
|
speed: float = None,
|
||||||
|
):
|
||||||
"""Convert text to speech.
|
"""Convert text to speech.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
@ -192,8 +498,17 @@ class TTS:
|
||||||
speaker_wav (str, optional):
|
speaker_wav (str, optional):
|
||||||
Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
|
Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
|
||||||
Defaults to None.
|
Defaults to None.
|
||||||
|
emotion (str, optional):
|
||||||
|
Emotion to use for 🐸Coqui Studio models. If None, Studio models use "Neutral". Defaults to None.
|
||||||
|
speed (float, optional):
|
||||||
|
Speed factor to use for 🐸Coqui Studio models, between 0 and 2.0. If None, Studio models use 1.0.
|
||||||
|
Defaults to None.
|
||||||
"""
|
"""
|
||||||
self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav)
|
self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed)
|
||||||
|
if self.csapi is not None:
|
||||||
|
return self.tts_coqui_studio(
|
||||||
|
text=text, speaker_name=speaker, language=language, emotion=emotion, speed=speed
|
||||||
|
)
|
||||||
|
|
||||||
wav = self.synthesizer.tts(
|
wav = self.synthesizer.tts(
|
||||||
text=text,
|
text=text,
|
||||||
|
@ -213,6 +528,8 @@ class TTS:
|
||||||
speaker: str = None,
|
speaker: str = None,
|
||||||
language: str = None,
|
language: str = None,
|
||||||
speaker_wav: str = None,
|
speaker_wav: str = None,
|
||||||
|
emotion: str = "Neutral",
|
||||||
|
speed: float = 1.0,
|
||||||
file_path: str = "output.wav",
|
file_path: str = "output.wav",
|
||||||
):
|
):
|
||||||
"""Convert text to speech.
|
"""Convert text to speech.
|
||||||
|
@ -229,11 +546,22 @@ class TTS:
|
||||||
speaker_wav (str, optional):
|
speaker_wav (str, optional):
|
||||||
Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
|
Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
|
||||||
Defaults to None.
|
Defaults to None.
|
||||||
|
emotion (str, optional):
|
||||||
|
Emotion to use for 🐸Coqui Studio models. Defaults to "Neutral".
|
||||||
|
speed (float, optional):
|
||||||
|
Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None.
|
||||||
file_path (str, optional):
|
file_path (str, optional):
|
||||||
Output file path. Defaults to "output.wav".
|
Output file path. Defaults to "output.wav".
|
||||||
"""
|
"""
|
||||||
|
self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav)
|
||||||
|
|
||||||
|
if self.csapi is not None:
|
||||||
|
return self.tts_coqui_studio(
|
||||||
|
text=text, speaker_name=speaker, language=language, emotion=emotion, speed=speed, file_path=file_path
|
||||||
|
)
|
||||||
wav = self.tts(text=text, speaker=speaker, language=language, speaker_wav=speaker_wav)
|
wav = self.tts(text=text, speaker=speaker, language=language, speaker_wav=speaker_wav)
|
||||||
self.synthesizer.save_wav(wav=wav, path=file_path)
|
self.synthesizer.save_wav(wav=wav, path=file_path)
|
||||||
|
return file_path
|
||||||
|
|
||||||
def voice_conversion(
|
def voice_conversion(
|
||||||
self,
|
self,
|
||||||
|
|
|
@ -290,7 +290,14 @@ class Synthesizer(object):
|
||||||
language_id = list(self.tts_model.language_manager.name_to_id.values())[0]
|
language_id = list(self.tts_model.language_manager.name_to_id.values())[0]
|
||||||
|
|
||||||
elif language_name and isinstance(language_name, str):
|
elif language_name and isinstance(language_name, str):
|
||||||
language_id = self.tts_model.language_manager.name_to_id[language_name]
|
try:
|
||||||
|
language_id = self.tts_model.language_manager.name_to_id[language_name]
|
||||||
|
except KeyError as e:
|
||||||
|
raise ValueError(
|
||||||
|
f" [!] Looks like you use a multi-lingual model. "
|
||||||
|
f"Language {language_name} is not in the available languages: "
|
||||||
|
f"{self.tts_model.language_manager.name_to_id.keys()}."
|
||||||
|
) from e
|
||||||
|
|
||||||
elif not language_name:
|
elif not language_name:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
|
|
@ -109,7 +109,7 @@ tts-server --model_name "<type>/<language>/<dataset>/<model_name>" \
|
||||||
--vocoder_name "<type>/<language>/<dataset>/<model_name>"
|
--vocoder_name "<type>/<language>/<dataset>/<model_name>"
|
||||||
```
|
```
|
||||||
|
|
||||||
## Python API
|
## Python 🐸TTS API
|
||||||
|
|
||||||
You can run a multi-speaker and multi-lingual model in Python as
|
You can run a multi-speaker and multi-lingual model in Python as
|
||||||
|
|
||||||
|
@ -163,4 +163,34 @@ tts.tts_with_vc_to_file(
|
||||||
speaker_wav="target/speaker.wav",
|
speaker_wav="target/speaker.wav",
|
||||||
file_path="ouptut.wav"
|
file_path="ouptut.wav"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
Example text to speech using [🐸Coqui Studio](https://coqui.ai) models. You can use all of your available speakers in the studio.
|
||||||
|
[🐸Coqui Studio](https://coqui.ai) API token is required. You can get it from the [account page](https://coqui.ai/account).
|
||||||
|
You should set the `COQUI_STUDIO_TOKEN` environment variable to use the API token.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# If you have a valid API token set you will see the studio speakers as separate models in the list.
|
||||||
|
# The name format is coqui_studio/en/<studio_speaker_name>/coqui_studio
|
||||||
|
models = TTS().list_models()
|
||||||
|
# Init TTS with the target studio speaker
|
||||||
|
tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False, gpu=False)
|
||||||
|
# Run TTS
|
||||||
|
tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH)
|
||||||
|
# Run TTS with emotion and speed control
|
||||||
|
tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5)
|
||||||
|
```
|
||||||
|
|
||||||
|
If you just need 🐸 Coqui Studio speakers, you can use `CS_API`. It is a wrapper around the 🐸 Coqui Studio API.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from TTS.api import CS_API
|
||||||
|
|
||||||
|
# Init 🐸 Coqui Studio API
|
||||||
|
# you can either set the API token as an environment variable `COQUI_STUDIO_TOKEN` or pass it as an argument.
|
||||||
|
api = CS_API(api_token=<token>)
|
||||||
|
api.speakers
|
||||||
|
api.emotions
|
||||||
|
api.list_speakers()
|
||||||
|
api.list_voices()
|
||||||
|
wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5)
|
||||||
```
|
```
|
|
@ -2,12 +2,41 @@ import os
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from tests import get_tests_data_path, get_tests_output_path
|
from tests import get_tests_data_path, get_tests_output_path
|
||||||
from TTS.api import TTS
|
from TTS.api import CS_API, TTS
|
||||||
|
|
||||||
OUTPUT_PATH = os.path.join(get_tests_output_path(), "test_python_api.wav")
|
OUTPUT_PATH = os.path.join(get_tests_output_path(), "test_python_api.wav")
|
||||||
cloning_test_wav_path = os.path.join(get_tests_data_path(), "ljspeech/wavs/LJ001-0028.wav")
|
cloning_test_wav_path = os.path.join(get_tests_data_path(), "ljspeech/wavs/LJ001-0028.wav")
|
||||||
|
|
||||||
|
|
||||||
|
class CS_APITest(unittest.TestCase):
|
||||||
|
def test_speakers(self):
|
||||||
|
tts = CS_API()
|
||||||
|
self.assertGreater(len(tts.speakers), 1)
|
||||||
|
|
||||||
|
def test_emotions(self):
|
||||||
|
tts = CS_API()
|
||||||
|
self.assertGreater(len(tts.emotions), 1)
|
||||||
|
|
||||||
|
def test_list_calls(self):
|
||||||
|
tts = CS_API()
|
||||||
|
self.assertGreater(len(tts.list_voices()), 1)
|
||||||
|
self.assertGreater(len(tts.list_speakers()), 1)
|
||||||
|
self.assertGreater(len(tts.list_all_speakers()), 1)
|
||||||
|
self.assertGreater(len(tts.list_speakers_as_tts_models()), 1)
|
||||||
|
|
||||||
|
def test_name_to_speaker(self):
|
||||||
|
tts = CS_API()
|
||||||
|
speaker_name = tts.list_speakers_as_tts_models()[0].split("/")[2]
|
||||||
|
speaker = tts.name_to_speaker(speaker_name)
|
||||||
|
self.assertEqual(speaker.name, speaker_name)
|
||||||
|
|
||||||
|
def test_tts(self):
|
||||||
|
tts = CS_API()
|
||||||
|
wav, sr = tts.tts(text="This is a test.", speaker_name=tts.list_speakers()[0].name)
|
||||||
|
self.assertEqual(sr, 44100)
|
||||||
|
self.assertGreater(len(wav), 1)
|
||||||
|
|
||||||
|
|
||||||
class TTSTest(unittest.TestCase):
|
class TTSTest(unittest.TestCase):
|
||||||
def test_single_speaker_model(self):
|
def test_single_speaker_model(self):
|
||||||
tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False)
|
tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False)
|
||||||
|
@ -26,6 +55,30 @@ class TTSTest(unittest.TestCase):
|
||||||
self.assertIsNone(tts.speakers)
|
self.assertIsNone(tts.speakers)
|
||||||
self.assertIsNone(tts.languages)
|
self.assertIsNone(tts.languages)
|
||||||
|
|
||||||
|
def test_studio_model(self):
|
||||||
|
tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio")
|
||||||
|
tts.tts_to_file(text="This is a test.")
|
||||||
|
|
||||||
|
# check speed > 2.0 raises error
|
||||||
|
raised_error = False
|
||||||
|
try:
|
||||||
|
_ = tts.tts(text="This is a test.", speed=4.0, emotion="Sad") # should raise error with speed > 2.0
|
||||||
|
except ValueError:
|
||||||
|
raised_error = True
|
||||||
|
self.assertTrue(raised_error)
|
||||||
|
|
||||||
|
# check emotion is invalid
|
||||||
|
raised_error = False
|
||||||
|
try:
|
||||||
|
_ = tts.tts(text="This is a test.", speed=2.0, emotion="No Emo") # should raise error with speed > 2.0
|
||||||
|
except ValueError:
|
||||||
|
raised_error = True
|
||||||
|
self.assertTrue(raised_error)
|
||||||
|
|
||||||
|
# check valid call
|
||||||
|
wav = tts.tts(text="This is a test.", speed=2.0, emotion="Sad")
|
||||||
|
self.assertGreater(len(wav), 0)
|
||||||
|
|
||||||
def test_multi_speaker_multi_lingual_model(self):
|
def test_multi_speaker_multi_lingual_model(self):
|
||||||
tts = TTS()
|
tts = TTS()
|
||||||
tts.load_tts_model_by_name(tts.models[0]) # YourTTS
|
tts.load_tts_model_by_name(tts.models[0]) # YourTTS
|
||||||
|
|
Loading…
Reference in New Issue