mirror of https://github.com/coqui-ai/TTS.git
Update Studio API for XTTS (#2861)
* Update Studio API for XTTS * Update the docs * Update README.md * Update README.md Update README
This commit is contained in:
parent
37b558ccb9
commit
3a104d5c49
52
README.md
52
README.md
|
@ -204,9 +204,11 @@ tts = TTS(model_name)
|
||||||
wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0], language=tts.languages[0])
|
wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0], language=tts.languages[0])
|
||||||
# Text to speech to a file
|
# Text to speech to a file
|
||||||
tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav")
|
tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav")
|
||||||
|
```
|
||||||
|
|
||||||
# Running a single speaker model
|
#### Running a single speaker model
|
||||||
|
|
||||||
|
```python
|
||||||
# Init TTS with the target model name
|
# Init TTS with the target model name
|
||||||
tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False)
|
tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False)
|
||||||
# Run TTS
|
# Run TTS
|
||||||
|
@ -218,15 +220,21 @@ tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_
|
||||||
tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav")
|
tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav")
|
||||||
tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr-fr", file_path="output.wav")
|
tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr-fr", file_path="output.wav")
|
||||||
tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt-br", file_path="output.wav")
|
tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt-br", file_path="output.wav")
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Example voice conversion
|
||||||
|
|
||||||
# Example voice conversion converting speaker of the `source_wav` to the speaker of the `target_wav`
|
Converting the voice in `source_wav` to the voice of `target_wav`
|
||||||
|
|
||||||
|
```python
|
||||||
tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False, gpu=True)
|
tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False, gpu=True)
|
||||||
tts.voice_conversion_to_file(source_wav="my/source.wav", target_wav="my/target.wav", file_path="output.wav")
|
tts.voice_conversion_to_file(source_wav="my/source.wav", target_wav="my/target.wav", file_path="output.wav")
|
||||||
|
```
|
||||||
|
|
||||||
# Example voice cloning by a single speaker TTS model combining with the voice conversion model. This way, you can
|
#### Example voice cloning together with the voice conversion model.
|
||||||
# clone voices by using any model in 🐸TTS.
|
This way, you can clone voices by using any model in 🐸TTS.
|
||||||
|
|
||||||
|
```python
|
||||||
|
|
||||||
tts = TTS("tts_models/de/thorsten/tacotron2-DDC")
|
tts = TTS("tts_models/de/thorsten/tacotron2-DDC")
|
||||||
tts.tts_with_vc_to_file(
|
tts.tts_with_vc_to_file(
|
||||||
|
@ -234,29 +242,43 @@ tts.tts_with_vc_to_file(
|
||||||
speaker_wav="target/speaker.wav",
|
speaker_wav="target/speaker.wav",
|
||||||
file_path="output.wav"
|
file_path="output.wav"
|
||||||
)
|
)
|
||||||
|
```
|
||||||
|
|
||||||
# Example text to speech using [🐸Coqui Studio](https://coqui.ai) models.
|
#### Example using [🐸Coqui Studio](https://coqui.ai) voices.
|
||||||
|
You access all of your cloned voices and built-in speakers in [🐸Coqui Studio](https://coqui.ai).
|
||||||
|
To do this, you'll need an API token, which you can obtain from the [account page](https://coqui.ai/account).
|
||||||
|
After obtaining the API token, you'll need to configure the COQUI_STUDIO_TOKEN environment variable.
|
||||||
|
|
||||||
# You can use all of your available speakers in the studio.
|
Once you have a valid API token in place, the studio speakers will be displayed as distinct models within the list.
|
||||||
# [🐸Coqui Studio](https://coqui.ai) API token is required. You can get it from the [account page](https://coqui.ai/account).
|
These models will follow the naming convention `coqui_studio/en/<studio_speaker_name>/coqui_studio`
|
||||||
# You should set the `COQUI_STUDIO_TOKEN` environment variable to use the API token.
|
|
||||||
|
|
||||||
# If you have a valid API token set you will see the studio speakers as separate models in the list.
|
```python
|
||||||
# The name format is coqui_studio/en/<studio_speaker_name>/coqui_studio
|
# XTTS model
|
||||||
models = TTS().list_models()
|
models = TTS(cs_api_model="XTTS").list_models()
|
||||||
# Init TTS with the target studio speaker
|
# Init TTS with the target studio speaker
|
||||||
tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False, gpu=False)
|
tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False, gpu=False)
|
||||||
# Run TTS
|
# Run TTS
|
||||||
tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH)
|
tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH)
|
||||||
|
|
||||||
|
# V1 model
|
||||||
|
models = TTS(cs_api_model="V1").list_models()
|
||||||
# Run TTS with emotion and speed control
|
# Run TTS with emotion and speed control
|
||||||
|
# Emotion control only works with V1 model
|
||||||
tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5)
|
tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5)
|
||||||
|
|
||||||
|
# XTTS-multilingual
|
||||||
|
models = TTS(cs_api_model="XTTS-multilingual").list_models()
|
||||||
|
# Run TTS with emotion and speed control
|
||||||
|
# Emotion control only works with V1 model
|
||||||
|
tts.tts_to_file(text="Das ist ein Test.", file_path=OUTPUT_PATH, language="de", speed=1.0)
|
||||||
|
```
|
||||||
|
|
||||||
#Example text to speech using **Fairseq models in ~1100 languages** 🤯.
|
#### Example text to speech using **Fairseq models in ~1100 languages** 🤯.
|
||||||
|
For Fairseq models, use the following name format: `tts_models/<lang-iso_code>/fairseq/vits`.
|
||||||
#For these models use the following name format: `tts_models/<lang-iso_code>/fairseq/vits`.
|
You can find the language ISO codes [here](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html)
|
||||||
#You can find the list of language ISO codes [here](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html) and learn about the Fairseq models [here](https://github.com/facebookresearch/fairseq/tree/main/examples/mms).
|
and learn about the Fairseq models [here](https://github.com/facebookresearch/fairseq/tree/main/examples/mms).
|
||||||
|
|
||||||
|
```python
|
||||||
# TTS with on the fly voice conversion
|
# TTS with on the fly voice conversion
|
||||||
api = TTS("tts_models/deu/fairseq/vits")
|
api = TTS("tts_models/deu/fairseq/vits")
|
||||||
api.tts_with_vc_to_file(
|
api.tts_with_vc_to_file(
|
||||||
|
|
249
TTS/api.py
249
TTS/api.py
|
@ -1,234 +1,15 @@
|
||||||
import http.client
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import tempfile
|
import tempfile
|
||||||
import urllib.request
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Tuple, Union
|
from typing import Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import requests
|
|
||||||
from scipy.io import wavfile
|
|
||||||
|
|
||||||
|
from TTS.cs_api import CS_API
|
||||||
from TTS.utils.audio.numpy_transforms import save_wav
|
from TTS.utils.audio.numpy_transforms import save_wav
|
||||||
from TTS.utils.manage import ModelManager
|
from TTS.utils.manage import ModelManager
|
||||||
from TTS.utils.synthesizer import Synthesizer
|
from TTS.utils.synthesizer import Synthesizer
|
||||||
|
|
||||||
|
|
||||||
class Speaker(object):
|
|
||||||
"""Convert dict to object."""
|
|
||||||
|
|
||||||
def __init__(self, d, is_voice=False):
|
|
||||||
self.is_voice = is_voice
|
|
||||||
for k, v in d.items():
|
|
||||||
if isinstance(k, (list, tuple)):
|
|
||||||
setattr(self, k, [Speaker(x) if isinstance(x, dict) else x for x in v])
|
|
||||||
else:
|
|
||||||
setattr(self, k, Speaker(v) if isinstance(v, dict) else v)
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return str(self.__dict__)
|
|
||||||
|
|
||||||
|
|
||||||
class CS_API:
|
|
||||||
"""🐸Coqui Studio API Wrapper.
|
|
||||||
|
|
||||||
🐸Coqui Studio is the most advanced voice generation platform. You can generate new voices by voice cloning, voice
|
|
||||||
interpolation, or our unique prompt to voice technology. It also provides a set of built-in voices with different
|
|
||||||
characteristics. You can use these voices to generate new audio files or use them in your applications.
|
|
||||||
You can use all the built-in and your own 🐸Coqui Studio speakers with this API with an API token.
|
|
||||||
You can signup to 🐸Coqui Studio from https://app.coqui.ai/auth/signup and get an API token from
|
|
||||||
https://app.coqui.ai/account. We can either enter the token as an environment variable as
|
|
||||||
`export COQUI_STUDIO_TOKEN=<token>` or pass it as `CS_API(api_token=<toke>)`.
|
|
||||||
Visit https://app.coqui.ai/api for more information.
|
|
||||||
|
|
||||||
Example listing all available speakers:
|
|
||||||
>>> from TTS.api import CS_API
|
|
||||||
>>> tts = CS_API()
|
|
||||||
>>> tts.speakers
|
|
||||||
|
|
||||||
Example listing all emotions:
|
|
||||||
>>> from TTS.api import CS_API
|
|
||||||
>>> tts = CS_API()
|
|
||||||
>>> tts.emotions
|
|
||||||
|
|
||||||
Example with a built-in 🐸 speaker:
|
|
||||||
>>> from TTS.api import CS_API
|
|
||||||
>>> tts = CS_API()
|
|
||||||
>>> wav, sr = api.tts("Hello world", speaker_name="Claribel Dervla")
|
|
||||||
>>> filepath = tts.tts_to_file(text="Hello world!", speaker_name=tts.speakers[0].name, file_path="output.wav")
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, api_token=None):
|
|
||||||
self.api_token = api_token
|
|
||||||
self.api_prefix = "/api/v2"
|
|
||||||
self.headers = None
|
|
||||||
self._speakers = None
|
|
||||||
self._check_token()
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def ping_api():
|
|
||||||
URL = "https://coqui.gateway.scarf.sh/tts/api"
|
|
||||||
_ = requests.get(URL)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def speakers(self):
|
|
||||||
if self._speakers is None:
|
|
||||||
self._speakers = self.list_all_speakers()
|
|
||||||
return self._speakers
|
|
||||||
|
|
||||||
@property
|
|
||||||
def emotions(self):
|
|
||||||
"""Return a list of available emotions.
|
|
||||||
|
|
||||||
TODO: Get this from the API endpoint.
|
|
||||||
"""
|
|
||||||
return ["Neutral", "Happy", "Sad", "Angry", "Dull"]
|
|
||||||
|
|
||||||
def _check_token(self):
|
|
||||||
if self.api_token is None:
|
|
||||||
self.api_token = os.environ.get("COQUI_STUDIO_TOKEN")
|
|
||||||
self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_token}"}
|
|
||||||
if not self.api_token:
|
|
||||||
raise ValueError(
|
|
||||||
"No API token found for 🐸Coqui Studio voices - https://coqui.ai \n"
|
|
||||||
"Visit 🔗https://app.coqui.ai/account to get one.\n"
|
|
||||||
"Set it as an environment variable `export COQUI_STUDIO_TOKEN=<token>`\n"
|
|
||||||
""
|
|
||||||
)
|
|
||||||
|
|
||||||
def list_all_speakers(self):
|
|
||||||
"""Return both built-in Coqui Studio speakers and custom voices created by the user."""
|
|
||||||
return self.list_speakers() + self.list_voices()
|
|
||||||
|
|
||||||
def list_speakers(self):
|
|
||||||
"""List built-in Coqui Studio speakers."""
|
|
||||||
self._check_token()
|
|
||||||
conn = http.client.HTTPSConnection("app.coqui.ai")
|
|
||||||
conn.request("GET", f"{self.api_prefix}/speakers?per_page=100", headers=self.headers)
|
|
||||||
res = conn.getresponse()
|
|
||||||
data = res.read()
|
|
||||||
return [Speaker(s) for s in json.loads(data)["result"]]
|
|
||||||
|
|
||||||
def list_voices(self):
|
|
||||||
"""List custom voices created by the user."""
|
|
||||||
conn = http.client.HTTPSConnection("app.coqui.ai")
|
|
||||||
conn.request("GET", f"{self.api_prefix}/voices", headers=self.headers)
|
|
||||||
res = conn.getresponse()
|
|
||||||
data = res.read()
|
|
||||||
return [Speaker(s, True) for s in json.loads(data)["result"]]
|
|
||||||
|
|
||||||
def list_speakers_as_tts_models(self):
|
|
||||||
"""List speakers in ModelManager format."""
|
|
||||||
models = []
|
|
||||||
for speaker in self.speakers:
|
|
||||||
model = f"coqui_studio/en/{speaker.name}/coqui_studio"
|
|
||||||
models.append(model)
|
|
||||||
return models
|
|
||||||
|
|
||||||
def name_to_speaker(self, name):
|
|
||||||
for speaker in self.speakers:
|
|
||||||
if speaker.name == name:
|
|
||||||
return speaker
|
|
||||||
raise ValueError(f"Speaker {name} not found in {self.speakers}")
|
|
||||||
|
|
||||||
def id_to_speaker(self, speaker_id):
|
|
||||||
for speaker in self.speakers:
|
|
||||||
if speaker.id == speaker_id:
|
|
||||||
return speaker
|
|
||||||
raise ValueError(f"Speaker {speaker_id} not found.")
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def url_to_np(url):
|
|
||||||
tmp_file, _ = urllib.request.urlretrieve(url)
|
|
||||||
rate, data = wavfile.read(tmp_file)
|
|
||||||
return data, rate
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _create_payload(text, speaker, emotion, speed):
|
|
||||||
payload = {}
|
|
||||||
if speaker.is_voice:
|
|
||||||
payload["voice_id"] = speaker.id
|
|
||||||
else:
|
|
||||||
payload["speaker_id"] = speaker.id
|
|
||||||
payload.update(
|
|
||||||
{
|
|
||||||
"emotion": emotion,
|
|
||||||
"name": speaker.name,
|
|
||||||
"text": text,
|
|
||||||
"speed": speed,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
return payload
|
|
||||||
|
|
||||||
def tts(
|
|
||||||
self,
|
|
||||||
text: str,
|
|
||||||
speaker_name: str = None,
|
|
||||||
speaker_id=None,
|
|
||||||
emotion="Neutral",
|
|
||||||
speed=1.0,
|
|
||||||
language=None, # pylint: disable=unused-argument
|
|
||||||
) -> Tuple[np.ndarray, int]:
|
|
||||||
"""Synthesize speech from text.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text (str): Text to synthesize.
|
|
||||||
speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and
|
|
||||||
voices (user generated speakers) with `list_voices()`.
|
|
||||||
speaker_id (str): Speaker ID. If None, the speaker name is used.
|
|
||||||
emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull".
|
|
||||||
speed (float): Speed of the speech. 1.0 is normal speed.
|
|
||||||
language (str): Language of the text. If None, the default language of the speaker is used.
|
|
||||||
"""
|
|
||||||
self._check_token()
|
|
||||||
self.ping_api()
|
|
||||||
if speaker_name is None and speaker_id is None:
|
|
||||||
raise ValueError(" [!] Please provide either a `speaker_name` or a `speaker_id`.")
|
|
||||||
if speaker_id is None:
|
|
||||||
speaker = self.name_to_speaker(speaker_name)
|
|
||||||
else:
|
|
||||||
speaker = self.id_to_speaker(speaker_id)
|
|
||||||
conn = http.client.HTTPSConnection("app.coqui.ai")
|
|
||||||
payload = self._create_payload(text, speaker, emotion, speed)
|
|
||||||
conn.request("POST", "/api/v2/samples", json.dumps(payload), self.headers)
|
|
||||||
res = conn.getresponse()
|
|
||||||
data = res.read()
|
|
||||||
try:
|
|
||||||
wav, sr = self.url_to_np(json.loads(data)["audio_url"])
|
|
||||||
except KeyError as e:
|
|
||||||
raise ValueError(f" [!] 🐸 API returned error: {data}") from e
|
|
||||||
return wav, sr
|
|
||||||
|
|
||||||
def tts_to_file(
|
|
||||||
self,
|
|
||||||
text: str,
|
|
||||||
speaker_name: str,
|
|
||||||
speaker_id=None,
|
|
||||||
emotion="Neutral",
|
|
||||||
speed=1.0,
|
|
||||||
language=None,
|
|
||||||
file_path: str = None,
|
|
||||||
) -> str:
|
|
||||||
"""Synthesize speech from text and save it to a file.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text (str): Text to synthesize.
|
|
||||||
speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and
|
|
||||||
voices (user generated speakers) with `list_voices()`.
|
|
||||||
speaker_id (str): Speaker ID. If None, the speaker name is used.
|
|
||||||
emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull".
|
|
||||||
speed (float): Speed of the speech. 1.0 is normal speed.
|
|
||||||
language (str): Language of the text. If None, the default language of the speaker is used.
|
|
||||||
file_path (str): Path to save the file. If None, a temporary file is created.
|
|
||||||
"""
|
|
||||||
if file_path is None:
|
|
||||||
file_path = tempfile.mktemp(".wav")
|
|
||||||
wav, sr = self.tts(text, speaker_name, speaker_id, emotion, speed, language)
|
|
||||||
wavfile.write(file_path, sr, wav)
|
|
||||||
return file_path
|
|
||||||
|
|
||||||
|
|
||||||
class TTS:
|
class TTS:
|
||||||
"""TODO: Add voice conversion and Capacitron support."""
|
"""TODO: Add voice conversion and Capacitron support."""
|
||||||
|
|
||||||
|
@ -240,6 +21,7 @@ class TTS:
|
||||||
vocoder_path: str = None,
|
vocoder_path: str = None,
|
||||||
vocoder_config_path: str = None,
|
vocoder_config_path: str = None,
|
||||||
progress_bar: bool = True,
|
progress_bar: bool = True,
|
||||||
|
cs_api_model: str = "XTTS",
|
||||||
gpu=False,
|
gpu=False,
|
||||||
):
|
):
|
||||||
"""🐸TTS python interface that allows to load and use the released models.
|
"""🐸TTS python interface that allows to load and use the released models.
|
||||||
|
@ -275,6 +57,9 @@ class TTS:
|
||||||
vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
|
vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
|
||||||
vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
|
vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
|
||||||
progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
|
progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
|
||||||
|
cs_api_model (str, optional): Name of the model to use for the Coqui Studio API. Available models are
|
||||||
|
"XTTS", "XTTS-multilingual", "V1". You can also use `TTS.cs_api.CS_API" for more control.
|
||||||
|
Defaults to "XTTS".
|
||||||
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
|
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
|
||||||
"""
|
"""
|
||||||
self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar, verbose=False)
|
self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar, verbose=False)
|
||||||
|
@ -282,6 +67,7 @@ class TTS:
|
||||||
self.synthesizer = None
|
self.synthesizer = None
|
||||||
self.voice_converter = None
|
self.voice_converter = None
|
||||||
self.csapi = None
|
self.csapi = None
|
||||||
|
self.cs_api_model = cs_api_model
|
||||||
self.model_name = None
|
self.model_name = None
|
||||||
|
|
||||||
if model_name is not None:
|
if model_name is not None:
|
||||||
|
@ -333,10 +119,9 @@ class TTS:
|
||||||
def get_models_file_path():
|
def get_models_file_path():
|
||||||
return Path(__file__).parent / ".models.json"
|
return Path(__file__).parent / ".models.json"
|
||||||
|
|
||||||
@staticmethod
|
def list_models(self):
|
||||||
def list_models():
|
|
||||||
try:
|
try:
|
||||||
csapi = CS_API()
|
csapi = CS_API(model=self.cs_api_model)
|
||||||
models = csapi.list_speakers_as_tts_models()
|
models = csapi.list_speakers_as_tts_models()
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
print(e)
|
print(e)
|
||||||
|
@ -468,7 +253,7 @@ class TTS:
|
||||||
text: str,
|
text: str,
|
||||||
speaker_name: str = None,
|
speaker_name: str = None,
|
||||||
language: str = None,
|
language: str = None,
|
||||||
emotion: str = "Neutral",
|
emotion: str = None,
|
||||||
speed: float = 1.0,
|
speed: float = 1.0,
|
||||||
file_path: str = None,
|
file_path: str = None,
|
||||||
) -> Union[np.ndarray, str]:
|
) -> Union[np.ndarray, str]:
|
||||||
|
@ -479,10 +264,11 @@ class TTS:
|
||||||
Input text to synthesize.
|
Input text to synthesize.
|
||||||
speaker_name (str, optional):
|
speaker_name (str, optional):
|
||||||
Speaker name from Coqui Studio. Defaults to None.
|
Speaker name from Coqui Studio. Defaults to None.
|
||||||
language (str, optional):
|
language (str): Language of the text. If None, the default language of the speaker is used. Language is only
|
||||||
Language code. Coqui Studio currently supports only English. Defaults to None.
|
supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
|
||||||
emotion (str, optional):
|
emotion (str, optional):
|
||||||
Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Defaults to "Neutral".
|
Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only available
|
||||||
|
with "V1" model. Defaults to None.
|
||||||
speed (float, optional):
|
speed (float, optional):
|
||||||
Speed of the speech. Defaults to 1.0.
|
Speed of the speech. Defaults to 1.0.
|
||||||
file_path (str, optional):
|
file_path (str, optional):
|
||||||
|
@ -521,9 +307,8 @@ class TTS:
|
||||||
speaker (str, optional):
|
speaker (str, optional):
|
||||||
Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
|
Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
|
||||||
`tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
|
`tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
|
||||||
language (str, optional):
|
language (str): Language of the text. If None, the default language of the speaker is used. Language is only
|
||||||
Language code for multi-lingual models. You can check whether loaded model is multi-lingual
|
supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
|
||||||
`tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
|
|
||||||
speaker_wav (str, optional):
|
speaker_wav (str, optional):
|
||||||
Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
|
Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
|
||||||
Defaults to None.
|
Defaults to None.
|
||||||
|
@ -559,7 +344,7 @@ class TTS:
|
||||||
speaker: str = None,
|
speaker: str = None,
|
||||||
language: str = None,
|
language: str = None,
|
||||||
speaker_wav: str = None,
|
speaker_wav: str = None,
|
||||||
emotion: str = "Neutral",
|
emotion: str = None,
|
||||||
speed: float = 1.0,
|
speed: float = 1.0,
|
||||||
file_path: str = "output.wav",
|
file_path: str = "output.wav",
|
||||||
**kwargs,
|
**kwargs,
|
||||||
|
|
|
@ -185,11 +185,22 @@ If you don't specify any models, then it uses LJSpeech based English model.
|
||||||
parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None)
|
parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None)
|
||||||
|
|
||||||
# args for coqui studio
|
# args for coqui studio
|
||||||
|
parser.add_argument(
|
||||||
|
"--cs_model",
|
||||||
|
type=str,
|
||||||
|
help="Name of the 🐸Coqui Studio model. Available models are `XTTS`, `XTTS-multilingual`, `V1`.",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--emotion",
|
"--emotion",
|
||||||
type=str,
|
type=str,
|
||||||
help="Emotion to condition the model with. Only available for 🐸Coqui Studio models.",
|
help="Emotion to condition the model with. Only available for 🐸Coqui Studio `V1` model.",
|
||||||
default="Neutral",
|
default=None,
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--language",
|
||||||
|
type=str,
|
||||||
|
help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS-multilingual` model.",
|
||||||
|
default=None,
|
||||||
)
|
)
|
||||||
|
|
||||||
# args for multi-speaker synthesis
|
# args for multi-speaker synthesis
|
||||||
|
@ -335,8 +346,8 @@ If you don't specify any models, then it uses LJSpeech based English model.
|
||||||
# CASE3: TTS with coqui studio models
|
# CASE3: TTS with coqui studio models
|
||||||
if "coqui_studio" in args.model_name:
|
if "coqui_studio" in args.model_name:
|
||||||
print(" > Using 🐸Coqui Studio model: ", args.model_name)
|
print(" > Using 🐸Coqui Studio model: ", args.model_name)
|
||||||
api = TTS(model_name=args.model_name)
|
api = TTS(model_name=args.model_name, cs_api_model=args.cs_model)
|
||||||
api.tts_to_file(text=args.text, emotion=args.emotion, file_path=args.out_path)
|
api.tts_to_file(text=args.text, emotion=args.emotion, file_path=args.out_path, language=args.language)
|
||||||
print(" > Saving output to ", args.out_path)
|
print(" > Saving output to ", args.out_path)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,338 @@
|
||||||
|
import http.client
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
import urllib.request
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import requests
|
||||||
|
from scipy.io import wavfile
|
||||||
|
|
||||||
|
|
||||||
|
class Speaker(object):
|
||||||
|
"""Convert dict to object."""
|
||||||
|
|
||||||
|
def __init__(self, d, is_voice=False):
|
||||||
|
self.is_voice = is_voice
|
||||||
|
for k, v in d.items():
|
||||||
|
if isinstance(k, (list, tuple)):
|
||||||
|
setattr(self, k, [Speaker(x) if isinstance(x, dict) else x for x in v])
|
||||||
|
else:
|
||||||
|
setattr(self, k, Speaker(v) if isinstance(v, dict) else v)
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return str(self.__dict__)
|
||||||
|
|
||||||
|
|
||||||
|
class CS_API:
|
||||||
|
"""🐸Coqui Studio API Wrapper.
|
||||||
|
|
||||||
|
🐸Coqui Studio is the most advanced voice generation platform. You can generate new voices by voice cloning, voice
|
||||||
|
interpolation, or our unique prompt to voice technology. It also provides a set of built-in voices with different
|
||||||
|
characteristics. You can use these voices to generate new audio files or use them in your applications.
|
||||||
|
You can use all the built-in and your own 🐸Coqui Studio speakers with this API with an API token.
|
||||||
|
You can signup to 🐸Coqui Studio from https://app.coqui.ai/auth/signup and get an API token from
|
||||||
|
https://app.coqui.ai/account. We can either enter the token as an environment variable as
|
||||||
|
`export COQUI_STUDIO_TOKEN=<token>` or pass it as `CS_API(api_token=<toke>)`.
|
||||||
|
Visit https://app.coqui.ai/api for more information.
|
||||||
|
|
||||||
|
|
||||||
|
Args:
|
||||||
|
api_token (str): 🐸Coqui Studio API token. If not provided, it will be read from the environment variable
|
||||||
|
`COQUI_STUDIO_TOKEN`.
|
||||||
|
model (str): 🐸Coqui Studio model. It can be either `V1`, `XTTS`, or `XTTS-multilang`. Default is `XTTS`.
|
||||||
|
|
||||||
|
|
||||||
|
Example listing all available speakers:
|
||||||
|
>>> from TTS.api import CS_API
|
||||||
|
>>> tts = CS_API()
|
||||||
|
>>> tts.speakers
|
||||||
|
|
||||||
|
Example listing all emotions:
|
||||||
|
>>> # emotions are only available for `V1` model
|
||||||
|
>>> from TTS.api import CS_API
|
||||||
|
>>> tts = CS_API(model="V1")
|
||||||
|
>>> tts.emotions
|
||||||
|
|
||||||
|
Example with a built-in 🐸 speaker:
|
||||||
|
>>> from TTS.api import CS_API
|
||||||
|
>>> tts = CS_API()
|
||||||
|
>>> wav, sr = api.tts("Hello world", speaker_name=tts.speakers[0].name)
|
||||||
|
>>> filepath = tts.tts_to_file(text="Hello world!", speaker_name=tts.speakers[0].name, file_path="output.wav")
|
||||||
|
|
||||||
|
Example with multi-language model:
|
||||||
|
>>> from TTS.api import CS_API
|
||||||
|
>>> tts = CS_API(model="XTTS-multilang")
|
||||||
|
>>> wav, sr = api.tts("Hello world", speaker_name=tts.speakers[0].name, language="en")
|
||||||
|
"""
|
||||||
|
|
||||||
|
MODEL_ENDPOINTS = {
|
||||||
|
"V1": {
|
||||||
|
"list_speakers": "https://app.coqui.ai/api/v2/speakers",
|
||||||
|
"synthesize": "https://app.coqui.ai/api/v2/samples",
|
||||||
|
"list_voices": "https://app.coqui.ai/api/v2/voices",
|
||||||
|
},
|
||||||
|
"XTTS": {
|
||||||
|
"list_speakers": "https://app.coqui.ai/api/v2/speakers",
|
||||||
|
"synthesize": "https://app.coqui.ai/api/v2/samples/xtts/render/",
|
||||||
|
"list_voices": "https://app.coqui.ai/api/v2/voices/xtts/",
|
||||||
|
},
|
||||||
|
"XTTS-multilang": {
|
||||||
|
"list_speakers": "https://app.coqui.ai/api/v2/speakers",
|
||||||
|
"synthesize": "https://app.coqui.ai/api/v2/samples/multilingual/render/",
|
||||||
|
"list_voices": "https://app.coqui.ai/api/v2/voices/xtts/",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
SUPPORTED_LANGUAGES = ["en", "es", "de", "fr", "it", "pt", "pl"]
|
||||||
|
|
||||||
|
def __init__(self, api_token=None, model="XTTS"):
|
||||||
|
self.api_token = api_token
|
||||||
|
self.model = model
|
||||||
|
self.headers = None
|
||||||
|
self._speakers = None
|
||||||
|
self._check_token()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def ping_api():
|
||||||
|
URL = "https://coqui.gateway.scarf.sh/tts/api"
|
||||||
|
_ = requests.get(URL)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def speakers(self):
|
||||||
|
if self._speakers is None:
|
||||||
|
self._speakers = self.list_all_speakers()
|
||||||
|
return self._speakers
|
||||||
|
|
||||||
|
@property
|
||||||
|
def emotions(self):
|
||||||
|
"""Return a list of available emotions.
|
||||||
|
|
||||||
|
TODO: Get this from the API endpoint.
|
||||||
|
"""
|
||||||
|
if self.model == "V1":
|
||||||
|
return ["Neutral", "Happy", "Sad", "Angry", "Dull"]
|
||||||
|
else:
|
||||||
|
raise ValueError(f"❗ Emotions are not available for {self.model}.")
|
||||||
|
|
||||||
|
def _check_token(self):
|
||||||
|
if self.api_token is None:
|
||||||
|
self.api_token = os.environ.get("COQUI_STUDIO_TOKEN")
|
||||||
|
self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_token}"}
|
||||||
|
if not self.api_token:
|
||||||
|
raise ValueError(
|
||||||
|
"No API token found for 🐸Coqui Studio voices - https://coqui.ai \n"
|
||||||
|
"Visit 🔗https://app.coqui.ai/account to get one.\n"
|
||||||
|
"Set it as an environment variable `export COQUI_STUDIO_TOKEN=<token>`\n"
|
||||||
|
""
|
||||||
|
)
|
||||||
|
|
||||||
|
def list_all_speakers(self):
|
||||||
|
"""Return both built-in Coqui Studio speakers and custom voices created by the user."""
|
||||||
|
return self.list_speakers() + self.list_voices()
|
||||||
|
|
||||||
|
def list_speakers(self):
|
||||||
|
"""List built-in Coqui Studio speakers."""
|
||||||
|
self._check_token()
|
||||||
|
conn = http.client.HTTPSConnection("app.coqui.ai")
|
||||||
|
url = self.MODEL_ENDPOINTS[self.model]["list_speakers"]
|
||||||
|
conn.request("GET", f"{url}?per_page=100", headers=self.headers)
|
||||||
|
res = conn.getresponse()
|
||||||
|
data = res.read()
|
||||||
|
return [Speaker(s) for s in json.loads(data)["result"]]
|
||||||
|
|
||||||
|
def list_voices(self):
|
||||||
|
"""List custom voices created by the user."""
|
||||||
|
conn = http.client.HTTPSConnection("app.coqui.ai")
|
||||||
|
url = self.MODEL_ENDPOINTS[self.model]["list_voices"]
|
||||||
|
conn.request("GET", f"{url}", headers=self.headers)
|
||||||
|
res = conn.getresponse()
|
||||||
|
data = res.read()
|
||||||
|
return [Speaker(s, True) for s in json.loads(data)["result"]]
|
||||||
|
|
||||||
|
def list_speakers_as_tts_models(self):
|
||||||
|
"""List speakers in ModelManager format."""
|
||||||
|
models = []
|
||||||
|
for speaker in self.speakers:
|
||||||
|
model = f"coqui_studio/multilingual/{speaker.name}/{self.model}"
|
||||||
|
models.append(model)
|
||||||
|
return models
|
||||||
|
|
||||||
|
def name_to_speaker(self, name):
|
||||||
|
for speaker in self.speakers:
|
||||||
|
if speaker.name == name:
|
||||||
|
return speaker
|
||||||
|
raise ValueError(f"Speaker {name} not found in {self.speakers}")
|
||||||
|
|
||||||
|
def id_to_speaker(self, speaker_id):
|
||||||
|
for speaker in self.speakers:
|
||||||
|
if speaker.id == speaker_id:
|
||||||
|
return speaker
|
||||||
|
raise ValueError(f"Speaker {speaker_id} not found.")
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def url_to_np(url):
|
||||||
|
tmp_file, _ = urllib.request.urlretrieve(url)
|
||||||
|
rate, data = wavfile.read(tmp_file)
|
||||||
|
return data, rate
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _create_payload(model, text, speaker, speed, emotion, language):
|
||||||
|
payload = {}
|
||||||
|
# if speaker.is_voice:
|
||||||
|
payload["voice_id"] = speaker.id
|
||||||
|
# else:
|
||||||
|
payload["speaker_id"] = speaker.id
|
||||||
|
|
||||||
|
if model == "V1":
|
||||||
|
payload.update(
|
||||||
|
{
|
||||||
|
"emotion": emotion,
|
||||||
|
"name": speaker.name,
|
||||||
|
"text": text,
|
||||||
|
"speed": speed,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
elif model == "XTTS":
|
||||||
|
payload.update(
|
||||||
|
{
|
||||||
|
"name": speaker.name,
|
||||||
|
"text": text,
|
||||||
|
"speed": speed,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
elif model == "XTTS-multilang":
|
||||||
|
payload.update(
|
||||||
|
{
|
||||||
|
"name": speaker.name,
|
||||||
|
"text": text,
|
||||||
|
"speed": speed,
|
||||||
|
"language": language,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"❗ Unknown model {model}")
|
||||||
|
return payload
|
||||||
|
|
||||||
|
def _check_tts_args(self, text, speaker_name, speaker_id, emotion, speed, language):
|
||||||
|
assert text is not None, "❗ text is required for V1 model."
|
||||||
|
assert speaker_name is not None, "❗ speaker_name is required for V1 model."
|
||||||
|
if self.model == "V1":
|
||||||
|
if emotion is None:
|
||||||
|
emotion = "Neutral"
|
||||||
|
assert language is None, "❗ language is not supported for V1 model."
|
||||||
|
elif self.model == "XTTS":
|
||||||
|
assert emotion is None, f"❗ Emotions are not supported for XTTS model. Use V1 model."
|
||||||
|
assert language is None, "❗ Language is not supported for XTTS model. Use XTTS-multilang model."
|
||||||
|
elif self.model == "XTTS-multilang":
|
||||||
|
assert emotion is None, f"❗ Emotions are not supported for XTTS-multilang model. Use V1 model."
|
||||||
|
assert language is not None, "❗ Language is required for XTTS-multilang model."
|
||||||
|
assert (
|
||||||
|
language in self.SUPPORTED_LANGUAGES
|
||||||
|
), f"❗ Language {language} is not yet supported. Use one of: en, es, de, fr, it, pt, pl"
|
||||||
|
return text, speaker_name, speaker_id, emotion, speed, language
|
||||||
|
|
||||||
|
def tts(
|
||||||
|
self,
|
||||||
|
text: str,
|
||||||
|
speaker_name: str = None,
|
||||||
|
speaker_id=None,
|
||||||
|
emotion=None,
|
||||||
|
speed=1.0,
|
||||||
|
language=None, # pylint: disable=unused-argument
|
||||||
|
) -> Tuple[np.ndarray, int]:
|
||||||
|
"""Synthesize speech from text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text (str): Text to synthesize.
|
||||||
|
speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and
|
||||||
|
voices (user generated speakers) with `list_voices()`.
|
||||||
|
speaker_id (str): Speaker ID. If None, the speaker name is used.
|
||||||
|
emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only
|
||||||
|
supported by `V1` model. Defaults to None.
|
||||||
|
speed (float): Speed of the speech. 1.0 is normal speed.
|
||||||
|
language (str): Language of the text. If None, the default language of the speaker is used. Language is only
|
||||||
|
supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
|
||||||
|
"""
|
||||||
|
self._check_token()
|
||||||
|
self.ping_api()
|
||||||
|
|
||||||
|
if speaker_name is None and speaker_id is None:
|
||||||
|
raise ValueError(" [!] Please provide either a `speaker_name` or a `speaker_id`.")
|
||||||
|
if speaker_id is None:
|
||||||
|
speaker = self.name_to_speaker(speaker_name)
|
||||||
|
else:
|
||||||
|
speaker = self.id_to_speaker(speaker_id)
|
||||||
|
|
||||||
|
text, speaker_name, speaker_id, emotion, speed, language = self._check_tts_args(
|
||||||
|
text, speaker_name, speaker_id, emotion, speed, language
|
||||||
|
)
|
||||||
|
|
||||||
|
conn = http.client.HTTPSConnection("app.coqui.ai")
|
||||||
|
payload = self._create_payload(self.model, text, speaker, speed, emotion, language)
|
||||||
|
url = self.MODEL_ENDPOINTS[self.model]["synthesize"]
|
||||||
|
conn.request("POST", url, json.dumps(payload), self.headers)
|
||||||
|
res = conn.getresponse()
|
||||||
|
data = res.read()
|
||||||
|
try:
|
||||||
|
wav, sr = self.url_to_np(json.loads(data)["audio_url"])
|
||||||
|
except KeyError as e:
|
||||||
|
raise ValueError(f" [!] 🐸 API returned error: {data}") from e
|
||||||
|
return wav, sr
|
||||||
|
|
||||||
|
def tts_to_file(
|
||||||
|
self,
|
||||||
|
text: str,
|
||||||
|
speaker_name: str,
|
||||||
|
speaker_id=None,
|
||||||
|
emotion=None,
|
||||||
|
speed=1.0,
|
||||||
|
language=None,
|
||||||
|
file_path: str = None,
|
||||||
|
) -> str:
|
||||||
|
"""Synthesize speech from text and save it to a file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text (str): Text to synthesize.
|
||||||
|
speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and
|
||||||
|
voices (user generated speakers) with `list_voices()`.
|
||||||
|
speaker_id (str): Speaker ID. If None, the speaker name is used.
|
||||||
|
emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull".
|
||||||
|
speed (float): Speed of the speech. 1.0 is normal speed.
|
||||||
|
language (str): Language of the text. If None, the default language of the speaker is used. Language is only
|
||||||
|
supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
|
||||||
|
file_path (str): Path to save the file. If None, a temporary file is created.
|
||||||
|
"""
|
||||||
|
if file_path is None:
|
||||||
|
file_path = tempfile.mktemp(".wav")
|
||||||
|
wav, sr = self.tts(text, speaker_name, speaker_id, emotion, speed, language)
|
||||||
|
wavfile.write(file_path, sr, wav)
|
||||||
|
return file_path
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import time
|
||||||
|
|
||||||
|
api = CS_API()
|
||||||
|
print(api.speakers)
|
||||||
|
print(api.list_speakers_as_tts_models())
|
||||||
|
|
||||||
|
ts = time.time()
|
||||||
|
wav, sr = api.tts("It took me quite a long time to develop a voice.", speaker_name=api.speakers[0].name)
|
||||||
|
print(f" [i] XTTS took {time.time() - ts:.2f}s")
|
||||||
|
|
||||||
|
filepath = api.tts_to_file(text="Hello world!", speaker_name=api.speakers[0].name, file_path="output.wav")
|
||||||
|
|
||||||
|
api = CS_API(model="XTTS-multilang")
|
||||||
|
print(api.speakers)
|
||||||
|
|
||||||
|
ts = time.time()
|
||||||
|
wav, sr = api.tts(
|
||||||
|
"It took me quite a long time to develop a voice.", speaker_name=api.speakers[0].name, language="en"
|
||||||
|
)
|
||||||
|
print(f" [i] XTTS took {time.time() - ts:.2f}s")
|
||||||
|
|
||||||
|
filepath = api.tts_to_file(
|
||||||
|
text="Hello world!", speaker_name=api.speakers[0].name, file_path="output.wav", language="en"
|
||||||
|
)
|
|
@ -72,7 +72,7 @@ def load_discrete_vocoder_diffuser(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def format_conditioning(clip, cond_length=132300, device="cuda"):
|
def format_conditioning(clip, cond_length=132300, device="cuda", **kwargs):
|
||||||
"""
|
"""
|
||||||
Converts the given conditioning signal to a MEL spectrogram and clips it as expected by the models.
|
Converts the given conditioning signal to a MEL spectrogram and clips it as expected by the models.
|
||||||
"""
|
"""
|
||||||
|
@ -82,7 +82,7 @@ def format_conditioning(clip, cond_length=132300, device="cuda"):
|
||||||
elif gap > 0:
|
elif gap > 0:
|
||||||
rand_start = random.randint(0, gap)
|
rand_start = random.randint(0, gap)
|
||||||
clip = clip[:, rand_start : rand_start + cond_length]
|
clip = clip[:, rand_start : rand_start + cond_length]
|
||||||
mel_clip = TorchMelSpectrogram()(clip.unsqueeze(0)).squeeze(0)
|
mel_clip = TorchMelSpectrogram(**kwargs)(clip.unsqueeze(0)).squeeze(0)
|
||||||
return mel_clip.unsqueeze(0).to(device)
|
return mel_clip.unsqueeze(0).to(device)
|
||||||
|
|
||||||
|
|
||||||
|
@ -321,6 +321,7 @@ class Tortoise(BaseTTS):
|
||||||
|
|
||||||
def __init__(self, config: Coqpit):
|
def __init__(self, config: Coqpit):
|
||||||
super().__init__(config, ap=None, tokenizer=None)
|
super().__init__(config, ap=None, tokenizer=None)
|
||||||
|
self.mel_norm_path = None
|
||||||
self.config = config
|
self.config = config
|
||||||
self.ar_checkpoint = self.args.ar_checkpoint
|
self.ar_checkpoint = self.args.ar_checkpoint
|
||||||
self.diff_checkpoint = self.args.diff_checkpoint # TODO: check if this is even needed
|
self.diff_checkpoint = self.args.diff_checkpoint # TODO: check if this is even needed
|
||||||
|
@ -429,7 +430,7 @@ class Tortoise(BaseTTS):
|
||||||
|
|
||||||
auto_conds = []
|
auto_conds = []
|
||||||
for ls in voice_samples:
|
for ls in voice_samples:
|
||||||
auto_conds.append(format_conditioning(ls[0], device=self.device))
|
auto_conds.append(format_conditioning(ls[0], device=self.device, mel_norm_file=self.mel_norm_path))
|
||||||
auto_conds = torch.stack(auto_conds, dim=1)
|
auto_conds = torch.stack(auto_conds, dim=1)
|
||||||
with self.temporary_cuda(self.autoregressive) as ar:
|
with self.temporary_cuda(self.autoregressive) as ar:
|
||||||
auto_latent = ar.get_conditioning(auto_conds)
|
auto_latent = ar.get_conditioning(auto_conds)
|
||||||
|
@ -873,6 +874,7 @@ class Tortoise(BaseTTS):
|
||||||
diff_path = diff_checkpoint_path or os.path.join(checkpoint_dir, "diffusion_decoder.pth")
|
diff_path = diff_checkpoint_path or os.path.join(checkpoint_dir, "diffusion_decoder.pth")
|
||||||
clvp_path = clvp_checkpoint_path or os.path.join(checkpoint_dir, "clvp2.pth")
|
clvp_path = clvp_checkpoint_path or os.path.join(checkpoint_dir, "clvp2.pth")
|
||||||
vocoder_checkpoint_path = vocoder_checkpoint_path or os.path.join(checkpoint_dir, "vocoder.pth")
|
vocoder_checkpoint_path = vocoder_checkpoint_path or os.path.join(checkpoint_dir, "vocoder.pth")
|
||||||
|
self.mel_norm_path = os.path.join(checkpoint_dir, "mel_norms.pth")
|
||||||
|
|
||||||
if os.path.exists(ar_path):
|
if os.path.exists(ar_path):
|
||||||
# remove keys from the checkpoint that are not in the model
|
# remove keys from the checkpoint that are not in the model
|
||||||
|
|
|
@ -88,7 +88,7 @@ class ModelManager(object):
|
||||||
|
|
||||||
def _list_models(self, model_type, model_count=0):
|
def _list_models(self, model_type, model_count=0):
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print(" Name format: type/language/dataset/model")
|
print("\n Name format: type/language/dataset/model")
|
||||||
model_list = []
|
model_list = []
|
||||||
for lang in self.models_dict[model_type]:
|
for lang in self.models_dict[model_type]:
|
||||||
for dataset in self.models_dict[model_type][lang]:
|
for dataset in self.models_dict[model_type][lang]:
|
||||||
|
|
|
@ -191,9 +191,25 @@ from TTS.api import CS_API
|
||||||
|
|
||||||
# Init 🐸 Coqui Studio API
|
# Init 🐸 Coqui Studio API
|
||||||
# you can either set the API token as an environment variable `COQUI_STUDIO_TOKEN` or pass it as an argument.
|
# you can either set the API token as an environment variable `COQUI_STUDIO_TOKEN` or pass it as an argument.
|
||||||
api = CS_API(api_token=<token>)
|
|
||||||
|
# XTTS - Best quality and life-like speech in EN
|
||||||
|
api = CS_API(api_token=<token>, model="XTTS")
|
||||||
|
api.speakers # all the speakers are available with all the models.
|
||||||
|
api.list_speakers()
|
||||||
|
api.list_voices()
|
||||||
|
wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5)
|
||||||
|
|
||||||
|
# XTTS-multilingual - Multilingual XTTS with [en, de, es, fr, it, pt, ...] (more langs coming soon)
|
||||||
|
api = CS_API(api_token=<token>, model="XTTS-multilingual")
|
||||||
api.speakers
|
api.speakers
|
||||||
api.emotions
|
api.list_speakers()
|
||||||
|
api.list_voices()
|
||||||
|
wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5)
|
||||||
|
|
||||||
|
# V1 - Fast and lightweight TTS in EN with emotion control.
|
||||||
|
api = CS_API(api_token=<token>, model="V1")
|
||||||
|
api.speakers
|
||||||
|
api.emotions # emotions are only for the V1 model.
|
||||||
api.list_speakers()
|
api.list_speakers()
|
||||||
api.list_voices()
|
api.list_voices()
|
||||||
wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5)
|
wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5)
|
||||||
|
|
Loading…
Reference in New Issue