mirror of https://github.com/coqui-ai/TTS.git
Fix coqui api (#3168)
This commit is contained in:
parent
ce1a39a9a4
commit
a24ebcd8a6
|
@ -267,19 +267,13 @@ models = TTS(cs_api_model="XTTS").list_models()
|
|||
# Init TTS with the target studio speaker
|
||||
tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False)
|
||||
# Run TTS
|
||||
tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH)
|
||||
tts.tts_to_file(text="This is a test.", language="en", file_path=OUTPUT_PATH)
|
||||
|
||||
# V1 model
|
||||
models = TTS(cs_api_model="V1").list_models()
|
||||
# Run TTS with emotion and speed control
|
||||
# Emotion control only works with V1 model
|
||||
tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5)
|
||||
|
||||
# XTTS-multilingual
|
||||
models = TTS(cs_api_model="XTTS-multilingual").list_models()
|
||||
# Run TTS with emotion and speed control
|
||||
# Emotion control only works with V1 model
|
||||
tts.tts_to_file(text="Das ist ein Test.", file_path=OUTPUT_PATH, language="de", speed=1.0)
|
||||
```
|
||||
|
||||
#### Example text to speech using **Fairseq models in ~1100 languages** 🤯.
|
||||
|
|
|
@ -60,7 +60,7 @@ class TTS(nn.Module):
|
|||
vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
|
||||
progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
|
||||
cs_api_model (str, optional): Name of the model to use for the Coqui Studio API. Available models are
|
||||
"XTTS", "XTTS-multilingual", "V1". You can also use `TTS.cs_api.CS_API" for more control.
|
||||
"XTTS", "V1". You can also use `TTS.cs_api.CS_API" for more control.
|
||||
Defaults to "XTTS".
|
||||
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
|
||||
"""
|
||||
|
@ -275,7 +275,7 @@ class TTS(nn.Module):
|
|||
speaker_name (str, optional):
|
||||
Speaker name from Coqui Studio. Defaults to None.
|
||||
language (str): Language of the text. If None, the default language of the speaker is used. Language is only
|
||||
supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
|
||||
supported by `XTTS` model.
|
||||
emotion (str, optional):
|
||||
Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only available
|
||||
with "V1" model. Defaults to None.
|
||||
|
@ -321,7 +321,7 @@ class TTS(nn.Module):
|
|||
Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
|
||||
`tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
|
||||
language (str): Language of the text. If None, the default language of the speaker is used. Language is only
|
||||
supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
|
||||
supported by `XTTS` model.
|
||||
speaker_wav (str, optional):
|
||||
Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
|
||||
Defaults to None.
|
||||
|
|
|
@ -227,7 +227,7 @@ def main():
|
|||
parser.add_argument(
|
||||
"--cs_model",
|
||||
type=str,
|
||||
help="Name of the 🐸Coqui Studio model. Available models are `XTTS`, `XTTS-multilingual`, `V1`.",
|
||||
help="Name of the 🐸Coqui Studio model. Available models are `XTTS`, `V1`.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--emotion",
|
||||
|
@ -238,7 +238,7 @@ def main():
|
|||
parser.add_argument(
|
||||
"--language",
|
||||
type=str,
|
||||
help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS-multilingual` model.",
|
||||
help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS` model.",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
|
|
|
@ -43,7 +43,7 @@ class CS_API:
|
|||
Args:
|
||||
api_token (str): 🐸Coqui Studio API token. If not provided, it will be read from the environment variable
|
||||
`COQUI_STUDIO_TOKEN`.
|
||||
model (str): 🐸Coqui Studio model. It can be either `V1`, `XTTS`, or `XTTS-multilang`. Default is `XTTS`.
|
||||
model (str): 🐸Coqui Studio model. It can be either `V1`, `XTTS`. Default is `XTTS`.
|
||||
|
||||
|
||||
Example listing all available speakers:
|
||||
|
@ -65,7 +65,7 @@ class CS_API:
|
|||
|
||||
Example with multi-language model:
|
||||
>>> from TTS.api import CS_API
|
||||
>>> tts = CS_API(model="XTTS-multilang")
|
||||
>>> tts = CS_API(model="XTTS")
|
||||
>>> wav, sr = api.tts("Hello world", speaker_name=tts.speakers[0].name, language="en")
|
||||
"""
|
||||
|
||||
|
@ -78,16 +78,12 @@ class CS_API:
|
|||
"XTTS": {
|
||||
"list_speakers": "https://app.coqui.ai/api/v2/speakers",
|
||||
"synthesize": "https://app.coqui.ai/api/v2/samples/xtts/render/",
|
||||
"list_voices": "https://app.coqui.ai/api/v2/voices/xtts/",
|
||||
},
|
||||
"XTTS-multilang": {
|
||||
"list_speakers": "https://app.coqui.ai/api/v2/speakers",
|
||||
"synthesize": "https://app.coqui.ai/api/v2/samples/multilingual/render/",
|
||||
"list_voices": "https://app.coqui.ai/api/v2/voices/xtts/",
|
||||
"list_voices": "https://app.coqui.ai/api/v2/voices/xtts",
|
||||
},
|
||||
}
|
||||
|
||||
SUPPORTED_LANGUAGES = ["en", "es", "de", "fr", "it", "pt", "pl"]
|
||||
|
||||
SUPPORTED_LANGUAGES = ["en", "es", "de", "fr", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja"]
|
||||
|
||||
def __init__(self, api_token=None, model="XTTS"):
|
||||
self.api_token = api_token
|
||||
|
@ -139,7 +135,7 @@ class CS_API:
|
|||
self._check_token()
|
||||
conn = http.client.HTTPSConnection("app.coqui.ai")
|
||||
url = self.MODEL_ENDPOINTS[self.model]["list_speakers"]
|
||||
conn.request("GET", f"{url}?per_page=100", headers=self.headers)
|
||||
conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers)
|
||||
res = conn.getresponse()
|
||||
data = res.read()
|
||||
return [Speaker(s) for s in json.loads(data)["result"]]
|
||||
|
@ -148,7 +144,7 @@ class CS_API:
|
|||
"""List custom voices created by the user."""
|
||||
conn = http.client.HTTPSConnection("app.coqui.ai")
|
||||
url = self.MODEL_ENDPOINTS[self.model]["list_voices"]
|
||||
conn.request("GET", f"{url}", headers=self.headers)
|
||||
conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers)
|
||||
res = conn.getresponse()
|
||||
data = res.read()
|
||||
return [Speaker(s, True) for s in json.loads(data)["result"]]
|
||||
|
@ -197,14 +193,6 @@ class CS_API:
|
|||
}
|
||||
)
|
||||
elif model == "XTTS":
|
||||
payload.update(
|
||||
{
|
||||
"name": speaker.name,
|
||||
"text": text,
|
||||
"speed": speed,
|
||||
}
|
||||
)
|
||||
elif model == "XTTS-multilang":
|
||||
payload.update(
|
||||
{
|
||||
"name": speaker.name,
|
||||
|
@ -226,13 +214,10 @@ class CS_API:
|
|||
assert language is None, "❗ language is not supported for V1 model."
|
||||
elif self.model == "XTTS":
|
||||
assert emotion is None, f"❗ Emotions are not supported for XTTS model. Use V1 model."
|
||||
assert language is None, "❗ Language is not supported for XTTS model. Use XTTS-multilang model."
|
||||
elif self.model == "XTTS-multilang":
|
||||
assert emotion is None, f"❗ Emotions are not supported for XTTS-multilang model. Use V1 model."
|
||||
assert language is not None, "❗ Language is required for XTTS-multilang model."
|
||||
assert language is not None, "❗ Language is required for XTTS model."
|
||||
assert (
|
||||
language in self.SUPPORTED_LANGUAGES
|
||||
), f"❗ Language {language} is not yet supported. Use one of: en, es, de, fr, it, pt, pl"
|
||||
), f"❗ Language {language} is not yet supported. Check https://docs.coqui.ai/reference/samples_xtts_create."
|
||||
return text, speaker_name, speaker_id, emotion, speed, language
|
||||
|
||||
def tts(
|
||||
|
@ -255,7 +240,7 @@ class CS_API:
|
|||
supported by `V1` model. Defaults to None.
|
||||
speed (float): Speed of the speech. 1.0 is normal speed.
|
||||
language (str): Language of the text. If None, the default language of the speaker is used. Language is only
|
||||
supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
|
||||
supported by `XTTS` model. See https://docs.coqui.ai/reference/samples_xtts_create for supported languages.
|
||||
"""
|
||||
self._check_token()
|
||||
self.ping_api()
|
||||
|
@ -305,7 +290,7 @@ class CS_API:
|
|||
speed (float): Speed of the speech. 1.0 is normal speed.
|
||||
pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe.
|
||||
language (str): Language of the text. If None, the default language of the speaker is used. Language is only
|
||||
supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
|
||||
supported by `XTTS` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
|
||||
file_path (str): Path to save the file. If None, a temporary file is created.
|
||||
"""
|
||||
if file_path is None:
|
||||
|
@ -323,20 +308,7 @@ if __name__ == "__main__":
|
|||
print(api.list_speakers_as_tts_models())
|
||||
|
||||
ts = time.time()
|
||||
wav, sr = api.tts("It took me quite a long time to develop a voice.", speaker_name=api.speakers[0].name)
|
||||
wav, sr = api.tts("It took me quite a long time to develop a voice.", language="en", speaker_name=api.speakers[0].name)
|
||||
print(f" [i] XTTS took {time.time() - ts:.2f}s")
|
||||
|
||||
filepath = api.tts_to_file(text="Hello world!", speaker_name=api.speakers[0].name, file_path="output.wav")
|
||||
|
||||
api = CS_API(model="XTTS-multilang")
|
||||
print(api.speakers)
|
||||
|
||||
ts = time.time()
|
||||
wav, sr = api.tts(
|
||||
"It took me quite a long time to develop a voice.", speaker_name=api.speakers[0].name, language="en"
|
||||
)
|
||||
print(f" [i] XTTS took {time.time() - ts:.2f}s")
|
||||
|
||||
filepath = api.tts_to_file(
|
||||
text="Hello world!", speaker_name=api.speakers[0].name, file_path="output.wav", language="en"
|
||||
)
|
||||
filepath = api.tts_to_file(text="Hello world!", speaker_name=api.speakers[0].name, language="en", file_path="output.wav")
|
||||
|
|
|
@ -198,19 +198,12 @@ from TTS.api import CS_API
|
|||
# Init 🐸 Coqui Studio API
|
||||
# you can either set the API token as an environment variable `COQUI_STUDIO_TOKEN` or pass it as an argument.
|
||||
|
||||
# XTTS - Best quality and life-like speech in EN
|
||||
# XTTS - Best quality and life-like speech in multiple languages. See https://docs.coqui.ai/reference/samples_xtts_create for supported languages.
|
||||
api = CS_API(api_token=<token>, model="XTTS")
|
||||
api.speakers # all the speakers are available with all the models.
|
||||
api.list_speakers()
|
||||
api.list_voices()
|
||||
wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5)
|
||||
|
||||
# XTTS-multilingual - Multilingual XTTS with [en, de, es, fr, it, pt, ...] (more langs coming soon)
|
||||
api = CS_API(api_token=<token>, model="XTTS-multilingual")
|
||||
api.speakers
|
||||
api.list_speakers()
|
||||
api.list_voices()
|
||||
wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5)
|
||||
wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", language="en", speed=1.5)
|
||||
|
||||
# V1 - Fast and lightweight TTS in EN with emotion control.
|
||||
api = CS_API(api_token=<token>, model="V1")
|
||||
|
|
Loading…
Reference in New Issue