Fix coqui api (#3168)

This commit is contained in:
Eren Gölge 2023-11-08 10:51:23 +01:00 committed by GitHub
parent ce1a39a9a4
commit a24ebcd8a6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 22 additions and 63 deletions

View File

@ -2,7 +2,7 @@
## 🐸Coqui.ai News ## 🐸Coqui.ai News
- 📣 ⓍTTSv2 is here with 16 languages and better performance across the board. - 📣 ⓍTTSv2 is here with 16 languages and better performance across the board.
- 📣 ⓍTTS fine-tuning code is out. Check the [example recipes](https://github.com/coqui-ai/TTS/tree/dev/recipes/ljspeech). - 📣 ⓍTTS fine-tuning code is out. Check the [example recipes](https://github.com/coqui-ai/TTS/tree/dev/recipes/ljspeech).
- 📣 ⓍTTS can now stream with <200ms latency. - 📣 ⓍTTS can now stream with <200ms latency.
- 📣 ⓍTTS, our production TTS model that can speak 13 languages, is released [Blog Post](https://coqui.ai/blog/tts/open_xtts), [Demo](https://huggingface.co/spaces/coqui/xtts), [Docs](https://tts.readthedocs.io/en/dev/models/xtts.html) - 📣 ⓍTTS, our production TTS model that can speak 13 languages, is released [Blog Post](https://coqui.ai/blog/tts/open_xtts), [Demo](https://huggingface.co/spaces/coqui/xtts), [Docs](https://tts.readthedocs.io/en/dev/models/xtts.html)
- 📣 [🐶Bark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://tts.readthedocs.io/en/dev/models/bark.html) - 📣 [🐶Bark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://tts.readthedocs.io/en/dev/models/bark.html)
- 📣 You can use [~1100 Fairseq models](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS. - 📣 You can use [~1100 Fairseq models](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS.
@ -267,19 +267,13 @@ models = TTS(cs_api_model="XTTS").list_models()
# Init TTS with the target studio speaker # Init TTS with the target studio speaker
tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False) tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False)
# Run TTS # Run TTS
tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH) tts.tts_to_file(text="This is a test.", language="en", file_path=OUTPUT_PATH)
# V1 model # V1 model
models = TTS(cs_api_model="V1").list_models() models = TTS(cs_api_model="V1").list_models()
# Run TTS with emotion and speed control # Run TTS with emotion and speed control
# Emotion control only works with V1 model # Emotion control only works with V1 model
tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5) tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5)
# XTTS-multilingual
models = TTS(cs_api_model="XTTS-multilingual").list_models()
# Run TTS with emotion and speed control
# Emotion control only works with V1 model
tts.tts_to_file(text="Das ist ein Test.", file_path=OUTPUT_PATH, language="de", speed=1.0)
``` ```
#### Example text to speech using **Fairseq models in ~1100 languages** 🤯. #### Example text to speech using **Fairseq models in ~1100 languages** 🤯.

View File

@ -60,7 +60,7 @@ class TTS(nn.Module):
vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None. vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True. progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
cs_api_model (str, optional): Name of the model to use for the Coqui Studio API. Available models are cs_api_model (str, optional): Name of the model to use for the Coqui Studio API. Available models are
"XTTS", "XTTS-multilingual", "V1". You can also use `TTS.cs_api.CS_API" for more control. "XTTS", "V1". You can also use `TTS.cs_api.CS_API" for more control.
Defaults to "XTTS". Defaults to "XTTS".
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False. gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
""" """
@ -275,7 +275,7 @@ class TTS(nn.Module):
speaker_name (str, optional): speaker_name (str, optional):
Speaker name from Coqui Studio. Defaults to None. Speaker name from Coqui Studio. Defaults to None.
language (str): Language of the text. If None, the default language of the speaker is used. Language is only language (str): Language of the text. If None, the default language of the speaker is used. Language is only
supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en". supported by `XTTS` model.
emotion (str, optional): emotion (str, optional):
Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only available Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only available
with "V1" model. Defaults to None. with "V1" model. Defaults to None.
@ -321,7 +321,7 @@ class TTS(nn.Module):
Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
`tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None. `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
language (str): Language of the text. If None, the default language of the speaker is used. Language is only language (str): Language of the text. If None, the default language of the speaker is used. Language is only
supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en". supported by `XTTS` model.
speaker_wav (str, optional): speaker_wav (str, optional):
Path to a reference wav file to use for voice cloning with supporting models like YourTTS. Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
Defaults to None. Defaults to None.

View File

@ -227,7 +227,7 @@ def main():
parser.add_argument( parser.add_argument(
"--cs_model", "--cs_model",
type=str, type=str,
help="Name of the 🐸Coqui Studio model. Available models are `XTTS`, `XTTS-multilingual`, `V1`.", help="Name of the 🐸Coqui Studio model. Available models are `XTTS`, `V1`.",
) )
parser.add_argument( parser.add_argument(
"--emotion", "--emotion",
@ -238,7 +238,7 @@ def main():
parser.add_argument( parser.add_argument(
"--language", "--language",
type=str, type=str,
help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS-multilingual` model.", help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS` model.",
default=None, default=None,
) )
parser.add_argument( parser.add_argument(

View File

@ -43,7 +43,7 @@ class CS_API:
Args: Args:
api_token (str): 🐸Coqui Studio API token. If not provided, it will be read from the environment variable api_token (str): 🐸Coqui Studio API token. If not provided, it will be read from the environment variable
`COQUI_STUDIO_TOKEN`. `COQUI_STUDIO_TOKEN`.
model (str): 🐸Coqui Studio model. It can be either `V1`, `XTTS`, or `XTTS-multilang`. Default is `XTTS`. model (str): 🐸Coqui Studio model. It can be either `V1`, `XTTS`. Default is `XTTS`.
Example listing all available speakers: Example listing all available speakers:
@ -65,7 +65,7 @@ class CS_API:
Example with multi-language model: Example with multi-language model:
>>> from TTS.api import CS_API >>> from TTS.api import CS_API
>>> tts = CS_API(model="XTTS-multilang") >>> tts = CS_API(model="XTTS")
>>> wav, sr = api.tts("Hello world", speaker_name=tts.speakers[0].name, language="en") >>> wav, sr = api.tts("Hello world", speaker_name=tts.speakers[0].name, language="en")
""" """
@ -78,16 +78,12 @@ class CS_API:
"XTTS": { "XTTS": {
"list_speakers": "https://app.coqui.ai/api/v2/speakers", "list_speakers": "https://app.coqui.ai/api/v2/speakers",
"synthesize": "https://app.coqui.ai/api/v2/samples/xtts/render/", "synthesize": "https://app.coqui.ai/api/v2/samples/xtts/render/",
"list_voices": "https://app.coqui.ai/api/v2/voices/xtts/", "list_voices": "https://app.coqui.ai/api/v2/voices/xtts",
},
"XTTS-multilang": {
"list_speakers": "https://app.coqui.ai/api/v2/speakers",
"synthesize": "https://app.coqui.ai/api/v2/samples/multilingual/render/",
"list_voices": "https://app.coqui.ai/api/v2/voices/xtts/",
}, },
} }
SUPPORTED_LANGUAGES = ["en", "es", "de", "fr", "it", "pt", "pl"]
SUPPORTED_LANGUAGES = ["en", "es", "de", "fr", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja"]
def __init__(self, api_token=None, model="XTTS"): def __init__(self, api_token=None, model="XTTS"):
self.api_token = api_token self.api_token = api_token
@ -139,7 +135,7 @@ class CS_API:
self._check_token() self._check_token()
conn = http.client.HTTPSConnection("app.coqui.ai") conn = http.client.HTTPSConnection("app.coqui.ai")
url = self.MODEL_ENDPOINTS[self.model]["list_speakers"] url = self.MODEL_ENDPOINTS[self.model]["list_speakers"]
conn.request("GET", f"{url}?per_page=100", headers=self.headers) conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers)
res = conn.getresponse() res = conn.getresponse()
data = res.read() data = res.read()
return [Speaker(s) for s in json.loads(data)["result"]] return [Speaker(s) for s in json.loads(data)["result"]]
@ -148,7 +144,7 @@ class CS_API:
"""List custom voices created by the user.""" """List custom voices created by the user."""
conn = http.client.HTTPSConnection("app.coqui.ai") conn = http.client.HTTPSConnection("app.coqui.ai")
url = self.MODEL_ENDPOINTS[self.model]["list_voices"] url = self.MODEL_ENDPOINTS[self.model]["list_voices"]
conn.request("GET", f"{url}", headers=self.headers) conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers)
res = conn.getresponse() res = conn.getresponse()
data = res.read() data = res.read()
return [Speaker(s, True) for s in json.loads(data)["result"]] return [Speaker(s, True) for s in json.loads(data)["result"]]
@ -197,14 +193,6 @@ class CS_API:
} }
) )
elif model == "XTTS": elif model == "XTTS":
payload.update(
{
"name": speaker.name,
"text": text,
"speed": speed,
}
)
elif model == "XTTS-multilang":
payload.update( payload.update(
{ {
"name": speaker.name, "name": speaker.name,
@ -226,13 +214,10 @@ class CS_API:
assert language is None, "❗ language is not supported for V1 model." assert language is None, "❗ language is not supported for V1 model."
elif self.model == "XTTS": elif self.model == "XTTS":
assert emotion is None, f"❗ Emotions are not supported for XTTS model. Use V1 model." assert emotion is None, f"❗ Emotions are not supported for XTTS model. Use V1 model."
assert language is None, "❗ Language is not supported for XTTS model. Use XTTS-multilang model." assert language is not None, "❗ Language is required for XTTS model."
elif self.model == "XTTS-multilang":
assert emotion is None, f"❗ Emotions are not supported for XTTS-multilang model. Use V1 model."
assert language is not None, "❗ Language is required for XTTS-multilang model."
assert ( assert (
language in self.SUPPORTED_LANGUAGES language in self.SUPPORTED_LANGUAGES
), f"❗ Language {language} is not yet supported. Use one of: en, es, de, fr, it, pt, pl" ), f"❗ Language {language} is not yet supported. Check https://docs.coqui.ai/reference/samples_xtts_create."
return text, speaker_name, speaker_id, emotion, speed, language return text, speaker_name, speaker_id, emotion, speed, language
def tts( def tts(
@ -255,7 +240,7 @@ class CS_API:
supported by `V1` model. Defaults to None. supported by `V1` model. Defaults to None.
speed (float): Speed of the speech. 1.0 is normal speed. speed (float): Speed of the speech. 1.0 is normal speed.
language (str): Language of the text. If None, the default language of the speaker is used. Language is only language (str): Language of the text. If None, the default language of the speaker is used. Language is only
supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en". supported by `XTTS` model. See https://docs.coqui.ai/reference/samples_xtts_create for supported languages.
""" """
self._check_token() self._check_token()
self.ping_api() self.ping_api()
@ -305,7 +290,7 @@ class CS_API:
speed (float): Speed of the speech. 1.0 is normal speed. speed (float): Speed of the speech. 1.0 is normal speed.
pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe. pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe.
language (str): Language of the text. If None, the default language of the speaker is used. Language is only language (str): Language of the text. If None, the default language of the speaker is used. Language is only
supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en". supported by `XTTS` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
file_path (str): Path to save the file. If None, a temporary file is created. file_path (str): Path to save the file. If None, a temporary file is created.
""" """
if file_path is None: if file_path is None:
@ -323,20 +308,7 @@ if __name__ == "__main__":
print(api.list_speakers_as_tts_models()) print(api.list_speakers_as_tts_models())
ts = time.time() ts = time.time()
wav, sr = api.tts("It took me quite a long time to develop a voice.", speaker_name=api.speakers[0].name) wav, sr = api.tts("It took me quite a long time to develop a voice.", language="en", speaker_name=api.speakers[0].name)
print(f" [i] XTTS took {time.time() - ts:.2f}s") print(f" [i] XTTS took {time.time() - ts:.2f}s")
filepath = api.tts_to_file(text="Hello world!", speaker_name=api.speakers[0].name, file_path="output.wav") filepath = api.tts_to_file(text="Hello world!", speaker_name=api.speakers[0].name, language="en", file_path="output.wav")
api = CS_API(model="XTTS-multilang")
print(api.speakers)
ts = time.time()
wav, sr = api.tts(
"It took me quite a long time to develop a voice.", speaker_name=api.speakers[0].name, language="en"
)
print(f" [i] XTTS took {time.time() - ts:.2f}s")
filepath = api.tts_to_file(
text="Hello world!", speaker_name=api.speakers[0].name, file_path="output.wav", language="en"
)

View File

@ -198,19 +198,12 @@ from TTS.api import CS_API
# Init 🐸 Coqui Studio API # Init 🐸 Coqui Studio API
# you can either set the API token as an environment variable `COQUI_STUDIO_TOKEN` or pass it as an argument. # you can either set the API token as an environment variable `COQUI_STUDIO_TOKEN` or pass it as an argument.
# XTTS - Best quality and life-like speech in EN # XTTS - Best quality and life-like speech in multiple languages. See https://docs.coqui.ai/reference/samples_xtts_create for supported languages.
api = CS_API(api_token=<token>, model="XTTS") api = CS_API(api_token=<token>, model="XTTS")
api.speakers # all the speakers are available with all the models. api.speakers # all the speakers are available with all the models.
api.list_speakers() api.list_speakers()
api.list_voices() api.list_voices()
wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5) wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", language="en", speed=1.5)
# XTTS-multilingual - Multilingual XTTS with [en, de, es, fr, it, pt, ...] (more langs coming soon)
api = CS_API(api_token=<token>, model="XTTS-multilingual")
api.speakers
api.list_speakers()
api.list_voices()
wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5)
# V1 - Fast and lightweight TTS in EN with emotion control. # V1 - Fast and lightweight TTS in EN with emotion control.
api = CS_API(api_token=<token>, model="V1") api = CS_API(api_token=<token>, model="V1")