diff --git a/README.md b/README.md index 1a9285eb..353db7cf 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ ## 🐸Coqui.ai News - 📣 ⓍTTSv2 is here with 16 languages and better performance across the board. - 📣 ⓍTTS fine-tuning code is out. Check the [example recipes](https://github.com/coqui-ai/TTS/tree/dev/recipes/ljspeech). -- 📣 ⓍTTS can now stream with <200ms latency. +- 📣 ⓍTTS can now stream with <200ms latency. - 📣 ⓍTTS, our production TTS model that can speak 13 languages, is released [Blog Post](https://coqui.ai/blog/tts/open_xtts), [Demo](https://huggingface.co/spaces/coqui/xtts), [Docs](https://tts.readthedocs.io/en/dev/models/xtts.html) - 📣 [🐶Bark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://tts.readthedocs.io/en/dev/models/bark.html) - 📣 You can use [~1100 Fairseq models](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS. @@ -267,19 +267,13 @@ models = TTS(cs_api_model="XTTS").list_models() # Init TTS with the target studio speaker tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False) # Run TTS -tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH) +tts.tts_to_file(text="This is a test.", language="en", file_path=OUTPUT_PATH) # V1 model models = TTS(cs_api_model="V1").list_models() # Run TTS with emotion and speed control # Emotion control only works with V1 model tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5) - -# XTTS-multilingual -models = TTS(cs_api_model="XTTS-multilingual").list_models() -# Run TTS with emotion and speed control -# Emotion control only works with V1 model -tts.tts_to_file(text="Das ist ein Test.", file_path=OUTPUT_PATH, language="de", speed=1.0) ``` #### Example text to speech using **Fairseq models in ~1100 languages** 🤯. diff --git a/TTS/api.py b/TTS/api.py index 5d1fbb5a..c8600dcd 100644 --- a/TTS/api.py +++ b/TTS/api.py @@ -60,7 +60,7 @@ class TTS(nn.Module): vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None. progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True. cs_api_model (str, optional): Name of the model to use for the Coqui Studio API. Available models are - "XTTS", "XTTS-multilingual", "V1". You can also use `TTS.cs_api.CS_API" for more control. + "XTTS", "V1". You can also use `TTS.cs_api.CS_API" for more control. Defaults to "XTTS". gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False. """ @@ -275,7 +275,7 @@ class TTS(nn.Module): speaker_name (str, optional): Speaker name from Coqui Studio. Defaults to None. language (str): Language of the text. If None, the default language of the speaker is used. Language is only - supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en". + supported by `XTTS` model. emotion (str, optional): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only available with "V1" model. Defaults to None. @@ -321,7 +321,7 @@ class TTS(nn.Module): Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None. language (str): Language of the text. If None, the default language of the speaker is used. Language is only - supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en". + supported by `XTTS` model. speaker_wav (str, optional): Path to a reference wav file to use for voice cloning with supporting models like YourTTS. Defaults to None. diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index ef41c8e1..ddfe35d2 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -227,7 +227,7 @@ def main(): parser.add_argument( "--cs_model", type=str, - help="Name of the 🐸Coqui Studio model. Available models are `XTTS`, `XTTS-multilingual`, `V1`.", + help="Name of the 🐸Coqui Studio model. Available models are `XTTS`, `V1`.", ) parser.add_argument( "--emotion", @@ -238,7 +238,7 @@ def main(): parser.add_argument( "--language", type=str, - help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS-multilingual` model.", + help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS` model.", default=None, ) parser.add_argument( diff --git a/TTS/cs_api.py b/TTS/cs_api.py index 4a44b535..c45f9d08 100644 --- a/TTS/cs_api.py +++ b/TTS/cs_api.py @@ -43,7 +43,7 @@ class CS_API: Args: api_token (str): 🐸Coqui Studio API token. If not provided, it will be read from the environment variable `COQUI_STUDIO_TOKEN`. - model (str): 🐸Coqui Studio model. It can be either `V1`, `XTTS`, or `XTTS-multilang`. Default is `XTTS`. + model (str): 🐸Coqui Studio model. It can be either `V1`, `XTTS`. Default is `XTTS`. Example listing all available speakers: @@ -65,7 +65,7 @@ class CS_API: Example with multi-language model: >>> from TTS.api import CS_API - >>> tts = CS_API(model="XTTS-multilang") + >>> tts = CS_API(model="XTTS") >>> wav, sr = api.tts("Hello world", speaker_name=tts.speakers[0].name, language="en") """ @@ -78,16 +78,12 @@ class CS_API: "XTTS": { "list_speakers": "https://app.coqui.ai/api/v2/speakers", "synthesize": "https://app.coqui.ai/api/v2/samples/xtts/render/", - "list_voices": "https://app.coqui.ai/api/v2/voices/xtts/", - }, - "XTTS-multilang": { - "list_speakers": "https://app.coqui.ai/api/v2/speakers", - "synthesize": "https://app.coqui.ai/api/v2/samples/multilingual/render/", - "list_voices": "https://app.coqui.ai/api/v2/voices/xtts/", + "list_voices": "https://app.coqui.ai/api/v2/voices/xtts", }, } - SUPPORTED_LANGUAGES = ["en", "es", "de", "fr", "it", "pt", "pl"] + + SUPPORTED_LANGUAGES = ["en", "es", "de", "fr", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja"] def __init__(self, api_token=None, model="XTTS"): self.api_token = api_token @@ -139,7 +135,7 @@ class CS_API: self._check_token() conn = http.client.HTTPSConnection("app.coqui.ai") url = self.MODEL_ENDPOINTS[self.model]["list_speakers"] - conn.request("GET", f"{url}?per_page=100", headers=self.headers) + conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers) res = conn.getresponse() data = res.read() return [Speaker(s) for s in json.loads(data)["result"]] @@ -148,7 +144,7 @@ class CS_API: """List custom voices created by the user.""" conn = http.client.HTTPSConnection("app.coqui.ai") url = self.MODEL_ENDPOINTS[self.model]["list_voices"] - conn.request("GET", f"{url}", headers=self.headers) + conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers) res = conn.getresponse() data = res.read() return [Speaker(s, True) for s in json.loads(data)["result"]] @@ -197,14 +193,6 @@ class CS_API: } ) elif model == "XTTS": - payload.update( - { - "name": speaker.name, - "text": text, - "speed": speed, - } - ) - elif model == "XTTS-multilang": payload.update( { "name": speaker.name, @@ -226,13 +214,10 @@ class CS_API: assert language is None, "❗ language is not supported for V1 model." elif self.model == "XTTS": assert emotion is None, f"❗ Emotions are not supported for XTTS model. Use V1 model." - assert language is None, "❗ Language is not supported for XTTS model. Use XTTS-multilang model." - elif self.model == "XTTS-multilang": - assert emotion is None, f"❗ Emotions are not supported for XTTS-multilang model. Use V1 model." - assert language is not None, "❗ Language is required for XTTS-multilang model." + assert language is not None, "❗ Language is required for XTTS model." assert ( language in self.SUPPORTED_LANGUAGES - ), f"❗ Language {language} is not yet supported. Use one of: en, es, de, fr, it, pt, pl" + ), f"❗ Language {language} is not yet supported. Check https://docs.coqui.ai/reference/samples_xtts_create." return text, speaker_name, speaker_id, emotion, speed, language def tts( @@ -255,7 +240,7 @@ class CS_API: supported by `V1` model. Defaults to None. speed (float): Speed of the speech. 1.0 is normal speed. language (str): Language of the text. If None, the default language of the speaker is used. Language is only - supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en". + supported by `XTTS` model. See https://docs.coqui.ai/reference/samples_xtts_create for supported languages. """ self._check_token() self.ping_api() @@ -305,7 +290,7 @@ class CS_API: speed (float): Speed of the speech. 1.0 is normal speed. pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe. language (str): Language of the text. If None, the default language of the speaker is used. Language is only - supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en". + supported by `XTTS` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en". file_path (str): Path to save the file. If None, a temporary file is created. """ if file_path is None: @@ -323,20 +308,7 @@ if __name__ == "__main__": print(api.list_speakers_as_tts_models()) ts = time.time() - wav, sr = api.tts("It took me quite a long time to develop a voice.", speaker_name=api.speakers[0].name) + wav, sr = api.tts("It took me quite a long time to develop a voice.", language="en", speaker_name=api.speakers[0].name) print(f" [i] XTTS took {time.time() - ts:.2f}s") - filepath = api.tts_to_file(text="Hello world!", speaker_name=api.speakers[0].name, file_path="output.wav") - - api = CS_API(model="XTTS-multilang") - print(api.speakers) - - ts = time.time() - wav, sr = api.tts( - "It took me quite a long time to develop a voice.", speaker_name=api.speakers[0].name, language="en" - ) - print(f" [i] XTTS took {time.time() - ts:.2f}s") - - filepath = api.tts_to_file( - text="Hello world!", speaker_name=api.speakers[0].name, file_path="output.wav", language="en" - ) + filepath = api.tts_to_file(text="Hello world!", speaker_name=api.speakers[0].name, language="en", file_path="output.wav") diff --git a/docs/source/inference.md b/docs/source/inference.md index 4de9ecdd..b40445ae 100644 --- a/docs/source/inference.md +++ b/docs/source/inference.md @@ -198,19 +198,12 @@ from TTS.api import CS_API # Init 🐸 Coqui Studio API # you can either set the API token as an environment variable `COQUI_STUDIO_TOKEN` or pass it as an argument. -# XTTS - Best quality and life-like speech in EN +# XTTS - Best quality and life-like speech in multiple languages. See https://docs.coqui.ai/reference/samples_xtts_create for supported languages. api = CS_API(api_token=, model="XTTS") api.speakers # all the speakers are available with all the models. api.list_speakers() api.list_voices() -wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5) - -# XTTS-multilingual - Multilingual XTTS with [en, de, es, fr, it, pt, ...] (more langs coming soon) -api = CS_API(api_token=, model="XTTS-multilingual") -api.speakers -api.list_speakers() -api.list_voices() -wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5) +wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", language="en", speed=1.5) # V1 - Fast and lightweight TTS in EN with emotion control. api = CS_API(api_token=, model="V1")