diff --git a/.github/workflows/inference_tests.yml b/.github/workflows/inference_tests.yml index 0ff0857d..78a0a8f8 100644 --- a/.github/workflows/inference_tests.yml +++ b/.github/workflows/inference_tests.yml @@ -32,7 +32,8 @@ jobs: - name: check OS run: cat /etc/os-release - name: set ENV - run: export TRAINER_TELEMETRY=0 + run: | + export TRAINER_TELEMETRY=0 - name: Install dependencies run: | sudo apt-get update @@ -49,4 +50,6 @@ jobs: python3 -m pip install .[all] python3 setup.py egg_info - name: Unit tests - run: make inference_tests + run: | + export COQUI_STUDIO_TOKEN=${{ secrets.COQUI_STUDIO_TOKEN }} + make inference_tests diff --git a/README.md b/README.md index 7df9bcb7..baa52912 100644 --- a/README.md +++ b/README.md @@ -197,6 +197,36 @@ tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_ tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav") tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr-fr", file_path="output.wav") tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt-br", file_path="output.wav") + + +# Example voice conversion converting speaker of the `source_wav` to the speaker of the `target_wav` + +tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False, gpu=True) +tts.voice_conversion_to_file(source_wav="my/source.wav", target_wav="my/target.wav", file_path="output.wav") + +# Example voice cloning by a single speaker TTS model combining with the voice conversion model. This way, you can +# clone voices by using any model in 🐸TTS. + +tts = TTS("tts_models/de/thorsten/tacotron2-DDC") +tts.tts_with_vc_to_file( + "Wie sage ich auf Italienisch, dass ich dich liebe?", + speaker_wav="target/speaker.wav", + file_path="ouptut.wav" +) + +# Example text to speech using [🐸Coqui Studio](https://coqui.ai) models. You can use all of your available speakers in the studio. +# [🐸Coqui Studio](https://coqui.ai) API token is required. You can get it from the [account page](https://coqui.ai/account). +# You should set the `COQUI_STUDIO_TOKEN` environment variable to use the API token. + +# If you have a valid API token set you will see the studio speakers as separate models in the list. +# The name format is coqui_studio/en//coqui_studio +models = TTS().list_models() +# Init TTS with the target studio speaker +tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False, gpu=False) +# Run TTS +tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH) +# Run TTS with emotion and speed control +tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5) ``` ### Command line `tts` diff --git a/TTS/api.py b/TTS/api.py index 008fd082..7376cfa4 100644 --- a/TTS/api.py +++ b/TTS/api.py @@ -1,11 +1,227 @@ +import http.client +import json +import os import tempfile +import urllib.request from pathlib import Path +from typing import Tuple + +import numpy as np +from scipy.io import wavfile from TTS.utils.audio.numpy_transforms import save_wav from TTS.utils.manage import ModelManager from TTS.utils.synthesizer import Synthesizer +class Speaker(object): + """Convert dict to object.""" + + def __init__(self, d, is_voice=False): + self.is_voice = is_voice + for k, v in d.items(): + if isinstance(k, (list, tuple)): + setattr(self, k, [Speaker(x) if isinstance(x, dict) else x for x in v]) + else: + setattr(self, k, Speaker(v) if isinstance(v, dict) else v) + + def __repr__(self): + return str(self.__dict__) + + +class CS_API: + """🐸Coqui Studio API Wrapper. + + 🐸Coqui Studio is the most advanced voice generation platform. You can generate new voices by voice cloning, voice + interpolation, or our unique prompt to voice technology. It also provides a set of built-in voices with different + characteristics. You can use these voices to generate new audio files or use them in your applications. + You can use all the built-in and your own 🐸Coqui Studio speakers with this API with an API token. + You can signup to 🐸Coqui Studio from https://app.coqui.ai/auth/signup and get an API token from + https://app.coqui.ai/account. We can either enter the token as an environment variable as + `export COQUI_STUDIO_TOKEN=` or pass it as `CS_API(api_token=)`. + Visit https://app.coqui.ai/api for more information. + + Example listing all available speakers: + >>> from TTS.api import CS_API + >>> tts = CS_API() + >>> tts.speakers + + Example listing all emotions: + >>> from TTS.api import CS_API + >>> tts = CS_API() + >>> tts.emotions + + Example with a built-in 🐸 speaker: + >>> from TTS.api import CS_API + >>> tts = CS_API() + >>> wav, sr = api.tts("Hello world", speaker_name="Claribel Dervla") + >>> filepath = tts.tts_to_file(text="Hello world!", speaker_name=tts.speakers[0].name, file_path="output.wav") + """ + + def __init__(self, api_token=None): + self.api_token = api_token + self.api_prefix = "/api/v2" + self.headers = None + self._speakers = None + self._check_token() + + @property + def speakers(self): + if self._speakers is None: + self._speakers = self.list_all_speakers() + return self._speakers + + @property + def emotions(self): + """Return a list of available emotions. + + TODO: Get this from the API endpoint. + """ + return ["Neutral", "Happy", "Sad", "Angry", "Dull"] + + def _check_token(self): + if self.api_token is None: + self.api_token = os.environ.get("COQUI_STUDIO_TOKEN") + self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_token}"} + if not self.api_token: + raise ValueError( + "No API token found for 🐸Coqui Studio voices - https://coqui.ai.\n" + "Visit 🔗https://app.coqui.ai/account to get one.\n" + "Set it as an environment variable `export COQUI_STUDIO_TOKEN=`\n" + "" + ) + + def list_all_speakers(self): + """Return both built-in Coqui Studio speakers and custom voices created by the user.""" + return self.list_speakers() + self.list_voices() + + def list_speakers(self): + """List built-in Coqui Studio speakers.""" + self._check_token() + conn = http.client.HTTPSConnection("app.coqui.ai") + conn.request("GET", f"{self.api_prefix}/speakers", headers=self.headers) + res = conn.getresponse() + data = res.read() + return [Speaker(s) for s in json.loads(data)["result"]] + + def list_voices(self): + """List custom voices created by the user.""" + conn = http.client.HTTPSConnection("app.coqui.ai") + conn.request("GET", f"{self.api_prefix}/voices", headers=self.headers) + res = conn.getresponse() + data = res.read() + return [Speaker(s, True) for s in json.loads(data)["result"]] + + def list_speakers_as_tts_models(self): + """List speakers in ModelManager format.""" + models = [] + for speaker in self.speakers: + model = f"coqui_studio/en/{speaker.name}/coqui_studio" + models.append(model) + return models + + def name_to_speaker(self, name): + for speaker in self.speakers: + if speaker.name == name: + return speaker + raise ValueError(f"Speaker {name} not found.") + + def id_to_speaker(self, speaker_id): + for speaker in self.speakers: + if speaker.id == speaker_id: + return speaker + raise ValueError(f"Speaker {speaker_id} not found.") + + @staticmethod + def url_to_np(url): + tmp_file, _ = urllib.request.urlretrieve(url) + rate, data = wavfile.read(tmp_file) + return data, rate + + @staticmethod + def _create_payload(text, speaker, emotion, speed): + payload = {} + if speaker.is_voice: + payload["voice_id"] = speaker.id + else: + payload["speaker_id"] = speaker.id + payload.update( + { + "emotion": emotion, + "name": speaker.name, + "text": text, + "speed": speed, + } + ) + return payload + + def tts( + self, + text: str, + speaker_name: str = None, + speaker_id=None, + emotion="Neutral", + speed=1.0, + language=None, # pylint: disable=unused-argument + ) -> Tuple[np.ndarray, int]: + """Synthesize speech from text. + + Args: + text (str): Text to synthesize. + speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and + voices (user generated speakers) with `list_voices()`. + speaker_id (str): Speaker ID. If None, the speaker name is used. + emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". + speed (float): Speed of the speech. 1.0 is normal speed. + language (str): Language of the text. If None, the default language of the speaker is used. + """ + self._check_token() + if speaker_name is None and speaker_id is None: + raise ValueError(" [!] Please provide either a `speaker_name` or a `speaker_id`.") + if speaker_id is None: + speaker = self.name_to_speaker(speaker_name) + else: + speaker = self.id_to_speaker(speaker_id) + conn = http.client.HTTPSConnection("app.coqui.ai") + payload = self._create_payload(text, speaker, emotion, speed) + conn.request("POST", "/api/v2/samples", json.dumps(payload), self.headers) + res = conn.getresponse() + data = res.read() + try: + wav, sr = self.url_to_np(json.loads(data)["audio_url"]) + except KeyError as e: + raise ValueError(f" [!] 🐸 API returned error: {data}") from e + return wav, sr + + def tts_to_file( + self, + text: str, + speaker_name: str, + speaker_id=None, + emotion="Neutral", + speed=1.0, + language=None, + file_path: str = None, + ) -> str: + """Synthesize speech from text and save it to a file. + + Args: + text (str): Text to synthesize. + speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and + voices (user generated speakers) with `list_voices()`. + speaker_id (str): Speaker ID. If None, the speaker name is used. + emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". + speed (float): Speed of the speech. 1.0 is normal speed. + language (str): Language of the text. If None, the default language of the speaker is used. + file_path (str): Path to save the file. If None, a temporary file is created. + """ + if file_path is None: + file_path = tempfile.mktemp(".wav") + wav, sr = self.tts(text, speaker_name, speaker_id, emotion, speed, language) + wavfile.write(file_path, sr, wav) + return file_path + + class TTS: """TODO: Add voice conversion and Capacitron support.""" @@ -54,9 +270,12 @@ class TTS: self.synthesizer = None self.voice_converter = None + self.csapi = None + self.model_name = None if model_name: self.load_tts_model_by_name(model_name, gpu) + if model_path: self.load_tts_model_by_path( model_path, config_path, vocoder_path=vocoder_path, vocoder_config=vocoder_config_path, gpu=gpu @@ -72,6 +291,10 @@ class TTS: return self.synthesizer.tts_model.speaker_manager.num_speakers > 1 return False + @property + def is_coqui_studio(self): + return "coqui_studio" in self.model_name + @property def is_multi_lingual(self): if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager: @@ -96,8 +319,14 @@ class TTS: @staticmethod def list_models(): + try: + csapi = CS_API() + models = csapi.list_speakers_as_tts_models() + except ValueError as e: + print(e) + models = [] manager = ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False) - return manager.list_tts_models() + return manager.list_tts_models() + models def download_model_by_name(self, model_name: str): model_path, config_path, model_item = self.manager.download_model(model_name) @@ -125,22 +354,28 @@ class TTS: TODO: Add tests """ + self.synthesizer = None + self.csapi = None + self.model_name = model_name - model_path, config_path, vocoder_path, vocoder_config_path = self.download_model_by_name(model_name) + if "coqui_studio" in model_name: + self.csapi = CS_API() + else: + model_path, config_path, vocoder_path, vocoder_config_path = self.download_model_by_name(model_name) - # init synthesizer - # None values are fetch from the model - self.synthesizer = Synthesizer( - tts_checkpoint=model_path, - tts_config_path=config_path, - tts_speakers_file=None, - tts_languages_file=None, - vocoder_checkpoint=vocoder_path, - vocoder_config=vocoder_config_path, - encoder_checkpoint=None, - encoder_config=None, - use_cuda=gpu, - ) + # init synthesizer + # None values are fetch from the model + self.synthesizer = Synthesizer( + tts_checkpoint=model_path, + tts_config_path=config_path, + tts_speakers_file=None, + tts_languages_file=None, + vocoder_checkpoint=vocoder_path, + vocoder_config=vocoder_config_path, + encoder_checkpoint=None, + encoder_config=None, + use_cuda=gpu, + ) def load_tts_model_by_path( self, model_path: str, config_path: str, vocoder_path: str = None, vocoder_config: str = None, gpu: bool = False @@ -167,17 +402,88 @@ class TTS: use_cuda=gpu, ) - def _check_arguments(self, speaker: str = None, language: str = None, speaker_wav: str = None): - if self.is_multi_speaker and (speaker is None and speaker_wav is None): - raise ValueError("Model is multi-speaker but no speaker is provided.") - if self.is_multi_lingual and language is None: - raise ValueError("Model is multi-lingual but no language is provided.") - if not self.is_multi_speaker and speaker is not None: - raise ValueError("Model is not multi-speaker but speaker is provided.") - if not self.is_multi_lingual and language is not None: - raise ValueError("Model is not multi-lingual but language is provided.") + def _check_arguments( + self, + speaker: str = None, + language: str = None, + speaker_wav: str = None, + emotion: str = None, + speed: float = None, + ) -> None: + """Check if the arguments are valid for the model.""" + if not self.is_coqui_studio: + # check for the coqui tts models + if self.is_multi_speaker and (speaker is None and speaker_wav is None): + raise ValueError("Model is multi-speaker but no `speaker` is provided.") + if self.is_multi_lingual and language is None: + raise ValueError("Model is multi-lingual but no `language` is provided.") + if not self.is_multi_speaker and speaker is not None: + raise ValueError("Model is not multi-speaker but `speaker` is provided.") + if not self.is_multi_lingual and language is not None: + raise ValueError("Model is not multi-lingual but `language` is provided.") + if not emotion is None and not speed is None: + raise ValueError("Emotion and speed can only be used with Coqui Studio models.") + else: + if emotion is None: + emotion = "Neutral" + if speed is None: + speed = 1.0 + # check for the studio models + if speaker_wav is not None: + raise ValueError("Coqui Studio models do not support `speaker_wav` argument.") + if speaker is not None: + raise ValueError("Coqui Studio models do not support `speaker` argument.") + if language is not None and language != "en": + raise ValueError("Coqui Studio models currently support only `language=en` argument.") + if emotion not in ["Neutral", "Happy", "Sad", "Angry", "Dull"]: + raise ValueError(f"Emotion - `{emotion}` - must be one of `Neutral`, `Happy`, `Sad`, `Angry`, `Dull`.") - def tts(self, text: str, speaker: str = None, language: str = None, speaker_wav: str = None): + def tts_coqui_studio( + self, + text: str, + speaker_name: str = None, + language: str = None, + emotion: str = "Neutral", + speed: float = 1.0, + file_path: str = None, + ): + """Convert text to speech using Coqui Studio models. Use `CS_API` class if you are only interested in the API. + + Args: + text (str): + Input text to synthesize. + speaker_name (str, optional): + Speaker name from Coqui Studio. Defaults to None. + language (str, optional): + Language code. Coqui Studio currently supports only English. Defaults to None. + emotion (str, optional): + Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Defaults to "Neutral". + speed (float, optional): + Speed of the speech. Defaults to 1.0. + file_path (str, optional): + Path to save the output file. When None it returns the `np.ndarray` of waveform. Defaults to None. + """ + speaker_name = self.model_name.split("/")[2] + if file_path is None: + return self.csapi.tts_to_file( + text=text, + speaker_name=speaker_name, + language=language, + speed=speed, + emotion=emotion, + file_path=file_path, + )[0] + return self.csapi.tts(text=text, speaker_name=speaker_name, language=language, speed=speed, emotion=emotion)[0] + + def tts( + self, + text: str, + speaker: str = None, + language: str = None, + speaker_wav: str = None, + emotion: str = None, + speed: float = None, + ): """Convert text to speech. Args: @@ -192,8 +498,17 @@ class TTS: speaker_wav (str, optional): Path to a reference wav file to use for voice cloning with supporting models like YourTTS. Defaults to None. + emotion (str, optional): + Emotion to use for 🐸Coqui Studio models. If None, Studio models use "Neutral". Defaults to None. + speed (float, optional): + Speed factor to use for 🐸Coqui Studio models, between 0 and 2.0. If None, Studio models use 1.0. + Defaults to None. """ - self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav) + self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed) + if self.csapi is not None: + return self.tts_coqui_studio( + text=text, speaker_name=speaker, language=language, emotion=emotion, speed=speed + ) wav = self.synthesizer.tts( text=text, @@ -213,6 +528,8 @@ class TTS: speaker: str = None, language: str = None, speaker_wav: str = None, + emotion: str = "Neutral", + speed: float = 1.0, file_path: str = "output.wav", ): """Convert text to speech. @@ -229,11 +546,22 @@ class TTS: speaker_wav (str, optional): Path to a reference wav file to use for voice cloning with supporting models like YourTTS. Defaults to None. + emotion (str, optional): + Emotion to use for 🐸Coqui Studio models. Defaults to "Neutral". + speed (float, optional): + Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None. file_path (str, optional): Output file path. Defaults to "output.wav". """ + self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav) + + if self.csapi is not None: + return self.tts_coqui_studio( + text=text, speaker_name=speaker, language=language, emotion=emotion, speed=speed, file_path=file_path + ) wav = self.tts(text=text, speaker=speaker, language=language, speaker_wav=speaker_wav) self.synthesizer.save_wav(wav=wav, path=file_path) + return file_path def voice_conversion( self, diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 6c368a2b..104eb3a0 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -290,7 +290,14 @@ class Synthesizer(object): language_id = list(self.tts_model.language_manager.name_to_id.values())[0] elif language_name and isinstance(language_name, str): - language_id = self.tts_model.language_manager.name_to_id[language_name] + try: + language_id = self.tts_model.language_manager.name_to_id[language_name] + except KeyError as e: + raise ValueError( + f" [!] Looks like you use a multi-lingual model. " + f"Language {language_name} is not in the available languages: " + f"{self.tts_model.language_manager.name_to_id.keys()}." + ) from e elif not language_name: raise ValueError( diff --git a/docs/source/inference.md b/docs/source/inference.md index a69d714e..4abdd327 100644 --- a/docs/source/inference.md +++ b/docs/source/inference.md @@ -109,7 +109,7 @@ tts-server --model_name "///" \ --vocoder_name "///" ``` -## Python API +## Python 🐸TTS API You can run a multi-speaker and multi-lingual model in Python as @@ -163,4 +163,34 @@ tts.tts_with_vc_to_file( speaker_wav="target/speaker.wav", file_path="ouptut.wav" ) + +Example text to speech using [🐸Coqui Studio](https://coqui.ai) models. You can use all of your available speakers in the studio. +[🐸Coqui Studio](https://coqui.ai) API token is required. You can get it from the [account page](https://coqui.ai/account). +You should set the `COQUI_STUDIO_TOKEN` environment variable to use the API token. + +```python +# If you have a valid API token set you will see the studio speakers as separate models in the list. +# The name format is coqui_studio/en//coqui_studio +models = TTS().list_models() +# Init TTS with the target studio speaker +tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False, gpu=False) +# Run TTS +tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH) +# Run TTS with emotion and speed control +tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5) +``` + +If you just need 🐸 Coqui Studio speakers, you can use `CS_API`. It is a wrapper around the 🐸 Coqui Studio API. + +```python +from TTS.api import CS_API + +# Init 🐸 Coqui Studio API +# you can either set the API token as an environment variable `COQUI_STUDIO_TOKEN` or pass it as an argument. +api = CS_API(api_token=) +api.speakers +api.emotions +api.list_speakers() +api.list_voices() +wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5) ``` \ No newline at end of file diff --git a/tests/inference_tests/test_python_api.py b/tests/inference_tests/test_python_api.py index 4072edab..dbfca837 100644 --- a/tests/inference_tests/test_python_api.py +++ b/tests/inference_tests/test_python_api.py @@ -2,12 +2,41 @@ import os import unittest from tests import get_tests_data_path, get_tests_output_path -from TTS.api import TTS +from TTS.api import CS_API, TTS OUTPUT_PATH = os.path.join(get_tests_output_path(), "test_python_api.wav") cloning_test_wav_path = os.path.join(get_tests_data_path(), "ljspeech/wavs/LJ001-0028.wav") +class CS_APITest(unittest.TestCase): + def test_speakers(self): + tts = CS_API() + self.assertGreater(len(tts.speakers), 1) + + def test_emotions(self): + tts = CS_API() + self.assertGreater(len(tts.emotions), 1) + + def test_list_calls(self): + tts = CS_API() + self.assertGreater(len(tts.list_voices()), 1) + self.assertGreater(len(tts.list_speakers()), 1) + self.assertGreater(len(tts.list_all_speakers()), 1) + self.assertGreater(len(tts.list_speakers_as_tts_models()), 1) + + def test_name_to_speaker(self): + tts = CS_API() + speaker_name = tts.list_speakers_as_tts_models()[0].split("/")[2] + speaker = tts.name_to_speaker(speaker_name) + self.assertEqual(speaker.name, speaker_name) + + def test_tts(self): + tts = CS_API() + wav, sr = tts.tts(text="This is a test.", speaker_name=tts.list_speakers()[0].name) + self.assertEqual(sr, 44100) + self.assertGreater(len(wav), 1) + + class TTSTest(unittest.TestCase): def test_single_speaker_model(self): tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False) @@ -26,6 +55,30 @@ class TTSTest(unittest.TestCase): self.assertIsNone(tts.speakers) self.assertIsNone(tts.languages) + def test_studio_model(self): + tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio") + tts.tts_to_file(text="This is a test.") + + # check speed > 2.0 raises error + raised_error = False + try: + _ = tts.tts(text="This is a test.", speed=4.0, emotion="Sad") # should raise error with speed > 2.0 + except ValueError: + raised_error = True + self.assertTrue(raised_error) + + # check emotion is invalid + raised_error = False + try: + _ = tts.tts(text="This is a test.", speed=2.0, emotion="No Emo") # should raise error with speed > 2.0 + except ValueError: + raised_error = True + self.assertTrue(raised_error) + + # check valid call + wav = tts.tts(text="This is a test.", speed=2.0, emotion="Sad") + self.assertGreater(len(wav), 0) + def test_multi_speaker_multi_lingual_model(self): tts = TTS() tts.load_tts_model_by_name(tts.models[0]) # YourTTS