Update Studio API for XTTS (#2861)

* Update Studio API for XTTS * Update the docs * Update README.md * Update README.md Update README
2023-08-13 12:04:12 +02:00 · 2023-08-13 12:04:12 +02:00 · 3a104d5c49
parent 37b558ccb9
commit 3a104d5c49
7 changed files with 432 additions and 258 deletions
--- a/README.md
+++ b/README.md
@ -204,9 +204,11 @@ tts = TTS(model_name)
 wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0], language=tts.languages[0])
 # Text to speech to a file
 tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav")
 ```
-# Running a single speaker model
+#### Running a single speaker model
 ```python
 # Init TTS with the target model name
 tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False)
 # Run TTS
@ -218,15 +220,21 @@ tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_
 tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav")
 tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr-fr", file_path="output.wav")
 tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt-br", file_path="output.wav")
 ```
 #### Example voice conversion
-# Example voice conversion converting speaker of the `source_wav` to the speaker of the `target_wav`
+Converting the voice in `source_wav` to the voice of `target_wav`
 ```python
 tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False, gpu=True)
 tts.voice_conversion_to_file(source_wav="my/source.wav", target_wav="my/target.wav", file_path="output.wav")
 ```
-# Example voice cloning by a single speaker TTS model combining with the voice conversion model. This way, you can
+#### Example voice cloning together with the voice conversion model.
-# clone voices by using any model in 🐸TTS.
+This way, you can clone voices by using any model in 🐸TTS.
 ```python
 tts = TTS("tts_models/de/thorsten/tacotron2-DDC")
 tts.tts_with_vc_to_file(
@ -234,29 +242,43 @@ tts.tts_with_vc_to_file(
    speaker_wav="target/speaker.wav",
    file_path="output.wav"
 )
 ```
-# Example text to speech using [🐸Coqui Studio](https://coqui.ai) models.
+#### Example using [🐸Coqui Studio](https://coqui.ai) voices.
 You access all of your cloned voices and built-in speakers in [🐸Coqui Studio](https://coqui.ai). 
 To do this, you'll need an API token, which you can obtain from the [account page](https://coqui.ai/account).
 After obtaining the API token, you'll need to configure the COQUI_STUDIO_TOKEN environment variable.
-# You can use all of your available speakers in the studio.
+Once you have a valid API token in place, the studio speakers will be displayed as distinct models within the list. 
-# [🐸Coqui Studio](https://coqui.ai) API token is required. You can get it from the [account page](https://coqui.ai/account).
+These models will follow the naming convention `coqui_studio/en/<studio_speaker_name>/coqui_studio`
 # You should set the `COQUI_STUDIO_TOKEN` environment variable to use the API token.
-# If you have a valid API token set you will see the studio speakers as separate models in the list.
+```python
-# The name format is coqui_studio/en/<studio_speaker_name>/coqui_studio
+# XTTS model
-models = TTS().list_models()
+models = TTS(cs_api_model="XTTS").list_models()
 # Init TTS with the target studio speaker
 tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False, gpu=False)
 # Run TTS
 tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH)
 # V1 model
 models = TTS(cs_api_model="V1").list_models()
 # Run TTS with emotion and speed control
 # Emotion control only works with V1 model
 tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5)
 # XTTS-multilingual
 models = TTS(cs_api_model="XTTS-multilingual").list_models()
 # Run TTS with emotion and speed control
 # Emotion control only works with V1 model
 tts.tts_to_file(text="Das ist ein Test.", file_path=OUTPUT_PATH, language="de", speed=1.0)
 ```
-#Example text to speech using **Fairseq models in ~1100 languages** 🤯.
+#### Example text to speech using **Fairseq models in ~1100 languages** 🤯.
-
+For Fairseq models, use the following name format: `tts_models/<lang-iso_code>/fairseq/vits`.
-#For these models use the following name format: `tts_models/<lang-iso_code>/fairseq/vits`.
+You can find the language ISO codes [here](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html)
-#You can find the list of language ISO codes [here](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html) and learn about the Fairseq models [here](https://github.com/facebookresearch/fairseq/tree/main/examples/mms).
+and learn about the Fairseq models [here](https://github.com/facebookresearch/fairseq/tree/main/examples/mms).
 ```python
 # TTS with on the fly voice conversion
 api = TTS("tts_models/deu/fairseq/vits")
 api.tts_with_vc_to_file(
--- a/TTS/api.py
+++ b/TTS/api.py
@ -1,234 +1,15 @@
 import http.client
 import json
 import os
 import tempfile
 import urllib.request
 from pathlib import Path
-from typing import Tuple, Union
+from typing import Union
 import numpy as np
 import requests
 from scipy.io import wavfile
 from TTS.cs_api import CS_API
 from TTS.utils.audio.numpy_transforms import save_wav
 from TTS.utils.manage import ModelManager
 from TTS.utils.synthesizer import Synthesizer
 class Speaker(object):
    """Convert dict to object."""
    def __init__(self, d, is_voice=False):
        self.is_voice = is_voice
        for k, v in d.items():
            if isinstance(k, (list, tuple)):
                setattr(self, k, [Speaker(x) if isinstance(x, dict) else x for x in v])
            else:
                setattr(self, k, Speaker(v) if isinstance(v, dict) else v)
    def __repr__(self):
        return str(self.__dict__)
 class CS_API:
    """🐸Coqui Studio API Wrapper.
    🐸Coqui Studio is the most advanced voice generation platform. You can generate new voices by voice cloning, voice
    interpolation, or our unique prompt to voice technology. It also provides a set of built-in voices with different
    characteristics. You can use these voices to generate new audio files or use them in your applications.
    You can use all the built-in and your own 🐸Coqui Studio speakers with this API with an API token.
    You can signup to 🐸Coqui Studio from https://app.coqui.ai/auth/signup and get an API token from
    https://app.coqui.ai/account. We can either enter the token as an environment variable as
    `export COQUI_STUDIO_TOKEN=<token>` or pass it as `CS_API(api_token=<toke>)`.
    Visit https://app.coqui.ai/api for more information.
    Example listing all available speakers:
        >>> from TTS.api import CS_API
        >>> tts = CS_API()
        >>> tts.speakers
    Example listing all emotions:
        >>> from TTS.api import CS_API
        >>> tts = CS_API()
        >>> tts.emotions
    Example with a built-in 🐸 speaker:
        >>> from TTS.api import CS_API
        >>> tts = CS_API()
        >>> wav, sr = api.tts("Hello world", speaker_name="Claribel Dervla")
        >>> filepath = tts.tts_to_file(text="Hello world!", speaker_name=tts.speakers[0].name, file_path="output.wav")
    """
    def __init__(self, api_token=None):
        self.api_token = api_token
        self.api_prefix = "/api/v2"
        self.headers = None
        self._speakers = None
        self._check_token()
    @staticmethod
    def ping_api():
        URL = "https://coqui.gateway.scarf.sh/tts/api"
        _ = requests.get(URL)
    @property
    def speakers(self):
        if self._speakers is None:
            self._speakers = self.list_all_speakers()
        return self._speakers
    @property
    def emotions(self):
        """Return a list of available emotions.
        TODO: Get this from the API endpoint.
        """
        return ["Neutral", "Happy", "Sad", "Angry", "Dull"]
    def _check_token(self):
        if self.api_token is None:
            self.api_token = os.environ.get("COQUI_STUDIO_TOKEN")
            self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_token}"}
        if not self.api_token:
            raise ValueError(
                "No API token found for 🐸Coqui Studio voices - https://coqui.ai \n"
                "Visit 🔗https://app.coqui.ai/account to get one.\n"
                "Set it as an environment variable `export COQUI_STUDIO_TOKEN=<token>`\n"
                ""
            )
    def list_all_speakers(self):
        """Return both built-in Coqui Studio speakers and custom voices created by the user."""
        return self.list_speakers() + self.list_voices()
    def list_speakers(self):
        """List built-in Coqui Studio speakers."""
        self._check_token()
        conn = http.client.HTTPSConnection("app.coqui.ai")
        conn.request("GET", f"{self.api_prefix}/speakers?per_page=100", headers=self.headers)
        res = conn.getresponse()
        data = res.read()
        return [Speaker(s) for s in json.loads(data)["result"]]
    def list_voices(self):
        """List custom voices created by the user."""
        conn = http.client.HTTPSConnection("app.coqui.ai")
        conn.request("GET", f"{self.api_prefix}/voices", headers=self.headers)
        res = conn.getresponse()
        data = res.read()
        return [Speaker(s, True) for s in json.loads(data)["result"]]
    def list_speakers_as_tts_models(self):
        """List speakers in ModelManager format."""
        models = []
        for speaker in self.speakers:
            model = f"coqui_studio/en/{speaker.name}/coqui_studio"
            models.append(model)
        return models
    def name_to_speaker(self, name):
        for speaker in self.speakers:
            if speaker.name == name:
                return speaker
        raise ValueError(f"Speaker {name} not found in {self.speakers}")
    def id_to_speaker(self, speaker_id):
        for speaker in self.speakers:
            if speaker.id == speaker_id:
                return speaker
        raise ValueError(f"Speaker {speaker_id} not found.")
    @staticmethod
    def url_to_np(url):
        tmp_file, _ = urllib.request.urlretrieve(url)
        rate, data = wavfile.read(tmp_file)
        return data, rate
    @staticmethod
    def _create_payload(text, speaker, emotion, speed):
        payload = {}
        if speaker.is_voice:
            payload["voice_id"] = speaker.id
        else:
            payload["speaker_id"] = speaker.id
        payload.update(
            {
                "emotion": emotion,
                "name": speaker.name,
                "text": text,
                "speed": speed,
            }
        )
        return payload
    def tts(
        self,
        text: str,
        speaker_name: str = None,
        speaker_id=None,
        emotion="Neutral",
        speed=1.0,
        language=None,  # pylint: disable=unused-argument
    ) -> Tuple[np.ndarray, int]:
        """Synthesize speech from text.
        Args:
            text (str): Text to synthesize.
            speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and
                voices (user generated speakers) with `list_voices()`.
            speaker_id (str): Speaker ID. If None, the speaker name is used.
            emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull".
            speed (float): Speed of the speech. 1.0 is normal speed.
            language (str): Language of the text. If None, the default language of the speaker is used.
        """
        self._check_token()
        self.ping_api()
        if speaker_name is None and speaker_id is None:
            raise ValueError(" [!] Please provide either a `speaker_name` or a `speaker_id`.")
        if speaker_id is None:
            speaker = self.name_to_speaker(speaker_name)
        else:
            speaker = self.id_to_speaker(speaker_id)
        conn = http.client.HTTPSConnection("app.coqui.ai")
        payload = self._create_payload(text, speaker, emotion, speed)
        conn.request("POST", "/api/v2/samples", json.dumps(payload), self.headers)
        res = conn.getresponse()
        data = res.read()
        try:
            wav, sr = self.url_to_np(json.loads(data)["audio_url"])
        except KeyError as e:
            raise ValueError(f" [!] 🐸 API returned error: {data}") from e
        return wav, sr
    def tts_to_file(
        self,
        text: str,
        speaker_name: str,
        speaker_id=None,
        emotion="Neutral",
        speed=1.0,
        language=None,
        file_path: str = None,
    ) -> str:
        """Synthesize speech from text and save it to a file.
        Args:
            text (str): Text to synthesize.
            speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and
                voices (user generated speakers) with `list_voices()`.
            speaker_id (str): Speaker ID. If None, the speaker name is used.
            emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull".
            speed (float): Speed of the speech. 1.0 is normal speed.
            language (str): Language of the text. If None, the default language of the speaker is used.
            file_path (str): Path to save the file. If None, a temporary file is created.
        """
        if file_path is None:
            file_path = tempfile.mktemp(".wav")
        wav, sr = self.tts(text, speaker_name, speaker_id, emotion, speed, language)
        wavfile.write(file_path, sr, wav)
        return file_path
 class TTS:
    """TODO: Add voice conversion and Capacitron support."""
@ -240,6 +21,7 @@ class TTS:
        vocoder_path: str = None,
        vocoder_config_path: str = None,
        progress_bar: bool = True,
        cs_api_model: str = "XTTS",
        gpu=False,
    ):
        """🐸TTS python interface that allows to load and use the released models.
@ -275,6 +57,9 @@ class TTS:
            vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
            vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
            progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
            cs_api_model (str, optional): Name of the model to use for the Coqui Studio API. Available models are
                "XTTS", "XTTS-multilingual", "V1". You can also use `TTS.cs_api.CS_API" for more control.
                Defaults to "XTTS".
            gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
        """
        self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar, verbose=False)
@ -282,6 +67,7 @@ class TTS:
        self.synthesizer = None
        self.voice_converter = None
        self.csapi = None
        self.cs_api_model = cs_api_model
        self.model_name = None
        if model_name is not None:
@ -333,10 +119,9 @@ class TTS:
    def get_models_file_path():
        return Path(__file__).parent / ".models.json"
-    @staticmethod
+    def list_models(self):
    def list_models():
        try:
-            csapi = CS_API()
+            csapi = CS_API(model=self.cs_api_model)
            models = csapi.list_speakers_as_tts_models()
        except ValueError as e:
            print(e)
@ -468,7 +253,7 @@ class TTS:
        text: str,
        speaker_name: str = None,
        language: str = None,
-        emotion: str = "Neutral",
+        emotion: str = None,
        speed: float = 1.0,
        file_path: str = None,
    ) -> Union[np.ndarray, str]:
@ -479,10 +264,11 @@ class TTS:
                Input text to synthesize.
            speaker_name (str, optional):
                Speaker name from Coqui Studio. Defaults to None.
-            language (str, optional):
+            language (str): Language of the text. If None, the default language of the speaker is used. Language is only
-                Language code. Coqui Studio currently supports only English. Defaults to None.
+                supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
            emotion (str, optional):
-                Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Defaults to "Neutral".
+                Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only available
                with "V1" model. Defaults to None.
            speed (float, optional):
                Speed of the speech. Defaults to 1.0.
            file_path (str, optional):
@ -521,9 +307,8 @@ class TTS:
            speaker (str, optional):
                Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
                `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
-            language (str, optional):
+            language (str): Language of the text. If None, the default language of the speaker is used. Language is only
-                Language code for multi-lingual models. You can check whether loaded model is multi-lingual
+                supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
                `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
            speaker_wav (str, optional):
                Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
                Defaults to None.
@ -559,7 +344,7 @@ class TTS:
        speaker: str = None,
        language: str = None,
        speaker_wav: str = None,
-        emotion: str = "Neutral",
+        emotion: str = None,
        speed: float = 1.0,
        file_path: str = "output.wav",
        **kwargs,
--- a/TTS/bin/synthesize.py
+++ b/TTS/bin/synthesize.py
@ -185,11 +185,22 @@ If you don't specify any models, then it uses LJSpeech based English model.
    parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None)
    # args for coqui studio
    parser.add_argument(
        "--cs_model",
        type=str,
        help="Name of the 🐸Coqui Studio model. Available models are `XTTS`, `XTTS-multilingual`, `V1`.",
    )
    parser.add_argument(
        "--emotion",
        type=str,
-        help="Emotion to condition the model with. Only available for 🐸Coqui Studio models.",
+        help="Emotion to condition the model with. Only available for 🐸Coqui Studio `V1` model.",
-        default="Neutral",
+        default=None,
    )
    parser.add_argument(
        "--language",
        type=str,
        help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS-multilingual` model.",
        default=None,
    )
    # args for multi-speaker synthesis
@ -335,8 +346,8 @@ If you don't specify any models, then it uses LJSpeech based English model.
    # CASE3: TTS with coqui studio models
    if "coqui_studio" in args.model_name:
        print(" > Using 🐸Coqui Studio model: ", args.model_name)
-        api = TTS(model_name=args.model_name)
+        api = TTS(model_name=args.model_name, cs_api_model=args.cs_model)
-        api.tts_to_file(text=args.text, emotion=args.emotion, file_path=args.out_path)
+        api.tts_to_file(text=args.text, emotion=args.emotion, file_path=args.out_path, language=args.language)
        print(" > Saving output to ", args.out_path)
        return
--- a/TTS/cs_api.py
+++ b/TTS/cs_api.py
@ -0,0 +1,338 @@
 import http.client
 import json
 import os
 import tempfile
 import urllib.request
 from typing import Tuple
 import numpy as np
 import requests
 from scipy.io import wavfile
 class Speaker(object):
    """Convert dict to object."""
    def __init__(self, d, is_voice=False):
        self.is_voice = is_voice
        for k, v in d.items():
            if isinstance(k, (list, tuple)):
                setattr(self, k, [Speaker(x) if isinstance(x, dict) else x for x in v])
            else:
                setattr(self, k, Speaker(v) if isinstance(v, dict) else v)
    def __repr__(self):
        return str(self.__dict__)
 class CS_API:
    """🐸Coqui Studio API Wrapper.
    🐸Coqui Studio is the most advanced voice generation platform. You can generate new voices by voice cloning, voice
    interpolation, or our unique prompt to voice technology. It also provides a set of built-in voices with different
    characteristics. You can use these voices to generate new audio files or use them in your applications.
    You can use all the built-in and your own 🐸Coqui Studio speakers with this API with an API token.
    You can signup to 🐸Coqui Studio from https://app.coqui.ai/auth/signup and get an API token from
    https://app.coqui.ai/account. We can either enter the token as an environment variable as
    `export COQUI_STUDIO_TOKEN=<token>` or pass it as `CS_API(api_token=<toke>)`.
    Visit https://app.coqui.ai/api for more information.
    Args:
        api_token (str): 🐸Coqui Studio API token. If not provided, it will be read from the environment variable
            `COQUI_STUDIO_TOKEN`.
        model (str): 🐸Coqui Studio model. It can be either `V1`, `XTTS`, or `XTTS-multilang`. Default is `XTTS`.
    Example listing all available speakers:
        >>> from TTS.api import CS_API
        >>> tts = CS_API()
        >>> tts.speakers
    Example listing all emotions:
        >>> # emotions are only available for `V1` model
        >>> from TTS.api import CS_API
        >>> tts = CS_API(model="V1")
        >>> tts.emotions
    Example with a built-in 🐸 speaker:
        >>> from TTS.api import CS_API
        >>> tts = CS_API()
        >>> wav, sr = api.tts("Hello world", speaker_name=tts.speakers[0].name)
        >>> filepath = tts.tts_to_file(text="Hello world!", speaker_name=tts.speakers[0].name, file_path="output.wav")
    Example with multi-language model:
        >>> from TTS.api import CS_API
        >>> tts = CS_API(model="XTTS-multilang")
        >>> wav, sr = api.tts("Hello world", speaker_name=tts.speakers[0].name, language="en")
    """
    MODEL_ENDPOINTS = {
        "V1": {
            "list_speakers": "https://app.coqui.ai/api/v2/speakers",
            "synthesize": "https://app.coqui.ai/api/v2/samples",
            "list_voices": "https://app.coqui.ai/api/v2/voices",
        },
        "XTTS": {
            "list_speakers": "https://app.coqui.ai/api/v2/speakers",
            "synthesize": "https://app.coqui.ai/api/v2/samples/xtts/render/",
            "list_voices": "https://app.coqui.ai/api/v2/voices/xtts/",
        },
        "XTTS-multilang": {
            "list_speakers": "https://app.coqui.ai/api/v2/speakers",
            "synthesize": "https://app.coqui.ai/api/v2/samples/multilingual/render/",
            "list_voices": "https://app.coqui.ai/api/v2/voices/xtts/",
        },
    }
    SUPPORTED_LANGUAGES = ["en", "es", "de", "fr", "it", "pt", "pl"]
    def __init__(self, api_token=None, model="XTTS"):
        self.api_token = api_token
        self.model = model
        self.headers = None
        self._speakers = None
        self._check_token()
    @staticmethod
    def ping_api():
        URL = "https://coqui.gateway.scarf.sh/tts/api"
        _ = requests.get(URL)
    @property
    def speakers(self):
        if self._speakers is None:
            self._speakers = self.list_all_speakers()
        return self._speakers
    @property
    def emotions(self):
        """Return a list of available emotions.
        TODO: Get this from the API endpoint.
        """
        if self.model == "V1":
            return ["Neutral", "Happy", "Sad", "Angry", "Dull"]
        else:
            raise ValueError(f"❗ Emotions are not available for {self.model}.")
    def _check_token(self):
        if self.api_token is None:
            self.api_token = os.environ.get("COQUI_STUDIO_TOKEN")
            self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_token}"}
        if not self.api_token:
            raise ValueError(
                "No API token found for 🐸Coqui Studio voices - https://coqui.ai \n"
                "Visit 🔗https://app.coqui.ai/account to get one.\n"
                "Set it as an environment variable `export COQUI_STUDIO_TOKEN=<token>`\n"
                ""
            )
    def list_all_speakers(self):
        """Return both built-in Coqui Studio speakers and custom voices created by the user."""
        return self.list_speakers() + self.list_voices()
    def list_speakers(self):
        """List built-in Coqui Studio speakers."""
        self._check_token()
        conn = http.client.HTTPSConnection("app.coqui.ai")
        url = self.MODEL_ENDPOINTS[self.model]["list_speakers"]
        conn.request("GET", f"{url}?per_page=100", headers=self.headers)
        res = conn.getresponse()
        data = res.read()
        return [Speaker(s) for s in json.loads(data)["result"]]
    def list_voices(self):
        """List custom voices created by the user."""
        conn = http.client.HTTPSConnection("app.coqui.ai")
        url = self.MODEL_ENDPOINTS[self.model]["list_voices"]
        conn.request("GET", f"{url}", headers=self.headers)
        res = conn.getresponse()
        data = res.read()
        return [Speaker(s, True) for s in json.loads(data)["result"]]
    def list_speakers_as_tts_models(self):
        """List speakers in ModelManager format."""
        models = []
        for speaker in self.speakers:
            model = f"coqui_studio/multilingual/{speaker.name}/{self.model}"
            models.append(model)
        return models
    def name_to_speaker(self, name):
        for speaker in self.speakers:
            if speaker.name == name:
                return speaker
        raise ValueError(f"Speaker {name} not found in {self.speakers}")
    def id_to_speaker(self, speaker_id):
        for speaker in self.speakers:
            if speaker.id == speaker_id:
                return speaker
        raise ValueError(f"Speaker {speaker_id} not found.")
    @staticmethod
    def url_to_np(url):
        tmp_file, _ = urllib.request.urlretrieve(url)
        rate, data = wavfile.read(tmp_file)
        return data, rate
    @staticmethod
    def _create_payload(model, text, speaker, speed, emotion, language):
        payload = {}
        # if speaker.is_voice:
        payload["voice_id"] = speaker.id
        # else:
        payload["speaker_id"] = speaker.id
        if model == "V1":
            payload.update(
                {
                    "emotion": emotion,
                    "name": speaker.name,
                    "text": text,
                    "speed": speed,
                }
            )
        elif model == "XTTS":
            payload.update(
                {
                    "name": speaker.name,
                    "text": text,
                    "speed": speed,
                }
            )
        elif model == "XTTS-multilang":
            payload.update(
                {
                    "name": speaker.name,
                    "text": text,
                    "speed": speed,
                    "language": language,
                }
            )
        else:
            raise ValueError(f"❗ Unknown model {model}")
        return payload
    def _check_tts_args(self, text, speaker_name, speaker_id, emotion, speed, language):
        assert text is not None, "❗ text is required for V1 model."
        assert speaker_name is not None, "❗ speaker_name is required for V1 model."
        if self.model == "V1":
            if emotion is None:
                emotion = "Neutral"
            assert language is None, "❗ language is not supported for V1 model."
        elif self.model == "XTTS":
            assert emotion is None, f"❗ Emotions are not supported for XTTS model. Use V1 model."
            assert language is None, "❗ Language is not supported for XTTS model. Use XTTS-multilang model."
        elif self.model == "XTTS-multilang":
            assert emotion is None, f"❗ Emotions are not supported for XTTS-multilang model. Use V1 model."
            assert language is not None, "❗ Language is required for XTTS-multilang model."
            assert (
                language in self.SUPPORTED_LANGUAGES
            ), f"❗ Language {language} is not yet supported. Use one of: en, es, de, fr, it, pt, pl"
        return text, speaker_name, speaker_id, emotion, speed, language
    def tts(
        self,
        text: str,
        speaker_name: str = None,
        speaker_id=None,
        emotion=None,
        speed=1.0,
        language=None,  # pylint: disable=unused-argument
    ) -> Tuple[np.ndarray, int]:
        """Synthesize speech from text.
        Args:
            text (str): Text to synthesize.
            speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and
                voices (user generated speakers) with `list_voices()`.
            speaker_id (str): Speaker ID. If None, the speaker name is used.
            emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only
                supported by `V1` model. Defaults to None.
            speed (float): Speed of the speech. 1.0 is normal speed.
            language (str): Language of the text. If None, the default language of the speaker is used. Language is only
                supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
        """
        self._check_token()
        self.ping_api()
        if speaker_name is None and speaker_id is None:
            raise ValueError(" [!] Please provide either a `speaker_name` or a `speaker_id`.")
        if speaker_id is None:
            speaker = self.name_to_speaker(speaker_name)
        else:
            speaker = self.id_to_speaker(speaker_id)
        text, speaker_name, speaker_id, emotion, speed, language = self._check_tts_args(
            text, speaker_name, speaker_id, emotion, speed, language
        )
        conn = http.client.HTTPSConnection("app.coqui.ai")
        payload = self._create_payload(self.model, text, speaker, speed, emotion, language)
        url = self.MODEL_ENDPOINTS[self.model]["synthesize"]
        conn.request("POST", url, json.dumps(payload), self.headers)
        res = conn.getresponse()
        data = res.read()
        try:
            wav, sr = self.url_to_np(json.loads(data)["audio_url"])
        except KeyError as e:
            raise ValueError(f" [!] 🐸 API returned error: {data}") from e
        return wav, sr
    def tts_to_file(
        self,
        text: str,
        speaker_name: str,
        speaker_id=None,
        emotion=None,
        speed=1.0,
        language=None,
        file_path: str = None,
    ) -> str:
        """Synthesize speech from text and save it to a file.
        Args:
            text (str): Text to synthesize.
            speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and
                voices (user generated speakers) with `list_voices()`.
            speaker_id (str): Speaker ID. If None, the speaker name is used.
            emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull".
            speed (float): Speed of the speech. 1.0 is normal speed.
            language (str): Language of the text. If None, the default language of the speaker is used. Language is only
                supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
            file_path (str): Path to save the file. If None, a temporary file is created.
        """
        if file_path is None:
            file_path = tempfile.mktemp(".wav")
        wav, sr = self.tts(text, speaker_name, speaker_id, emotion, speed, language)
        wavfile.write(file_path, sr, wav)
        return file_path
 if __name__ == "__main__":
    import time
    api = CS_API()
    print(api.speakers)
    print(api.list_speakers_as_tts_models())
    ts = time.time()
    wav, sr = api.tts("It took me quite a long time to develop a voice.", speaker_name=api.speakers[0].name)
    print(f" [i] XTTS took {time.time() - ts:.2f}s")
    filepath = api.tts_to_file(text="Hello world!", speaker_name=api.speakers[0].name, file_path="output.wav")
    api = CS_API(model="XTTS-multilang")
    print(api.speakers)
    ts = time.time()
    wav, sr = api.tts(
        "It took me quite a long time to develop a voice.", speaker_name=api.speakers[0].name, language="en"
    )
    print(f" [i] XTTS took {time.time() - ts:.2f}s")
    filepath = api.tts_to_file(
        text="Hello world!", speaker_name=api.speakers[0].name, file_path="output.wav", language="en"
    )
--- a/TTS/tts/models/tortoise.py
+++ b/TTS/tts/models/tortoise.py
@ -72,7 +72,7 @@ def load_discrete_vocoder_diffuser(
    )
-def format_conditioning(clip, cond_length=132300, device="cuda"):
+def format_conditioning(clip, cond_length=132300, device="cuda", **kwargs):
    """
    Converts the given conditioning signal to a MEL spectrogram and clips it as expected by the models.
    """
@ -82,7 +82,7 @@ def format_conditioning(clip, cond_length=132300, device="cuda"):
    elif gap > 0:
        rand_start = random.randint(0, gap)
        clip = clip[:, rand_start : rand_start + cond_length]
-    mel_clip = TorchMelSpectrogram()(clip.unsqueeze(0)).squeeze(0)
+    mel_clip = TorchMelSpectrogram(**kwargs)(clip.unsqueeze(0)).squeeze(0)
    return mel_clip.unsqueeze(0).to(device)
@ -321,6 +321,7 @@ class Tortoise(BaseTTS):
    def __init__(self, config: Coqpit):
        super().__init__(config, ap=None, tokenizer=None)
        self.mel_norm_path = None
        self.config = config
        self.ar_checkpoint = self.args.ar_checkpoint
        self.diff_checkpoint = self.args.diff_checkpoint  # TODO: check if this is even needed
@ -429,7 +430,7 @@ class Tortoise(BaseTTS):
            auto_conds = []
            for ls in voice_samples:
-                auto_conds.append(format_conditioning(ls[0], device=self.device))
+                auto_conds.append(format_conditioning(ls[0], device=self.device, mel_norm_file=self.mel_norm_path))
            auto_conds = torch.stack(auto_conds, dim=1)
            with self.temporary_cuda(self.autoregressive) as ar:
                auto_latent = ar.get_conditioning(auto_conds)
@ -873,6 +874,7 @@ class Tortoise(BaseTTS):
        diff_path = diff_checkpoint_path or os.path.join(checkpoint_dir, "diffusion_decoder.pth")
        clvp_path = clvp_checkpoint_path or os.path.join(checkpoint_dir, "clvp2.pth")
        vocoder_checkpoint_path = vocoder_checkpoint_path or os.path.join(checkpoint_dir, "vocoder.pth")
        self.mel_norm_path = os.path.join(checkpoint_dir, "mel_norms.pth")
        if os.path.exists(ar_path):
            # remove keys from the checkpoint that are not in the model
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@ -88,7 +88,7 @@ class ModelManager(object):
    def _list_models(self, model_type, model_count=0):
        if self.verbose:
-            print(" Name format: type/language/dataset/model")
+            print("\n Name format: type/language/dataset/model")
        model_list = []
        for lang in self.models_dict[model_type]:
            for dataset in self.models_dict[model_type][lang]:
--- a/docs/source/inference.md
+++ b/docs/source/inference.md
@ -191,9 +191,25 @@ from TTS.api import CS_API
 # Init 🐸 Coqui Studio API
 # you can either set the API token as an environment variable `COQUI_STUDIO_TOKEN` or pass it as an argument.
-api = CS_API(api_token=<token>)
+
 # XTTS - Best quality and life-like speech in EN
 api = CS_API(api_token=<token>, model="XTTS")
 api.speakers  # all the speakers are available with all the models.
 api.list_speakers()
 api.list_voices()
 wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5)
 # XTTS-multilingual - Multilingual XTTS with [en, de, es, fr, it, pt, ...] (more langs coming soon)
 api = CS_API(api_token=<token>, model="XTTS-multilingual")
 api.speakers
-api.emotions
+api.list_speakers()
 api.list_voices()
 wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5)
 # V1 - Fast and lightweight TTS in EN with emotion control.
 api = CS_API(api_token=<token>, model="V1")
 api.speakers
 api.emotions  # emotions are only for the V1 model.
 api.list_speakers()
 api.list_voices()
 wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5)