Merge branch 'dev' into reuben/docs-studio-refs

2023-12-13 08:53:43 +01:00 · 2023-12-13 08:53:43 +01:00 · 3991d83b2c
parent 0859e9f252 fa28f99f15
commit 3991d83b2c
18 changed files with 184 additions and 866 deletions
--- a/.github/workflows/api_tests.yml
+++ b/.github/workflows/api_tests.yml
@ -1,53 +0,0 @@
 name: api_tests
 on:
  push:
    branches:
      - main
 jobs:
  check_skip:
    runs-on: ubuntu-latest
    if: "! contains(github.event.head_commit.message, '[ci skip]')"
    steps:
      - run: echo "${{ github.event.head_commit.message }}"
  test:
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        python-version: [3.9, "3.10", "3.11"]
        experimental: [false]
    steps:
      - uses: actions/checkout@v3
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}
          architecture: x64
          cache: 'pip'
          cache-dependency-path: 'requirements*'
      - name: check OS
        run: cat /etc/os-release
      - name: set ENV
        run: |
          export TRAINER_TELEMETRY=0
      - name: Install dependencies
        run: |
          sudo apt-get update
          sudo apt-get install -y --no-install-recommends git make gcc
          sudo apt-get install espeak-ng
          make system-deps
      - name: Install/upgrade Python setup deps
        run: python3 -m pip install --upgrade pip setuptools wheel
      - name: Replace scarf urls
        run: |
          sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
      - name: Install TTS
        run: |
          python3 -m pip install .[all]
          python3 setup.py egg_info
      - name: Unit tests
        run: make api_tests
        env:
          COQUI_STUDIO_TOKEN: ${{ secrets.COQUI_STUDIO_TOKEN }}
--- a/.github/workflows/zoo_tests_tortoise.yml
+++ b/.github/workflows/zoo_tests_tortoise.yml
@ -1,52 +0,0 @@
 name: zoo-tests-tortoise
 on:
  push:
    branches:
      - main
  pull_request:
    types: [opened, synchronize, reopened]
 jobs:
  check_skip:
    runs-on: ubuntu-latest
    if: "! contains(github.event.head_commit.message, '[ci skip]')"
    steps:
      - run: echo "${{ github.event.head_commit.message }}"
  test:
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        python-version: [3.9, "3.10", "3.11"]
        experimental: [false]
    steps:
      - uses: actions/checkout@v3
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}
          architecture: x64
          cache: 'pip'
          cache-dependency-path: 'requirements*'
      - name: check OS
        run: cat /etc/os-release
      - name: set ENV
        run: export TRAINER_TELEMETRY=0
      - name: Install dependencies
        run: |
          sudo apt-get update
          sudo apt-get install -y git make gcc
          sudo apt-get install espeak espeak-ng
          make system-deps
      - name: Install/upgrade Python setup deps
        run: python3 -m pip install --upgrade pip setuptools wheel
      - name: Replace scarf urls
        run: |
          sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
      - name: Install TTS
        run: |
          python3 -m pip install .[all]
          python3 setup.py egg_info
      - name: Unit tests
        run: nose2 -F -v -B --with-coverage --coverage TTS tests.zoo_tests.test_models.test_tortoise
--- a/3
+++ b/3
@ -35,9 +35,6 @@ test_zoo:	## run zoo tests.
 inference_tests: ## run inference tests.
 	nose2 -F -v -B --with-coverage --coverage TTS tests.inference_tests
 api_tests: ## run api tests.
 	nose2 -F -v -B --with-coverage --coverage TTS tests.api_tests
 data_tests: ## run data tests.
 	nose2 -F -v -B --with-coverage --coverage TTS tests.data_tests
--- a/TTS/.models.json
+++ b/TTS/.models.json
@ -3,12 +3,13 @@
        "multilingual": {
            "multi-dataset": {
                "xtts_v2": {
-                    "description": "XTTS-v2.0.2 by Coqui with 16 languages.",
+                    "description": "XTTS-v2.0.3 by Coqui with 17 languages.",
                    "hf_url": [
                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth",
                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json",
                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json",
-                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/hash.md5"
+                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/hash.md5",
                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/speakers_xtts.pth"
                    ],
                    "model_hash": "10f92b55c512af7a8d39d650547a15a7",
                    "default_vocoder": null,
@ -45,7 +46,7 @@
                    "hf_url": [
                        "https://coqui.gateway.scarf.sh/hf/bark/coarse_2.pt",
                        "https://coqui.gateway.scarf.sh/hf/bark/fine_2.pt",
-                        "https://app.coqui.ai/tts_model/text_2.pt",
+                        "https://coqui.gateway.scarf.sh/hf/text_2.pt",
                        "https://coqui.gateway.scarf.sh/hf/bark/config.json",
                        "https://coqui.gateway.scarf.sh/hf/bark/hubert.pt",
                        "https://coqui.gateway.scarf.sh/hf/bark/tokenizer.pth"
@ -270,7 +271,7 @@
                "tortoise-v2": {
                    "description": "Tortoise tts model https://github.com/neonbjb/tortoise-tts",
                    "github_rls_url": [
-                        "https://app.coqui.ai/tts_model/autoregressive.pth",
+                        "https://coqui.gateway.scarf.sh/v0.14.1_models/autoregressive.pth",
                        "https://coqui.gateway.scarf.sh/v0.14.1_models/clvp2.pth",
                        "https://coqui.gateway.scarf.sh/v0.14.1_models/cvvp.pth",
                        "https://coqui.gateway.scarf.sh/v0.14.1_models/diffusion_decoder.pth",
--- a/TTS/VERSION
+++ b/TTS/VERSION
@ -1 +1 @@
-0.21.3
+0.22.0
--- a/TTS/api.py
+++ b/TTS/api.py
@ -6,7 +6,6 @@ from typing import Union
 import numpy as np
 from torch import nn
 from TTS.cs_api import CS_API
 from TTS.utils.audio.numpy_transforms import save_wav
 from TTS.utils.manage import ModelManager
 from TTS.utils.synthesizer import Synthesizer
@ -24,7 +23,6 @@ class TTS(nn.Module):
        vocoder_path: str = None,
        vocoder_config_path: str = None,
        progress_bar: bool = True,
        cs_api_model: str = "XTTS",
        gpu=False,
    ):
        """🐸TTS python interface that allows to load and use the released models.
@ -60,9 +58,6 @@ class TTS(nn.Module):
            vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
            vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
            progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
            cs_api_model (str, optional): Name of the model to use for the Coqui Studio API. Available models are
                "XTTS", "V1". You can also use `TTS.cs_api.CS_API" for more control.
                Defaults to "XTTS".
            gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
        """
        super().__init__()
@ -70,14 +65,12 @@ class TTS(nn.Module):
        self.config = load_config(config_path) if config_path else None
        self.synthesizer = None
        self.voice_converter = None
        self.csapi = None
        self.cs_api_model = cs_api_model
        self.model_name = ""
        if gpu:
            warnings.warn("`gpu` will be deprecated. Please use `tts.to(device)` instead.")
        if model_name is not None and len(model_name) > 0:
-            if "tts_models" in model_name or "coqui_studio" in model_name:
+            if "tts_models" in model_name:
                self.load_tts_model_by_name(model_name, gpu)
            elif "voice_conversion_models" in model_name:
                self.load_vc_model_by_name(model_name, gpu)
@ -99,12 +92,6 @@ class TTS(nn.Module):
            return self.synthesizer.tts_model.speaker_manager.num_speakers > 1
        return False
    @property
    def is_coqui_studio(self):
        if self.model_name is None:
            return False
        return "coqui_studio" in self.model_name
    @property
    def is_multi_lingual(self):
        # Not sure what sets this to None, but applied a fix to prevent crashing.
@ -136,14 +123,7 @@ class TTS(nn.Module):
        return Path(__file__).parent / ".models.json"
    def list_models(self):
-        try:
+        return ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False)
            csapi = CS_API(model=self.cs_api_model)
            models = csapi.list_speakers_as_tts_models()
        except ValueError as e:
            print(e)
            models = []
        manager = ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False)
        return manager.list_tts_models() + models
    def download_model_by_name(self, model_name: str):
        model_path, config_path, model_item = self.manager.download_model(model_name)
@ -186,30 +166,26 @@ class TTS(nn.Module):
        TODO: Add tests
        """
        self.synthesizer = None
        self.csapi = None
        self.model_name = model_name
-        if "coqui_studio" in model_name:
+        model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
-            self.csapi = CS_API()
+            model_name
-        else:
+        )
            model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
                model_name
            )
-            # init synthesizer
+        # init synthesizer
-            # None values are fetch from the model
+        # None values are fetch from the model
-            self.synthesizer = Synthesizer(
+        self.synthesizer = Synthesizer(
-                tts_checkpoint=model_path,
+            tts_checkpoint=model_path,
-                tts_config_path=config_path,
+            tts_config_path=config_path,
-                tts_speakers_file=None,
+            tts_speakers_file=None,
-                tts_languages_file=None,
+            tts_languages_file=None,
-                vocoder_checkpoint=vocoder_path,
+            vocoder_checkpoint=vocoder_path,
-                vocoder_config=vocoder_config_path,
+            vocoder_config=vocoder_config_path,
-                encoder_checkpoint=None,
+            encoder_checkpoint=None,
-                encoder_config=None,
+            encoder_config=None,
-                model_dir=model_dir,
+            model_dir=model_dir,
-                use_cuda=gpu,
+            use_cuda=gpu,
-            )
+        )
    def load_tts_model_by_path(
        self, model_path: str, config_path: str, vocoder_path: str = None, vocoder_config: str = None, gpu: bool = False
@ -246,77 +222,17 @@ class TTS(nn.Module):
        **kwargs,
    ) -> None:
        """Check if the arguments are valid for the model."""
-        if not self.is_coqui_studio:
+        # check for the coqui tts models
-            # check for the coqui tts models
+        if self.is_multi_speaker and (speaker is None and speaker_wav is None):
-            if self.is_multi_speaker and (speaker is None and speaker_wav is None):
+            raise ValueError("Model is multi-speaker but no `speaker` is provided.")
-                raise ValueError("Model is multi-speaker but no `speaker` is provided.")
+        if self.is_multi_lingual and language is None:
-            if self.is_multi_lingual and language is None:
+            raise ValueError("Model is multi-lingual but no `language` is provided.")
-                raise ValueError("Model is multi-lingual but no `language` is provided.")
+        if not self.is_multi_speaker and speaker is not None and "voice_dir" not in kwargs:
-            if not self.is_multi_speaker and speaker is not None and "voice_dir" not in kwargs:
+            raise ValueError("Model is not multi-speaker but `speaker` is provided.")
-                raise ValueError("Model is not multi-speaker but `speaker` is provided.")
+        if not self.is_multi_lingual and language is not None:
-            if not self.is_multi_lingual and language is not None:
+            raise ValueError("Model is not multi-lingual but `language` is provided.")
-                raise ValueError("Model is not multi-lingual but `language` is provided.")
+        if not emotion is None and not speed is None:
-            if not emotion is None and not speed is None:
+            raise ValueError("Emotion and speed can only be used with Coqui Studio models. Which is discontinued.")
                raise ValueError("Emotion and speed can only be used with Coqui Studio models.")
        else:
            if emotion is None:
                emotion = "Neutral"
            if speed is None:
                speed = 1.0
            # check for the studio models
            if speaker_wav is not None:
                raise ValueError("Coqui Studio models do not support `speaker_wav` argument.")
            if speaker is not None:
                raise ValueError("Coqui Studio models do not support `speaker` argument.")
            if language is not None and language != "en":
                raise ValueError("Coqui Studio models currently support only `language=en` argument.")
            if emotion not in ["Neutral", "Happy", "Sad", "Angry", "Dull"]:
                raise ValueError(f"Emotion - `{emotion}` - must be one of `Neutral`, `Happy`, `Sad`, `Angry`, `Dull`.")
    def tts_coqui_studio(
        self,
        text: str,
        speaker_name: str = None,
        language: str = None,
        emotion: str = None,
        speed: float = 1.0,
        pipe_out=None,
        file_path: str = None,
    ) -> Union[np.ndarray, str]:
        """Convert text to speech using Coqui Studio models. Use `CS_API` class if you are only interested in the API.
        Args:
            text (str):
                Input text to synthesize.
            speaker_name (str, optional):
                Speaker name from Coqui Studio. Defaults to None.
            language (str): Language of the text. If None, the default language of the speaker is used. Language is only
                supported by `XTTS` model.
            emotion (str, optional):
                Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only available
                with "V1" model. Defaults to None.
            speed (float, optional):
                Speed of the speech. Defaults to 1.0.
            pipe_out (BytesIO, optional):
                Flag to stdout the generated TTS wav file for shell pipe.
            file_path (str, optional):
                Path to save the output file. When None it returns the `np.ndarray` of waveform. Defaults to None.
        Returns:
            Union[np.ndarray, str]: Waveform of the synthesized speech or path to the output file.
        """
        speaker_name = self.model_name.split("/")[2]
        if file_path is not None:
            return self.csapi.tts_to_file(
                text=text,
                speaker_name=speaker_name,
                language=language,
                speed=speed,
                pipe_out=pipe_out,
                emotion=emotion,
                file_path=file_path,
            )[0]
        return self.csapi.tts(text=text, speaker_name=speaker_name, language=language, speed=speed, emotion=emotion)[0]
    def tts(
        self,
@ -357,10 +273,6 @@ class TTS(nn.Module):
        self._check_arguments(
            speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed, **kwargs
        )
        if self.csapi is not None:
            return self.tts_coqui_studio(
                text=text, speaker_name=speaker, language=language, emotion=emotion, speed=speed
            )
        wav = self.synthesizer.tts(
            text=text,
            speaker_name=speaker,
@ -419,16 +331,6 @@ class TTS(nn.Module):
        """
        self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
        if self.csapi is not None:
            return self.tts_coqui_studio(
                text=text,
                speaker_name=speaker,
                language=language,
                emotion=emotion,
                speed=speed,
                file_path=file_path,
                pipe_out=pipe_out,
            )
        wav = self.tts(
            text=text,
            speaker=speaker,
--- a/TTS/bin/synthesize.py
+++ b/TTS/bin/synthesize.py
@ -66,12 +66,6 @@ If you don't specify any models, then it uses LJSpeech based English model.
  $ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
  ```
 - Run TTS and define speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0:
  ```
  $ tts --text "Text for TTS" --model_name "coqui_studio/<language>/<dataset>/<model_name>" --speed 1.2 --out_path output/path/speech.wav
  ```
 - Run a TTS model with its default vocoder model:
  ```
@ -222,25 +216,6 @@ def main():
        default=None,
    )
    parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None)
    # args for coqui studio
    parser.add_argument(
        "--cs_model",
        type=str,
        help="Name of the 🐸Coqui Studio model. Available models are `XTTS`, `V1`.",
    )
    parser.add_argument(
        "--emotion",
        type=str,
        help="Emotion to condition the model with. Only available for 🐸Coqui Studio `V1` model.",
        default=None,
    )
    parser.add_argument(
        "--language",
        type=str,
        help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS` model.",
        default=None,
    )
    parser.add_argument(
        "--pipe_out",
        help="stdout the generated TTS wav file for shell pipe.",
@ -249,12 +224,6 @@ def main():
        const=True,
        default=False,
    )
    parser.add_argument(
        "--speed",
        type=float,
        help="Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0.",
        default=None,
    )
    # args for multi-speaker synthesis
    parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
@ -389,7 +358,6 @@ def main():
        # CASE1 #list : list pre-trained TTS models
        if args.list_models:
            manager.add_cs_api_models(api.list_models())
            manager.list_models()
            sys.exit()
@ -404,29 +372,7 @@ def main():
            manager.model_info_by_full_name(model_query_full_name)
            sys.exit()
-        # CASE3: TTS with coqui studio models
+        # CASE3: load pre-trained model paths
        if "coqui_studio" in args.model_name:
            print(" > Using 🐸Coqui Studio model: ", args.model_name)
            api = TTS(model_name=args.model_name, cs_api_model=args.cs_model)
            api.tts_to_file(
                text=args.text,
                emotion=args.emotion,
                file_path=args.out_path,
                language=args.language,
                speed=args.speed,
                pipe_out=pipe_out,
            )
            print(" > Saving output to ", args.out_path)
            return
        if args.language_idx is None and args.language is not None:
            msg = (
                "--language is only supported for Coqui Studio models. "
                "Use --language_idx to specify the target language for multilingual models."
            )
            raise ValueError(msg)
        # CASE4: load pre-trained model paths
        if args.model_name is not None and not args.model_path:
            model_path, config_path, model_item = manager.download_model(args.model_name)
            # tts model
@ -454,7 +400,7 @@ def main():
        if args.vocoder_name is not None and not args.vocoder_path:
            vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
-        # CASE5: set custom model paths
+        # CASE4: set custom model paths
        if args.model_path is not None:
            tts_path = args.model_path
            tts_config_path = args.config_path
--- a/TTS/config/init.py
+++ b/TTS/config/init.py
@ -16,12 +16,9 @@ def read_json_with_comments(json_path):
    # fallback to json
    with fsspec.open(json_path, "r", encoding="utf-8") as f:
        input_str = f.read()
-    # handle comments
+    # handle comments but not urls with //
-    input_str = re.sub(r"\\\n", "", input_str)
+    input_str = re.sub(r"(\"(?:[^\"\\]|\\.)*\")|(/\*(?:.|[\\n\\r])*?\*/)|(//.*)", lambda m: m.group(1) or m.group(2) or "", input_str)
-    input_str = re.sub(r"//.*\n", "\n", input_str)
+    return json.loads(input_str)
    data = json.loads(input_str)
    return data
 def register_config(model_name: str) -> Coqpit:
    """Find the right config for the given model name.
--- a/TTS/cs_api.py
+++ b/TTS/cs_api.py
@ -1,317 +0,0 @@
 import http.client
 import json
 import os
 import tempfile
 import urllib.request
 from typing import Tuple
 import numpy as np
 import requests
 from scipy.io import wavfile
 from TTS.utils.audio.numpy_transforms import save_wav
 class Speaker(object):
    """Convert dict to object."""
    def __init__(self, d, is_voice=False):
        self.is_voice = is_voice
        for k, v in d.items():
            if isinstance(k, (list, tuple)):
                setattr(self, k, [Speaker(x) if isinstance(x, dict) else x for x in v])
            else:
                setattr(self, k, Speaker(v) if isinstance(v, dict) else v)
    def __repr__(self):
        return str(self.__dict__)
 class CS_API:
    """🐸Coqui Studio API Wrapper.
    🐸Coqui Studio is the most advanced voice generation platform. You can generate new voices by voice cloning, voice
    interpolation, or our unique prompt to voice technology. It also provides a set of built-in voices with different
    characteristics. You can use these voices to generate new audio files or use them in your applications.
    You can use all the built-in and your own 🐸Coqui Studio speakers with this API with an API token.
    You can signup to 🐸Coqui Studio from https://app.coqui.ai/auth/signup and get an API token from
    https://app.coqui.ai/account. We can either enter the token as an environment variable as
    `export COQUI_STUDIO_TOKEN=<token>` or pass it as `CS_API(api_token=<toke>)`.
    Visit https://app.coqui.ai/api for more information.
    Args:
        api_token (str): 🐸Coqui Studio API token. If not provided, it will be read from the environment variable
            `COQUI_STUDIO_TOKEN`.
        model (str): 🐸Coqui Studio model. It can be either `V1`, `XTTS`. Default is `XTTS`.
    Example listing all available speakers:
        >>> from TTS.api import CS_API
        >>> tts = CS_API()
        >>> tts.speakers
    Example listing all emotions:
        >>> # emotions are only available for `V1` model
        >>> from TTS.api import CS_API
        >>> tts = CS_API(model="V1")
        >>> tts.emotions
    Example with a built-in 🐸 speaker:
        >>> from TTS.api import CS_API
        >>> tts = CS_API()
        >>> wav, sr = api.tts("Hello world", speaker_name=tts.speakers[0].name)
        >>> filepath = tts.tts_to_file(text="Hello world!", speaker_name=tts.speakers[0].name, file_path="output.wav")
    Example with multi-language model:
        >>> from TTS.api import CS_API
        >>> tts = CS_API(model="XTTS")
        >>> wav, sr = api.tts("Hello world", speaker_name=tts.speakers[0].name, language="en")
    """
    MODEL_ENDPOINTS = {
        "V1": {
            "list_speakers": "https://app.coqui.ai/api/v2/speakers",
            "synthesize": "https://app.coqui.ai/api/v2/samples",
            "list_voices": "https://app.coqui.ai/api/v2/voices",
        },
        "XTTS": {
            "list_speakers": "https://app.coqui.ai/api/v2/speakers",
            "synthesize": "https://app.coqui.ai/api/v2/samples/xtts/render/",
            "list_voices": "https://app.coqui.ai/api/v2/voices/xtts",
        },
    }
    SUPPORTED_LANGUAGES = ["en", "es", "de", "fr", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja"]
    def __init__(self, api_token=None, model="XTTS"):
        self.api_token = api_token
        self.model = model
        self.headers = None
        self._speakers = None
        self._check_token()
    @staticmethod
    def ping_api():
        URL = "https://coqui.gateway.scarf.sh/tts/api"
        _ = requests.get(URL)
    @property
    def speakers(self):
        if self._speakers is None:
            self._speakers = self.list_all_speakers()
        return self._speakers
    @property
    def emotions(self):
        """Return a list of available emotions.
        TODO: Get this from the API endpoint.
        """
        if self.model == "V1":
            return ["Neutral", "Happy", "Sad", "Angry", "Dull"]
        else:
            raise ValueError(f"❗ Emotions are not available for {self.model}.")
    def _check_token(self):
        if self.api_token is None:
            self.api_token = os.environ.get("COQUI_STUDIO_TOKEN")
            self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_token}"}
        if not self.api_token:
            raise ValueError(
                "No API token found for 🐸Coqui Studio voices - https://coqui.ai \n"
                "Visit 🔗https://app.coqui.ai/account to get one.\n"
                "Set it as an environment variable `export COQUI_STUDIO_TOKEN=<token>`\n"
                ""
            )
    def list_all_speakers(self):
        """Return both built-in Coqui Studio speakers and custom voices created by the user."""
        return self.list_speakers() + self.list_voices()
    def list_speakers(self):
        """List built-in Coqui Studio speakers."""
        self._check_token()
        conn = http.client.HTTPSConnection("app.coqui.ai")
        url = self.MODEL_ENDPOINTS[self.model]["list_speakers"]
        conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers)
        res = conn.getresponse()
        data = res.read()
        return [Speaker(s) for s in json.loads(data)["result"]]
    def list_voices(self):
        """List custom voices created by the user."""
        conn = http.client.HTTPSConnection("app.coqui.ai")
        url = self.MODEL_ENDPOINTS[self.model]["list_voices"]
        conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers)
        res = conn.getresponse()
        data = res.read()
        return [Speaker(s, True) for s in json.loads(data)["result"]]
    def list_speakers_as_tts_models(self):
        """List speakers in ModelManager format."""
        models = []
        for speaker in self.speakers:
            model = f"coqui_studio/multilingual/{speaker.name}/{self.model}"
            models.append(model)
        return models
    def name_to_speaker(self, name):
        for speaker in self.speakers:
            if speaker.name == name:
                return speaker
        raise ValueError(f"Speaker {name} not found in {self.speakers}")
    def id_to_speaker(self, speaker_id):
        for speaker in self.speakers:
            if speaker.id == speaker_id:
                return speaker
        raise ValueError(f"Speaker {speaker_id} not found.")
    @staticmethod
    def url_to_np(url):
        tmp_file, _ = urllib.request.urlretrieve(url)
        rate, data = wavfile.read(tmp_file)
        return data, rate
    @staticmethod
    def _create_payload(model, text, speaker, speed, emotion, language):
        payload = {}
        # if speaker.is_voice:
        payload["voice_id"] = speaker.id
        # else:
        payload["speaker_id"] = speaker.id
        if model == "V1":
            payload.update(
                {
                    "emotion": emotion,
                    "name": speaker.name,
                    "text": text,
                    "speed": speed,
                }
            )
        elif model == "XTTS":
            payload.update(
                {
                    "name": speaker.name,
                    "text": text,
                    "speed": speed,
                    "language": language,
                }
            )
        else:
            raise ValueError(f"❗ Unknown model {model}")
        return payload
    def _check_tts_args(self, text, speaker_name, speaker_id, emotion, speed, language):
        assert text is not None, "❗ text is required for V1 model."
        assert speaker_name is not None, "❗ speaker_name is required for V1 model."
        if self.model == "V1":
            if emotion is None:
                emotion = "Neutral"
            assert language is None, "❗ language is not supported for V1 model."
        elif self.model == "XTTS":
            assert emotion is None, f"❗ Emotions are not supported for XTTS model. Use V1 model."
            assert language is not None, "❗ Language is required for XTTS model."
            assert (
                language in self.SUPPORTED_LANGUAGES
            ), f"❗ Language {language} is not yet supported. Check https://docs.coqui.ai/reference/samples_xtts_create."
        return text, speaker_name, speaker_id, emotion, speed, language
    def tts(
        self,
        text: str,
        speaker_name: str = None,
        speaker_id=None,
        emotion=None,
        speed=1.0,
        language=None,  # pylint: disable=unused-argument
    ) -> Tuple[np.ndarray, int]:
        """Synthesize speech from text.
        Args:
            text (str): Text to synthesize.
            speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and
                voices (user generated speakers) with `list_voices()`.
            speaker_id (str): Speaker ID. If None, the speaker name is used.
            emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only
                supported by `V1` model. Defaults to None.
            speed (float): Speed of the speech. 1.0 is normal speed.
            language (str): Language of the text. If None, the default language of the speaker is used. Language is only
                supported by `XTTS` model. See https://docs.coqui.ai/reference/samples_xtts_create for supported languages.
        """
        self._check_token()
        self.ping_api()
        if speaker_name is None and speaker_id is None:
            raise ValueError(" [!] Please provide either a `speaker_name` or a `speaker_id`.")
        if speaker_id is None:
            speaker = self.name_to_speaker(speaker_name)
        else:
            speaker = self.id_to_speaker(speaker_id)
        text, speaker_name, speaker_id, emotion, speed, language = self._check_tts_args(
            text, speaker_name, speaker_id, emotion, speed, language
        )
        conn = http.client.HTTPSConnection("app.coqui.ai")
        payload = self._create_payload(self.model, text, speaker, speed, emotion, language)
        url = self.MODEL_ENDPOINTS[self.model]["synthesize"]
        conn.request("POST", url, json.dumps(payload), self.headers)
        res = conn.getresponse()
        data = res.read()
        try:
            wav, sr = self.url_to_np(json.loads(data)["audio_url"])
        except KeyError as e:
            raise ValueError(f" [!] 🐸 API returned error: {data}") from e
        return wav, sr
    def tts_to_file(
        self,
        text: str,
        speaker_name: str,
        speaker_id=None,
        emotion=None,
        speed=1.0,
        pipe_out=None,
        language=None,
        file_path: str = None,
    ) -> str:
        """Synthesize speech from text and save it to a file.
        Args:
            text (str): Text to synthesize.
            speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and
                voices (user generated speakers) with `list_voices()`.
            speaker_id (str): Speaker ID. If None, the speaker name is used.
            emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull".
            speed (float): Speed of the speech. 1.0 is normal speed.
            pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe.
            language (str): Language of the text. If None, the default language of the speaker is used. Language is only
                supported by `XTTS` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
            file_path (str): Path to save the file. If None, a temporary file is created.
        """
        if file_path is None:
            file_path = tempfile.mktemp(".wav")
        wav, sr = self.tts(text, speaker_name, speaker_id, emotion, speed, language)
        save_wav(wav=wav, path=file_path, sample_rate=sr, pipe_out=pipe_out)
        return file_path
 if __name__ == "__main__":
    import time
    api = CS_API()
    print(api.speakers)
    print(api.list_speakers_as_tts_models())
    ts = time.time()
    wav, sr = api.tts(
        "It took me quite a long time to develop a voice.", language="en", speaker_name=api.speakers[0].name
    )
    print(f" [i] XTTS took {time.time() - ts:.2f}s")
    filepath = api.tts_to_file(
        text="Hello world!", speaker_name=api.speakers[0].name, language="en", file_path="output.wav"
    )
--- a/TTS/tts/layers/xtts/xtts_manager.py
+++ b/TTS/tts/layers/xtts/xtts_manager.py
@ -0,0 +1,34 @@
 import torch
 class SpeakerManager():
    def __init__(self, speaker_file_path=None):
        self.speakers = torch.load(speaker_file_path)
    @property
    def name_to_id(self):
        return self.speakers.keys()
    @property
    def num_speakers(self):
        return len(self.name_to_id)
    @property
    def speaker_names(self):
        return list(self.name_to_id.keys())
 class LanguageManager():
    def __init__(self, config):
        self.langs = config["languages"]
    @property
    def name_to_id(self):
        return self.langs
    @property
    def num_languages(self):
        return len(self.name_to_id)
    @property
    def language_names(self):
        return list(self.name_to_id)
--- a/TTS/tts/models/xtts.py
+++ b/TTS/tts/models/xtts.py
@ -11,6 +11,7 @@ from TTS.tts.layers.xtts.gpt import GPT
 from TTS.tts.layers.xtts.hifigan_decoder import HifiDecoder
 from TTS.tts.layers.xtts.stream_generator import init_stream_support
 from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer, split_sentence
 from TTS.tts.layers.xtts.xtts_manager import SpeakerManager, LanguageManager
 from TTS.tts.models.base_tts import BaseTTS
 from TTS.utils.io import load_fsspec
@ -378,7 +379,7 @@ class Xtts(BaseTTS):
        return gpt_cond_latents, speaker_embedding
-    def synthesize(self, text, config, speaker_wav, language, **kwargs):
+    def synthesize(self, text, config, speaker_wav, language, speaker_id=None, **kwargs):
        """Synthesize speech with the given input text.
        Args:
@ -393,12 +394,6 @@ class Xtts(BaseTTS):
            `text_input` as text token IDs after tokenizer, `voice_samples` as samples used for cloning, `conditioning_latents`
            as latents used at inference.
        """
        return self.inference_with_config(text, config, ref_audio_path=speaker_wav, language=language, **kwargs)
    def inference_with_config(self, text, config, ref_audio_path, language, **kwargs):
        """
        inference with config
        """
        assert (
            "zh-cn" if language == "zh" else language in self.config.languages
@ -410,13 +405,18 @@ class Xtts(BaseTTS):
            "repetition_penalty": config.repetition_penalty,
            "top_k": config.top_k,
            "top_p": config.top_p,
        }
        settings.update(kwargs)  # allow overriding of preset settings with kwargs
        if speaker_id is not None:
            gpt_cond_latent, speaker_embedding = self.speaker_manager.speakers[speaker_id].values()
            return self.inference(text, language, gpt_cond_latent, speaker_embedding, **settings)
        settings.update({
            "gpt_cond_len": config.gpt_cond_len,
            "gpt_cond_chunk_len": config.gpt_cond_chunk_len,
            "max_ref_len": config.max_ref_len,
            "sound_norm_refs": config.sound_norm_refs,
-        }
+        })
-        settings.update(kwargs)  # allow overriding of preset settings with kwargs
+        return self.full_inference(text, speaker_wav, language, **settings)
        return self.full_inference(text, ref_audio_path, language, **settings)
    @torch.inference_mode()
    def full_inference(
@ -520,6 +520,8 @@ class Xtts(BaseTTS):
    ):
        language = language.split("-")[0]  # remove the country code
        length_scale = 1.0 / max(speed, 0.05)
        gpt_cond_latent = gpt_cond_latent.to(self.device)
        speaker_embedding = speaker_embedding.to(self.device)
        if enable_text_splitting:
            text = split_sentence(text, language, self.tokenizer.char_limits[language])
        else:
@ -628,6 +630,8 @@ class Xtts(BaseTTS):
    ):
        language = language.split("-")[0]  # remove the country code
        length_scale = 1.0 / max(speed, 0.05)
        gpt_cond_latent = gpt_cond_latent.to(self.device)
        speaker_embedding = speaker_embedding.to(self.device)
        if enable_text_splitting:
            text = split_sentence(text, language, self.tokenizer.char_limits[language])
        else:
@ -733,6 +737,7 @@ class Xtts(BaseTTS):
        eval=True,
        strict=True,
        use_deepspeed=False,
        speaker_file_path=None,
    ):
        """
        Loads a checkpoint from disk and initializes the model's state and tokenizer.
@ -751,6 +756,12 @@ class Xtts(BaseTTS):
        model_path = checkpoint_path or os.path.join(checkpoint_dir, "model.pth")
        vocab_path = vocab_path or os.path.join(checkpoint_dir, "vocab.json")
        speaker_file_path = speaker_file_path or os.path.join(checkpoint_dir, "speakers_xtts.pth")
        self.language_manager = LanguageManager(config)
        self.speaker_manager = None
        if os.path.exists(speaker_file_path):
            self.speaker_manager = SpeakerManager(speaker_file_path)
        if os.path.exists(vocab_path):
            self.tokenizer = VoiceBpeTokenizer(vocab_file=vocab_path)
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@ -11,7 +11,7 @@ import fsspec
 import requests
 from tqdm import tqdm
-from TTS.config import load_config
+from TTS.config import load_config, read_json_with_comments
 from TTS.utils.generic_utils import get_user_data_dir
 LICENSE_URLS = {
@ -65,30 +65,7 @@ class ModelManager(object):
        Args:
            file_path (str): path to .models.json.
        """
-        with open(file_path, "r", encoding="utf-8") as json_file:
+        self.models_dict = read_json_with_comments(file_path)
            self.models_dict = json.load(json_file)
    def add_cs_api_models(self, model_list: List[str]):
        """Add list of Coqui Studio model names that are returned from the api
        Each has the following format `<coqui_studio_model>/en/<speaker_name>/<coqui_studio_model>`
        """
        def _add_model(model_name: str):
            if not "coqui_studio" in model_name:
                return
            model_type, lang, dataset, model = model_name.split("/")
            if model_type not in self.models_dict:
                self.models_dict[model_type] = {}
            if lang not in self.models_dict[model_type]:
                self.models_dict[model_type][lang] = {}
            if dataset not in self.models_dict[model_type][lang]:
                self.models_dict[model_type][lang][dataset] = {}
            if model not in self.models_dict[model_type][lang][dataset]:
                self.models_dict[model_type][lang][dataset][model] = {}
        for model_name in model_list:
            _add_model(model_name)
    def _list_models(self, model_type, model_count=0):
        if self.verbose:
@ -315,6 +292,7 @@ class ModelManager(object):
                    f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/config.json",
                    f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/vocab.json",
                    f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/hash.md5",
                    f"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/{model_version}/speakers_xtts.pth",
                ],
            }
        else:
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@ -305,7 +305,7 @@ class Synthesizer(nn.Module):
        speaker_embedding = None
        speaker_id = None
        if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "name_to_id"):
-            if speaker_name and isinstance(speaker_name, str):
+            if speaker_name and isinstance(speaker_name, str) and not self.tts_config.model == "xtts":
                if self.tts_config.use_d_vector_file:
                    # get the average speaker embedding from the saved d_vectors.
                    speaker_embedding = self.tts_model.speaker_manager.get_mean_embedding(
@ -335,7 +335,9 @@ class Synthesizer(nn.Module):
        # handle multi-lingual
        language_id = None
        if self.tts_languages_file or (
-            hasattr(self.tts_model, "language_manager") and self.tts_model.language_manager is not None
+            hasattr(self.tts_model, "language_manager") 
            and self.tts_model.language_manager is not None
            and not self.tts_config.model == "xtts"
        ):
            if len(self.tts_model.language_manager.name_to_id) == 1:
                language_id = list(self.tts_model.language_manager.name_to_id.values())[0]
@ -366,6 +368,7 @@ class Synthesizer(nn.Module):
        if (
            speaker_wav is not None
            and self.tts_model.speaker_manager is not None
            and hasattr(self.tts_model.speaker_manager, "encoder_ap")
            and self.tts_model.speaker_manager.encoder_ap is not None
        ):
            speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(speaker_wav)
--- a/docs/source/inference.md
+++ b/docs/source/inference.md
@ -172,48 +172,6 @@ tts.tts_with_vc_to_file(
 )
 ```
 #### Example text to speech using [🐸Coqui Studio](https://coqui.ai) models.
 You can use all of your available speakers in the studio.
 [🐸Coqui Studio](https://coqui.ai) API token is required. You can get it from the [account page](https://coqui.ai/account).
 You should set the `COQUI_STUDIO_TOKEN` environment variable to use the API token.
 ```python
 # If you have a valid API token set you will see the studio speakers as separate models in the list.
 # The name format is coqui_studio/en/<studio_speaker_name>/coqui_studio
 models = TTS().list_models()
 # Init TTS with the target studio speaker
 tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False)
 # Run TTS
 tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH)
 # Run TTS with emotion and speed control
 tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5)
 ```
 If you just need 🐸 Coqui Studio speakers, you can use `CS_API`. It is a wrapper around the 🐸 Coqui Studio API.
 ```python
 from TTS.api import CS_API
 # Init 🐸 Coqui Studio API
 # you can either set the API token as an environment variable `COQUI_STUDIO_TOKEN` or pass it as an argument.
 # XTTS - Best quality and life-like speech in multiple languages. See https://docs.coqui.ai/reference/samples_xtts_create for supported languages.
 api = CS_API(api_token=<token>, model="XTTS")
 api.speakers  # all the speakers are available with all the models.
 api.list_speakers()
 api.list_voices()
 wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", language="en", speed=1.5)
 # V1 - Fast and lightweight TTS in EN with emotion control.
 api = CS_API(api_token=<token>, model="V1")
 api.speakers
 api.emotions  # emotions are only for the V1 model.
 api.list_speakers()
 api.list_voices()
 wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5)
 ```
 #### Example text to speech using **Fairseq models in ~1100 languages** 🤯.
 For these models use the following name format: `tts_models/<lang-iso_code>/fairseq/vits`.
--- a/docs/source/models/xtts.md
+++ b/docs/source/models/xtts.md
@ -21,7 +21,7 @@ a few tricks to make it faster and support streaming inference.
 - Across the board quality improvements.
 ### Code
-Current implementation only supports inference.
+Current implementation only supports inference and GPT encoder training.
 ### Languages
 As of now, XTTS-v2 supports 16 languages: English (en), Spanish (es), French (fr), German (de), Italian (it), Portuguese (pt), Polish (pl), Turkish (tr), Russian (ru), Dutch (nl), Czech (cs), Arabic (ar), Chinese (zh-cn), Japanese (ja), Hungarian (hu) and Korean (ko).
@ -36,9 +36,71 @@ Come and join in our 🐸Community. We're active on [Discord](https://discord.gg
 You can also mail us at info@coqui.ai.
 ### Inference
 #### 🐸TTS Command line
 You can check all supported languages with the following command: 
 ```console
 tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \
    --list_language_idx
 ```
 You can check all Coqui available speakers with the following command: 
 ```console
 tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \
    --list_speaker_idx
 ```
 ##### Coqui speakers
 You can do inference using one of the available speakers using the following command:
 ```console
 tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \
     --text "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent." \
     --speaker_idx "Ana Florence" \
     --language_idx en \
     --use_cuda true
 ```
 ##### Clone a voice
 You can clone a speaker voice using a single or multiple references:
 ###### Single reference
 ```console
 tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \
     --text "Bugün okula gitmek istemiyorum." \
     --speaker_wav /path/to/target/speaker.wav \
     --language_idx tr \
     --use_cuda true
 ```
 ###### Multiple references
 ```console
 tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \
     --text "Bugün okula gitmek istemiyorum." \
     --speaker_wav /path/to/target/speaker.wav /path/to/target/speaker_2.wav /path/to/target/speaker_3.wav \
     --language_idx tr \
     --use_cuda true
 ```
 or for all wav files in a directory you can use:
 ```console
 tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \
     --text "Bugün okula gitmek istemiyorum." \
     --speaker_wav /path/to/target/*.wav \
     --language_idx tr \
     --use_cuda true
 ```
 #### 🐸TTS API
-##### Single reference
+##### Clone a voice
 You can clone a speaker voice using a single or multiple references:
 ###### Single reference
 Splits the text into sentences and generates audio for each sentence. The audio files are then concatenated to produce the final audio.
 You can optionally disable sentence splitting for better coherence but more VRAM and possibly hitting models context length limit.
@ -56,7 +118,7 @@ tts.tts_to_file(text="It took me quite a long time to develop a voice, and now t
                )
 ```
-##### Multiple references
+###### Multiple references
 You can pass multiple audio files to the `speaker_wav` argument for better voice cloning.
@ -81,34 +143,23 @@ tts.tts_to_file(text="It took me quite a long time to develop a voice, and now t
                language="en")
 ```
-#### 🐸TTS Command line
+##### Coqui speakers
-##### Single reference
+You can do inference using one of the available speakers using the following code:
-```console
+
- tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \
+```python
-     --text "Bugün okula gitmek istemiyorum." \
+from TTS.api import TTS
-     --speaker_wav /path/to/target/speaker.wav \
+tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
-     --language_idx tr \
+
-     --use_cuda true
+# generate speech by cloning a voice using default settings
 tts.tts_to_file(text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
                file_path="output.wav",
                speaker="Ana Florence",
                language="en",
                split_sentences=True
                )
 ```
 ##### Multiple references
 ```console
 tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \
     --text "Bugün okula gitmek istemiyorum." \
     --speaker_wav /path/to/target/speaker.wav /path/to/target/speaker_2.wav /path/to/target/speaker_3.wav \
     --language_idx tr \
     --use_cuda true
 ```
 or for all wav files in a directory you can use:
 ```console
 tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \
     --text "Bugün okula gitmek istemiyorum." \
     --speaker_wav /path/to/target/*.wav \
     --language_idx tr \
     --use_cuda true
 ```
 #### 🐸TTS Model API
--- a/tests/api_tests/init.py
+++ b/tests/api_tests/init.py
--- a/tests/api_tests/test_python_api.py
+++ b/tests/api_tests/test_python_api.py
@ -1,113 +0,0 @@
 import os
 import unittest
 from tests import get_tests_data_path, get_tests_output_path
 from TTS.api import CS_API, TTS
 OUTPUT_PATH = os.path.join(get_tests_output_path(), "test_python_api.wav")
 cloning_test_wav_path = os.path.join(get_tests_data_path(), "ljspeech/wavs/LJ001-0028.wav")
 is_coqui_available = os.environ.get("COQUI_STUDIO_TOKEN")
 if is_coqui_available:
    class CS_APITest(unittest.TestCase):
        def test_speakers(self):
            tts = CS_API()
            self.assertGreater(len(tts.speakers), 1)
        def test_emotions(self):
            tts = CS_API()
            self.assertGreater(len(tts.emotions), 1)
        def test_list_calls(self):
            tts = CS_API()
            self.assertGreater(len(tts.list_voices()), 1)
            self.assertGreater(len(tts.list_speakers()), 1)
            self.assertGreater(len(tts.list_all_speakers()), 1)
            self.assertGreater(len(tts.list_speakers_as_tts_models()), 1)
        def test_name_to_speaker(self):
            tts = CS_API()
            speaker_name = tts.list_speakers_as_tts_models()[0].split("/")[2]
            speaker = tts.name_to_speaker(speaker_name)
            self.assertEqual(speaker.name, speaker_name)
        def test_tts(self):
            tts = CS_API()
            wav, sr = tts.tts(text="This is a test.", speaker_name=tts.list_speakers()[0].name)
            self.assertEqual(sr, 44100)
            self.assertGreater(len(wav), 1)
    class TTSTest(unittest.TestCase):
        def test_single_speaker_model(self):
            tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False)
            error_raised = False
            try:
                tts.tts_to_file(text="Ich bin eine Testnachricht.", speaker="Thorsten", language="de")
            except ValueError:
                error_raised = True
            tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path=OUTPUT_PATH)
            self.assertTrue(error_raised)
            self.assertFalse(tts.is_multi_speaker)
            self.assertFalse(tts.is_multi_lingual)
            self.assertIsNone(tts.speakers)
            self.assertIsNone(tts.languages)
        def test_studio_model(self):
            tts = TTS(model_name="coqui_studio/en/Zacharie Aimilios/coqui_studio")
            tts.tts_to_file(text="This is a test.")
            # check speed > 2.0 raises error
            raised_error = False
            try:
                _ = tts.tts(text="This is a test.", speed=4.0, emotion="Sad")  # should raise error with speed > 2.0
            except ValueError:
                raised_error = True
            self.assertTrue(raised_error)
            # check emotion is invalid
            raised_error = False
            try:
                _ = tts.tts(text="This is a test.", speed=2.0, emotion="No Emo")  # should raise error with speed > 2.0
            except ValueError:
                raised_error = True
            self.assertTrue(raised_error)
            # check valid call
            wav = tts.tts(text="This is a test.", speed=2.0, emotion="Sad")
            self.assertGreater(len(wav), 0)
        def test_fairseq_model(self):  # pylint: disable=no-self-use
            tts = TTS(model_name="tts_models/eng/fairseq/vits")
            tts.tts_to_file(text="This is a test.")
        def test_multi_speaker_multi_lingual_model(self):
            tts = TTS()
            tts.load_tts_model_by_name(tts.models[0])  # YourTTS
            tts.tts_to_file(
                text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path=OUTPUT_PATH
            )
            self.assertTrue(tts.is_multi_speaker)
            self.assertTrue(tts.is_multi_lingual)
            self.assertGreater(len(tts.speakers), 1)
            self.assertGreater(len(tts.languages), 1)
        def test_voice_cloning(self):  # pylint: disable=no-self-use
            tts = TTS()
            tts.load_tts_model_by_name("tts_models/multilingual/multi-dataset/your_tts")
            tts.tts_to_file("Hello world!", speaker_wav=cloning_test_wav_path, language="en", file_path=OUTPUT_PATH)
        def test_voice_conversion(self):  # pylint: disable=no-self-use
            tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False, gpu=False)
            tts.voice_conversion_to_file(
                source_wav=cloning_test_wav_path,
                target_wav=cloning_test_wav_path,
                file_path=OUTPUT_PATH,
            )
--- a/tests/api_tests/test_synthesize_api.py
+++ b/tests/api_tests/test_synthesize_api.py
@ -1,25 +0,0 @@
 import os
 from tests import get_tests_output_path, run_cli
 def test_synthesize():
    """Test synthesize.py with diffent arguments."""
    output_path = os.path.join(get_tests_output_path(), "output.wav")
    # 🐸 Coqui studio model
    run_cli(
        'tts --model_name "coqui_studio/en/Torcull Diarmuid/coqui_studio" '
        '--text "This is it" '
        f'--out_path "{output_path}"'
    )
    # 🐸 Coqui studio model with speed arg.
    run_cli(
        'tts --model_name "coqui_studio/en/Torcull Diarmuid/coqui_studio" '
        '--text "This is it but slow" --speed 0.1'
        f'--out_path "{output_path}"'
    )
    # test pipe_out command
    run_cli(f'tts --text "test." --pipe_out --out_path "{output_path}" | aplay')