Merge pull request #3173 from coqui-ai/dev

v0.20.2
2023-11-08 16:08:22 +01:00 · 2023-11-08 16:08:22 +01:00 · ab57c36c2b
parent 063556abf4 46d9c27212
commit ab57c36c2b
15 changed files with 383 additions and 437 deletions
--- a/README.md
+++ b/README.md
@ -2,7 +2,7 @@
 ## 🐸Coqui.ai News
 - 📣 ⓍTTSv2 is here with 16 languages and better performance across the board.
 - 📣 ⓍTTS fine-tuning code is out. Check the [example recipes](https://github.com/coqui-ai/TTS/tree/dev/recipes/ljspeech).
- 📣 ⓍTTS can now stream with <200ms latency. 
+- 📣 ⓍTTS can now stream with <200ms latency.
 - 📣 ⓍTTS, our production TTS model that can speak 13 languages, is released [Blog Post](https://coqui.ai/blog/tts/open_xtts), [Demo](https://huggingface.co/spaces/coqui/xtts), [Docs](https://tts.readthedocs.io/en/dev/models/xtts.html)
 - 📣 [🐶Bark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://tts.readthedocs.io/en/dev/models/bark.html)
 - 📣 You can use [~1100 Fairseq models](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS.
@ -205,7 +205,7 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 print(TTS().list_models())
 # Init TTS
-tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1").to(device)
+tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
 # Run TTS
 # ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language
@ -267,19 +267,13 @@ models = TTS(cs_api_model="XTTS").list_models()
 # Init TTS with the target studio speaker
 tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False)
 # Run TTS
-tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH)
+tts.tts_to_file(text="This is a test.", language="en", file_path=OUTPUT_PATH)
 # V1 model
 models = TTS(cs_api_model="V1").list_models()
 # Run TTS with emotion and speed control
 # Emotion control only works with V1 model
 tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5)
 # XTTS-multilingual
 models = TTS(cs_api_model="XTTS-multilingual").list_models()
 # Run TTS with emotion and speed control
 # Emotion control only works with V1 model
 tts.tts_to_file(text="Das ist ein Test.", file_path=OUTPUT_PATH, language="de", speed=1.0)
 ```
 #### Example text to speech using **Fairseq models in ~1100 languages** 🤯.
--- a/TTS/VERSION
+++ b/TTS/VERSION
@ -1 +1 @@
-0.20.1
+0.20.2
--- a/TTS/api.py
+++ b/TTS/api.py
@ -60,7 +60,7 @@ class TTS(nn.Module):
            vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
            progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
            cs_api_model (str, optional): Name of the model to use for the Coqui Studio API. Available models are
-                "XTTS", "XTTS-multilingual", "V1". You can also use `TTS.cs_api.CS_API" for more control.
+                "XTTS", "V1". You can also use `TTS.cs_api.CS_API" for more control.
                Defaults to "XTTS".
            gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
        """
@ -275,7 +275,7 @@ class TTS(nn.Module):
            speaker_name (str, optional):
                Speaker name from Coqui Studio. Defaults to None.
            language (str): Language of the text. If None, the default language of the speaker is used. Language is only
-                supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
+                supported by `XTTS` model.
            emotion (str, optional):
                Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only available
                with "V1" model. Defaults to None.
@ -321,7 +321,7 @@ class TTS(nn.Module):
                Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
                `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
            language (str): Language of the text. If None, the default language of the speaker is used. Language is only
-                supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
+                supported by `XTTS` model.
            speaker_wav (str, optional):
                Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
                Defaults to None.
--- a/TTS/bin/synthesize.py
+++ b/TTS/bin/synthesize.py
@ -227,7 +227,7 @@ def main():
    parser.add_argument(
        "--cs_model",
        type=str,
-        help="Name of the 🐸Coqui Studio model. Available models are `XTTS`, `XTTS-multilingual`, `V1`.",
+        help="Name of the 🐸Coqui Studio model. Available models are `XTTS`, `V1`.",
    )
    parser.add_argument(
        "--emotion",
@ -238,7 +238,7 @@ def main():
    parser.add_argument(
        "--language",
        type=str,
-        help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS-multilingual` model.",
+        help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS` model.",
        default=None,
    )
    parser.add_argument(
--- a/TTS/cs_api.py
+++ b/TTS/cs_api.py
@ -43,7 +43,7 @@ class CS_API:
    Args:
        api_token (str): 🐸Coqui Studio API token. If not provided, it will be read from the environment variable
            `COQUI_STUDIO_TOKEN`.
-        model (str): 🐸Coqui Studio model. It can be either `V1`, `XTTS`, or `XTTS-multilang`. Default is `XTTS`.
+        model (str): 🐸Coqui Studio model. It can be either `V1`, `XTTS`. Default is `XTTS`.
    Example listing all available speakers:
@ -65,7 +65,7 @@ class CS_API:
    Example with multi-language model:
        >>> from TTS.api import CS_API
-        >>> tts = CS_API(model="XTTS-multilang")
+        >>> tts = CS_API(model="XTTS")
        >>> wav, sr = api.tts("Hello world", speaker_name=tts.speakers[0].name, language="en")
    """
@ -78,16 +78,12 @@ class CS_API:
        "XTTS": {
            "list_speakers": "https://app.coqui.ai/api/v2/speakers",
            "synthesize": "https://app.coqui.ai/api/v2/samples/xtts/render/",
-            "list_voices": "https://app.coqui.ai/api/v2/voices/xtts/",
+            "list_voices": "https://app.coqui.ai/api/v2/voices/xtts",
        },
        "XTTS-multilang": {
            "list_speakers": "https://app.coqui.ai/api/v2/speakers",
            "synthesize": "https://app.coqui.ai/api/v2/samples/multilingual/render/",
            "list_voices": "https://app.coqui.ai/api/v2/voices/xtts/",
        },
    }
-    SUPPORTED_LANGUAGES = ["en", "es", "de", "fr", "it", "pt", "pl"]
+
    SUPPORTED_LANGUAGES = ["en", "es", "de", "fr", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja"]
    def __init__(self, api_token=None, model="XTTS"):
        self.api_token = api_token
@ -139,7 +135,7 @@ class CS_API:
        self._check_token()
        conn = http.client.HTTPSConnection("app.coqui.ai")
        url = self.MODEL_ENDPOINTS[self.model]["list_speakers"]
-        conn.request("GET", f"{url}?per_page=100", headers=self.headers)
+        conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers)
        res = conn.getresponse()
        data = res.read()
        return [Speaker(s) for s in json.loads(data)["result"]]
@ -148,7 +144,7 @@ class CS_API:
        """List custom voices created by the user."""
        conn = http.client.HTTPSConnection("app.coqui.ai")
        url = self.MODEL_ENDPOINTS[self.model]["list_voices"]
-        conn.request("GET", f"{url}", headers=self.headers)
+        conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers)
        res = conn.getresponse()
        data = res.read()
        return [Speaker(s, True) for s in json.loads(data)["result"]]
@ -197,14 +193,6 @@ class CS_API:
                }
            )
        elif model == "XTTS":
            payload.update(
                {
                    "name": speaker.name,
                    "text": text,
                    "speed": speed,
                }
            )
        elif model == "XTTS-multilang":
            payload.update(
                {
                    "name": speaker.name,
@ -226,13 +214,10 @@ class CS_API:
            assert language is None, "❗ language is not supported for V1 model."
        elif self.model == "XTTS":
            assert emotion is None, f"❗ Emotions are not supported for XTTS model. Use V1 model."
-            assert language is None, "❗ Language is not supported for XTTS model. Use XTTS-multilang model."
+            assert language is not None, "❗ Language is required for XTTS model."
        elif self.model == "XTTS-multilang":
            assert emotion is None, f"❗ Emotions are not supported for XTTS-multilang model. Use V1 model."
            assert language is not None, "❗ Language is required for XTTS-multilang model."
            assert (
                language in self.SUPPORTED_LANGUAGES
-            ), f"❗ Language {language} is not yet supported. Use one of: en, es, de, fr, it, pt, pl"
+            ), f"❗ Language {language} is not yet supported. Check https://docs.coqui.ai/reference/samples_xtts_create."
        return text, speaker_name, speaker_id, emotion, speed, language
    def tts(
@ -255,7 +240,7 @@ class CS_API:
                supported by `V1` model. Defaults to None.
            speed (float): Speed of the speech. 1.0 is normal speed.
            language (str): Language of the text. If None, the default language of the speaker is used. Language is only
-                supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
+                supported by `XTTS` model. See https://docs.coqui.ai/reference/samples_xtts_create for supported languages.
        """
        self._check_token()
        self.ping_api()
@ -305,7 +290,7 @@ class CS_API:
            speed (float): Speed of the speech. 1.0 is normal speed.
            pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe.
            language (str): Language of the text. If None, the default language of the speaker is used. Language is only
-                supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
+                supported by `XTTS` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
            file_path (str): Path to save the file. If None, a temporary file is created.
        """
        if file_path is None:
@ -323,20 +308,7 @@ if __name__ == "__main__":
    print(api.list_speakers_as_tts_models())
    ts = time.time()
-    wav, sr = api.tts("It took me quite a long time to develop a voice.", speaker_name=api.speakers[0].name)
+    wav, sr = api.tts("It took me quite a long time to develop a voice.", language="en", speaker_name=api.speakers[0].name)
    print(f" [i] XTTS took {time.time() - ts:.2f}s")
-    filepath = api.tts_to_file(text="Hello world!", speaker_name=api.speakers[0].name, file_path="output.wav")
+    filepath = api.tts_to_file(text="Hello world!", speaker_name=api.speakers[0].name, language="en", file_path="output.wav")
    api = CS_API(model="XTTS-multilang")
    print(api.speakers)
    ts = time.time()
    wav, sr = api.tts(
        "It took me quite a long time to develop a voice.", speaker_name=api.speakers[0].name, language="en"
    )
    print(f" [i] XTTS took {time.time() - ts:.2f}s")
    filepath = api.tts_to_file(
        text="Hello world!", speaker_name=api.speakers[0].name, file_path="output.wav", language="en"
    )
--- a/TTS/tts/configs/xtts_config.py
+++ b/TTS/tts/configs/xtts_config.py
@ -37,29 +37,11 @@ class XttsConfig(BaseTTSConfig):
            If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
            Defaults to `0.8`.
        cond_free_k (float):
            Knob that determines how to balance the conditioning free signal with the conditioning-present signal. [0,inf].
            As cond_free_k increases, the output becomes dominated by the conditioning-free signal.
            Formula is: output=cond_present_output*(cond_free_k+1)-cond_absenct_output*cond_free_k. Defaults to `2.0`.
        diffusion_temperature (float):
            Controls the variance of the noise fed into the diffusion model. [0,1]. Values at 0
            are the "mean" prediction of the diffusion network and will sound bland and smeared.
            Defaults to `1.0`.
        num_gpt_outputs (int):
            Number of samples taken from the autoregressive model, all of which are filtered using CLVP.
            As XTTS is a probabilistic model, more samples means a higher probability of creating something "great".
            Defaults to `16`.
        decoder_iterations (int):
            Number of diffusion steps to perform. [0,4000]. More steps means the network has more chances to iteratively refine
            the output, which should theoretically mean a higher quality output. Generally a value above 250 is not noticeably better,
            however. Defaults to `30`.
        decoder_sampler (str):
            Diffusion sampler to be used. `ddim` or `dpm++2m`. Defaults to `ddim`.
        gpt_cond_len (int):
            Secs audio to be used as conditioning for the autoregressive model. Defaults to `3`.
@ -110,11 +92,7 @@ class XttsConfig(BaseTTSConfig):
    repetition_penalty: float = 2.0
    top_k: int = 50
    top_p: float = 0.85
    cond_free_k: float = 2.0
    diffusion_temperature: float = 1.0
    num_gpt_outputs: int = 1
    decoder_iterations: int = 30
    decoder_sampler: str = "ddim"
    # cloning
    gpt_cond_len: int = 3
--- a/TTS/tts/layers/xtts/tokenizer.py
+++ b/TTS/tts/layers/xtts/tokenizer.py
@ -8,6 +8,7 @@ from hangul_romanize import Transliter
 from hangul_romanize.rule import academic
 from num2words import num2words
 from tokenizers import Tokenizer
 from functools import cached_property
 from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words
@ -535,11 +536,50 @@ DEFAULT_VOCAB_FILE = os.path.join(os.path.dirname(os.path.realpath(__file__)), "
 class VoiceBpeTokenizer:
    def __init__(self, vocab_file=None):
        self.tokenizer = None
        self.katsu = None
        if vocab_file is not None:
            self.tokenizer = Tokenizer.from_file(vocab_file)
        self.char_limits = {
            "en": 250,
            "de": 253,
            "fr": 273,
            "es": 239,
            "it": 213,
            "pt": 203,
            "pl": 224,
            "zh-cn": 82,
            "ar": 166,
            "cs": 186,
            "ru": 182,
            "nl": 251,
            "tr": 226,
            "ja": 71,
            "hu": 224,
            "ko": 95,
        }
    @cached_property
    def katsu(self):
        import cutlet
        return cutlet.Cutlet()
    def check_input_length(self, txt, lang):
        limit = self.char_limits.get(lang, 250)
        if len(txt) > limit:
            print(f"[!] Warning: The text length exceeds the character limit of {limit} for language '{lang}', this might cause truncated audio.")
    def preprocess_text(self, txt, lang):
        if lang in ["en", "es", "fr", "de", "pt", "it", "pl", "ar", "cs", "ru", "nl", "tr", "zh-cn"]:
            txt = multilingual_cleaners(txt, lang)
            if lang == "zh-cn":
                txt = chinese_transliterate(txt)
        elif lang == "ja":                
            txt = japanese_cleaners(txt, self.katsu)
        else:
            raise NotImplementedError()
        return txt
    def encode(self, txt, lang):
        self.check_input_length(txt, lang)
        txt = self.preprocess_text(txt, lang)
        txt = f"[{lang}]{txt}"
        txt = txt.replace(" ", "[SPACE]")
--- a/TTS/tts/models/xtts.py
+++ b/TTS/tts/models/xtts.py
@ -152,19 +152,6 @@ class XttsArgs(Coqpit):
        gpt_code_stride_len (int, optional): The hop_size of dvae and consequently of the gpt output. Defaults to 1024.
        gpt_use_masking_gt_prompt_approach (bool, optional):  If True, it will use ground truth as prompt and it will mask the loss to avoid repetition. Defaults to True.
        gpt_use_perceiver_resampler (bool, optional):  If True, it will use perceiver resampler from flamingo paper - https://arxiv.org/abs/2204.14198. Defaults to False.
        For DiffTTS model:
        diff_model_channels (int, optional): The number of channels for the DiffTTS model. Defaults to 1024.
        diff_num_layers (int, optional): The number of layers for the DiffTTS model. Defaults to 10.
        diff_in_channels (int, optional): The input channels for the DiffTTS model. Defaults to 100.
        diff_out_channels (int, optional): The output channels for the DiffTTS model. Defaults to 200.
        diff_in_latent_channels (int, optional): The input latent channels for the DiffTTS model. Defaults to 1024.
        diff_in_tokens (int, optional): The input tokens for the DiffTTS model. Defaults to 8193.
        diff_dropout (int, optional): The dropout percentage for the DiffTTS model. Defaults to 0.
        diff_use_fp16 (bool, optional): Whether to use fp16 for the DiffTTS model. Defaults to False.
        diff_num_heads (int, optional): The number of heads for the DiffTTS model. Defaults to 16.
        diff_layer_drop (int, optional): The layer dropout percentage for the DiffTTS model. Defaults to 0.
        diff_unconditioned_percentage (int, optional): The percentage of unconditioned inputs for the DiffTTS model. Defaults to 0.
    """
    gpt_batch_size: int = 1
@ -193,19 +180,6 @@ class XttsArgs(Coqpit):
    gpt_use_masking_gt_prompt_approach: bool = True
    gpt_use_perceiver_resampler: bool = False
    # Diffusion Decoder params
    diff_model_channels: int = 1024
    diff_num_layers: int = 10
    diff_in_channels: int = 100
    diff_out_channels: int = 200
    diff_in_latent_channels: int = 1024
    diff_in_tokens: int = 8193
    diff_dropout: int = 0
    diff_use_fp16: bool = False
    diff_num_heads: int = 16
    diff_layer_drop: int = 0
    diff_unconditioned_percentage: int = 0
    # HifiGAN Decoder params
    input_sample_rate: int = 22050
    output_sample_rate: int = 24000
@ -426,10 +400,6 @@ class Xtts(BaseTTS):
            "repetition_penalty": config.repetition_penalty,
            "top_k": config.top_k,
            "top_p": config.top_p,
            "cond_free_k": config.cond_free_k,
            "diffusion_temperature": config.diffusion_temperature,
            "decoder_iterations": config.decoder_iterations,
            "decoder_sampler": config.decoder_sampler,
            "gpt_cond_len": config.gpt_cond_len,
            "max_ref_len": config.max_ref_len,
            "sound_norm_refs": config.sound_norm_refs,
@ -454,13 +424,6 @@ class Xtts(BaseTTS):
        gpt_cond_len=6,
        max_ref_len=10,
        sound_norm_refs=False,
        # Decoder inference
        decoder_iterations=100,
        cond_free=True,
        cond_free_k=2,
        diffusion_temperature=1.0,
        decoder_sampler="ddim",
        decoder="hifigan",
        **hf_generate_kwargs,
    ):
        """
@ -603,10 +566,21 @@ class Xtts(BaseTTS):
        if wav_gen_prev is not None:
            wav_chunk = wav_gen[(wav_gen_prev.shape[0] - overlap_len) : -overlap_len]
        if wav_overlap is not None:
-            crossfade_wav = wav_chunk[:overlap_len]
+            # cross fade the overlap section
-            crossfade_wav = crossfade_wav * torch.linspace(0.0, 1.0, overlap_len).to(crossfade_wav.device)
+            if overlap_len > len(wav_chunk):
-            wav_chunk[:overlap_len] = wav_overlap * torch.linspace(1.0, 0.0, overlap_len).to(wav_overlap.device)
+                # wav_chunk is smaller than overlap_len, pass on last wav_gen
-            wav_chunk[:overlap_len] += crossfade_wav
+                if wav_gen_prev is not None:
                    wav_chunk = wav_gen[(wav_gen_prev.shape[0] - overlap_len):]
                else:
                    # not expecting will hit here as problem happens on last chunk
                    wav_chunk = wav_gen[-overlap_len:]
                return wav_chunk, wav_gen, None
            else:
                crossfade_wav = wav_chunk[:overlap_len]
                crossfade_wav = crossfade_wav * torch.linspace(0.0, 1.0, overlap_len).to(crossfade_wav.device)
                wav_chunk[:overlap_len] = wav_overlap * torch.linspace(1.0, 0.0, overlap_len).to(wav_overlap.device)
                wav_chunk[:overlap_len] += crossfade_wav
        wav_overlap = wav_gen[-overlap_len:]
        wav_gen_prev = wav_gen
        return wav_chunk, wav_gen_prev, wav_overlap
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@ -109,7 +109,6 @@ class ModelManager(object):
    def _list_for_model_type(self, model_type):
        models_name_list = []
        model_count = 1
        model_type = "tts_models"
        models_name_list.extend(self._list_models(model_type, model_count))
        return models_name_list
@ -298,22 +297,22 @@ class ModelManager(object):
        model_item = self.set_model_url(model_item)
        return model_item, model_full_name, model, md5hash
-    def ask_tos(self, model_full_path):
+    @staticmethod
    def ask_tos(model_full_path):
        """Ask the user to agree to the terms of service"""
        tos_path = os.path.join(model_full_path, "tos_agreed.txt")
-        if not os.path.exists(tos_path):
+        print(" > You must agree to the terms of service to use this model.")
-            print(" > You must agree to the terms of service to use this model.")
+        print(" | > Please see the terms of service at https://coqui.ai/cpml.txt")
-            print(" | > Please see the terms of service at https://coqui.ai/cpml.txt")
+        print(' | > "I have read, understood and agreed to the Terms and Conditions." - [y/n]')
-            print(' | > "I have read, understood and agreed the Terms and Conditions." - [y/n]')
+        answer = input(" | | > ")
-            answer = input(" | | > ")
+        if answer.lower() == "y":
-            if answer.lower() == "y":
+            with open(tos_path, "w", encoding="utf-8") as f:
-                with open(tos_path, "w") as f:
+                f.write("I have read, understood and agreed to the Terms and Conditions.")
-                    f.write("I have read, understood ad agree the Terms and Conditions.")
+            return True
-                return True
+        return False
            else:
                return False
-    def tos_agreed(self, model_item, model_full_path):
+    @staticmethod
    def tos_agreed(model_item, model_full_path):
        """Check if the user has agreed to the terms of service"""
        if "tos_required" in model_item and model_item["tos_required"]:
            tos_path = os.path.join(model_full_path, "tos_agreed.txt")
--- a/TTS/vc/configs/freevc_config.py
+++ b/TTS/vc/configs/freevc_config.py
@ -1,5 +1,278 @@
 from dataclasses import dataclass, field
-from typing import List
+from typing import List, Optional
 from coqpit import Coqpit
 from TTS.vc.configs.shared_configs import BaseVCConfig
-from TTS.vc.models.freevc import FreeVCArgs, FreeVCAudioConfig, FreeVCConfig
+
@dataclass
 class FreeVCAudioConfig(Coqpit):
    """Audio configuration
    Args:
        max_wav_value (float):
            The maximum value of the waveform.
        input_sample_rate (int):
            The sampling rate of the input waveform.
        output_sample_rate (int):
            The sampling rate of the output waveform.
        filter_length (int):
            The length of the filter.
        hop_length (int):
            The hop length.
        win_length (int):
            The window length.
        n_mel_channels (int):
            The number of mel channels.
        mel_fmin (float):
            The minimum frequency of the mel filterbank.
        mel_fmax (Optional[float]):
            The maximum frequency of the mel filterbank.
    """
    max_wav_value: float = field(default=32768.0)
    input_sample_rate: int = field(default=16000)
    output_sample_rate: int = field(default=24000)
    filter_length: int = field(default=1280)
    hop_length: int = field(default=320)
    win_length: int = field(default=1280)
    n_mel_channels: int = field(default=80)
    mel_fmin: float = field(default=0.0)
    mel_fmax: Optional[float] = field(default=None)
@dataclass
 class FreeVCArgs(Coqpit):
    """FreeVC model arguments
    Args:
        spec_channels (int):
            The number of channels in the spectrogram.
        inter_channels (int):
            The number of channels in the intermediate layers.
        hidden_channels (int):
            The number of channels in the hidden layers.
        filter_channels (int):
            The number of channels in the filter layers.
        n_heads (int):
            The number of attention heads.
        n_layers (int):
            The number of layers.
        kernel_size (int):
            The size of the kernel.
        p_dropout (float):
            The dropout probability.
        resblock (str):
            The type of residual block.
        resblock_kernel_sizes (List[int]):
            The kernel sizes for the residual blocks.
        resblock_dilation_sizes (List[List[int]]):
            The dilation sizes for the residual blocks.
        upsample_rates (List[int]):
            The upsample rates.
        upsample_initial_channel (int):
            The number of channels in the initial upsample layer.
        upsample_kernel_sizes (List[int]):
            The kernel sizes for the upsample layers.
        n_layers_q (int):
            The number of layers in the quantization network.
        use_spectral_norm (bool):
            Whether to use spectral normalization.
        gin_channels (int):
            The number of channels in the global conditioning vector.
        ssl_dim (int):
            The dimension of the self-supervised learning embedding.
        use_spk (bool):
            Whether to use external speaker encoder.
    """
    spec_channels: int = field(default=641)
    inter_channels: int = field(default=192)
    hidden_channels: int = field(default=192)
    filter_channels: int = field(default=768)
    n_heads: int = field(default=2)
    n_layers: int = field(default=6)
    kernel_size: int = field(default=3)
    p_dropout: float = field(default=0.1)
    resblock: str = field(default="1")
    resblock_kernel_sizes: List[int] = field(default_factory=lambda: [3, 7, 11])
    resblock_dilation_sizes: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]])
    upsample_rates: List[int] = field(default_factory=lambda: [10, 8, 2, 2])
    upsample_initial_channel: int = field(default=512)
    upsample_kernel_sizes: List[int] = field(default_factory=lambda: [16, 16, 4, 4])
    n_layers_q: int = field(default=3)
    use_spectral_norm: bool = field(default=False)
    gin_channels: int = field(default=256)
    ssl_dim: int = field(default=1024)
    use_spk: bool = field(default=False)
    num_spks: int = field(default=0)
    segment_size: int = field(default=8960)
@dataclass
 class FreeVCConfig(BaseVCConfig):
    """Defines parameters for FreeVC End2End TTS model.
    Args:
        model (str):
            Model name. Do not change unless you know what you are doing.
        model_args (FreeVCArgs):
            Model architecture arguments. Defaults to `FreeVCArgs()`.
        audio (FreeVCAudioConfig):
            Audio processing configuration. Defaults to `FreeVCAudioConfig()`.
        grad_clip (List):
            Gradient clipping thresholds for each optimizer. Defaults to `[1000.0, 1000.0]`.
        lr_gen (float):
            Initial learning rate for the generator. Defaults to 0.0002.
        lr_disc (float):
            Initial learning rate for the discriminator. Defaults to 0.0002.
        lr_scheduler_gen (str):
            Name of the learning rate scheduler for the generator. One of the `torch.optim.lr_scheduler.*`. Defaults to
            `ExponentialLR`.
        lr_scheduler_gen_params (dict):
            Parameters for the learning rate scheduler of the generator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`.
        lr_scheduler_disc (str):
            Name of the learning rate scheduler for the discriminator. One of the `torch.optim.lr_scheduler.*`. Defaults to
            `ExponentialLR`.
        lr_scheduler_disc_params (dict):
            Parameters for the learning rate scheduler of the discriminator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`.
        scheduler_after_epoch (bool):
            If true, step the schedulers after each epoch else after each step. Defaults to `False`.
        optimizer (str):
            Name of the optimizer to use with both the generator and the discriminator networks. One of the
            `torch.optim.*`. Defaults to `AdamW`.
        kl_loss_alpha (float):
            Loss weight for KL loss. Defaults to 1.0.
        disc_loss_alpha (float):
            Loss weight for the discriminator loss. Defaults to 1.0.
        gen_loss_alpha (float):
            Loss weight for the generator loss. Defaults to 1.0.
        feat_loss_alpha (float):
            Loss weight for the feature matching loss. Defaults to 1.0.
        mel_loss_alpha (float):
            Loss weight for the mel loss. Defaults to 45.0.
        return_wav (bool):
            If true, data loader returns the waveform as well as the other outputs. Do not change. Defaults to `True`.
        compute_linear_spec (bool):
            If true, the linear spectrogram is computed and returned alongside the mel output. Do not change. Defaults to `True`.
        use_weighted_sampler (bool):
            If true, use weighted sampler with bucketing for balancing samples between datasets used in training. Defaults to `False`.
        weighted_sampler_attrs (dict):
            Key retuned by the formatter to be used for weighted sampler. For example `{"root_path": 2.0, "speaker_name": 1.0}` sets sample probabilities
            by overweighting `root_path` by 2.0. Defaults to `{}`.
        weighted_sampler_multipliers (dict):
            Weight each unique value of a key returned by the formatter for weighted sampling.
            For example `{"root_path":{"/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-100/":1.0, "/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-360/": 0.5}`.
            It will sample instances from `train-clean-100` 2 times more than `train-clean-360`. Defaults to `{}`.
        r (int):
            Number of spectrogram frames to be generated at a time. Do not change. Defaults to `1`.
        add_blank (bool):
            If true, a blank token is added in between every character. Defaults to `True`.
        test_sentences (List[List]):
            List of sentences with speaker and language information to be used for testing.
        language_ids_file (str):
            Path to the language ids file.
        use_language_embedding (bool):
            If true, language embedding is used. Defaults to `False`.
    Note:
        Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters.
    Example:
        >>> from TTS.vc.configs.freevc_config import FreeVCConfig
        >>> config = FreeVCConfig()
    """
    model: str = "freevc"
    # model specific params
    model_args: FreeVCArgs = field(default_factory=FreeVCArgs)
    audio: FreeVCAudioConfig = field(default_factory=FreeVCAudioConfig)
    # optimizer
    # TODO with training support
    # loss params
    # TODO with training support
    # data loader params
    return_wav: bool = True
    compute_linear_spec: bool = True
    # sampler params
    use_weighted_sampler: bool = False  # TODO: move it to the base config
    weighted_sampler_attrs: dict = field(default_factory=lambda: {})
    weighted_sampler_multipliers: dict = field(default_factory=lambda: {})
    # overrides
    r: int = 1  # DO NOT CHANGE
    add_blank: bool = True
    # multi-speaker settings
    # use speaker embedding layer
    num_speakers: int = 0
    speakers_file: str = None
    speaker_embedding_channels: int = 256
    # use d-vectors
    use_d_vector_file: bool = False
    d_vector_file: List[str] = None
    d_vector_dim: int = None
    def __post_init__(self):
        for key, val in self.model_args.items():
            if hasattr(self, key):
                self[key] = val
--- a/TTS/vc/models/freevc.py
+++ b/TTS/vc/models/freevc.py
@ -1,4 +1,3 @@
 from dataclasses import dataclass, field
 from typing import Dict, List, Optional, Tuple, Union
 import librosa
@ -13,8 +12,8 @@ from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
 import TTS.vc.modules.freevc.commons as commons
 import TTS.vc.modules.freevc.modules as modules
 from TTS.tts.utils.speakers import SpeakerManager
-from TTS.utils.io import load_fsspec, save_checkpoint
+from TTS.utils.io import load_fsspec
-from TTS.vc.configs.shared_configs import BaseVCConfig
+from TTS.vc.configs.freevc_config import FreeVCConfig
 from TTS.vc.models.base_vc import BaseVC
 from TTS.vc.modules.freevc.commons import get_padding, init_weights
 from TTS.vc.modules.freevc.mel_processing import mel_spectrogram_torch
@ -294,136 +293,6 @@ class SpeakerEncoder(torch.nn.Module):
        return embed
@dataclass
 class FreeVCAudioConfig(Coqpit):
    """Audio configuration
    Args:
        max_wav_value (float):
            The maximum value of the waveform.
        input_sample_rate (int):
            The sampling rate of the input waveform.
        output_sample_rate (int):
            The sampling rate of the output waveform.
        filter_length (int):
            The length of the filter.
        hop_length (int):
            The hop length.
        win_length (int):
            The window length.
        n_mel_channels (int):
            The number of mel channels.
        mel_fmin (float):
            The minimum frequency of the mel filterbank.
        mel_fmax (Optional[float]):
            The maximum frequency of the mel filterbank.
    """
    max_wav_value: float = field(default=32768.0)
    input_sample_rate: int = field(default=16000)
    output_sample_rate: int = field(default=24000)
    filter_length: int = field(default=1280)
    hop_length: int = field(default=320)
    win_length: int = field(default=1280)
    n_mel_channels: int = field(default=80)
    mel_fmin: float = field(default=0.0)
    mel_fmax: Optional[float] = field(default=None)
@dataclass
 class FreeVCArgs(Coqpit):
    """FreeVC model arguments
    Args:
        spec_channels (int):
            The number of channels in the spectrogram.
        inter_channels (int):
            The number of channels in the intermediate layers.
        hidden_channels (int):
            The number of channels in the hidden layers.
        filter_channels (int):
            The number of channels in the filter layers.
        n_heads (int):
            The number of attention heads.
        n_layers (int):
            The number of layers.
        kernel_size (int):
            The size of the kernel.
        p_dropout (float):
            The dropout probability.
        resblock (str):
            The type of residual block.
        resblock_kernel_sizes (List[int]):
            The kernel sizes for the residual blocks.
        resblock_dilation_sizes (List[List[int]]):
            The dilation sizes for the residual blocks.
        upsample_rates (List[int]):
            The upsample rates.
        upsample_initial_channel (int):
            The number of channels in the initial upsample layer.
        upsample_kernel_sizes (List[int]):
            The kernel sizes for the upsample layers.
        n_layers_q (int):
            The number of layers in the quantization network.
        use_spectral_norm (bool):
            Whether to use spectral normalization.
        gin_channels (int):
            The number of channels in the global conditioning vector.
        ssl_dim (int):
            The dimension of the self-supervised learning embedding.
        use_spk (bool):
            Whether to use external speaker encoder.
    """
    spec_channels: int = field(default=641)
    inter_channels: int = field(default=192)
    hidden_channels: int = field(default=192)
    filter_channels: int = field(default=768)
    n_heads: int = field(default=2)
    n_layers: int = field(default=6)
    kernel_size: int = field(default=3)
    p_dropout: float = field(default=0.1)
    resblock: str = field(default="1")
    resblock_kernel_sizes: List[int] = field(default_factory=lambda: [3, 7, 11])
    resblock_dilation_sizes: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]])
    upsample_rates: List[int] = field(default_factory=lambda: [10, 8, 2, 2])
    upsample_initial_channel: int = field(default=512)
    upsample_kernel_sizes: List[int] = field(default_factory=lambda: [16, 16, 4, 4])
    n_layers_q: int = field(default=3)
    use_spectral_norm: bool = field(default=False)
    gin_channels: int = field(default=256)
    ssl_dim: int = field(default=1024)
    use_spk: bool = field(default=False)
    num_spks: int = field(default=0)
    segment_size: int = field(default=8960)
 class FreeVC(BaseVC):
    """
@ -677,7 +546,7 @@ class FreeVC(BaseVC):
        ...
    @staticmethod
-    def init_from_config(config: "VitsConfig", samples: Union[List[List], List[Dict]] = None, verbose=True):
+    def init_from_config(config: FreeVCConfig, samples: Union[List[List], List[Dict]] = None, verbose=True):
        model = FreeVC(config)
        return model
@ -689,145 +558,3 @@ class FreeVC(BaseVC):
    def train_step():
        ...
@dataclass
 class FreeVCConfig(BaseVCConfig):
    """Defines parameters for FreeVC End2End TTS model.
    Args:
        model (str):
            Model name. Do not change unless you know what you are doing.
        model_args (FreeVCArgs):
            Model architecture arguments. Defaults to `FreeVCArgs()`.
        audio (FreeVCAudioConfig):
            Audio processing configuration. Defaults to `FreeVCAudioConfig()`.
        grad_clip (List):
            Gradient clipping thresholds for each optimizer. Defaults to `[1000.0, 1000.0]`.
        lr_gen (float):
            Initial learning rate for the generator. Defaults to 0.0002.
        lr_disc (float):
            Initial learning rate for the discriminator. Defaults to 0.0002.
        lr_scheduler_gen (str):
            Name of the learning rate scheduler for the generator. One of the `torch.optim.lr_scheduler.*`. Defaults to
            `ExponentialLR`.
        lr_scheduler_gen_params (dict):
            Parameters for the learning rate scheduler of the generator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`.
        lr_scheduler_disc (str):
            Name of the learning rate scheduler for the discriminator. One of the `torch.optim.lr_scheduler.*`. Defaults to
            `ExponentialLR`.
        lr_scheduler_disc_params (dict):
            Parameters for the learning rate scheduler of the discriminator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`.
        scheduler_after_epoch (bool):
            If true, step the schedulers after each epoch else after each step. Defaults to `False`.
        optimizer (str):
            Name of the optimizer to use with both the generator and the discriminator networks. One of the
            `torch.optim.*`. Defaults to `AdamW`.
        kl_loss_alpha (float):
            Loss weight for KL loss. Defaults to 1.0.
        disc_loss_alpha (float):
            Loss weight for the discriminator loss. Defaults to 1.0.
        gen_loss_alpha (float):
            Loss weight for the generator loss. Defaults to 1.0.
        feat_loss_alpha (float):
            Loss weight for the feature matching loss. Defaults to 1.0.
        mel_loss_alpha (float):
            Loss weight for the mel loss. Defaults to 45.0.
        return_wav (bool):
            If true, data loader returns the waveform as well as the other outputs. Do not change. Defaults to `True`.
        compute_linear_spec (bool):
            If true, the linear spectrogram is computed and returned alongside the mel output. Do not change. Defaults to `True`.
        use_weighted_sampler (bool):
            If true, use weighted sampler with bucketing for balancing samples between datasets used in training. Defaults to `False`.
        weighted_sampler_attrs (dict):
            Key retuned by the formatter to be used for weighted sampler. For example `{"root_path": 2.0, "speaker_name": 1.0}` sets sample probabilities
            by overweighting `root_path` by 2.0. Defaults to `{}`.
        weighted_sampler_multipliers (dict):
            Weight each unique value of a key returned by the formatter for weighted sampling.
            For example `{"root_path":{"/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-100/":1.0, "/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-360/": 0.5}`.
            It will sample instances from `train-clean-100` 2 times more than `train-clean-360`. Defaults to `{}`.
        r (int):
            Number of spectrogram frames to be generated at a time. Do not change. Defaults to `1`.
        add_blank (bool):
            If true, a blank token is added in between every character. Defaults to `True`.
        test_sentences (List[List]):
            List of sentences with speaker and language information to be used for testing.
        language_ids_file (str):
            Path to the language ids file.
        use_language_embedding (bool):
            If true, language embedding is used. Defaults to `False`.
    Note:
        Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters.
    Example:
        >>> from TTS.tts.configs.freevc_config import FreeVCConfig
        >>> config = FreeVCConfig()
    """
    model: str = "freevc"
    # model specific params
    model_args: FreeVCArgs = field(default_factory=FreeVCArgs)
    audio: FreeVCAudioConfig = field(default_factory=FreeVCAudioConfig)
    # optimizer
    # TODO with training support
    # loss params
    # TODO with training support
    # data loader params
    return_wav: bool = True
    compute_linear_spec: bool = True
    # sampler params
    use_weighted_sampler: bool = False  # TODO: move it to the base config
    weighted_sampler_attrs: dict = field(default_factory=lambda: {})
    weighted_sampler_multipliers: dict = field(default_factory=lambda: {})
    # overrides
    r: int = 1  # DO NOT CHANGE
    add_blank: bool = True
    # multi-speaker settings
    # use speaker embedding layer
    num_speakers: int = 0
    speakers_file: str = None
    speaker_embedding_channels: int = 256
    # use d-vectors
    use_d_vector_file: bool = False
    d_vector_file: List[str] = None
    d_vector_dim: int = None
    def __post_init__(self):
        for key, val in self.model_args.items():
            if hasattr(self, key):
                self[key] = val
--- a/TTS/vocoder/layers/losses.py
+++ b/TTS/vocoder/layers/losses.py
@ -195,10 +195,10 @@ def _apply_D_loss(scores_fake, scores_real, loss_func):
    if isinstance(scores_fake, list):
        # multi-scale loss
        for score_fake, score_real in zip(scores_fake, scores_real):
-            total_loss, real_loss, fake_loss = loss_func(score_fake=score_fake, score_real=score_real)
+            total_loss, real_loss_, fake_loss_ = loss_func(score_fake=score_fake, score_real=score_real)
            loss += total_loss
-            real_loss += real_loss
+            real_loss += real_loss_
-            fake_loss += fake_loss
+            fake_loss += fake_loss_
        # normalize loss values with number of scales (discriminators)
        loss /= len(scores_fake)
        real_loss /= len(scores_real)
--- a/docs/source/inference.md
+++ b/docs/source/inference.md
@ -124,7 +124,7 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 print(TTS().list_models())
 # Init TTS
-tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1").to(device)
+tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
 # Run TTS
 # ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language
@ -198,19 +198,12 @@ from TTS.api import CS_API
 # Init 🐸 Coqui Studio API
 # you can either set the API token as an environment variable `COQUI_STUDIO_TOKEN` or pass it as an argument.
-# XTTS - Best quality and life-like speech in EN
+# XTTS - Best quality and life-like speech in multiple languages. See https://docs.coqui.ai/reference/samples_xtts_create for supported languages.
 api = CS_API(api_token=<token>, model="XTTS")
 api.speakers  # all the speakers are available with all the models.
 api.list_speakers()
 api.list_voices()
-wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5)
+wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", language="en", speed=1.5)
 # XTTS-multilingual - Multilingual XTTS with [en, de, es, fr, it, pt, ...] (more langs coming soon)
 api = CS_API(api_token=<token>, model="XTTS-multilingual")
 api.speakers
 api.list_speakers()
 api.list_voices()
 wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5)
 # V1 - Fast and lightweight TTS in EN with emotion control.
 api = CS_API(api_token=<token>, model="V1")
@ -238,4 +231,4 @@ api.tts_with_vc_to_file(
    speaker_wav="target/speaker.wav",
    file_path="ouptut.wav"
 )
-```
+```
--- a/docs/source/models/xtts.md
+++ b/docs/source/models/xtts.md
@ -24,8 +24,7 @@ a few tricks to make it faster and support streaming inference.
 Current implementation only supports inference.
 ### Languages
-As of now, XTTS-v2 supports 16 languages: English, Spanish, French, German, Italian, Portuguese,
+As of now, XTTS-v2 supports 16 languages: English (en), Spanish (es), French (fr), German (de), Italian (it), Portuguese (pt), Polish (pl), Turkish (tr), Russian (ru), Dutch (nl), Czech (cs), Arabic (ar), Chinese (zh-cn), Japanese (ja), Hungarian (hu) and Korean (ko).
 Polish, Turkish, Russian, Dutch, Czech, Arabic, Chinese (Simplified), Japanese, Hungarian, Korean
 Stay tuned as we continue to add support for more languages. If you have any language requests, please feel free to reach out.
@ -116,7 +115,7 @@ model.load_checkpoint(config, checkpoint_dir="/path/to/xtts/", use_deepspeed=Tru
 model.cuda()
 print("Computing speaker latents...")
-gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"])
+gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"])
 print("Inference...")
 out = model.inference(
@ -124,7 +123,6 @@ out = model.inference(
    "en",
    gpt_cond_latent,
    speaker_embedding,
    diffusion_conditioning,
    temperature=0.7, # Add custom parameters here
 )
 torchaudio.save("xtts.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
@ -153,7 +151,7 @@ model.load_checkpoint(config, checkpoint_dir="/path/to/xtts/", use_deepspeed=Tru
 model.cuda()
 print("Computing speaker latents...")
-gpt_cond_latent, _, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"])
+gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"])
 print("Inference...")
 t0 = time.time()
@ -210,7 +208,7 @@ model.load_checkpoint(config, checkpoint_path=XTTS_CHECKPOINT, vocab_path=TOKENI
 model.cuda()
 print("Computing speaker latents...")
-gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path=[SPEAKER_REFERENCE])
+gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[SPEAKER_REFERENCE])
 print("Inference...")
 out = model.inference(
@ -218,7 +216,6 @@ out = model.inference(
    "en",
    gpt_cond_latent,
    speaker_embedding,
    diffusion_conditioning,
    temperature=0.7, # Add custom parameters here
 )
 torchaudio.save(OUTPUT_WAV_PATH, torch.tensor(out["wav"]).unsqueeze(0), 24000)
--- a/tests/zoo_tests/test_models.py
+++ b/tests/zoo_tests/test_models.py
@ -14,7 +14,6 @@ from TTS.utils.manage import ModelManager
 MODELS_WITH_SEP_TESTS = [
    "tts_models/multilingual/multi-dataset/bark",
    "tts_models/en/multi-dataset/tortoise-v2",
    "tts_models/multilingual/multi-dataset/xtts_v1",
    "tts_models/multilingual/multi-dataset/xtts_v1.1",
    "tts_models/multilingual/multi-dataset/xtts_v2",
 ]
@ -83,14 +82,14 @@ def test_xtts():
    if use_gpu:
        run_cli(
            "yes | "
-            f"tts --model_name  tts_models/multilingual/multi-dataset/xtts_v1 "
+            f"tts --model_name  tts_models/multilingual/multi-dataset/xtts_v1.1 "
            f'--text "This is an example." --out_path "{output_path}" --progress_bar False --use_cuda True '
            f'--speaker_wav "{speaker_wav}" --language_idx "en"'
        )
    else:
        run_cli(
            "yes | "
-            f"tts --model_name  tts_models/multilingual/multi-dataset/xtts_v1 "
+            f"tts --model_name  tts_models/multilingual/multi-dataset/xtts_v1.1 "
            f'--text "This is an example." --out_path "{output_path}" --progress_bar False '
            f'--speaker_wav "{speaker_wav}" --language_idx "en"'
        )
@ -104,7 +103,7 @@ def test_xtts_streaming():
    speaker_wav = [os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")]
    speaker_wav_2 = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0002.wav")
    speaker_wav.append(speaker_wav_2)
-    model_path = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v1")
+    model_path = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v1.1")
    config = XttsConfig()
    config.load_json(os.path.join(model_path, "config.json"))
    model = Xtts.init_from_config(config)