Merge pull request #3173 from coqui-ai/dev

v0.20.2
2023-11-08 16:08:22 +01:00 · 2023-11-08 16:08:22 +01:00 · ab57c36c2b
parent 063556abf4 46d9c27212
commit ab57c36c2b
15 changed files with 383 additions and 437 deletions
--- a/README.md
+++ b/README.md
@ -2,7 +2,7 @@
 ## 🐸Coqui.ai News
 - 📣 ⓍTTSv2 is here with 16 languages and better performance across the board.
 - 📣 ⓍTTS fine-tuning code is out. Check the [example recipes](https://github.com/coqui-ai/TTS/tree/dev/recipes/ljspeech).
- 📣 ⓍTTS can now stream with <200ms latency. 
+- 📣 ⓍTTS can now stream with <200ms latency.
 - 📣 ⓍTTS, our production TTS model that can speak 13 languages, is released [Blog Post](https://coqui.ai/blog/tts/open_xtts), [Demo](https://huggingface.co/spaces/coqui/xtts), [Docs](https://tts.readthedocs.io/en/dev/models/xtts.html)
 - 📣 [🐶Bark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://tts.readthedocs.io/en/dev/models/bark.html)
 - 📣 You can use [~1100 Fairseq models](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS.
@ -205,7 +205,7 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 print(TTS().list_models())

 # Init TTS
-tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1").to(device)
+tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)

 # Run TTS
 # ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language
@ -267,19 +267,13 @@ models = TTS(cs_api_model="XTTS").list_models()
 # Init TTS with the target studio speaker
 tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False)
 # Run TTS
-tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH)
+tts.tts_to_file(text="This is a test.", language="en", file_path=OUTPUT_PATH)

 # V1 model
 models = TTS(cs_api_model="V1").list_models()
 # Run TTS with emotion and speed control
 # Emotion control only works with V1 model
 tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5)
-
-# XTTS-multilingual
-models = TTS(cs_api_model="XTTS-multilingual").list_models()
-# Run TTS with emotion and speed control
-# Emotion control only works with V1 model
-tts.tts_to_file(text="Das ist ein Test.", file_path=OUTPUT_PATH, language="de", speed=1.0)
 ```

 #### Example text to speech using **Fairseq models in ~1100 languages** 🤯.
--- a/TTS/VERSION
+++ b/TTS/VERSION
@ -1 +1 @@
-0.20.1
+0.20.2
--- a/TTS/api.py
+++ b/TTS/api.py
@ -60,7 +60,7 @@ class TTS(nn.Module):
            vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
            progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
            cs_api_model (str, optional): Name of the model to use for the Coqui Studio API. Available models are
-                "XTTS", "XTTS-multilingual", "V1". You can also use `TTS.cs_api.CS_API" for more control.
+                "XTTS", "V1". You can also use `TTS.cs_api.CS_API" for more control.
                Defaults to "XTTS".
            gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
        """
@ -275,7 +275,7 @@ class TTS(nn.Module):
            speaker_name (str, optional):
                Speaker name from Coqui Studio. Defaults to None.
            language (str): Language of the text. If None, the default language of the speaker is used. Language is only
-                supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
+                supported by `XTTS` model.
            emotion (str, optional):
                Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only available
                with "V1" model. Defaults to None.
@ -321,7 +321,7 @@ class TTS(nn.Module):
                Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
                `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
            language (str): Language of the text. If None, the default language of the speaker is used. Language is only
-                supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
+                supported by `XTTS` model.
            speaker_wav (str, optional):
                Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
                Defaults to None.
--- a/TTS/bin/synthesize.py
+++ b/TTS/bin/synthesize.py
@ -227,7 +227,7 @@ def main():
    parser.add_argument(
        "--cs_model",
        type=str,
-        help="Name of the 🐸Coqui Studio model. Available models are `XTTS`, `XTTS-multilingual`, `V1`.",
+        help="Name of the 🐸Coqui Studio model. Available models are `XTTS`, `V1`.",
    )
    parser.add_argument(
        "--emotion",
@ -238,7 +238,7 @@ def main():
    parser.add_argument(
        "--language",
        type=str,
-        help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS-multilingual` model.",
+        help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS` model.",
        default=None,
    )
    parser.add_argument(
--- a/TTS/cs_api.py
+++ b/TTS/cs_api.py
@ -43,7 +43,7 @@ class CS_API:
    Args:
        api_token (str): 🐸Coqui Studio API token. If not provided, it will be read from the environment variable
            `COQUI_STUDIO_TOKEN`.
-        model (str): 🐸Coqui Studio model. It can be either `V1`, `XTTS`, or `XTTS-multilang`. Default is `XTTS`.
+        model (str): 🐸Coqui Studio model. It can be either `V1`, `XTTS`. Default is `XTTS`.


    Example listing all available speakers:
@ -65,7 +65,7 @@ class CS_API:

    Example with multi-language model:
        >>> from TTS.api import CS_API
-        >>> tts = CS_API(model="XTTS-multilang")
+        >>> tts = CS_API(model="XTTS")
        >>> wav, sr = api.tts("Hello world", speaker_name=tts.speakers[0].name, language="en")
    """

@ -78,16 +78,12 @@ class CS_API:
        "XTTS": {
            "list_speakers": "https://app.coqui.ai/api/v2/speakers",
            "synthesize": "https://app.coqui.ai/api/v2/samples/xtts/render/",
-            "list_voices": "https://app.coqui.ai/api/v2/voices/xtts/",
-        },
-        "XTTS-multilang": {
-            "list_speakers": "https://app.coqui.ai/api/v2/speakers",
-            "synthesize": "https://app.coqui.ai/api/v2/samples/multilingual/render/",
-            "list_voices": "https://app.coqui.ai/api/v2/voices/xtts/",
+            "list_voices": "https://app.coqui.ai/api/v2/voices/xtts",
        },
    }

-    SUPPORTED_LANGUAGES = ["en", "es", "de", "fr", "it", "pt", "pl"]
+
+    SUPPORTED_LANGUAGES = ["en", "es", "de", "fr", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja"]

    def __init__(self, api_token=None, model="XTTS"):
        self.api_token = api_token
@ -139,7 +135,7 @@ class CS_API:
        self._check_token()
        conn = http.client.HTTPSConnection("app.coqui.ai")
        url = self.MODEL_ENDPOINTS[self.model]["list_speakers"]
-        conn.request("GET", f"{url}?per_page=100", headers=self.headers)
+        conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers)
        res = conn.getresponse()
        data = res.read()
        return [Speaker(s) for s in json.loads(data)["result"]]
@ -148,7 +144,7 @@ class CS_API:
        """List custom voices created by the user."""
        conn = http.client.HTTPSConnection("app.coqui.ai")
        url = self.MODEL_ENDPOINTS[self.model]["list_voices"]
-        conn.request("GET", f"{url}", headers=self.headers)
+        conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers)
        res = conn.getresponse()
        data = res.read()
        return [Speaker(s, True) for s in json.loads(data)["result"]]
@ -197,14 +193,6 @@ class CS_API:
                }
            )
        elif model == "XTTS":
-            payload.update(
-                {
-                    "name": speaker.name,
-                    "text": text,
-                    "speed": speed,
-                }
-            )
-        elif model == "XTTS-multilang":
            payload.update(
                {
                    "name": speaker.name,
@ -226,13 +214,10 @@ class CS_API:
            assert language is None, "❗ language is not supported for V1 model."
        elif self.model == "XTTS":
            assert emotion is None, f"❗ Emotions are not supported for XTTS model. Use V1 model."
-            assert language is None, "❗ Language is not supported for XTTS model. Use XTTS-multilang model."
-        elif self.model == "XTTS-multilang":
-            assert emotion is None, f"❗ Emotions are not supported for XTTS-multilang model. Use V1 model."
-            assert language is not None, "❗ Language is required for XTTS-multilang model."
+            assert language is not None, "❗ Language is required for XTTS model."
            assert (
                language in self.SUPPORTED_LANGUAGES
-            ), f"❗ Language {language} is not yet supported. Use one of: en, es, de, fr, it, pt, pl"
+            ), f"❗ Language {language} is not yet supported. Check https://docs.coqui.ai/reference/samples_xtts_create."
        return text, speaker_name, speaker_id, emotion, speed, language

    def tts(
@ -255,7 +240,7 @@ class CS_API:
                supported by `V1` model. Defaults to None.
            speed (float): Speed of the speech. 1.0 is normal speed.
            language (str): Language of the text. If None, the default language of the speaker is used. Language is only
-                supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
+                supported by `XTTS` model. See https://docs.coqui.ai/reference/samples_xtts_create for supported languages.
        """
        self._check_token()
        self.ping_api()
@ -305,7 +290,7 @@ class CS_API:
            speed (float): Speed of the speech. 1.0 is normal speed.
            pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe.
            language (str): Language of the text. If None, the default language of the speaker is used. Language is only
-                supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
+                supported by `XTTS` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
            file_path (str): Path to save the file. If None, a temporary file is created.
        """
        if file_path is None:
@ -323,20 +308,7 @@ if __name__ == "__main__":
    print(api.list_speakers_as_tts_models())

    ts = time.time()
-    wav, sr = api.tts("It took me quite a long time to develop a voice.", speaker_name=api.speakers[0].name)
+    wav, sr = api.tts("It took me quite a long time to develop a voice.", language="en", speaker_name=api.speakers[0].name)
    print(f" [i] XTTS took {time.time() - ts:.2f}s")

-    filepath = api.tts_to_file(text="Hello world!", speaker_name=api.speakers[0].name, file_path="output.wav")
-
-    api = CS_API(model="XTTS-multilang")
-    print(api.speakers)
-
-    ts = time.time()
-    wav, sr = api.tts(
-        "It took me quite a long time to develop a voice.", speaker_name=api.speakers[0].name, language="en"
-    )
-    print(f" [i] XTTS took {time.time() - ts:.2f}s")
-
-    filepath = api.tts_to_file(
-        text="Hello world!", speaker_name=api.speakers[0].name, file_path="output.wav", language="en"
-    )
+    filepath = api.tts_to_file(text="Hello world!", speaker_name=api.speakers[0].name, language="en", file_path="output.wav")
--- a/TTS/tts/configs/xtts_config.py
+++ b/TTS/tts/configs/xtts_config.py
@ -37,29 +37,11 @@ class XttsConfig(BaseTTSConfig):
            If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
            Defaults to `0.8`.

-        cond_free_k (float):
-            Knob that determines how to balance the conditioning free signal with the conditioning-present signal. [0,inf].
-            As cond_free_k increases, the output becomes dominated by the conditioning-free signal.
-            Formula is: output=cond_present_output*(cond_free_k+1)-cond_absenct_output*cond_free_k. Defaults to `2.0`.
-
-        diffusion_temperature (float):
-            Controls the variance of the noise fed into the diffusion model. [0,1]. Values at 0
-            are the "mean" prediction of the diffusion network and will sound bland and smeared.
-            Defaults to `1.0`.
-
        num_gpt_outputs (int):
            Number of samples taken from the autoregressive model, all of which are filtered using CLVP.
            As XTTS is a probabilistic model, more samples means a higher probability of creating something "great".
            Defaults to `16`.

-        decoder_iterations (int):
-            Number of diffusion steps to perform. [0,4000]. More steps means the network has more chances to iteratively refine
-            the output, which should theoretically mean a higher quality output. Generally a value above 250 is not noticeably better,
-            however. Defaults to `30`.
-
-        decoder_sampler (str):
-            Diffusion sampler to be used. `ddim` or `dpm++2m`. Defaults to `ddim`.
-
        gpt_cond_len (int):
            Secs audio to be used as conditioning for the autoregressive model. Defaults to `3`.

@ -110,11 +92,7 @@ class XttsConfig(BaseTTSConfig):
    repetition_penalty: float = 2.0
    top_k: int = 50
    top_p: float = 0.85
-    cond_free_k: float = 2.0
-    diffusion_temperature: float = 1.0
    num_gpt_outputs: int = 1
-    decoder_iterations: int = 30
-    decoder_sampler: str = "ddim"

    # cloning
    gpt_cond_len: int = 3
--- a/TTS/tts/layers/xtts/tokenizer.py
+++ b/TTS/tts/layers/xtts/tokenizer.py
@ -8,6 +8,7 @@ from hangul_romanize import Transliter
 from hangul_romanize.rule import academic
 from num2words import num2words
 from tokenizers import Tokenizer
+from functools import cached_property

 from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words

@ -535,11 +536,50 @@ DEFAULT_VOCAB_FILE = os.path.join(os.path.dirname(os.path.realpath(__file__)), "
 class VoiceBpeTokenizer:
    def __init__(self, vocab_file=None):
        self.tokenizer = None
-        self.katsu = None
        if vocab_file is not None:
            self.tokenizer = Tokenizer.from_file(vocab_file)
+        self.char_limits = {
+            "en": 250,
+            "de": 253,
+            "fr": 273,
+            "es": 239,
+            "it": 213,
+            "pt": 203,
+            "pl": 224,
+            "zh-cn": 82,
+            "ar": 166,
+            "cs": 186,
+            "ru": 182,
+            "nl": 251,
+            "tr": 226,
+            "ja": 71,
+            "hu": 224,
+            "ko": 95,
+        }
+
+    @cached_property
+    def katsu(self):
+        import cutlet
+        return cutlet.Cutlet()
+    
+    def check_input_length(self, txt, lang):
+        limit = self.char_limits.get(lang, 250)
+        if len(txt) > limit:
+            print(f"[!] Warning: The text length exceeds the character limit of {limit} for language '{lang}', this might cause truncated audio.")
+
+    def preprocess_text(self, txt, lang):
+        if lang in ["en", "es", "fr", "de", "pt", "it", "pl", "ar", "cs", "ru", "nl", "tr", "zh-cn"]:
+            txt = multilingual_cleaners(txt, lang)
+            if lang == "zh-cn":
+                txt = chinese_transliterate(txt)
+        elif lang == "ja":                
+            txt = japanese_cleaners(txt, self.katsu)
+        else:
+            raise NotImplementedError()
+        return txt

    def encode(self, txt, lang):
+        self.check_input_length(txt, lang)
        txt = self.preprocess_text(txt, lang)
        txt = f"[{lang}]{txt}"
        txt = txt.replace(" ", "[SPACE]")
--- a/TTS/tts/models/xtts.py
+++ b/TTS/tts/models/xtts.py
@ -152,19 +152,6 @@ class XttsArgs(Coqpit):
        gpt_code_stride_len (int, optional): The hop_size of dvae and consequently of the gpt output. Defaults to 1024.
        gpt_use_masking_gt_prompt_approach (bool, optional):  If True, it will use ground truth as prompt and it will mask the loss to avoid repetition. Defaults to True.
        gpt_use_perceiver_resampler (bool, optional):  If True, it will use perceiver resampler from flamingo paper - https://arxiv.org/abs/2204.14198. Defaults to False.
-
-        For DiffTTS model:
-        diff_model_channels (int, optional): The number of channels for the DiffTTS model. Defaults to 1024.
-        diff_num_layers (int, optional): The number of layers for the DiffTTS model. Defaults to 10.
-        diff_in_channels (int, optional): The input channels for the DiffTTS model. Defaults to 100.
-        diff_out_channels (int, optional): The output channels for the DiffTTS model. Defaults to 200.
-        diff_in_latent_channels (int, optional): The input latent channels for the DiffTTS model. Defaults to 1024.
-        diff_in_tokens (int, optional): The input tokens for the DiffTTS model. Defaults to 8193.
-        diff_dropout (int, optional): The dropout percentage for the DiffTTS model. Defaults to 0.
-        diff_use_fp16 (bool, optional): Whether to use fp16 for the DiffTTS model. Defaults to False.
-        diff_num_heads (int, optional): The number of heads for the DiffTTS model. Defaults to 16.
-        diff_layer_drop (int, optional): The layer dropout percentage for the DiffTTS model. Defaults to 0.
-        diff_unconditioned_percentage (int, optional): The percentage of unconditioned inputs for the DiffTTS model. Defaults to 0.
    """

    gpt_batch_size: int = 1
@ -193,19 +180,6 @@ class XttsArgs(Coqpit):
    gpt_use_masking_gt_prompt_approach: bool = True
    gpt_use_perceiver_resampler: bool = False

-    # Diffusion Decoder params
-    diff_model_channels: int = 1024
-    diff_num_layers: int = 10
-    diff_in_channels: int = 100
-    diff_out_channels: int = 200
-    diff_in_latent_channels: int = 1024
-    diff_in_tokens: int = 8193
-    diff_dropout: int = 0
-    diff_use_fp16: bool = False
-    diff_num_heads: int = 16
-    diff_layer_drop: int = 0
-    diff_unconditioned_percentage: int = 0
-
    # HifiGAN Decoder params
    input_sample_rate: int = 22050
    output_sample_rate: int = 24000
@ -426,10 +400,6 @@ class Xtts(BaseTTS):
            "repetition_penalty": config.repetition_penalty,
            "top_k": config.top_k,
            "top_p": config.top_p,
-            "cond_free_k": config.cond_free_k,
-            "diffusion_temperature": config.diffusion_temperature,
-            "decoder_iterations": config.decoder_iterations,
-            "decoder_sampler": config.decoder_sampler,
            "gpt_cond_len": config.gpt_cond_len,
            "max_ref_len": config.max_ref_len,
            "sound_norm_refs": config.sound_norm_refs,
@ -454,13 +424,6 @@ class Xtts(BaseTTS):
        gpt_cond_len=6,
        max_ref_len=10,
        sound_norm_refs=False,
-        # Decoder inference
-        decoder_iterations=100,
-        cond_free=True,
-        cond_free_k=2,
-        diffusion_temperature=1.0,
-        decoder_sampler="ddim",
-        decoder="hifigan",
        **hf_generate_kwargs,
    ):
        """
@ -603,10 +566,21 @@ class Xtts(BaseTTS):
        if wav_gen_prev is not None:
            wav_chunk = wav_gen[(wav_gen_prev.shape[0] - overlap_len) : -overlap_len]
        if wav_overlap is not None:
-            crossfade_wav = wav_chunk[:overlap_len]
-            crossfade_wav = crossfade_wav * torch.linspace(0.0, 1.0, overlap_len).to(crossfade_wav.device)
-            wav_chunk[:overlap_len] = wav_overlap * torch.linspace(1.0, 0.0, overlap_len).to(wav_overlap.device)
-            wav_chunk[:overlap_len] += crossfade_wav
+            # cross fade the overlap section
+            if overlap_len > len(wav_chunk):
+                # wav_chunk is smaller than overlap_len, pass on last wav_gen
+                if wav_gen_prev is not None:
+                    wav_chunk = wav_gen[(wav_gen_prev.shape[0] - overlap_len):]
+                else:
+                    # not expecting will hit here as problem happens on last chunk
+                    wav_chunk = wav_gen[-overlap_len:]
+                return wav_chunk, wav_gen, None
+            else:
+                crossfade_wav = wav_chunk[:overlap_len]
+                crossfade_wav = crossfade_wav * torch.linspace(0.0, 1.0, overlap_len).to(crossfade_wav.device)
+                wav_chunk[:overlap_len] = wav_overlap * torch.linspace(1.0, 0.0, overlap_len).to(wav_overlap.device)
+                wav_chunk[:overlap_len] += crossfade_wav
+                
        wav_overlap = wav_gen[-overlap_len:]
        wav_gen_prev = wav_gen
        return wav_chunk, wav_gen_prev, wav_overlap
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@ -109,7 +109,6 @@ class ModelManager(object):
    def _list_for_model_type(self, model_type):
        models_name_list = []
        model_count = 1
-        model_type = "tts_models"
        models_name_list.extend(self._list_models(model_type, model_count))
        return models_name_list

@ -298,22 +297,22 @@ class ModelManager(object):
        model_item = self.set_model_url(model_item)
        return model_item, model_full_name, model, md5hash

-    def ask_tos(self, model_full_path):
+    @staticmethod
+    def ask_tos(model_full_path):
        """Ask the user to agree to the terms of service"""
        tos_path = os.path.join(model_full_path, "tos_agreed.txt")
-        if not os.path.exists(tos_path):
-            print(" > You must agree to the terms of service to use this model.")
-            print(" | > Please see the terms of service at https://coqui.ai/cpml.txt")
-            print(' | > "I have read, understood and agreed the Terms and Conditions." - [y/n]')
-            answer = input(" | | > ")
-            if answer.lower() == "y":
-                with open(tos_path, "w") as f:
-                    f.write("I have read, understood ad agree the Terms and Conditions.")
-                return True
-            else:
-                return False
+        print(" > You must agree to the terms of service to use this model.")
+        print(" | > Please see the terms of service at https://coqui.ai/cpml.txt")
+        print(' | > "I have read, understood and agreed to the Terms and Conditions." - [y/n]')
+        answer = input(" | | > ")
+        if answer.lower() == "y":
+            with open(tos_path, "w", encoding="utf-8") as f:
+                f.write("I have read, understood and agreed to the Terms and Conditions.")
+            return True
+        return False

-    def tos_agreed(self, model_item, model_full_path):
+    @staticmethod
+    def tos_agreed(model_item, model_full_path):
        """Check if the user has agreed to the terms of service"""
        if "tos_required" in model_item and model_item["tos_required"]:
            tos_path = os.path.join(model_full_path, "tos_agreed.txt")
--- a/TTS/vc/configs/freevc_config.py
+++ b/TTS/vc/configs/freevc_config.py
@ -1,5 +1,278 @@
 from dataclasses import dataclass, field
-from typing import List
+from typing import List, Optional
+
+from coqpit import Coqpit

 from TTS.vc.configs.shared_configs import BaseVCConfig
-from TTS.vc.models.freevc import FreeVCArgs, FreeVCAudioConfig, FreeVCConfig
+
+
+@dataclass
+class FreeVCAudioConfig(Coqpit):
+    """Audio configuration
+
+    Args:
+        max_wav_value (float):
+            The maximum value of the waveform.
+
+        input_sample_rate (int):
+            The sampling rate of the input waveform.
+
+        output_sample_rate (int):
+            The sampling rate of the output waveform.
+
+        filter_length (int):
+            The length of the filter.
+
+        hop_length (int):
+            The hop length.
+
+        win_length (int):
+            The window length.
+
+        n_mel_channels (int):
+            The number of mel channels.
+
+        mel_fmin (float):
+            The minimum frequency of the mel filterbank.
+
+        mel_fmax (Optional[float]):
+            The maximum frequency of the mel filterbank.
+    """
+
+    max_wav_value: float = field(default=32768.0)
+    input_sample_rate: int = field(default=16000)
+    output_sample_rate: int = field(default=24000)
+    filter_length: int = field(default=1280)
+    hop_length: int = field(default=320)
+    win_length: int = field(default=1280)
+    n_mel_channels: int = field(default=80)
+    mel_fmin: float = field(default=0.0)
+    mel_fmax: Optional[float] = field(default=None)
+
+
+@dataclass
+class FreeVCArgs(Coqpit):
+    """FreeVC model arguments
+
+    Args:
+        spec_channels (int):
+            The number of channels in the spectrogram.
+
+        inter_channels (int):
+            The number of channels in the intermediate layers.
+
+        hidden_channels (int):
+            The number of channels in the hidden layers.
+
+        filter_channels (int):
+            The number of channels in the filter layers.
+
+        n_heads (int):
+            The number of attention heads.
+
+        n_layers (int):
+            The number of layers.
+
+        kernel_size (int):
+            The size of the kernel.
+
+        p_dropout (float):
+            The dropout probability.
+
+        resblock (str):
+            The type of residual block.
+
+        resblock_kernel_sizes (List[int]):
+            The kernel sizes for the residual blocks.
+
+        resblock_dilation_sizes (List[List[int]]):
+            The dilation sizes for the residual blocks.
+
+        upsample_rates (List[int]):
+            The upsample rates.
+
+        upsample_initial_channel (int):
+            The number of channels in the initial upsample layer.
+
+        upsample_kernel_sizes (List[int]):
+            The kernel sizes for the upsample layers.
+
+        n_layers_q (int):
+            The number of layers in the quantization network.
+
+        use_spectral_norm (bool):
+            Whether to use spectral normalization.
+
+        gin_channels (int):
+            The number of channels in the global conditioning vector.
+
+        ssl_dim (int):
+            The dimension of the self-supervised learning embedding.
+
+        use_spk (bool):
+            Whether to use external speaker encoder.
+    """
+
+    spec_channels: int = field(default=641)
+    inter_channels: int = field(default=192)
+    hidden_channels: int = field(default=192)
+    filter_channels: int = field(default=768)
+    n_heads: int = field(default=2)
+    n_layers: int = field(default=6)
+    kernel_size: int = field(default=3)
+    p_dropout: float = field(default=0.1)
+    resblock: str = field(default="1")
+    resblock_kernel_sizes: List[int] = field(default_factory=lambda: [3, 7, 11])
+    resblock_dilation_sizes: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]])
+    upsample_rates: List[int] = field(default_factory=lambda: [10, 8, 2, 2])
+    upsample_initial_channel: int = field(default=512)
+    upsample_kernel_sizes: List[int] = field(default_factory=lambda: [16, 16, 4, 4])
+    n_layers_q: int = field(default=3)
+    use_spectral_norm: bool = field(default=False)
+    gin_channels: int = field(default=256)
+    ssl_dim: int = field(default=1024)
+    use_spk: bool = field(default=False)
+    num_spks: int = field(default=0)
+    segment_size: int = field(default=8960)
+
+
+@dataclass
+class FreeVCConfig(BaseVCConfig):
+    """Defines parameters for FreeVC End2End TTS model.
+
+    Args:
+        model (str):
+            Model name. Do not change unless you know what you are doing.
+
+        model_args (FreeVCArgs):
+            Model architecture arguments. Defaults to `FreeVCArgs()`.
+
+        audio (FreeVCAudioConfig):
+            Audio processing configuration. Defaults to `FreeVCAudioConfig()`.
+
+        grad_clip (List):
+            Gradient clipping thresholds for each optimizer. Defaults to `[1000.0, 1000.0]`.
+
+        lr_gen (float):
+            Initial learning rate for the generator. Defaults to 0.0002.
+
+        lr_disc (float):
+            Initial learning rate for the discriminator. Defaults to 0.0002.
+
+        lr_scheduler_gen (str):
+            Name of the learning rate scheduler for the generator. One of the `torch.optim.lr_scheduler.*`. Defaults to
+            `ExponentialLR`.
+
+        lr_scheduler_gen_params (dict):
+            Parameters for the learning rate scheduler of the generator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`.
+
+        lr_scheduler_disc (str):
+            Name of the learning rate scheduler for the discriminator. One of the `torch.optim.lr_scheduler.*`. Defaults to
+            `ExponentialLR`.
+
+        lr_scheduler_disc_params (dict):
+            Parameters for the learning rate scheduler of the discriminator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`.
+
+        scheduler_after_epoch (bool):
+            If true, step the schedulers after each epoch else after each step. Defaults to `False`.
+
+        optimizer (str):
+            Name of the optimizer to use with both the generator and the discriminator networks. One of the
+            `torch.optim.*`. Defaults to `AdamW`.
+
+        kl_loss_alpha (float):
+            Loss weight for KL loss. Defaults to 1.0.
+
+        disc_loss_alpha (float):
+            Loss weight for the discriminator loss. Defaults to 1.0.
+
+        gen_loss_alpha (float):
+            Loss weight for the generator loss. Defaults to 1.0.
+
+        feat_loss_alpha (float):
+            Loss weight for the feature matching loss. Defaults to 1.0.
+
+        mel_loss_alpha (float):
+            Loss weight for the mel loss. Defaults to 45.0.
+
+        return_wav (bool):
+            If true, data loader returns the waveform as well as the other outputs. Do not change. Defaults to `True`.
+
+        compute_linear_spec (bool):
+            If true, the linear spectrogram is computed and returned alongside the mel output. Do not change. Defaults to `True`.
+
+        use_weighted_sampler (bool):
+            If true, use weighted sampler with bucketing for balancing samples between datasets used in training. Defaults to `False`.
+
+        weighted_sampler_attrs (dict):
+            Key retuned by the formatter to be used for weighted sampler. For example `{"root_path": 2.0, "speaker_name": 1.0}` sets sample probabilities
+            by overweighting `root_path` by 2.0. Defaults to `{}`.
+
+        weighted_sampler_multipliers (dict):
+            Weight each unique value of a key returned by the formatter for weighted sampling.
+            For example `{"root_path":{"/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-100/":1.0, "/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-360/": 0.5}`.
+            It will sample instances from `train-clean-100` 2 times more than `train-clean-360`. Defaults to `{}`.
+
+        r (int):
+            Number of spectrogram frames to be generated at a time. Do not change. Defaults to `1`.
+
+        add_blank (bool):
+            If true, a blank token is added in between every character. Defaults to `True`.
+
+        test_sentences (List[List]):
+            List of sentences with speaker and language information to be used for testing.
+
+        language_ids_file (str):
+            Path to the language ids file.
+
+        use_language_embedding (bool):
+            If true, language embedding is used. Defaults to `False`.
+
+    Note:
+        Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters.
+
+    Example:
+
+        >>> from TTS.vc.configs.freevc_config import FreeVCConfig
+        >>> config = FreeVCConfig()
+    """
+
+    model: str = "freevc"
+    # model specific params
+    model_args: FreeVCArgs = field(default_factory=FreeVCArgs)
+    audio: FreeVCAudioConfig = field(default_factory=FreeVCAudioConfig)
+
+    # optimizer
+    # TODO with training support
+
+    # loss params
+    # TODO with training support
+
+    # data loader params
+    return_wav: bool = True
+    compute_linear_spec: bool = True
+
+    # sampler params
+    use_weighted_sampler: bool = False  # TODO: move it to the base config
+    weighted_sampler_attrs: dict = field(default_factory=lambda: {})
+    weighted_sampler_multipliers: dict = field(default_factory=lambda: {})
+
+    # overrides
+    r: int = 1  # DO NOT CHANGE
+    add_blank: bool = True
+
+    # multi-speaker settings
+    # use speaker embedding layer
+    num_speakers: int = 0
+    speakers_file: str = None
+    speaker_embedding_channels: int = 256
+
+    # use d-vectors
+    use_d_vector_file: bool = False
+    d_vector_file: List[str] = None
+    d_vector_dim: int = None
+
+    def __post_init__(self):
+        for key, val in self.model_args.items():
+            if hasattr(self, key):
+                self[key] = val
--- a/TTS/vc/models/freevc.py
+++ b/TTS/vc/models/freevc.py
@ -1,4 +1,3 @@
-from dataclasses import dataclass, field
 from typing import Dict, List, Optional, Tuple, Union

 import librosa
@ -13,8 +12,8 @@ from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
 import TTS.vc.modules.freevc.commons as commons
 import TTS.vc.modules.freevc.modules as modules
 from TTS.tts.utils.speakers import SpeakerManager
-from TTS.utils.io import load_fsspec, save_checkpoint
-from TTS.vc.configs.shared_configs import BaseVCConfig
+from TTS.utils.io import load_fsspec
+from TTS.vc.configs.freevc_config import FreeVCConfig
 from TTS.vc.models.base_vc import BaseVC
 from TTS.vc.modules.freevc.commons import get_padding, init_weights
 from TTS.vc.modules.freevc.mel_processing import mel_spectrogram_torch
@ -294,136 +293,6 @@ class SpeakerEncoder(torch.nn.Module):
        return embed


-@dataclass
-class FreeVCAudioConfig(Coqpit):
-    """Audio configuration
-
-    Args:
-        max_wav_value (float):
-            The maximum value of the waveform.
-
-        input_sample_rate (int):
-            The sampling rate of the input waveform.
-
-        output_sample_rate (int):
-            The sampling rate of the output waveform.
-
-        filter_length (int):
-            The length of the filter.
-
-        hop_length (int):
-            The hop length.
-
-        win_length (int):
-            The window length.
-
-        n_mel_channels (int):
-            The number of mel channels.
-
-        mel_fmin (float):
-            The minimum frequency of the mel filterbank.
-
-        mel_fmax (Optional[float]):
-            The maximum frequency of the mel filterbank.
-    """
-
-    max_wav_value: float = field(default=32768.0)
-    input_sample_rate: int = field(default=16000)
-    output_sample_rate: int = field(default=24000)
-    filter_length: int = field(default=1280)
-    hop_length: int = field(default=320)
-    win_length: int = field(default=1280)
-    n_mel_channels: int = field(default=80)
-    mel_fmin: float = field(default=0.0)
-    mel_fmax: Optional[float] = field(default=None)
-
-
-@dataclass
-class FreeVCArgs(Coqpit):
-    """FreeVC model arguments
-
-    Args:
-        spec_channels (int):
-            The number of channels in the spectrogram.
-
-        inter_channels (int):
-            The number of channels in the intermediate layers.
-
-        hidden_channels (int):
-            The number of channels in the hidden layers.
-
-        filter_channels (int):
-            The number of channels in the filter layers.
-
-        n_heads (int):
-            The number of attention heads.
-
-        n_layers (int):
-            The number of layers.
-
-        kernel_size (int):
-            The size of the kernel.
-
-        p_dropout (float):
-            The dropout probability.
-
-        resblock (str):
-            The type of residual block.
-
-        resblock_kernel_sizes (List[int]):
-            The kernel sizes for the residual blocks.
-
-        resblock_dilation_sizes (List[List[int]]):
-            The dilation sizes for the residual blocks.
-
-        upsample_rates (List[int]):
-            The upsample rates.
-
-        upsample_initial_channel (int):
-            The number of channels in the initial upsample layer.
-
-        upsample_kernel_sizes (List[int]):
-            The kernel sizes for the upsample layers.
-
-        n_layers_q (int):
-            The number of layers in the quantization network.
-
-        use_spectral_norm (bool):
-            Whether to use spectral normalization.
-
-        gin_channels (int):
-            The number of channels in the global conditioning vector.
-
-        ssl_dim (int):
-            The dimension of the self-supervised learning embedding.
-
-        use_spk (bool):
-            Whether to use external speaker encoder.
-    """
-
-    spec_channels: int = field(default=641)
-    inter_channels: int = field(default=192)
-    hidden_channels: int = field(default=192)
-    filter_channels: int = field(default=768)
-    n_heads: int = field(default=2)
-    n_layers: int = field(default=6)
-    kernel_size: int = field(default=3)
-    p_dropout: float = field(default=0.1)
-    resblock: str = field(default="1")
-    resblock_kernel_sizes: List[int] = field(default_factory=lambda: [3, 7, 11])
-    resblock_dilation_sizes: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]])
-    upsample_rates: List[int] = field(default_factory=lambda: [10, 8, 2, 2])
-    upsample_initial_channel: int = field(default=512)
-    upsample_kernel_sizes: List[int] = field(default_factory=lambda: [16, 16, 4, 4])
-    n_layers_q: int = field(default=3)
-    use_spectral_norm: bool = field(default=False)
-    gin_channels: int = field(default=256)
-    ssl_dim: int = field(default=1024)
-    use_spk: bool = field(default=False)
-    num_spks: int = field(default=0)
-    segment_size: int = field(default=8960)
-
-
 class FreeVC(BaseVC):
    """

@ -677,7 +546,7 @@ class FreeVC(BaseVC):
        ...

    @staticmethod
-    def init_from_config(config: "VitsConfig", samples: Union[List[List], List[Dict]] = None, verbose=True):
+    def init_from_config(config: FreeVCConfig, samples: Union[List[List], List[Dict]] = None, verbose=True):
        model = FreeVC(config)
        return model

@ -689,145 +558,3 @@ class FreeVC(BaseVC):

    def train_step():
        ...
-
-
-@dataclass
-class FreeVCConfig(BaseVCConfig):
-    """Defines parameters for FreeVC End2End TTS model.
-
-    Args:
-        model (str):
-            Model name. Do not change unless you know what you are doing.
-
-        model_args (FreeVCArgs):
-            Model architecture arguments. Defaults to `FreeVCArgs()`.
-
-        audio (FreeVCAudioConfig):
-            Audio processing configuration. Defaults to `FreeVCAudioConfig()`.
-
-        grad_clip (List):
-            Gradient clipping thresholds for each optimizer. Defaults to `[1000.0, 1000.0]`.
-
-        lr_gen (float):
-            Initial learning rate for the generator. Defaults to 0.0002.
-
-        lr_disc (float):
-            Initial learning rate for the discriminator. Defaults to 0.0002.
-
-        lr_scheduler_gen (str):
-            Name of the learning rate scheduler for the generator. One of the `torch.optim.lr_scheduler.*`. Defaults to
-            `ExponentialLR`.
-
-        lr_scheduler_gen_params (dict):
-            Parameters for the learning rate scheduler of the generator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`.
-
-        lr_scheduler_disc (str):
-            Name of the learning rate scheduler for the discriminator. One of the `torch.optim.lr_scheduler.*`. Defaults to
-            `ExponentialLR`.
-
-        lr_scheduler_disc_params (dict):
-            Parameters for the learning rate scheduler of the discriminator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`.
-
-        scheduler_after_epoch (bool):
-            If true, step the schedulers after each epoch else after each step. Defaults to `False`.
-
-        optimizer (str):
-            Name of the optimizer to use with both the generator and the discriminator networks. One of the
-            `torch.optim.*`. Defaults to `AdamW`.
-
-        kl_loss_alpha (float):
-            Loss weight for KL loss. Defaults to 1.0.
-
-        disc_loss_alpha (float):
-            Loss weight for the discriminator loss. Defaults to 1.0.
-
-        gen_loss_alpha (float):
-            Loss weight for the generator loss. Defaults to 1.0.
-
-        feat_loss_alpha (float):
-            Loss weight for the feature matching loss. Defaults to 1.0.
-
-        mel_loss_alpha (float):
-            Loss weight for the mel loss. Defaults to 45.0.
-
-        return_wav (bool):
-            If true, data loader returns the waveform as well as the other outputs. Do not change. Defaults to `True`.
-
-        compute_linear_spec (bool):
-            If true, the linear spectrogram is computed and returned alongside the mel output. Do not change. Defaults to `True`.
-
-        use_weighted_sampler (bool):
-            If true, use weighted sampler with bucketing for balancing samples between datasets used in training. Defaults to `False`.
-
-        weighted_sampler_attrs (dict):
-            Key retuned by the formatter to be used for weighted sampler. For example `{"root_path": 2.0, "speaker_name": 1.0}` sets sample probabilities
-            by overweighting `root_path` by 2.0. Defaults to `{}`.
-
-        weighted_sampler_multipliers (dict):
-            Weight each unique value of a key returned by the formatter for weighted sampling.
-            For example `{"root_path":{"/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-100/":1.0, "/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-360/": 0.5}`.
-            It will sample instances from `train-clean-100` 2 times more than `train-clean-360`. Defaults to `{}`.
-
-        r (int):
-            Number of spectrogram frames to be generated at a time. Do not change. Defaults to `1`.
-
-        add_blank (bool):
-            If true, a blank token is added in between every character. Defaults to `True`.
-
-        test_sentences (List[List]):
-            List of sentences with speaker and language information to be used for testing.
-
-        language_ids_file (str):
-            Path to the language ids file.
-
-        use_language_embedding (bool):
-            If true, language embedding is used. Defaults to `False`.
-
-    Note:
-        Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters.
-
-    Example:
-
-        >>> from TTS.tts.configs.freevc_config import FreeVCConfig
-        >>> config = FreeVCConfig()
-    """
-
-    model: str = "freevc"
-    # model specific params
-    model_args: FreeVCArgs = field(default_factory=FreeVCArgs)
-    audio: FreeVCAudioConfig = field(default_factory=FreeVCAudioConfig)
-
-    # optimizer
-    # TODO with training support
-
-    # loss params
-    # TODO with training support
-
-    # data loader params
-    return_wav: bool = True
-    compute_linear_spec: bool = True
-
-    # sampler params
-    use_weighted_sampler: bool = False  # TODO: move it to the base config
-    weighted_sampler_attrs: dict = field(default_factory=lambda: {})
-    weighted_sampler_multipliers: dict = field(default_factory=lambda: {})
-
-    # overrides
-    r: int = 1  # DO NOT CHANGE
-    add_blank: bool = True
-
-    # multi-speaker settings
-    # use speaker embedding layer
-    num_speakers: int = 0
-    speakers_file: str = None
-    speaker_embedding_channels: int = 256
-
-    # use d-vectors
-    use_d_vector_file: bool = False
-    d_vector_file: List[str] = None
-    d_vector_dim: int = None
-
-    def __post_init__(self):
-        for key, val in self.model_args.items():
-            if hasattr(self, key):
-                self[key] = val
--- a/TTS/vocoder/layers/losses.py
+++ b/TTS/vocoder/layers/losses.py
@ -195,10 +195,10 @@ def _apply_D_loss(scores_fake, scores_real, loss_func):
    if isinstance(scores_fake, list):
        # multi-scale loss
        for score_fake, score_real in zip(scores_fake, scores_real):
-            total_loss, real_loss, fake_loss = loss_func(score_fake=score_fake, score_real=score_real)
+            total_loss, real_loss_, fake_loss_ = loss_func(score_fake=score_fake, score_real=score_real)
            loss += total_loss
-            real_loss += real_loss
-            fake_loss += fake_loss
+            real_loss += real_loss_
+            fake_loss += fake_loss_
        # normalize loss values with number of scales (discriminators)
        loss /= len(scores_fake)
        real_loss /= len(scores_real)
--- a/docs/source/inference.md
+++ b/docs/source/inference.md
@ -124,7 +124,7 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 print(TTS().list_models())

 # Init TTS
-tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1").to(device)
+tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)

 # Run TTS
 # ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language
@ -198,19 +198,12 @@ from TTS.api import CS_API
 # Init 🐸 Coqui Studio API
 # you can either set the API token as an environment variable `COQUI_STUDIO_TOKEN` or pass it as an argument.

-# XTTS - Best quality and life-like speech in EN
+# XTTS - Best quality and life-like speech in multiple languages. See https://docs.coqui.ai/reference/samples_xtts_create for supported languages.
 api = CS_API(api_token=<token>, model="XTTS")
 api.speakers  # all the speakers are available with all the models.
 api.list_speakers()
 api.list_voices()
-wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5)
-
-# XTTS-multilingual - Multilingual XTTS with [en, de, es, fr, it, pt, ...] (more langs coming soon)
-api = CS_API(api_token=<token>, model="XTTS-multilingual")
-api.speakers
-api.list_speakers()
-api.list_voices()
-wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5)
+wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", language="en", speed=1.5)

 # V1 - Fast and lightweight TTS in EN with emotion control.
 api = CS_API(api_token=<token>, model="V1")
@ -238,4 +231,4 @@ api.tts_with_vc_to_file(
    speaker_wav="target/speaker.wav",
    file_path="ouptut.wav"
 )
-```
+```
--- a/docs/source/models/xtts.md
+++ b/docs/source/models/xtts.md
@ -24,8 +24,7 @@ a few tricks to make it faster and support streaming inference.
 Current implementation only supports inference.

 ### Languages
-As of now, XTTS-v2 supports 16 languages: English, Spanish, French, German, Italian, Portuguese,
-Polish, Turkish, Russian, Dutch, Czech, Arabic, Chinese (Simplified), Japanese, Hungarian, Korean
+As of now, XTTS-v2 supports 16 languages: English (en), Spanish (es), French (fr), German (de), Italian (it), Portuguese (pt), Polish (pl), Turkish (tr), Russian (ru), Dutch (nl), Czech (cs), Arabic (ar), Chinese (zh-cn), Japanese (ja), Hungarian (hu) and Korean (ko).

 Stay tuned as we continue to add support for more languages. If you have any language requests, please feel free to reach out.

@ -116,7 +115,7 @@ model.load_checkpoint(config, checkpoint_dir="/path/to/xtts/", use_deepspeed=Tru
 model.cuda()

 print("Computing speaker latents...")
-gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"])
+gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"])

 print("Inference...")
 out = model.inference(
@ -124,7 +123,6 @@ out = model.inference(
    "en",
    gpt_cond_latent,
    speaker_embedding,
-    diffusion_conditioning,
    temperature=0.7, # Add custom parameters here
 )
 torchaudio.save("xtts.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
@ -153,7 +151,7 @@ model.load_checkpoint(config, checkpoint_dir="/path/to/xtts/", use_deepspeed=Tru
 model.cuda()

 print("Computing speaker latents...")
-gpt_cond_latent, _, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"])
+gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"])

 print("Inference...")
 t0 = time.time()
@ -210,7 +208,7 @@ model.load_checkpoint(config, checkpoint_path=XTTS_CHECKPOINT, vocab_path=TOKENI
 model.cuda()

 print("Computing speaker latents...")
-gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path=[SPEAKER_REFERENCE])
+gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[SPEAKER_REFERENCE])

 print("Inference...")
 out = model.inference(
@ -218,7 +216,6 @@ out = model.inference(
    "en",
    gpt_cond_latent,
    speaker_embedding,
-    diffusion_conditioning,
    temperature=0.7, # Add custom parameters here
 )
 torchaudio.save(OUTPUT_WAV_PATH, torch.tensor(out["wav"]).unsqueeze(0), 24000)
--- a/tests/zoo_tests/test_models.py
+++ b/tests/zoo_tests/test_models.py
@ -14,7 +14,6 @@ from TTS.utils.manage import ModelManager
 MODELS_WITH_SEP_TESTS = [
    "tts_models/multilingual/multi-dataset/bark",
    "tts_models/en/multi-dataset/tortoise-v2",
-    "tts_models/multilingual/multi-dataset/xtts_v1",
    "tts_models/multilingual/multi-dataset/xtts_v1.1",
    "tts_models/multilingual/multi-dataset/xtts_v2",
 ]
@ -83,14 +82,14 @@ def test_xtts():
    if use_gpu:
        run_cli(
            "yes | "
-            f"tts --model_name  tts_models/multilingual/multi-dataset/xtts_v1 "
+            f"tts --model_name  tts_models/multilingual/multi-dataset/xtts_v1.1 "
            f'--text "This is an example." --out_path "{output_path}" --progress_bar False --use_cuda True '
            f'--speaker_wav "{speaker_wav}" --language_idx "en"'
        )
    else:
        run_cli(
            "yes | "
-            f"tts --model_name  tts_models/multilingual/multi-dataset/xtts_v1 "
+            f"tts --model_name  tts_models/multilingual/multi-dataset/xtts_v1.1 "
            f'--text "This is an example." --out_path "{output_path}" --progress_bar False '
            f'--speaker_wav "{speaker_wav}" --language_idx "en"'
        )
@ -104,7 +103,7 @@ def test_xtts_streaming():
    speaker_wav = [os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")]
    speaker_wav_2 = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0002.wav")
    speaker_wav.append(speaker_wav_2)
-    model_path = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v1")
+    model_path = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v1.1")
    config = XttsConfig()
    config.load_json(os.path.join(model_path, "config.json"))
    model = Xtts.init_from_config(config)
 @ -1 +1 @@
 .20.1
 .20.2