From 5ae369d6296959db652d29ac33c06218098ce07e Mon Sep 17 00:00:00 2001
From: Aarni Koskela <akx@iki.fi>
Date: Tue, 31 Oct 2023 16:56:25 +0200
Subject: [PATCH 1/9] Move FreeVCConfig to TTS.vc.configs (like all other
 config classes)

---
 TTS/vc/configs/freevc_config.py | 277 ++++++++++++++++++++++++++++++-
 TTS/vc/models/freevc.py         | 279 +-------------------------------
 2 files changed, 278 insertions(+), 278 deletions(-)

diff --git a/TTS/vc/configs/freevc_config.py b/TTS/vc/configs/freevc_config.py
index 890a2693..207181b3 100644
--- a/TTS/vc/configs/freevc_config.py
+++ b/TTS/vc/configs/freevc_config.py
@@ -1,5 +1,278 @@
 from dataclasses import dataclass, field
-from typing import List
+from typing import List, Optional
+
+from coqpit import Coqpit
 
 from TTS.vc.configs.shared_configs import BaseVCConfig
-from TTS.vc.models.freevc import FreeVCArgs, FreeVCAudioConfig, FreeVCConfig
+
+
+@dataclass
+class FreeVCAudioConfig(Coqpit):
+    """Audio configuration
+
+    Args:
+        max_wav_value (float):
+            The maximum value of the waveform.
+
+        input_sample_rate (int):
+            The sampling rate of the input waveform.
+
+        output_sample_rate (int):
+            The sampling rate of the output waveform.
+
+        filter_length (int):
+            The length of the filter.
+
+        hop_length (int):
+            The hop length.
+
+        win_length (int):
+            The window length.
+
+        n_mel_channels (int):
+            The number of mel channels.
+
+        mel_fmin (float):
+            The minimum frequency of the mel filterbank.
+
+        mel_fmax (Optional[float]):
+            The maximum frequency of the mel filterbank.
+    """
+
+    max_wav_value: float = field(default=32768.0)
+    input_sample_rate: int = field(default=16000)
+    output_sample_rate: int = field(default=24000)
+    filter_length: int = field(default=1280)
+    hop_length: int = field(default=320)
+    win_length: int = field(default=1280)
+    n_mel_channels: int = field(default=80)
+    mel_fmin: float = field(default=0.0)
+    mel_fmax: Optional[float] = field(default=None)
+
+
+@dataclass
+class FreeVCArgs(Coqpit):
+    """FreeVC model arguments
+
+    Args:
+        spec_channels (int):
+            The number of channels in the spectrogram.
+
+        inter_channels (int):
+            The number of channels in the intermediate layers.
+
+        hidden_channels (int):
+            The number of channels in the hidden layers.
+
+        filter_channels (int):
+            The number of channels in the filter layers.
+
+        n_heads (int):
+            The number of attention heads.
+
+        n_layers (int):
+            The number of layers.
+
+        kernel_size (int):
+            The size of the kernel.
+
+        p_dropout (float):
+            The dropout probability.
+
+        resblock (str):
+            The type of residual block.
+
+        resblock_kernel_sizes (List[int]):
+            The kernel sizes for the residual blocks.
+
+        resblock_dilation_sizes (List[List[int]]):
+            The dilation sizes for the residual blocks.
+
+        upsample_rates (List[int]):
+            The upsample rates.
+
+        upsample_initial_channel (int):
+            The number of channels in the initial upsample layer.
+
+        upsample_kernel_sizes (List[int]):
+            The kernel sizes for the upsample layers.
+
+        n_layers_q (int):
+            The number of layers in the quantization network.
+
+        use_spectral_norm (bool):
+            Whether to use spectral normalization.
+
+        gin_channels (int):
+            The number of channels in the global conditioning vector.
+
+        ssl_dim (int):
+            The dimension of the self-supervised learning embedding.
+
+        use_spk (bool):
+            Whether to use external speaker encoder.
+    """
+
+    spec_channels: int = field(default=641)
+    inter_channels: int = field(default=192)
+    hidden_channels: int = field(default=192)
+    filter_channels: int = field(default=768)
+    n_heads: int = field(default=2)
+    n_layers: int = field(default=6)
+    kernel_size: int = field(default=3)
+    p_dropout: float = field(default=0.1)
+    resblock: str = field(default="1")
+    resblock_kernel_sizes: List[int] = field(default_factory=lambda: [3, 7, 11])
+    resblock_dilation_sizes: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]])
+    upsample_rates: List[int] = field(default_factory=lambda: [10, 8, 2, 2])
+    upsample_initial_channel: int = field(default=512)
+    upsample_kernel_sizes: List[int] = field(default_factory=lambda: [16, 16, 4, 4])
+    n_layers_q: int = field(default=3)
+    use_spectral_norm: bool = field(default=False)
+    gin_channels: int = field(default=256)
+    ssl_dim: int = field(default=1024)
+    use_spk: bool = field(default=False)
+    num_spks: int = field(default=0)
+    segment_size: int = field(default=8960)
+
+
+@dataclass
+class FreeVCConfig(BaseVCConfig):
+    """Defines parameters for FreeVC End2End TTS model.
+
+    Args:
+        model (str):
+            Model name. Do not change unless you know what you are doing.
+
+        model_args (FreeVCArgs):
+            Model architecture arguments. Defaults to `FreeVCArgs()`.
+
+        audio (FreeVCAudioConfig):
+            Audio processing configuration. Defaults to `FreeVCAudioConfig()`.
+
+        grad_clip (List):
+            Gradient clipping thresholds for each optimizer. Defaults to `[1000.0, 1000.0]`.
+
+        lr_gen (float):
+            Initial learning rate for the generator. Defaults to 0.0002.
+
+        lr_disc (float):
+            Initial learning rate for the discriminator. Defaults to 0.0002.
+
+        lr_scheduler_gen (str):
+            Name of the learning rate scheduler for the generator. One of the `torch.optim.lr_scheduler.*`. Defaults to
+            `ExponentialLR`.
+
+        lr_scheduler_gen_params (dict):
+            Parameters for the learning rate scheduler of the generator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`.
+
+        lr_scheduler_disc (str):
+            Name of the learning rate scheduler for the discriminator. One of the `torch.optim.lr_scheduler.*`. Defaults to
+            `ExponentialLR`.
+
+        lr_scheduler_disc_params (dict):
+            Parameters for the learning rate scheduler of the discriminator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`.
+
+        scheduler_after_epoch (bool):
+            If true, step the schedulers after each epoch else after each step. Defaults to `False`.
+
+        optimizer (str):
+            Name of the optimizer to use with both the generator and the discriminator networks. One of the
+            `torch.optim.*`. Defaults to `AdamW`.
+
+        kl_loss_alpha (float):
+            Loss weight for KL loss. Defaults to 1.0.
+
+        disc_loss_alpha (float):
+            Loss weight for the discriminator loss. Defaults to 1.0.
+
+        gen_loss_alpha (float):
+            Loss weight for the generator loss. Defaults to 1.0.
+
+        feat_loss_alpha (float):
+            Loss weight for the feature matching loss. Defaults to 1.0.
+
+        mel_loss_alpha (float):
+            Loss weight for the mel loss. Defaults to 45.0.
+
+        return_wav (bool):
+            If true, data loader returns the waveform as well as the other outputs. Do not change. Defaults to `True`.
+
+        compute_linear_spec (bool):
+            If true, the linear spectrogram is computed and returned alongside the mel output. Do not change. Defaults to `True`.
+
+        use_weighted_sampler (bool):
+            If true, use weighted sampler with bucketing for balancing samples between datasets used in training. Defaults to `False`.
+
+        weighted_sampler_attrs (dict):
+            Key retuned by the formatter to be used for weighted sampler. For example `{"root_path": 2.0, "speaker_name": 1.0}` sets sample probabilities
+            by overweighting `root_path` by 2.0. Defaults to `{}`.
+
+        weighted_sampler_multipliers (dict):
+            Weight each unique value of a key returned by the formatter for weighted sampling.
+            For example `{"root_path":{"/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-100/":1.0, "/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-360/": 0.5}`.
+            It will sample instances from `train-clean-100` 2 times more than `train-clean-360`. Defaults to `{}`.
+
+        r (int):
+            Number of spectrogram frames to be generated at a time. Do not change. Defaults to `1`.
+
+        add_blank (bool):
+            If true, a blank token is added in between every character. Defaults to `True`.
+
+        test_sentences (List[List]):
+            List of sentences with speaker and language information to be used for testing.
+
+        language_ids_file (str):
+            Path to the language ids file.
+
+        use_language_embedding (bool):
+            If true, language embedding is used. Defaults to `False`.
+
+    Note:
+        Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters.
+
+    Example:
+
+        >>> from TTS.vc.configs.freevc_config import FreeVCConfig
+        >>> config = FreeVCConfig()
+    """
+
+    model: str = "freevc"
+    # model specific params
+    model_args: FreeVCArgs = field(default_factory=FreeVCArgs)
+    audio: FreeVCAudioConfig = field(default_factory=FreeVCAudioConfig)
+
+    # optimizer
+    # TODO with training support
+
+    # loss params
+    # TODO with training support
+
+    # data loader params
+    return_wav: bool = True
+    compute_linear_spec: bool = True
+
+    # sampler params
+    use_weighted_sampler: bool = False  # TODO: move it to the base config
+    weighted_sampler_attrs: dict = field(default_factory=lambda: {})
+    weighted_sampler_multipliers: dict = field(default_factory=lambda: {})
+
+    # overrides
+    r: int = 1  # DO NOT CHANGE
+    add_blank: bool = True
+
+    # multi-speaker settings
+    # use speaker embedding layer
+    num_speakers: int = 0
+    speakers_file: str = None
+    speaker_embedding_channels: int = 256
+
+    # use d-vectors
+    use_d_vector_file: bool = False
+    d_vector_file: List[str] = None
+    d_vector_dim: int = None
+
+    def __post_init__(self):
+        for key, val in self.model_args.items():
+            if hasattr(self, key):
+                self[key] = val
diff --git a/TTS/vc/models/freevc.py b/TTS/vc/models/freevc.py
index ae22ad28..fd53a77f 100644
--- a/TTS/vc/models/freevc.py
+++ b/TTS/vc/models/freevc.py
@@ -1,4 +1,3 @@
-from dataclasses import dataclass, field
 from typing import Dict, List, Optional, Tuple, Union
 
 import librosa
@@ -13,8 +12,8 @@ from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
 import TTS.vc.modules.freevc.commons as commons
 import TTS.vc.modules.freevc.modules as modules
 from TTS.tts.utils.speakers import SpeakerManager
-from TTS.utils.io import load_fsspec, save_checkpoint
-from TTS.vc.configs.shared_configs import BaseVCConfig
+from TTS.utils.io import load_fsspec
+from TTS.vc.configs.freevc_config import FreeVCConfig
 from TTS.vc.models.base_vc import BaseVC
 from TTS.vc.modules.freevc.commons import get_padding, init_weights
 from TTS.vc.modules.freevc.mel_processing import mel_spectrogram_torch
@@ -294,136 +293,6 @@ class SpeakerEncoder(torch.nn.Module):
         return embed
 
 
-@dataclass
-class FreeVCAudioConfig(Coqpit):
-    """Audio configuration
-
-    Args:
-        max_wav_value (float):
-            The maximum value of the waveform.
-
-        input_sample_rate (int):
-            The sampling rate of the input waveform.
-
-        output_sample_rate (int):
-            The sampling rate of the output waveform.
-
-        filter_length (int):
-            The length of the filter.
-
-        hop_length (int):
-            The hop length.
-
-        win_length (int):
-            The window length.
-
-        n_mel_channels (int):
-            The number of mel channels.
-
-        mel_fmin (float):
-            The minimum frequency of the mel filterbank.
-
-        mel_fmax (Optional[float]):
-            The maximum frequency of the mel filterbank.
-    """
-
-    max_wav_value: float = field(default=32768.0)
-    input_sample_rate: int = field(default=16000)
-    output_sample_rate: int = field(default=24000)
-    filter_length: int = field(default=1280)
-    hop_length: int = field(default=320)
-    win_length: int = field(default=1280)
-    n_mel_channels: int = field(default=80)
-    mel_fmin: float = field(default=0.0)
-    mel_fmax: Optional[float] = field(default=None)
-
-
-@dataclass
-class FreeVCArgs(Coqpit):
-    """FreeVC model arguments
-
-    Args:
-        spec_channels (int):
-            The number of channels in the spectrogram.
-
-        inter_channels (int):
-            The number of channels in the intermediate layers.
-
-        hidden_channels (int):
-            The number of channels in the hidden layers.
-
-        filter_channels (int):
-            The number of channels in the filter layers.
-
-        n_heads (int):
-            The number of attention heads.
-
-        n_layers (int):
-            The number of layers.
-
-        kernel_size (int):
-            The size of the kernel.
-
-        p_dropout (float):
-            The dropout probability.
-
-        resblock (str):
-            The type of residual block.
-
-        resblock_kernel_sizes (List[int]):
-            The kernel sizes for the residual blocks.
-
-        resblock_dilation_sizes (List[List[int]]):
-            The dilation sizes for the residual blocks.
-
-        upsample_rates (List[int]):
-            The upsample rates.
-
-        upsample_initial_channel (int):
-            The number of channels in the initial upsample layer.
-
-        upsample_kernel_sizes (List[int]):
-            The kernel sizes for the upsample layers.
-
-        n_layers_q (int):
-            The number of layers in the quantization network.
-
-        use_spectral_norm (bool):
-            Whether to use spectral normalization.
-
-        gin_channels (int):
-            The number of channels in the global conditioning vector.
-
-        ssl_dim (int):
-            The dimension of the self-supervised learning embedding.
-
-        use_spk (bool):
-            Whether to use external speaker encoder.
-    """
-
-    spec_channels: int = field(default=641)
-    inter_channels: int = field(default=192)
-    hidden_channels: int = field(default=192)
-    filter_channels: int = field(default=768)
-    n_heads: int = field(default=2)
-    n_layers: int = field(default=6)
-    kernel_size: int = field(default=3)
-    p_dropout: float = field(default=0.1)
-    resblock: str = field(default="1")
-    resblock_kernel_sizes: List[int] = field(default_factory=lambda: [3, 7, 11])
-    resblock_dilation_sizes: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]])
-    upsample_rates: List[int] = field(default_factory=lambda: [10, 8, 2, 2])
-    upsample_initial_channel: int = field(default=512)
-    upsample_kernel_sizes: List[int] = field(default_factory=lambda: [16, 16, 4, 4])
-    n_layers_q: int = field(default=3)
-    use_spectral_norm: bool = field(default=False)
-    gin_channels: int = field(default=256)
-    ssl_dim: int = field(default=1024)
-    use_spk: bool = field(default=False)
-    num_spks: int = field(default=0)
-    segment_size: int = field(default=8960)
-
-
 class FreeVC(BaseVC):
     """
 
@@ -677,7 +546,7 @@ class FreeVC(BaseVC):
         ...
 
     @staticmethod
-    def init_from_config(config: "VitsConfig", samples: Union[List[List], List[Dict]] = None, verbose=True):
+    def init_from_config(config: FreeVCConfig, samples: Union[List[List], List[Dict]] = None, verbose=True):
         model = FreeVC(config)
         return model
 
@@ -689,145 +558,3 @@ class FreeVC(BaseVC):
 
     def train_step():
         ...
-
-
-@dataclass
-class FreeVCConfig(BaseVCConfig):
-    """Defines parameters for FreeVC End2End TTS model.
-
-    Args:
-        model (str):
-            Model name. Do not change unless you know what you are doing.
-
-        model_args (FreeVCArgs):
-            Model architecture arguments. Defaults to `FreeVCArgs()`.
-
-        audio (FreeVCAudioConfig):
-            Audio processing configuration. Defaults to `FreeVCAudioConfig()`.
-
-        grad_clip (List):
-            Gradient clipping thresholds for each optimizer. Defaults to `[1000.0, 1000.0]`.
-
-        lr_gen (float):
-            Initial learning rate for the generator. Defaults to 0.0002.
-
-        lr_disc (float):
-            Initial learning rate for the discriminator. Defaults to 0.0002.
-
-        lr_scheduler_gen (str):
-            Name of the learning rate scheduler for the generator. One of the `torch.optim.lr_scheduler.*`. Defaults to
-            `ExponentialLR`.
-
-        lr_scheduler_gen_params (dict):
-            Parameters for the learning rate scheduler of the generator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`.
-
-        lr_scheduler_disc (str):
-            Name of the learning rate scheduler for the discriminator. One of the `torch.optim.lr_scheduler.*`. Defaults to
-            `ExponentialLR`.
-
-        lr_scheduler_disc_params (dict):
-            Parameters for the learning rate scheduler of the discriminator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`.
-
-        scheduler_after_epoch (bool):
-            If true, step the schedulers after each epoch else after each step. Defaults to `False`.
-
-        optimizer (str):
-            Name of the optimizer to use with both the generator and the discriminator networks. One of the
-            `torch.optim.*`. Defaults to `AdamW`.
-
-        kl_loss_alpha (float):
-            Loss weight for KL loss. Defaults to 1.0.
-
-        disc_loss_alpha (float):
-            Loss weight for the discriminator loss. Defaults to 1.0.
-
-        gen_loss_alpha (float):
-            Loss weight for the generator loss. Defaults to 1.0.
-
-        feat_loss_alpha (float):
-            Loss weight for the feature matching loss. Defaults to 1.0.
-
-        mel_loss_alpha (float):
-            Loss weight for the mel loss. Defaults to 45.0.
-
-        return_wav (bool):
-            If true, data loader returns the waveform as well as the other outputs. Do not change. Defaults to `True`.
-
-        compute_linear_spec (bool):
-            If true, the linear spectrogram is computed and returned alongside the mel output. Do not change. Defaults to `True`.
-
-        use_weighted_sampler (bool):
-            If true, use weighted sampler with bucketing for balancing samples between datasets used in training. Defaults to `False`.
-
-        weighted_sampler_attrs (dict):
-            Key retuned by the formatter to be used for weighted sampler. For example `{"root_path": 2.0, "speaker_name": 1.0}` sets sample probabilities
-            by overweighting `root_path` by 2.0. Defaults to `{}`.
-
-        weighted_sampler_multipliers (dict):
-            Weight each unique value of a key returned by the formatter for weighted sampling.
-            For example `{"root_path":{"/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-100/":1.0, "/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-360/": 0.5}`.
-            It will sample instances from `train-clean-100` 2 times more than `train-clean-360`. Defaults to `{}`.
-
-        r (int):
-            Number of spectrogram frames to be generated at a time. Do not change. Defaults to `1`.
-
-        add_blank (bool):
-            If true, a blank token is added in between every character. Defaults to `True`.
-
-        test_sentences (List[List]):
-            List of sentences with speaker and language information to be used for testing.
-
-        language_ids_file (str):
-            Path to the language ids file.
-
-        use_language_embedding (bool):
-            If true, language embedding is used. Defaults to `False`.
-
-    Note:
-        Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters.
-
-    Example:
-
-        >>> from TTS.tts.configs.freevc_config import FreeVCConfig
-        >>> config = FreeVCConfig()
-    """
-
-    model: str = "freevc"
-    # model specific params
-    model_args: FreeVCArgs = field(default_factory=FreeVCArgs)
-    audio: FreeVCAudioConfig = field(default_factory=FreeVCAudioConfig)
-
-    # optimizer
-    # TODO with training support
-
-    # loss params
-    # TODO with training support
-
-    # data loader params
-    return_wav: bool = True
-    compute_linear_spec: bool = True
-
-    # sampler params
-    use_weighted_sampler: bool = False  # TODO: move it to the base config
-    weighted_sampler_attrs: dict = field(default_factory=lambda: {})
-    weighted_sampler_multipliers: dict = field(default_factory=lambda: {})
-
-    # overrides
-    r: int = 1  # DO NOT CHANGE
-    add_blank: bool = True
-
-    # multi-speaker settings
-    # use speaker embedding layer
-    num_speakers: int = 0
-    speakers_file: str = None
-    speaker_embedding_channels: int = 256
-
-    # use d-vectors
-    use_d_vector_file: bool = False
-    d_vector_file: List[str] = None
-    d_vector_dim: int = None
-
-    def __post_init__(self):
-        for key, val in self.model_args.items():
-            if hasattr(self, key):
-                self[key] = val

From ce1a39a9a4d106c68320c3cb00954fbf69b17a87 Mon Sep 17 00:00:00 2001
From: Julian Weber <julian.weber@hotmail.fr>
Date: Wed, 8 Nov 2023 10:24:23 +0100
Subject: [PATCH 2/9] Add char limit warn (#3130)

* Add char limit warning

* Adding v2 langs

* cached_property for cutlet

* Fix import
---
 TTS/tts/layers/xtts/tokenizer.py | 42 +++++++++++++++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git a/TTS/tts/layers/xtts/tokenizer.py b/TTS/tts/layers/xtts/tokenizer.py
index 4f2da02d..4c7ae6e3 100644
--- a/TTS/tts/layers/xtts/tokenizer.py
+++ b/TTS/tts/layers/xtts/tokenizer.py
@@ -8,6 +8,7 @@ from hangul_romanize import Transliter
 from hangul_romanize.rule import academic
 from num2words import num2words
 from tokenizers import Tokenizer
+from functools import cached_property
 
 from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words
 
@@ -535,11 +536,50 @@ DEFAULT_VOCAB_FILE = os.path.join(os.path.dirname(os.path.realpath(__file__)), "
 class VoiceBpeTokenizer:
     def __init__(self, vocab_file=None):
         self.tokenizer = None
-        self.katsu = None
         if vocab_file is not None:
             self.tokenizer = Tokenizer.from_file(vocab_file)
+        self.char_limits = {
+            "en": 250,
+            "de": 253,
+            "fr": 273,
+            "es": 239,
+            "it": 213,
+            "pt": 203,
+            "pl": 224,
+            "zh-cn": 82,
+            "ar": 166,
+            "cs": 186,
+            "ru": 182,
+            "nl": 251,
+            "tr": 226,
+            "ja": 71,
+            "hu": 224,
+            "ko": 95,
+        }
+
+    @cached_property
+    def katsu(self):
+        import cutlet
+        return cutlet.Cutlet()
+    
+    def check_input_length(self, txt, lang):
+        limit = self.char_limits.get(lang, 250)
+        if len(txt) > limit:
+            print(f"[!] Warning: The text length exceeds the character limit of {limit} for language '{lang}', this might cause truncated audio.")
+
+    def preprocess_text(self, txt, lang):
+        if lang in ["en", "es", "fr", "de", "pt", "it", "pl", "ar", "cs", "ru", "nl", "tr", "zh-cn"]:
+            txt = multilingual_cleaners(txt, lang)
+            if lang == "zh-cn":
+                txt = chinese_transliterate(txt)
+        elif lang == "ja":                
+            txt = japanese_cleaners(txt, self.katsu)
+        else:
+            raise NotImplementedError()
+        return txt
 
     def encode(self, txt, lang):
+        self.check_input_length(txt, lang)
         txt = self.preprocess_text(txt, lang)
         txt = f"[{lang}]{txt}"
         txt = txt.replace(" ", "[SPACE]")

From a24ebcd8a6be0a233cf3bb3dfd23916b276dd591 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <erogol@hotmail.com>
Date: Wed, 8 Nov 2023 10:51:23 +0100
Subject: [PATCH 3/9] Fix coqui api (#3168)

---
 README.md                | 10 ++------
 TTS/api.py               |  6 ++---
 TTS/bin/synthesize.py    |  4 +--
 TTS/cs_api.py            | 54 ++++++++++------------------------------
 docs/source/inference.md | 11 ++------
 5 files changed, 22 insertions(+), 63 deletions(-)

diff --git a/README.md b/README.md
index 1a9285eb..353db7cf 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 ## 🐸Coqui.ai News
 - 📣 ⓍTTSv2 is here with 16 languages and better performance across the board.
 - 📣 ⓍTTS fine-tuning code is out. Check the [example recipes](https://github.com/coqui-ai/TTS/tree/dev/recipes/ljspeech).
-- 📣 ⓍTTS can now stream with <200ms latency. 
+- 📣 ⓍTTS can now stream with <200ms latency.
 - 📣 ⓍTTS, our production TTS model that can speak 13 languages, is released [Blog Post](https://coqui.ai/blog/tts/open_xtts), [Demo](https://huggingface.co/spaces/coqui/xtts), [Docs](https://tts.readthedocs.io/en/dev/models/xtts.html)
 - 📣 [🐶Bark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://tts.readthedocs.io/en/dev/models/bark.html)
 - 📣 You can use [~1100 Fairseq models](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS.
@@ -267,19 +267,13 @@ models = TTS(cs_api_model="XTTS").list_models()
 # Init TTS with the target studio speaker
 tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False)
 # Run TTS
-tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH)
+tts.tts_to_file(text="This is a test.", language="en", file_path=OUTPUT_PATH)
 
 # V1 model
 models = TTS(cs_api_model="V1").list_models()
 # Run TTS with emotion and speed control
 # Emotion control only works with V1 model
 tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5)
-
-# XTTS-multilingual
-models = TTS(cs_api_model="XTTS-multilingual").list_models()
-# Run TTS with emotion and speed control
-# Emotion control only works with V1 model
-tts.tts_to_file(text="Das ist ein Test.", file_path=OUTPUT_PATH, language="de", speed=1.0)
 ```
 
 #### Example text to speech using **Fairseq models in ~1100 languages** 🤯.
diff --git a/TTS/api.py b/TTS/api.py
index 5d1fbb5a..c8600dcd 100644
--- a/TTS/api.py
+++ b/TTS/api.py
@@ -60,7 +60,7 @@ class TTS(nn.Module):
             vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
             progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
             cs_api_model (str, optional): Name of the model to use for the Coqui Studio API. Available models are
-                "XTTS", "XTTS-multilingual", "V1". You can also use `TTS.cs_api.CS_API" for more control.
+                "XTTS", "V1". You can also use `TTS.cs_api.CS_API" for more control.
                 Defaults to "XTTS".
             gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
         """
@@ -275,7 +275,7 @@ class TTS(nn.Module):
             speaker_name (str, optional):
                 Speaker name from Coqui Studio. Defaults to None.
             language (str): Language of the text. If None, the default language of the speaker is used. Language is only
-                supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
+                supported by `XTTS` model.
             emotion (str, optional):
                 Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only available
                 with "V1" model. Defaults to None.
@@ -321,7 +321,7 @@ class TTS(nn.Module):
                 Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
                 `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
             language (str): Language of the text. If None, the default language of the speaker is used. Language is only
-                supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
+                supported by `XTTS` model.
             speaker_wav (str, optional):
                 Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
                 Defaults to None.
diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py
index ef41c8e1..ddfe35d2 100755
--- a/TTS/bin/synthesize.py
+++ b/TTS/bin/synthesize.py
@@ -227,7 +227,7 @@ def main():
     parser.add_argument(
         "--cs_model",
         type=str,
-        help="Name of the 🐸Coqui Studio model. Available models are `XTTS`, `XTTS-multilingual`, `V1`.",
+        help="Name of the 🐸Coqui Studio model. Available models are `XTTS`, `V1`.",
     )
     parser.add_argument(
         "--emotion",
@@ -238,7 +238,7 @@ def main():
     parser.add_argument(
         "--language",
         type=str,
-        help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS-multilingual` model.",
+        help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS` model.",
         default=None,
     )
     parser.add_argument(
diff --git a/TTS/cs_api.py b/TTS/cs_api.py
index 4a44b535..c45f9d08 100644
--- a/TTS/cs_api.py
+++ b/TTS/cs_api.py
@@ -43,7 +43,7 @@ class CS_API:
     Args:
         api_token (str): 🐸Coqui Studio API token. If not provided, it will be read from the environment variable
             `COQUI_STUDIO_TOKEN`.
-        model (str): 🐸Coqui Studio model. It can be either `V1`, `XTTS`, or `XTTS-multilang`. Default is `XTTS`.
+        model (str): 🐸Coqui Studio model. It can be either `V1`, `XTTS`. Default is `XTTS`.
 
 
     Example listing all available speakers:
@@ -65,7 +65,7 @@ class CS_API:
 
     Example with multi-language model:
         >>> from TTS.api import CS_API
-        >>> tts = CS_API(model="XTTS-multilang")
+        >>> tts = CS_API(model="XTTS")
         >>> wav, sr = api.tts("Hello world", speaker_name=tts.speakers[0].name, language="en")
     """
 
@@ -78,16 +78,12 @@ class CS_API:
         "XTTS": {
             "list_speakers": "https://app.coqui.ai/api/v2/speakers",
             "synthesize": "https://app.coqui.ai/api/v2/samples/xtts/render/",
-            "list_voices": "https://app.coqui.ai/api/v2/voices/xtts/",
-        },
-        "XTTS-multilang": {
-            "list_speakers": "https://app.coqui.ai/api/v2/speakers",
-            "synthesize": "https://app.coqui.ai/api/v2/samples/multilingual/render/",
-            "list_voices": "https://app.coqui.ai/api/v2/voices/xtts/",
+            "list_voices": "https://app.coqui.ai/api/v2/voices/xtts",
         },
     }
 
-    SUPPORTED_LANGUAGES = ["en", "es", "de", "fr", "it", "pt", "pl"]
+
+    SUPPORTED_LANGUAGES = ["en", "es", "de", "fr", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja"]
 
     def __init__(self, api_token=None, model="XTTS"):
         self.api_token = api_token
@@ -139,7 +135,7 @@ class CS_API:
         self._check_token()
         conn = http.client.HTTPSConnection("app.coqui.ai")
         url = self.MODEL_ENDPOINTS[self.model]["list_speakers"]
-        conn.request("GET", f"{url}?per_page=100", headers=self.headers)
+        conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers)
         res = conn.getresponse()
         data = res.read()
         return [Speaker(s) for s in json.loads(data)["result"]]
@@ -148,7 +144,7 @@ class CS_API:
         """List custom voices created by the user."""
         conn = http.client.HTTPSConnection("app.coqui.ai")
         url = self.MODEL_ENDPOINTS[self.model]["list_voices"]
-        conn.request("GET", f"{url}", headers=self.headers)
+        conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers)
         res = conn.getresponse()
         data = res.read()
         return [Speaker(s, True) for s in json.loads(data)["result"]]
@@ -197,14 +193,6 @@ class CS_API:
                 }
             )
         elif model == "XTTS":
-            payload.update(
-                {
-                    "name": speaker.name,
-                    "text": text,
-                    "speed": speed,
-                }
-            )
-        elif model == "XTTS-multilang":
             payload.update(
                 {
                     "name": speaker.name,
@@ -226,13 +214,10 @@ class CS_API:
             assert language is None, "❗ language is not supported for V1 model."
         elif self.model == "XTTS":
             assert emotion is None, f"❗ Emotions are not supported for XTTS model. Use V1 model."
-            assert language is None, "❗ Language is not supported for XTTS model. Use XTTS-multilang model."
-        elif self.model == "XTTS-multilang":
-            assert emotion is None, f"❗ Emotions are not supported for XTTS-multilang model. Use V1 model."
-            assert language is not None, "❗ Language is required for XTTS-multilang model."
+            assert language is not None, "❗ Language is required for XTTS model."
             assert (
                 language in self.SUPPORTED_LANGUAGES
-            ), f"❗ Language {language} is not yet supported. Use one of: en, es, de, fr, it, pt, pl"
+            ), f"❗ Language {language} is not yet supported. Check https://docs.coqui.ai/reference/samples_xtts_create."
         return text, speaker_name, speaker_id, emotion, speed, language
 
     def tts(
@@ -255,7 +240,7 @@ class CS_API:
                 supported by `V1` model. Defaults to None.
             speed (float): Speed of the speech. 1.0 is normal speed.
             language (str): Language of the text. If None, the default language of the speaker is used. Language is only
-                supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
+                supported by `XTTS` model. See https://docs.coqui.ai/reference/samples_xtts_create for supported languages.
         """
         self._check_token()
         self.ping_api()
@@ -305,7 +290,7 @@ class CS_API:
             speed (float): Speed of the speech. 1.0 is normal speed.
             pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe.
             language (str): Language of the text. If None, the default language of the speaker is used. Language is only
-                supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
+                supported by `XTTS` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
             file_path (str): Path to save the file. If None, a temporary file is created.
         """
         if file_path is None:
@@ -323,20 +308,7 @@ if __name__ == "__main__":
     print(api.list_speakers_as_tts_models())
 
     ts = time.time()
-    wav, sr = api.tts("It took me quite a long time to develop a voice.", speaker_name=api.speakers[0].name)
+    wav, sr = api.tts("It took me quite a long time to develop a voice.", language="en", speaker_name=api.speakers[0].name)
     print(f" [i] XTTS took {time.time() - ts:.2f}s")
 
-    filepath = api.tts_to_file(text="Hello world!", speaker_name=api.speakers[0].name, file_path="output.wav")
-
-    api = CS_API(model="XTTS-multilang")
-    print(api.speakers)
-
-    ts = time.time()
-    wav, sr = api.tts(
-        "It took me quite a long time to develop a voice.", speaker_name=api.speakers[0].name, language="en"
-    )
-    print(f" [i] XTTS took {time.time() - ts:.2f}s")
-
-    filepath = api.tts_to_file(
-        text="Hello world!", speaker_name=api.speakers[0].name, file_path="output.wav", language="en"
-    )
+    filepath = api.tts_to_file(text="Hello world!", speaker_name=api.speakers[0].name, language="en", file_path="output.wav")
diff --git a/docs/source/inference.md b/docs/source/inference.md
index 4de9ecdd..b40445ae 100644
--- a/docs/source/inference.md
+++ b/docs/source/inference.md
@@ -198,19 +198,12 @@ from TTS.api import CS_API
 # Init 🐸 Coqui Studio API
 # you can either set the API token as an environment variable `COQUI_STUDIO_TOKEN` or pass it as an argument.
 
-# XTTS - Best quality and life-like speech in EN
+# XTTS - Best quality and life-like speech in multiple languages. See https://docs.coqui.ai/reference/samples_xtts_create for supported languages.
 api = CS_API(api_token=<token>, model="XTTS")
 api.speakers  # all the speakers are available with all the models.
 api.list_speakers()
 api.list_voices()
-wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5)
-
-# XTTS-multilingual - Multilingual XTTS with [en, de, es, fr, it, pt, ...] (more langs coming soon)
-api = CS_API(api_token=<token>, model="XTTS-multilingual")
-api.speakers
-api.list_speakers()
-api.list_voices()
-wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5)
+wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", language="en", speed=1.5)
 
 # V1 - Fast and lightweight TTS in EN with emotion control.
 api = CS_API(api_token=<token>, model="V1")

From cc6e9fcaa72a6ba7255a1a39c77ffdb5b7bc7e4d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <erogol@hotmail.com>
Date: Wed, 8 Nov 2023 11:13:58 +0100
Subject: [PATCH 4/9] Fix  #3153 (#3169)

---
 TTS/vocoder/layers/losses.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/TTS/vocoder/layers/losses.py b/TTS/vocoder/layers/losses.py
index befc43cc..74cfc726 100644
--- a/TTS/vocoder/layers/losses.py
+++ b/TTS/vocoder/layers/losses.py
@@ -195,10 +195,10 @@ def _apply_D_loss(scores_fake, scores_real, loss_func):
     if isinstance(scores_fake, list):
         # multi-scale loss
         for score_fake, score_real in zip(scores_fake, scores_real):
-            total_loss, real_loss, fake_loss = loss_func(score_fake=score_fake, score_real=score_real)
+            total_loss, real_loss_, fake_loss_ = loss_func(score_fake=score_fake, score_real=score_real)
             loss += total_loss
-            real_loss += real_loss
-            fake_loss += fake_loss
+            real_loss += real_loss_
+            fake_loss += fake_loss_
         # normalize loss values with number of scales (discriminators)
         loss /= len(scores_fake)
         real_loss /= len(scores_real)

From 99edd6daa38f929c703e29f6dfcdcdd8f5260b0d Mon Sep 17 00:00:00 2001
From: Enno Hermann <Eginhard@users.noreply.github.com>
Date: Wed, 8 Nov 2023 11:29:01 +0100
Subject: [PATCH 5/9] Fix ModelManager.list_models() (#3128)

* fix(utils.manage): remove hard-coded model_type variable

* refactor(utils.manage): address lint issues, fix typos

Addressed the following:
TTS/utils/manage.py:307:12: R1705: Unnecessary "else" after "return" (no-else-return)
TTS/utils/manage.py:308:21: W1514: Using open without explicitly specifying an encoding (unspecified-encoding)
TTS/utils/manage.py:299:4: R1710: Either all return statements in a function should return an expression, or none of them should. (inconsistent-return-statements)
TTS/utils/manage.py:299:4: R0201: Method could be a function (no-self-use)
TTS/utils/manage.py:314:4: R0201: Method could be a function (no-self-use)
---
 TTS/utils/manage.py | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py
index c732e1f5..1cd437e6 100644
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@@ -109,7 +109,6 @@ class ModelManager(object):
     def _list_for_model_type(self, model_type):
         models_name_list = []
         model_count = 1
-        model_type = "tts_models"
         models_name_list.extend(self._list_models(model_type, model_count))
         return models_name_list
 
@@ -298,22 +297,22 @@ class ModelManager(object):
         model_item = self.set_model_url(model_item)
         return model_item, model_full_name, model, md5hash
 
-    def ask_tos(self, model_full_path):
+    @staticmethod
+    def ask_tos(model_full_path):
         """Ask the user to agree to the terms of service"""
         tos_path = os.path.join(model_full_path, "tos_agreed.txt")
-        if not os.path.exists(tos_path):
-            print(" > You must agree to the terms of service to use this model.")
-            print(" | > Please see the terms of service at https://coqui.ai/cpml.txt")
-            print(' | > "I have read, understood and agreed the Terms and Conditions." - [y/n]')
-            answer = input(" | | > ")
-            if answer.lower() == "y":
-                with open(tos_path, "w") as f:
-                    f.write("I have read, understood ad agree the Terms and Conditions.")
-                return True
-            else:
-                return False
+        print(" > You must agree to the terms of service to use this model.")
+        print(" | > Please see the terms of service at https://coqui.ai/cpml.txt")
+        print(' | > "I have read, understood and agreed to the Terms and Conditions." - [y/n]')
+        answer = input(" | | > ")
+        if answer.lower() == "y":
+            with open(tos_path, "w", encoding="utf-8") as f:
+                f.write("I have read, understood and agreed to the Terms and Conditions.")
+            return True
+        return False
 
-    def tos_agreed(self, model_item, model_full_path):
+    @staticmethod
+    def tos_agreed(model_item, model_full_path):
         """Check if the user has agreed to the terms of service"""
         if "tos_required" in model_item and model_item["tos_required"]:
             tos_path = os.path.join(model_full_path, "tos_agreed.txt")

From 78a596618a4deb21ef1058e911c15334b81b0669 Mon Sep 17 00:00:00 2001
From: Gorkem <ggoknar@coqui.ai>
Date: Wed, 8 Nov 2023 13:32:02 +0300
Subject: [PATCH 6/9] Fix for exception on streaming if last chunk empty
 (#3160)

---
 TTS/tts/models/xtts.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py
index 4ab00270..a8a574c0 100644
--- a/TTS/tts/models/xtts.py
+++ b/TTS/tts/models/xtts.py
@@ -603,10 +603,21 @@ class Xtts(BaseTTS):
         if wav_gen_prev is not None:
             wav_chunk = wav_gen[(wav_gen_prev.shape[0] - overlap_len) : -overlap_len]
         if wav_overlap is not None:
-            crossfade_wav = wav_chunk[:overlap_len]
-            crossfade_wav = crossfade_wav * torch.linspace(0.0, 1.0, overlap_len).to(crossfade_wav.device)
-            wav_chunk[:overlap_len] = wav_overlap * torch.linspace(1.0, 0.0, overlap_len).to(wav_overlap.device)
-            wav_chunk[:overlap_len] += crossfade_wav
+            # cross fade the overlap section
+            if overlap_len > len(wav_chunk):
+                # wav_chunk is smaller than overlap_len, pass on last wav_gen
+                if wav_gen_prev is not None:
+                    wav_chunk = wav_gen[(wav_gen_prev.shape[0] - overlap_len):]
+                else:
+                    # not expecting will hit here as problem happens on last chunk
+                    wav_chunk = wav_gen[-overlap_len:]
+                return wav_chunk, wav_gen, None
+            else:
+                crossfade_wav = wav_chunk[:overlap_len]
+                crossfade_wav = crossfade_wav * torch.linspace(0.0, 1.0, overlap_len).to(crossfade_wav.device)
+                wav_chunk[:overlap_len] = wav_overlap * torch.linspace(1.0, 0.0, overlap_len).to(wav_overlap.device)
+                wav_chunk[:overlap_len] += crossfade_wav
+                
         wav_overlap = wav_gen[-overlap_len:]
         wav_gen_prev = wav_gen
         return wav_chunk, wav_gen_prev, wav_overlap

From 03ad90135bb70d1ca6b46b3b7f3e89563aa65af6 Mon Sep 17 00:00:00 2001
From: Julian Weber <julian.weber@hotmail.fr>
Date: Wed, 8 Nov 2023 13:47:33 +0100
Subject: [PATCH 7/9] Add lang code in XTTS doc (#3158)

* Add lang code in XTTS doc

* Remove ununsed config and args

* update docs

* woops
---
 TTS/tts/configs/xtts_config.py | 22 --------------------
 TTS/tts/models/xtts.py         | 37 ----------------------------------
 docs/source/models/xtts.md     | 11 ++++------
 3 files changed, 4 insertions(+), 66 deletions(-)

diff --git a/TTS/tts/configs/xtts_config.py b/TTS/tts/configs/xtts_config.py
index ea95faf5..2d3edaf4 100644
--- a/TTS/tts/configs/xtts_config.py
+++ b/TTS/tts/configs/xtts_config.py
@@ -37,29 +37,11 @@ class XttsConfig(BaseTTSConfig):
             If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
             Defaults to `0.8`.
 
-        cond_free_k (float):
-            Knob that determines how to balance the conditioning free signal with the conditioning-present signal. [0,inf].
-            As cond_free_k increases, the output becomes dominated by the conditioning-free signal.
-            Formula is: output=cond_present_output*(cond_free_k+1)-cond_absenct_output*cond_free_k. Defaults to `2.0`.
-
-        diffusion_temperature (float):
-            Controls the variance of the noise fed into the diffusion model. [0,1]. Values at 0
-            are the "mean" prediction of the diffusion network and will sound bland and smeared.
-            Defaults to `1.0`.
-
         num_gpt_outputs (int):
             Number of samples taken from the autoregressive model, all of which are filtered using CLVP.
             As XTTS is a probabilistic model, more samples means a higher probability of creating something "great".
             Defaults to `16`.
 
-        decoder_iterations (int):
-            Number of diffusion steps to perform. [0,4000]. More steps means the network has more chances to iteratively refine
-            the output, which should theoretically mean a higher quality output. Generally a value above 250 is not noticeably better,
-            however. Defaults to `30`.
-
-        decoder_sampler (str):
-            Diffusion sampler to be used. `ddim` or `dpm++2m`. Defaults to `ddim`.
-
         gpt_cond_len (int):
             Secs audio to be used as conditioning for the autoregressive model. Defaults to `3`.
 
@@ -110,11 +92,7 @@ class XttsConfig(BaseTTSConfig):
     repetition_penalty: float = 2.0
     top_k: int = 50
     top_p: float = 0.85
-    cond_free_k: float = 2.0
-    diffusion_temperature: float = 1.0
     num_gpt_outputs: int = 1
-    decoder_iterations: int = 30
-    decoder_sampler: str = "ddim"
 
     # cloning
     gpt_cond_len: int = 3
diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py
index a8a574c0..7cc9836a 100644
--- a/TTS/tts/models/xtts.py
+++ b/TTS/tts/models/xtts.py
@@ -152,19 +152,6 @@ class XttsArgs(Coqpit):
         gpt_code_stride_len (int, optional): The hop_size of dvae and consequently of the gpt output. Defaults to 1024.
         gpt_use_masking_gt_prompt_approach (bool, optional):  If True, it will use ground truth as prompt and it will mask the loss to avoid repetition. Defaults to True.
         gpt_use_perceiver_resampler (bool, optional):  If True, it will use perceiver resampler from flamingo paper - https://arxiv.org/abs/2204.14198. Defaults to False.
-
-        For DiffTTS model:
-        diff_model_channels (int, optional): The number of channels for the DiffTTS model. Defaults to 1024.
-        diff_num_layers (int, optional): The number of layers for the DiffTTS model. Defaults to 10.
-        diff_in_channels (int, optional): The input channels for the DiffTTS model. Defaults to 100.
-        diff_out_channels (int, optional): The output channels for the DiffTTS model. Defaults to 200.
-        diff_in_latent_channels (int, optional): The input latent channels for the DiffTTS model. Defaults to 1024.
-        diff_in_tokens (int, optional): The input tokens for the DiffTTS model. Defaults to 8193.
-        diff_dropout (int, optional): The dropout percentage for the DiffTTS model. Defaults to 0.
-        diff_use_fp16 (bool, optional): Whether to use fp16 for the DiffTTS model. Defaults to False.
-        diff_num_heads (int, optional): The number of heads for the DiffTTS model. Defaults to 16.
-        diff_layer_drop (int, optional): The layer dropout percentage for the DiffTTS model. Defaults to 0.
-        diff_unconditioned_percentage (int, optional): The percentage of unconditioned inputs for the DiffTTS model. Defaults to 0.
     """
 
     gpt_batch_size: int = 1
@@ -193,19 +180,6 @@ class XttsArgs(Coqpit):
     gpt_use_masking_gt_prompt_approach: bool = True
     gpt_use_perceiver_resampler: bool = False
 
-    # Diffusion Decoder params
-    diff_model_channels: int = 1024
-    diff_num_layers: int = 10
-    diff_in_channels: int = 100
-    diff_out_channels: int = 200
-    diff_in_latent_channels: int = 1024
-    diff_in_tokens: int = 8193
-    diff_dropout: int = 0
-    diff_use_fp16: bool = False
-    diff_num_heads: int = 16
-    diff_layer_drop: int = 0
-    diff_unconditioned_percentage: int = 0
-
     # HifiGAN Decoder params
     input_sample_rate: int = 22050
     output_sample_rate: int = 24000
@@ -426,10 +400,6 @@ class Xtts(BaseTTS):
             "repetition_penalty": config.repetition_penalty,
             "top_k": config.top_k,
             "top_p": config.top_p,
-            "cond_free_k": config.cond_free_k,
-            "diffusion_temperature": config.diffusion_temperature,
-            "decoder_iterations": config.decoder_iterations,
-            "decoder_sampler": config.decoder_sampler,
             "gpt_cond_len": config.gpt_cond_len,
             "max_ref_len": config.max_ref_len,
             "sound_norm_refs": config.sound_norm_refs,
@@ -454,13 +424,6 @@ class Xtts(BaseTTS):
         gpt_cond_len=6,
         max_ref_len=10,
         sound_norm_refs=False,
-        # Decoder inference
-        decoder_iterations=100,
-        cond_free=True,
-        cond_free_k=2,
-        diffusion_temperature=1.0,
-        decoder_sampler="ddim",
-        decoder="hifigan",
         **hf_generate_kwargs,
     ):
         """
diff --git a/docs/source/models/xtts.md b/docs/source/models/xtts.md
index 8167a1d1..03e44af1 100644
--- a/docs/source/models/xtts.md
+++ b/docs/source/models/xtts.md
@@ -24,8 +24,7 @@ a few tricks to make it faster and support streaming inference.
 Current implementation only supports inference.
 
 ### Languages
-As of now, XTTS-v2 supports 16 languages: English, Spanish, French, German, Italian, Portuguese,
-Polish, Turkish, Russian, Dutch, Czech, Arabic, Chinese (Simplified), Japanese, Hungarian, Korean
+As of now, XTTS-v2 supports 16 languages: English (en), Spanish (es), French (fr), German (de), Italian (it), Portuguese (pt), Polish (pl), Turkish (tr), Russian (ru), Dutch (nl), Czech (cs), Arabic (ar), Chinese (zh-cn), Japanese (ja), Hungarian (hu) and Korean (ko).
 
 Stay tuned as we continue to add support for more languages. If you have any language requests, please feel free to reach out.
 
@@ -116,7 +115,7 @@ model.load_checkpoint(config, checkpoint_dir="/path/to/xtts/", use_deepspeed=Tru
 model.cuda()
 
 print("Computing speaker latents...")
-gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"])
+gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"])
 
 print("Inference...")
 out = model.inference(
@@ -124,7 +123,6 @@ out = model.inference(
     "en",
     gpt_cond_latent,
     speaker_embedding,
-    diffusion_conditioning,
     temperature=0.7, # Add custom parameters here
 )
 torchaudio.save("xtts.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
@@ -153,7 +151,7 @@ model.load_checkpoint(config, checkpoint_dir="/path/to/xtts/", use_deepspeed=Tru
 model.cuda()
 
 print("Computing speaker latents...")
-gpt_cond_latent, _, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"])
+gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"])
 
 print("Inference...")
 t0 = time.time()
@@ -210,7 +208,7 @@ model.load_checkpoint(config, checkpoint_path=XTTS_CHECKPOINT, vocab_path=TOKENI
 model.cuda()
 
 print("Computing speaker latents...")
-gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path=[SPEAKER_REFERENCE])
+gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[SPEAKER_REFERENCE])
 
 print("Inference...")
 out = model.inference(
@@ -218,7 +216,6 @@ out = model.inference(
     "en",
     gpt_cond_latent,
     speaker_embedding,
-    diffusion_conditioning,
     temperature=0.7, # Add custom parameters here
 )
 torchaudio.save(OUTPUT_WAV_PATH, torch.tensor(out["wav"]).unsqueeze(0), 24000)

From 58cb0d8dd0a67e0d599d264987e518d823a78f46 Mon Sep 17 00:00:00 2001
From: Julian Weber <julian.weber@hotmail.fr>
Date: Wed, 8 Nov 2023 14:51:42 +0100
Subject: [PATCH 8/9] Remove v1 doc and tests (#3172)

* remove v1 in inference.md

* remove v1 in README.md

* Update test_models.py
---
 README.md                      | 2 +-
 docs/source/inference.md       | 4 ++--
 tests/zoo_tests/test_models.py | 7 +++----
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 353db7cf..935627e5 100644
--- a/README.md
+++ b/README.md
@@ -205,7 +205,7 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 print(TTS().list_models())
 
 # Init TTS
-tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1").to(device)
+tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
 
 # Run TTS
 # ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language
diff --git a/docs/source/inference.md b/docs/source/inference.md
index b40445ae..611a2445 100644
--- a/docs/source/inference.md
+++ b/docs/source/inference.md
@@ -124,7 +124,7 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 print(TTS().list_models())
 
 # Init TTS
-tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1").to(device)
+tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
 
 # Run TTS
 # ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language
@@ -231,4 +231,4 @@ api.tts_with_vc_to_file(
     speaker_wav="target/speaker.wav",
     file_path="ouptut.wav"
 )
-```
\ No newline at end of file
+```
diff --git a/tests/zoo_tests/test_models.py b/tests/zoo_tests/test_models.py
index 79aef5cb..d1c6b67c 100644
--- a/tests/zoo_tests/test_models.py
+++ b/tests/zoo_tests/test_models.py
@@ -14,7 +14,6 @@ from TTS.utils.manage import ModelManager
 MODELS_WITH_SEP_TESTS = [
     "tts_models/multilingual/multi-dataset/bark",
     "tts_models/en/multi-dataset/tortoise-v2",
-    "tts_models/multilingual/multi-dataset/xtts_v1",
     "tts_models/multilingual/multi-dataset/xtts_v1.1",
     "tts_models/multilingual/multi-dataset/xtts_v2",
 ]
@@ -83,14 +82,14 @@ def test_xtts():
     if use_gpu:
         run_cli(
             "yes | "
-            f"tts --model_name  tts_models/multilingual/multi-dataset/xtts_v1 "
+            f"tts --model_name  tts_models/multilingual/multi-dataset/xtts_v1.1 "
             f'--text "This is an example." --out_path "{output_path}" --progress_bar False --use_cuda True '
             f'--speaker_wav "{speaker_wav}" --language_idx "en"'
         )
     else:
         run_cli(
             "yes | "
-            f"tts --model_name  tts_models/multilingual/multi-dataset/xtts_v1 "
+            f"tts --model_name  tts_models/multilingual/multi-dataset/xtts_v1.1 "
             f'--text "This is an example." --out_path "{output_path}" --progress_bar False '
             f'--speaker_wav "{speaker_wav}" --language_idx "en"'
         )
@@ -104,7 +103,7 @@ def test_xtts_streaming():
     speaker_wav = [os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")]
     speaker_wav_2 = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0002.wav")
     speaker_wav.append(speaker_wav_2)
-    model_path = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v1")
+    model_path = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v1.1")
     config = XttsConfig()
     config.load_json(os.path.join(model_path, "config.json"))
     model = Xtts.init_from_config(config)

From 46d9c27212939aa54b22f9df842c753de67b1f34 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <erogol@hotmail.com>
Date: Wed, 8 Nov 2023 16:07:56 +0100
Subject: [PATCH 9/9] Update to v0.20.2

---
 TTS/VERSION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/VERSION b/TTS/VERSION
index 847e9aef..727d97b9 100644
--- a/TTS/VERSION
+++ b/TTS/VERSION
@@ -1 +1 @@
-0.20.1
+0.20.2