Tortoise inference

2023-04-22 17:21:27 +05:30 · 2023-04-22 17:21:27 +05:30 · b09d1889c3
parent f06c41492f
commit b09d1889c3
120 changed files with 7124 additions and 177 deletions
--- a/TTS/tts/configs/tortoise.py
+++ b/TTS/tts/configs/tortoise.py
--- a/TTS/tts/layers/tortoise/arch_utils.py
+++ b/TTS/tts/layers/tortoise/arch_utils.py
@ -3,11 +3,13 @@ import functools
 import math

 import torch
+
 import torch.nn as nn
 import torch.nn.functional as F
 import torchaudio
-from tortoise.models.xtransformers import ContinuousTransformerWrapper, RelativePositionBias
+from transformers import LogitsWarper

+from TTS.tts.layers.tortoise.xtransformers import ContinuousTransformerWrapper, RelativePositionBias

 def zero_module(module):
    """
@ -289,12 +291,15 @@ class AudioMiniEncoder(nn.Module):
        return h[:, :, 0]


-DEFAULT_MEL_NORM_FILE = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data/mel_norms.pth')
+DEFAULT_MEL_NORM_FILE = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../utils/assets/tortoise/mel_norms.pth')


 class TorchMelSpectrogram(nn.Module):
-    def __init__(self, filter_length=1024, hop_length=256, win_length=1024, n_mel_channels=80, mel_fmin=0, mel_fmax=8000,
-                 sampling_rate=22050, normalize=False, mel_norm_file=DEFAULT_MEL_NORM_FILE):
+    def __init__(self, filter_length=1024, hop_length=256, 
+                 win_length=1024, n_mel_channels=80, 
+                 mel_fmin=0, mel_fmax=8000,
+                 sampling_rate=22050, normalize=False, 
+                 mel_norm_file=DEFAULT_MEL_NORM_FILE):
        super().__init__()
        # These are the default tacotron values for the MEL spectrogram.
        self.filter_length = filter_length
@ -304,10 +309,15 @@ class TorchMelSpectrogram(nn.Module):
        self.mel_fmin = mel_fmin
        self.mel_fmax = mel_fmax
        self.sampling_rate = sampling_rate
-        self.mel_stft = torchaudio.transforms.MelSpectrogram(n_fft=self.filter_length, hop_length=self.hop_length,
-                                                             win_length=self.win_length, power=2, normalized=normalize,
-                                                             sample_rate=self.sampling_rate, f_min=self.mel_fmin,
-                                                             f_max=self.mel_fmax, n_mels=self.n_mel_channels,
+        self.mel_stft = torchaudio.transforms.MelSpectrogram(n_fft=self.filter_length,
+                                                            hop_length=self.hop_length,
+                                                            win_length=self.win_length,
+                                                            power=2,
+                                                            normalized=normalize,
+                                                            sample_rate=self.sampling_rate,
+                                                            f_min=self.mel_fmin,
+                                                            f_max=self.mel_fmax,
+                                                            n_mels=self.n_mel_channels,
                                                            norm="slaney")
        self.mel_norm_file = mel_norm_file
        if self.mel_norm_file is not None:
@ -369,3 +379,45 @@ class CheckpointedXTransformerEncoder(nn.Module):
        if self.exit_permute:
            h = h.permute(0,2,1)
        return h
+
+
+class TypicalLogitsWarper(LogitsWarper):
+    def __init__(
+        self,
+        mass: float = 0.9,
+        filter_value: float = -float("Inf"),
+        min_tokens_to_keep: int = 1,
+    ):
+        self.filter_value = filter_value
+        self.mass = mass
+        self.min_tokens_to_keep = min_tokens_to_keep
+
+    def __call__(
+        self, input_ids: torch.LongTensor, scores: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        # calculate entropy
+        normalized = torch.nn.functional.log_softmax(scores, dim=-1)
+        p = torch.exp(normalized)
+        ent = -(normalized * p).nansum(-1, keepdim=True)
+
+        # shift and sort
+        shifted_scores = torch.abs((-normalized) - ent)
+        sorted_scores, sorted_indices = torch.sort(shifted_scores, descending=False)
+        sorted_logits = scores.gather(-1, sorted_indices)
+        cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
+
+        # Remove tokens with cumulative mass above the threshold
+        last_ind = (cumulative_probs < self.mass).sum(dim=1)
+        last_ind[last_ind < 0] = 0
+        sorted_indices_to_remove = sorted_scores > sorted_scores.gather(
+            1, last_ind.view(-1, 1)
+        )
+        if self.min_tokens_to_keep > 1:
+            # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
+            sorted_indices_to_remove[..., : self.min_tokens_to_keep] = 0
+        indices_to_remove = sorted_indices_to_remove.scatter(
+            1, sorted_indices, sorted_indices_to_remove
+        )
+
+        scores = scores.masked_fill(indices_to_remove, self.filter_value)
+        return scores
--- a/TTS/tts/layers/tortoise/audio_utils.py
+++ b/TTS/tts/layers/tortoise/audio_utils.py
@ -0,0 +1,188 @@
+import os
+from glob import glob
+from typing import Dict, List
+
+import librosa
+import numpy as np
+import torch
+import torchaudio
+from scipy.io.wavfile import read
+from TTS.utils.audio.torch_transforms import TorchSTFT
+
+BUILTIN_VOICES_DIR = os.path.join(
+    os.path.dirname(os.path.realpath(__file__)), "../../utils/assets/tortoise/voices"
+)
+
+
+def load_wav_to_torch(full_path):
+    sampling_rate, data = read(full_path)
+    if data.dtype == np.int32:
+        norm_fix = 2**31
+    elif data.dtype == np.int16:
+        norm_fix = 2**15
+    elif data.dtype == np.float16 or data.dtype == np.float32:
+        norm_fix = 1.0
+    else:
+        raise NotImplementedError(f"Provided data dtype not supported: {data.dtype}")
+    return (torch.FloatTensor(data.astype(np.float32)) / norm_fix, sampling_rate)
+
+
+def check_audio(audio, audiopath: str):
+    # Check some assumptions about audio range. This should be automatically fixed in load_wav_to_torch, but might not be in some edge cases, where we should squawk.
+    # '2' is arbitrarily chosen since it seems like audio will often "overdrive" the [-1,1] bounds.
+    if torch.any(audio > 2) or not torch.any(audio < 0):
+        print(f"Error with {audiopath}. Max={audio.max()} min={audio.min()}")
+    audio.clip_(-1, 1)
+
+
+def read_audio_file(audiopath: str):
+    if audiopath[-4:] == ".wav":
+        audio, lsr = load_wav_to_torch(audiopath)
+    elif audiopath[-4:] == ".mp3":
+        audio, lsr = librosa.load(audiopath, sr=None)
+        audio = torch.FloatTensor(audio)
+    else:
+        assert False, f"Unsupported audio format provided: {audiopath[-4:]}"
+
+    # Remove any channel data.
+    if len(audio.shape) > 1:
+        if audio.shape[0] < 5:
+            audio = audio[0]
+        else:
+            assert audio.shape[1] < 5
+            audio = audio[:, 0]
+
+    return audio, lsr
+
+
+def load_required_audio(audiopath: str):
+    audio, lsr = read_audio_file(audiopath)
+
+    audios = [
+        torchaudio.functional.resample(audio, lsr, sampling_rate)
+        for sampling_rate in (22050, 24000)
+    ]
+    for audio in audios:
+        check_audio(audio, audiopath)
+
+    return [audio.unsqueeze(0) for audio in audios]
+
+
+def load_audio(audiopath, sampling_rate):
+    audio, lsr = read_audio_file(audiopath)
+
+    if lsr != sampling_rate:
+        audio = torchaudio.functional.resample(audio, lsr, sampling_rate)
+    check_audio(audio, audiopath)
+
+    return audio.unsqueeze(0)
+
+
+TACOTRON_MEL_MAX = 2.3143386840820312
+TACOTRON_MEL_MIN = -11.512925148010254
+
+
+def denormalize_tacotron_mel(norm_mel):
+    return ((norm_mel + 1) / 2) * (
+        TACOTRON_MEL_MAX - TACOTRON_MEL_MIN
+    ) + TACOTRON_MEL_MIN
+
+
+def normalize_tacotron_mel(mel):
+    return 2 * ((mel - TACOTRON_MEL_MIN) / (TACOTRON_MEL_MAX - TACOTRON_MEL_MIN)) - 1
+
+
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+    """
+    PARAMS
+    ------
+    C: compression factor
+    """
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+
+
+def dynamic_range_decompression(x, C=1):
+    """
+    PARAMS
+    ------
+    C: compression factor used to compress
+    """
+    return torch.exp(x) / C
+
+
+def get_voices(extra_voice_dirs: List[str] = []):
+    dirs = [BUILTIN_VOICES_DIR] + extra_voice_dirs
+    voices: Dict[str, List[str]] = {}
+    for d in dirs:
+        subs = os.listdir(d)
+        for sub in subs:
+            subj = os.path.join(d, sub)
+            if os.path.isdir(subj):
+                voices[sub] = (
+                    list(glob(f"{subj}/*.wav"))
+                    + list(glob(f"{subj}/*.mp3"))
+                    + list(glob(f"{subj}/*.pth"))
+                )
+    return voices
+
+
+def load_voice(voice: str, extra_voice_dirs: List[str] = []):
+    if voice == "random":
+        return None, None
+
+    voices = get_voices(extra_voice_dirs)
+    paths = voices[voice]
+    if len(paths) == 1 and paths[0].endswith(".pth"):
+        return None, torch.load(paths[0])
+    else:
+        conds = []
+        for cond_path in paths:
+            c = load_required_audio(cond_path)
+            conds.append(c)
+        return conds, None
+
+
+def load_voices(voices: List[str], extra_voice_dirs: List[str] = []):
+    latents = []
+    clips = []
+    for voice in voices:
+        if voice == "random":
+            if len(voices) > 1:
+                print(
+                    "Cannot combine a random voice with a non-random voice. Just using a random voice."
+                )
+            return None, None
+        clip, latent = load_voice(voice, extra_voice_dirs)
+        if latent is None:
+            assert (
+                len(latents) == 0
+            ), "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."
+            clips.extend(clip)
+        elif clip is None:
+            assert (
+                len(clips) == 0
+            ), "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."
+            latents.append(latent)
+    if len(latents) == 0:
+        return clips, None
+    else:
+        latents_0 = torch.stack([l[0] for l in latents], dim=0).mean(dim=0)
+        latents_1 = torch.stack([l[1] for l in latents], dim=0).mean(dim=0)
+        latents = (latents_0, latents_1)
+        return None, latents
+
+def wav_to_univnet_mel(wav, do_normalization=False, device="cuda"):
+    stft = TorchSTFT(n_fft=1024, 
+                     hop_length=256, 
+                     win_length=1024, 
+                     use_mel=True,
+                     n_mels=100, 
+                     sample_rate=24000, 
+                     mel_fmin=0, 
+                     mel_fmax=12000)
+    stft = stft.to(device)
+    mel = stft(wav)
+    mel = dynamic_range_compression(mel)
+    if do_normalization:
+        mel = normalize_tacotron_mel(mel)
+    return mel
--- a/TTS/tts/layers/tortoise/autoregressive.py
+++ b/TTS/tts/layers/tortoise/autoregressive.py
@ -1,3 +1,4 @@
+# AGPL: a notification must be added stating that changes have been made to that file.
 import functools

 import torch
@ -5,19 +6,22 @@ import torch.nn as nn
 import torch.nn.functional as F
 from transformers import GPT2Config, GPT2PreTrainedModel, LogitsProcessorList
 from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
-from transformers.utils.model_parallel_utils import get_device_map, assert_device_map
-from tortoise.models.arch_util import AttentionBlock
-from tortoise.utils.typical_sampling import TypicalLogitsWarper

+from TTS.tts.layers.tortoise.arch_utils import AttentionBlock, TypicalLogitsWarper

 def null_position_embeddings(range, dim):
    return torch.zeros((range.shape[0], range.shape[1], dim), device=range.device)


+def _p(t):
+    return t and (len(t), len(t[0]), t[0][0].shape)  # kv_cache debug
+
+
 class ResBlock(nn.Module):
    """
    Basic residual convolutional block that uses GroupNorm.
    """
+
    def __init__(self, chan):
        super().__init__()
        self.net = nn.Sequential(
@ -25,7 +29,7 @@ class ResBlock(nn.Module):
            nn.GroupNorm(chan // 8, chan),
            nn.ReLU(),
            nn.Conv1d(chan, chan, kernel_size=3, padding=1),
-            nn.GroupNorm(chan//8, chan)
+            nn.GroupNorm(chan // 8, chan),
        )

    def forward(self, x):
@ -33,50 +37,23 @@ class ResBlock(nn.Module):


 class GPT2InferenceModel(GPT2PreTrainedModel):
-    def __init__(self, config, gpt, text_pos_emb, embeddings, norm, linear):
+    def __init__(self, config, gpt, text_pos_emb, embeddings, norm, linear, kv_cache):
        super().__init__(config)
        self.transformer = gpt
        self.text_pos_embedding = text_pos_emb
        self.embeddings = embeddings
        self.lm_head = nn.Sequential(norm, linear)
-
-        # Model parallel
-        self.model_parallel = False
-        self.device_map = None
-        self.cached_mel_emb = None
-
-    def parallelize(self, device_map=None):
-        self.device_map = (
-            get_device_map(len(self.transformer.h), range(torch.cuda.device_count()))
-            if device_map is None
-            else device_map
-        )
-        assert_device_map(self.device_map, len(self.transformer.h))
-        self.transformer.parallelize(self.device_map)
-        self.lm_head = self.lm_head.to(self.transformer.first_device)
-        self.model_parallel = True
-
-    def deparallelize(self):
-        self.transformer.deparallelize()
-        self.transformer = self.transformer.to("cpu")
-        self.lm_head = self.lm_head.to("cpu")
-        self.model_parallel = False
-        torch.cuda.empty_cache()
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
+        self.kv_cache = kv_cache

    def store_mel_emb(self, mel_emb):
        self.cached_mel_emb = mel_emb

-    def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
-
-        token_type_ids = kwargs.get("token_type_ids", None)
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs):
+        token_type_ids = kwargs.get("token_type_ids", None)  # usually None
+        if not self.kv_cache:
+            past_key_values = None
        # only last token for inputs_ids if past is defined in kwargs
-        if past:
+        if past_key_values:
            input_ids = input_ids[:, -1].unsqueeze(-1)
            if token_type_ids is not None:
                token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
@ -88,13 +65,13 @@ class GPT2InferenceModel(GPT2PreTrainedModel):
            # create position_ids on the fly for batch generation
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past:
+            if past_key_values:
                position_ids = position_ids[:, -1].unsqueeze(-1)
        else:
            position_ids = None
        return {
            "input_ids": input_ids,
-            "past_key_values": past,
+            "past_key_values": past_key_values,
            "use_cache": kwargs.get("use_cache"),
            "position_ids": position_ids,
            "attention_mask": attention_mask,
@ -121,7 +98,9 @@ class GPT2InferenceModel(GPT2PreTrainedModel):
        assert self.cached_mel_emb is not None
        assert inputs_embeds is None  # Not supported by this inference model.
        assert labels is None  # Training not supported by this inference model.
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )

        # Create embedding
        mel_len = self.cached_mel_emb.shape[1]
@ -130,13 +109,17 @@ class GPT2InferenceModel(GPT2PreTrainedModel):
            text_emb = self.embeddings(text_inputs)
            text_emb = text_emb + self.text_pos_embedding(text_emb)
            if self.cached_mel_emb.shape[0] != text_emb.shape[0]:
-                mel_emb = self.cached_mel_emb.repeat_interleave(text_emb.shape[0]//self.cached_mel_emb.shape[0], 0)
-            else:
+                mel_emb = self.cached_mel_emb.repeat_interleave(
+                    text_emb.shape[0] // self.cached_mel_emb.shape[0], 0
+                )
+            else:  # this outcome only occurs once per loop in most cases
                mel_emb = self.cached_mel_emb
            emb = torch.cat([mel_emb, text_emb], dim=1)
        else:
            emb = self.embeddings(input_ids)
-            emb = emb + self.text_pos_embedding.get_fixed_embedding(attention_mask.shape[1]-mel_len, attention_mask.device)
+            emb = emb + self.text_pos_embedding.get_fixed_embedding(
+                attention_mask.shape[1] - mel_len, attention_mask.device
+            )

        transformer_outputs = self.transformer(
            inputs_embeds=emb,
@ -153,12 +136,6 @@ class GPT2InferenceModel(GPT2PreTrainedModel):
            return_dict=return_dict,
        )
        hidden_states = transformer_outputs[0]
-
-        # Set device for model parallelism
-        if self.model_parallel:
-            torch.cuda.set_device(self.transformer.first_device)
-            hidden_states = hidden_states.to(self.lm_head.weight.device)
-
        lm_logits = self.lm_head(hidden_states)

        if not return_dict:
@ -181,19 +158,24 @@ class GPT2InferenceModel(GPT2PreTrainedModel):
        called. This is required to match :obj:`past_key_values` with the correct beam_idx at every generation step.
        """
        return tuple(
-            tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
+            tuple(
+                past_state.index_select(0, beam_idx.to(past_state.device))
+                for past_state in layer_past
+            )
            for layer_past in past
        )


 class ConditioningEncoder(nn.Module):
-    def __init__(self,
+    def __init__(
+        self,
        spec_dim,
        embedding_dim,
        attn_blocks=6,
        num_attn_heads=4,
        do_checkpointing=False,
-                 mean=False):
+        mean=False,
+    ):
        super().__init__()
        attn = []
        self.init = nn.Conv1d(spec_dim, embedding_dim, kernel_size=1)
@ -214,7 +196,7 @@ class ConditioningEncoder(nn.Module):


 class LearnedPositionEmbeddings(nn.Module):
-    def __init__(self, seq_len, model_dim, init=.02):
+    def __init__(self, seq_len, model_dim, init=0.02):
        super().__init__()
        self.emb = nn.Embedding(seq_len, model_dim)
        # Initializing this way is standard for GPT-2
@ -225,50 +207,68 @@ class LearnedPositionEmbeddings(nn.Module):
        return self.emb(torch.arange(0, sl, device=x.device))

    def get_fixed_embedding(self, ind, dev):
-        return self.emb(torch.tensor([ind], device=dev)).unsqueeze(0)
+        return self.emb(torch.arange(0, ind, device=dev))[ind - 1 : ind]


-def build_hf_gpt_transformer(layers, model_dim, heads, max_mel_seq_len, max_text_seq_len, checkpointing):
+def build_hf_gpt_transformer(
+    layers, model_dim, heads, max_mel_seq_len, max_text_seq_len, checkpointing
+):
    """
    GPT-2 implemented by the HuggingFace library.
    """
    from transformers import GPT2Config, GPT2Model
-    gpt_config = GPT2Config(vocab_size=256,  # Unused.
+
+    gpt_config = GPT2Config(
+        vocab_size=256,  # Unused.
        n_positions=max_mel_seq_len + max_text_seq_len,
        n_ctx=max_mel_seq_len + max_text_seq_len,
        n_embd=model_dim,
        n_layer=layers,
        n_head=heads,
        gradient_checkpointing=checkpointing,
-                             use_cache=not checkpointing)
+        use_cache=not checkpointing,
+    )
    gpt = GPT2Model(gpt_config)
    # Override the built in positional embeddings
-    del gpt.wpe
+    del (
+        gpt.wpe
+    )  # TODO: figure out relevance in fixing exported model definition: Embedding(1012, 1024)
    gpt.wpe = functools.partial(null_position_embeddings, dim=model_dim)
    # Built-in token embeddings are unused.
    del gpt.wte
-    return gpt, LearnedPositionEmbeddings(max_mel_seq_len, model_dim), LearnedPositionEmbeddings(max_text_seq_len, model_dim),\
-           None, None
+    return (
+        gpt,
+        LearnedPositionEmbeddings(max_mel_seq_len, model_dim),
+        LearnedPositionEmbeddings(max_text_seq_len, model_dim),
+        None,
+        None,
+    )


 class MelEncoder(nn.Module):
    def __init__(self, channels, mel_channels=80, resblocks_per_reduction=2):
        super().__init__()
        self.channels = channels
-        self.encoder = nn.Sequential(nn.Conv1d(mel_channels, channels//4, kernel_size=3, padding=1),
-                                     nn.Sequential(*[ResBlock(channels//4) for _ in range(resblocks_per_reduction)]),
+        self.encoder = nn.Sequential(
+            nn.Conv1d(mel_channels, channels // 4, kernel_size=3, padding=1),
+            nn.Sequential(
+                *[ResBlock(channels // 4) for _ in range(resblocks_per_reduction)]
+            ),
            nn.Conv1d(channels // 4, channels // 2, kernel_size=3, stride=2, padding=1),
            nn.GroupNorm(channels // 16, channels // 2),
            nn.ReLU(),
-                                     nn.Sequential(*[ResBlock(channels//2) for _ in range(resblocks_per_reduction)]),
+            nn.Sequential(
+                *[ResBlock(channels // 2) for _ in range(resblocks_per_reduction)]
+            ),
            nn.Conv1d(channels // 2, channels, kernel_size=3, stride=2, padding=1),
            nn.GroupNorm(channels // 8, channels),
            nn.ReLU(),
-                                     nn.Sequential(*[ResBlock(channels) for _ in range(resblocks_per_reduction)]),
+            nn.Sequential(
+                *[ResBlock(channels) for _ in range(resblocks_per_reduction)]
+            ),
        )
        self.reduction = 4

-
    def forward(self, x):
        for e in self.encoder:
            x = e(x)
@ -276,11 +276,25 @@ class MelEncoder(nn.Module):


 class UnifiedVoice(nn.Module):
-    def __init__(self, layers=8, model_dim=512, heads=8, max_text_tokens=120, max_mel_tokens=250, max_conditioning_inputs=1,
-                 mel_length_compression=1024, number_text_tokens=256,
-                 start_text_token=None, number_mel_codes=8194, start_mel_token=8192,
-                 stop_mel_token=8193, train_solo_embeddings=False, use_mel_codes_as_input=True,
-                 checkpointing=True, types=1):
+    def __init__(
+        self,
+        layers=8,
+        model_dim=512,
+        heads=8,
+        max_text_tokens=120,
+        max_mel_tokens=250,
+        max_conditioning_inputs=1,
+        mel_length_compression=1024,
+        number_text_tokens=256,
+        start_text_token=None,
+        number_mel_codes=8194,
+        start_mel_token=8192,
+        stop_mel_token=8193,
+        train_solo_embeddings=False,
+        use_mel_codes_as_input=True,
+        checkpointing=True,
+        types=1,
+    ):
        """
        Args:
            layers: Number of layers in transformer stack.
@ -303,7 +317,9 @@ class UnifiedVoice(nn.Module):
        super().__init__()

        self.number_text_tokens = number_text_tokens
-        self.start_text_token = number_text_tokens * types if start_text_token is None else start_text_token
+        self.start_text_token = (
+            number_text_tokens * types if start_text_token is None else start_text_token
+        )
        self.stop_text_token = 0
        self.number_mel_codes = number_mel_codes
        self.start_mel_token = start_mel_token
@ -315,17 +331,37 @@ class UnifiedVoice(nn.Module):
        self.model_dim = model_dim
        self.max_conditioning_inputs = max_conditioning_inputs
        self.mel_length_compression = mel_length_compression
-        self.conditioning_encoder = ConditioningEncoder(80, model_dim, num_attn_heads=heads)
-        self.text_embedding = nn.Embedding(self.number_text_tokens*types+1, model_dim)
+        self.conditioning_encoder = ConditioningEncoder(
+            80, model_dim, num_attn_heads=heads
+        )
+        self.text_embedding = nn.Embedding(
+            self.number_text_tokens * types + 1, model_dim
+        )
        if use_mel_codes_as_input:
            self.mel_embedding = nn.Embedding(self.number_mel_codes, model_dim)
        else:
            self.mel_embedding = MelEncoder(model_dim, resblocks_per_reduction=1)
-        self.gpt, self.mel_pos_embedding, self.text_pos_embedding, self.mel_layer_pos_embedding, self.text_layer_pos_embedding = \
-            build_hf_gpt_transformer(layers, model_dim, heads, self.max_mel_tokens+2+self.max_conditioning_inputs, self.max_text_tokens+2, checkpointing)
+        (
+            self.gpt,
+            self.mel_pos_embedding,
+            self.text_pos_embedding,
+            self.mel_layer_pos_embedding,
+            self.text_layer_pos_embedding,
+        ) = build_hf_gpt_transformer(
+            layers,
+            model_dim,
+            heads,
+            self.max_mel_tokens + 2 + self.max_conditioning_inputs,
+            self.max_text_tokens + 2,
+            checkpointing,
+        )
        if train_solo_embeddings:
-            self.mel_solo_embedding = nn.Parameter(torch.randn(1, 1, model_dim) * .02, requires_grad=True)
-            self.text_solo_embedding = nn.Parameter(torch.randn(1, 1, model_dim) * .02, requires_grad=True)
+            self.mel_solo_embedding = nn.Parameter(
+                torch.randn(1, 1, model_dim) * 0.02, requires_grad=True
+            )
+            self.text_solo_embedding = nn.Parameter(
+                torch.randn(1, 1, model_dim) * 0.02, requires_grad=True
+            )
        else:
            self.mel_solo_embedding = 0
            self.text_solo_embedding = 0
@ -339,7 +375,32 @@ class UnifiedVoice(nn.Module):
        if use_mel_codes_as_input:
            embeddings.append(self.mel_embedding)
        for module in embeddings:
-            module.weight.data.normal_(mean=0.0, std=.02)
+            module.weight.data.normal_(mean=0.0, std=0.02)
+
+    def post_init_gpt2_config(self, kv_cache=True):
+        seq_length = self.max_mel_tokens + self.max_text_tokens + 2
+        gpt_config = GPT2Config(
+            vocab_size=self.max_mel_tokens,
+            n_positions=seq_length,
+            n_ctx=seq_length,
+            n_embd=self.model_dim,
+            n_layer=self.layers,
+            n_head=self.heads,
+            gradient_checkpointing=False,
+            use_cache=True,
+        )
+        self.inference_model = GPT2InferenceModel(
+            gpt_config,
+            self.gpt,
+            self.mel_pos_embedding,
+            self.mel_embedding,
+            self.final_norm,
+            self.mel_head,
+            kv_cache=kv_cache,
+        )
+        # self.inference_model = PrunedGPT2InferenceModel(gpt_config, self.gpt, self.mel_pos_embedding, self.mel_embedding, self.final_norm, self.mel_head)
+        self.gpt.wte = self.mel_embedding
+        # self.inference_model.save_pretrained("")

    def build_aligned_inputs_and_targets(self, input, start_token, stop_token):
        inp = F.pad(input, (1, 0), value=start_token)
@ -353,28 +414,56 @@ class UnifiedVoice(nn.Module):
        preformatting to create a working TTS model.
        """
        # Set padding areas within MEL (currently it is coded with the MEL code for <zero>).
-        mel_lengths = torch.div(wav_lengths, self.mel_length_compression, rounding_mode='trunc')
+        mel_lengths = torch.div(
+            wav_lengths, self.mel_length_compression, rounding_mode="trunc"
+        )
        for b in range(len(mel_lengths)):
-            actual_end = mel_lengths[b] + 1  # Due to the convolutional nature of how these tokens are generated, it would be best if the model predicts a token past the actual last token.
+            actual_end = (
+                mel_lengths[b] + 1
+            )  # Due to the convolutional nature of how these tokens are generated, it would be best if the model predicts a token past the actual last token.
            if actual_end < mel_input_tokens.shape[-1]:
                mel_input_tokens[b, actual_end:] = self.stop_mel_token
        return mel_input_tokens

-    def get_logits(self, speech_conditioning_inputs, first_inputs, first_head, second_inputs=None, second_head=None, get_attns=False, return_latent=False):
+    def get_logits(
+        self,
+        speech_conditioning_inputs,
+        first_inputs,
+        first_head,
+        second_inputs=None,
+        second_head=None,
+        get_attns=False,
+        return_latent=False,
+    ):
        if second_inputs is not None:
-            emb = torch.cat([speech_conditioning_inputs, first_inputs, second_inputs], dim=1)
+            emb = torch.cat(
+                [speech_conditioning_inputs, first_inputs, second_inputs], dim=1
+            )
        else:
            emb = torch.cat([speech_conditioning_inputs, first_inputs], dim=1)

-        gpt_out = self.gpt(inputs_embeds=emb, return_dict=True, output_attentions=get_attns)
+        gpt_out = self.gpt(
+            inputs_embeds=emb, return_dict=True, output_attentions=get_attns
+        )
        if get_attns:
            return gpt_out.attentions

-        enc = gpt_out.last_hidden_state[:, 1:]  # The first logit is tied to the speech_conditioning_input
+        enc = gpt_out.last_hidden_state[
+            :, 1:
+        ]  # The first logit is tied to the speech_conditioning_input
        enc = self.final_norm(enc)

        if return_latent:
-            return enc[:, speech_conditioning_inputs.shape[1]:speech_conditioning_inputs.shape[1]+first_inputs.shape[1]], enc[:, -second_inputs.shape[1]:]
+            return (
+                enc[
+                    :,
+                    speech_conditioning_inputs.shape[
+                        1
+                    ] : speech_conditioning_inputs.shape[1]
+                    + first_inputs.shape[1],
+                ],
+                enc[:, -second_inputs.shape[1] :],
+            )

        first_logits = enc[:, : first_inputs.shape[1]]
        first_logits = first_head(first_logits)
@ -388,8 +477,11 @@ class UnifiedVoice(nn.Module):
            return first_logits

    def get_conditioning(self, speech_conditioning_input):
-        speech_conditioning_input = speech_conditioning_input.unsqueeze(1) if len(
-            speech_conditioning_input.shape) == 3 else speech_conditioning_input
+        speech_conditioning_input = (
+            speech_conditioning_input.unsqueeze(1)
+            if len(speech_conditioning_input.shape) == 3
+            else speech_conditioning_input
+        )
        conds = []
        for j in range(speech_conditioning_input.shape[1]):
            conds.append(self.conditioning_encoder(speech_conditioning_input[:, j]))
@ -397,8 +489,20 @@ class UnifiedVoice(nn.Module):
        conds = conds.mean(dim=1)
        return conds

-    def forward(self, speech_conditioning_latent, text_inputs, text_lengths, mel_codes, wav_lengths, types=None, text_first=True, raw_mels=None, return_attentions=False,
-                return_latent=False, clip_inputs=True):
+    def forward(
+        self,
+        speech_conditioning_latent,
+        text_inputs,
+        text_lengths,
+        mel_codes,
+        wav_lengths,
+        types=None,
+        text_first=True,
+        raw_mels=None,
+        return_attentions=False,
+        return_latent=False,
+        clip_inputs=True,
+    ):
        """
        Forward pass that uses both text and voice in either text conditioning mode or voice conditioning mode
        (actuated by `text_first`).
@ -432,9 +536,15 @@ class UnifiedVoice(nn.Module):
        mel_codes = F.pad(mel_codes, (0, 1), value=self.stop_mel_token)

        conds = speech_conditioning_latent.unsqueeze(1)
-        text_inputs, text_targets = self.build_aligned_inputs_and_targets(text_inputs, self.start_text_token, self.stop_text_token)
-        text_emb = self.text_embedding(text_inputs) + self.text_pos_embedding(text_inputs)
-        mel_codes, mel_targets = self.build_aligned_inputs_and_targets(mel_codes, self.start_mel_token, self.stop_mel_token)
+        text_inputs, text_targets = self.build_aligned_inputs_and_targets(
+            text_inputs, self.start_text_token, self.stop_text_token
+        )
+        text_emb = self.text_embedding(text_inputs) + self.text_pos_embedding(
+            text_inputs
+        )
+        mel_codes, mel_targets = self.build_aligned_inputs_and_targets(
+            mel_codes, self.start_mel_token, self.stop_mel_token
+        )
        if raw_mels is not None:
            mel_inp = F.pad(raw_mels, (0, 8))
        else:
@ -443,13 +553,33 @@ class UnifiedVoice(nn.Module):
        mel_emb = mel_emb + self.mel_pos_embedding(mel_codes)

        if text_first:
-            text_logits, mel_logits = self.get_logits(conds, text_emb, self.text_head, mel_emb, self.mel_head, get_attns=return_attentions, return_latent=return_latent)
+            text_logits, mel_logits = self.get_logits(
+                conds,
+                text_emb,
+                self.text_head,
+                mel_emb,
+                self.mel_head,
+                get_attns=return_attentions,
+                return_latent=return_latent,
+            )
            if return_latent:
-                return mel_logits[:, :-2]  # Despite the name, these are not logits. Strip off the two tokens added by this forward pass.
+                return mel_logits[
+                    :, :-2
+                ]  # Despite the name, these are not logits. Strip off the two tokens added by this forward pass.
        else:
-            mel_logits, text_logits = self.get_logits(conds, mel_emb, self.mel_head, text_emb, self.text_head, get_attns=return_attentions, return_latent=return_latent)
+            mel_logits, text_logits = self.get_logits(
+                conds,
+                mel_emb,
+                self.mel_head,
+                text_emb,
+                self.text_head,
+                get_attns=return_attentions,
+                return_latent=return_latent,
+            )
            if return_latent:
-                return text_logits[:, :-2]  # Despite the name, these are not logits. Strip off the two tokens added by this forward pass.
+                return text_logits[
+                    :, :-2
+                ]  # Despite the name, these are not logits. Strip off the two tokens added by this forward pass.

        if return_attentions:
            return mel_logits
@ -457,55 +587,92 @@ class UnifiedVoice(nn.Module):
        loss_mel = F.cross_entropy(mel_logits, mel_targets.long())
        return loss_text.mean(), loss_mel.mean(), mel_logits

-    def inference_speech(self, speech_conditioning_latent, text_inputs, input_tokens=None, num_return_sequences=1,
-                         max_generate_length=None, typical_sampling=False, typical_mass=.9, **hf_generate_kwargs):
-        seq_length = self.max_mel_tokens + self.max_text_tokens + 2
-        if not hasattr(self, 'inference_model'):
-            # TODO: Decouple gpt_config from this inference model.
-            gpt_config = GPT2Config(vocab_size=self.max_mel_tokens,
-                                    n_positions=seq_length,
-                                    n_ctx=seq_length,
-                                    n_embd=self.model_dim,
-                                    n_layer=self.layers,
-                                    n_head=self.heads,
-                                    gradient_checkpointing=False,
-                                    use_cache=True)
-            self.inference_model = GPT2InferenceModel(gpt_config, self.gpt, self.mel_pos_embedding, self.mel_embedding, self.final_norm, self.mel_head)
-            self.gpt.wte = self.mel_embedding
-
+    def inference_speech(
+        self,
+        speech_conditioning_latent,
+        text_inputs,
+        input_tokens=None,
+        num_return_sequences=1,
+        max_generate_length=None,
+        typical_sampling=False,
+        typical_mass=0.9,
+        **hf_generate_kwargs
+    ):
        text_inputs = F.pad(text_inputs, (0, 1), value=self.stop_text_token)
-        text_inputs, text_targets = self.build_aligned_inputs_and_targets(text_inputs, self.start_text_token, self.stop_text_token)
-        text_emb = self.text_embedding(text_inputs) + self.text_pos_embedding(text_inputs)
+        text_inputs, text_targets = self.build_aligned_inputs_and_targets(
+            text_inputs, self.start_text_token, self.stop_text_token
+        )
+        text_emb = self.text_embedding(text_inputs) + self.text_pos_embedding(
+            text_inputs
+        )

        conds = speech_conditioning_latent.unsqueeze(1)
        emb = torch.cat([conds, text_emb], dim=1)
        self.inference_model.store_mel_emb(emb)

-        fake_inputs = torch.full((emb.shape[0], conds.shape[1] + emb.shape[1],), fill_value=1, dtype=torch.long,
-                                 device=text_inputs.device)
+        fake_inputs = torch.full(
+            (
+                emb.shape[0],
+                conds.shape[1] + emb.shape[1],
+            ),
+            fill_value=1,
+            dtype=torch.long,
+            device=text_inputs.device,
+        )
        fake_inputs[:, -1] = self.start_mel_token
        trunc_index = fake_inputs.shape[1]
        if input_tokens is None:
            inputs = fake_inputs
        else:
-            assert num_return_sequences % input_tokens.shape[0] == 0, "The number of return sequences must be divisible by the number of input sequences"
+            assert (
+                num_return_sequences % input_tokens.shape[0] == 0
+            ), "The number of return sequences must be divisible by the number of input sequences"
            fake_inputs = fake_inputs.repeat(num_return_sequences, 1)
-            input_tokens = input_tokens.repeat(num_return_sequences // input_tokens.shape[0], 1)
+            input_tokens = input_tokens.repeat(
+                num_return_sequences // input_tokens.shape[0], 1
+            )
            inputs = torch.cat([fake_inputs, input_tokens], dim=1)

-        logits_processor = LogitsProcessorList([TypicalLogitsWarper(mass=typical_mass)]) if typical_sampling else LogitsProcessorList()
-        max_length = trunc_index + self.max_mel_tokens - 1  if max_generate_length is None else trunc_index + max_generate_length
-        gen = self.inference_model.generate(inputs, bos_token_id=self.start_mel_token, pad_token_id=self.stop_mel_token, eos_token_id=self.stop_mel_token,
-                                            max_length=max_length, logits_processor=logits_processor,
-                                            num_return_sequences=num_return_sequences, **hf_generate_kwargs)
+        logits_processor = (
+            LogitsProcessorList([TypicalLogitsWarper(mass=typical_mass)])
+            if typical_sampling
+            else LogitsProcessorList()
+        )  # TODO disable this
+        max_length = (
+            trunc_index + self.max_mel_tokens - 1
+            if max_generate_length is None
+            else trunc_index + max_generate_length
+        )
+        gen = self.inference_model.generate(
+            inputs,
+            bos_token_id=self.start_mel_token,
+            pad_token_id=self.stop_mel_token,
+            eos_token_id=self.stop_mel_token,
+            max_length=max_length,
+            logits_processor=logits_processor,
+            num_return_sequences=num_return_sequences,
+            **hf_generate_kwargs
+        )
        return gen[:, trunc_index:]


-if __name__ == '__main__':
-    gpt = UnifiedVoice(model_dim=256, heads=4, train_solo_embeddings=True, use_mel_codes_as_input=True, max_conditioning_inputs=4)
-    l = gpt(torch.randn(2, 3, 80, 800),
+if __name__ == "__main__":
+    gpt = UnifiedVoice(
+        model_dim=256,
+        heads=4,
+        train_solo_embeddings=True,
+        use_mel_codes_as_input=True,
+        max_conditioning_inputs=4,
+    )
+    l = gpt(
+        torch.randn(2, 3, 80, 800),
        torch.randint(high=120, size=(2, 120)),
        torch.tensor([32, 120]),
        torch.randint(high=8192, size=(2, 250)),
-            torch.tensor([250*256,195*256]))
-    gpt.text_forward(torch.randn(2,80,800), torch.randint(high=50, size=(2,80)), torch.tensor([32, 80]))
+        torch.tensor([250 * 256, 195 * 256]),
+    )
+    gpt.text_forward(
+        torch.randn(2, 80, 800),
+        torch.randint(high=50, size=(2, 80)),
+        torch.tensor([32, 80]),
+    )
--- a/TTS/tts/layers/tortoise/classifier.py
+++ b/TTS/tts/layers/tortoise/classifier.py
@ -0,0 +1,166 @@
+import torch
+import torch.nn as nn
+
+from TTS.tts.layers.tortoise.arch_utils import (
+    AttentionBlock,
+    Downsample,
+    Upsample,
+    normalization,
+    zero_module,
+)
+
+
+class ResBlock(nn.Module):
+    def __init__(
+        self,
+        channels,
+        dropout,
+        out_channels=None,
+        use_conv=False,
+        use_scale_shift_norm=False,
+        dims=2,
+        up=False,
+        down=False,
+        kernel_size=3,
+        do_checkpoint=True,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.dropout = dropout
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_scale_shift_norm = use_scale_shift_norm
+        self.do_checkpoint = do_checkpoint
+        padding = 1 if kernel_size == 3 else 2
+
+        self.in_layers = nn.Sequential(
+            normalization(channels),
+            nn.SiLU(),
+            nn.Conv1d(channels, self.out_channels, kernel_size, padding=padding),
+        )
+
+        self.updown = up or down
+
+        if up:
+            self.h_upd = Upsample(channels, False, dims)
+            self.x_upd = Upsample(channels, False, dims)
+        elif down:
+            self.h_upd = Downsample(channels, False, dims)
+            self.x_upd = Downsample(channels, False, dims)
+        else:
+            self.h_upd = self.x_upd = nn.Identity()
+
+        self.out_layers = nn.Sequential(
+            normalization(self.out_channels),
+            nn.SiLU(),
+            nn.Dropout(p=dropout),
+            zero_module(
+                nn.Conv1d(
+                    self.out_channels, self.out_channels, kernel_size, padding=padding
+                )
+            ),
+        )
+
+        if self.out_channels == channels:
+            self.skip_connection = nn.Identity()
+        elif use_conv:
+            self.skip_connection = nn.Conv1d(
+                dims, channels, self.out_channels, kernel_size, padding=padding
+            )
+        else:
+            self.skip_connection = nn.Conv1d(dims, channels, self.out_channels, 1)
+
+    def forward(self, x):
+        if self.updown:
+            in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
+            h = in_rest(x)
+            h = self.h_upd(h)
+            x = self.x_upd(x)
+            h = in_conv(h)
+        else:
+            h = self.in_layers(x)
+        h = self.out_layers(h)
+        return self.skip_connection(x) + h
+
+
+class AudioMiniEncoder(nn.Module):
+    def __init__(
+        self,
+        spec_dim,
+        embedding_dim,
+        base_channels=128,
+        depth=2,
+        resnet_blocks=2,
+        attn_blocks=4,
+        num_attn_heads=4,
+        dropout=0,
+        downsample_factor=2,
+        kernel_size=3,
+    ):
+        super().__init__()
+        self.init = nn.Sequential(nn.Conv1d(spec_dim, base_channels, 3, padding=1))
+        ch = base_channels
+        res = []
+        self.layers = depth
+        for l in range(depth):
+            for r in range(resnet_blocks):
+                res.append(
+                    ResBlock(ch, dropout, do_checkpoint=False, kernel_size=kernel_size)
+                )
+            res.append(
+                Downsample(
+                    ch, use_conv=True, out_channels=ch * 2, factor=downsample_factor
+                )
+            )
+            ch *= 2
+        self.res = nn.Sequential(*res)
+        self.final = nn.Sequential(
+            normalization(ch), nn.SiLU(), nn.Conv1d(ch, embedding_dim, 1)
+        )
+        attn = []
+        for a in range(attn_blocks):
+            attn.append(
+                AttentionBlock(embedding_dim, num_attn_heads, do_checkpoint=False)
+            )
+        self.attn = nn.Sequential(*attn)
+        self.dim = embedding_dim
+
+    def forward(self, x):
+        h = self.init(x)
+        h = self.res(h)
+        h = self.final(h)
+        for blk in self.attn:
+            h = blk(h)
+        return h[:, :, 0]
+
+
+class AudioMiniEncoderWithClassifierHead(nn.Module):
+    def __init__(self, classes, distribute_zero_label=True, **kwargs):
+        super().__init__()
+        self.enc = AudioMiniEncoder(**kwargs)
+        self.head = nn.Linear(self.enc.dim, classes)
+        self.num_classes = classes
+        self.distribute_zero_label = distribute_zero_label
+
+    def forward(self, x, labels=None):
+        h = self.enc(x)
+        logits = self.head(h)
+        if labels is None:
+            return logits
+        else:
+            if self.distribute_zero_label:
+                oh_labels = nn.functional.one_hot(labels, num_classes=self.num_classes)
+                zeros_indices = (labels == 0).unsqueeze(-1)
+                # Distribute 20% of the probability mass on all classes when zero is specified, to compensate for dataset noise.
+                zero_extra_mass = torch.full_like(
+                    oh_labels,
+                    dtype=torch.float,
+                    fill_value=0.2 / (self.num_classes - 1),
+                )
+                zero_extra_mass[:, 0] = -0.2
+                zero_extra_mass = zero_extra_mass * zeros_indices
+                oh_labels = oh_labels + zero_extra_mass
+            else:
+                oh_labels = labels
+            loss = nn.functional.cross_entropy(logits, oh_labels)
+            return loss
--- a/TTS/tts/layers/tortoise/clvp.py
+++ b/TTS/tts/layers/tortoise/clvp.py
@ -3,9 +3,9 @@ import torch.nn as nn
 import torch.nn.functional as F
 from torch import einsum

-from tortoise.models.arch_util import CheckpointedXTransformerEncoder
-from tortoise.models.transformer import Transformer
-from tortoise.models.xtransformers import Encoder
+from TTS.tts.layers.tortoise.arch_utils import CheckpointedXTransformerEncoder
+from TTS.tts.layers.tortoise.transformer import Transformer
+from TTS.tts.layers.tortoise.xtransformers import Encoder


 def exists(val):
--- a/TTS/tts/layers/tortoise/cvvp.py
+++ b/TTS/tts/layers/tortoise/cvvp.py
@ -0,0 +1,156 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import einsum
+
+from TTS.tts.layers.tortoise.arch_utils import AttentionBlock
+from TTS.tts.layers.tortoise.xtransformers import ContinuousTransformerWrapper, Encoder
+
+
+def exists(val):
+    return val is not None
+
+
+def masked_mean(t, mask):
+    t = t.masked_fill(~mask, 0.0)
+    return t.sum(dim=1) / mask.sum(dim=1)
+
+
+class CollapsingTransformer(nn.Module):
+    def __init__(
+        self,
+        model_dim,
+        output_dims,
+        heads,
+        dropout,
+        depth,
+        mask_percentage=0,
+        **encoder_kwargs
+    ):
+        super().__init__()
+        self.transformer = ContinuousTransformerWrapper(
+            max_seq_len=-1,
+            use_pos_emb=False,
+            attn_layers=Encoder(
+                dim=model_dim,
+                depth=depth,
+                heads=heads,
+                ff_dropout=dropout,
+                ff_mult=1,
+                attn_dropout=dropout,
+                use_rmsnorm=True,
+                ff_glu=True,
+                rotary_pos_emb=True,
+                **encoder_kwargs,
+            ),
+        )
+        self.pre_combiner = nn.Sequential(
+            nn.Conv1d(model_dim, output_dims, 1),
+            AttentionBlock(output_dims, num_heads=heads, do_checkpoint=False),
+            nn.Conv1d(output_dims, output_dims, 1),
+        )
+        self.mask_percentage = mask_percentage
+
+    def forward(self, x, **transformer_kwargs):
+        h = self.transformer(x, **transformer_kwargs)
+        h = h.permute(0, 2, 1)
+        h = self.pre_combiner(h).permute(0, 2, 1)
+        if self.training:
+            mask = torch.rand_like(h.float()) > self.mask_percentage
+        else:
+            mask = torch.ones_like(h.float()).bool()
+        return masked_mean(h, mask)
+
+
+class ConvFormatEmbedding(nn.Module):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self.emb = nn.Embedding(*args, **kwargs)
+
+    def forward(self, x):
+        y = self.emb(x)
+        return y.permute(0, 2, 1)
+
+
+class CVVP(nn.Module):
+    def __init__(
+        self,
+        model_dim=512,
+        transformer_heads=8,
+        dropout=0.1,
+        conditioning_enc_depth=8,
+        cond_mask_percentage=0,
+        mel_channels=80,
+        mel_codes=None,
+        speech_enc_depth=8,
+        speech_mask_percentage=0,
+        latent_multiplier=1,
+    ):
+        super().__init__()
+        latent_dim = latent_multiplier * model_dim
+        self.temperature = nn.Parameter(torch.tensor(1.0))
+
+        self.cond_emb = nn.Sequential(
+            nn.Conv1d(mel_channels, model_dim // 2, kernel_size=5, stride=2, padding=2),
+            nn.Conv1d(model_dim // 2, model_dim, kernel_size=3, stride=2, padding=1),
+        )
+        self.conditioning_transformer = CollapsingTransformer(
+            model_dim,
+            model_dim,
+            transformer_heads,
+            dropout,
+            conditioning_enc_depth,
+            cond_mask_percentage,
+        )
+        self.to_conditioning_latent = nn.Linear(latent_dim, latent_dim, bias=False)
+
+        if mel_codes is None:
+            self.speech_emb = nn.Conv1d(
+                mel_channels, model_dim, kernel_size=5, padding=2
+            )
+        else:
+            self.speech_emb = ConvFormatEmbedding(mel_codes, model_dim)
+        self.speech_transformer = CollapsingTransformer(
+            model_dim,
+            latent_dim,
+            transformer_heads,
+            dropout,
+            speech_enc_depth,
+            speech_mask_percentage,
+        )
+        self.to_speech_latent = nn.Linear(latent_dim, latent_dim, bias=False)
+
+    def get_grad_norm_parameter_groups(self):
+        return {
+            "conditioning": list(self.conditioning_transformer.parameters()),
+            "speech": list(self.speech_transformer.parameters()),
+        }
+
+    def forward(self, mel_cond, mel_input, return_loss=False):
+        cond_emb = self.cond_emb(mel_cond).permute(0, 2, 1)
+        enc_cond = self.conditioning_transformer(cond_emb)
+        cond_latents = self.to_conditioning_latent(enc_cond)
+
+        speech_emb = self.speech_emb(mel_input).permute(0, 2, 1)
+        enc_speech = self.speech_transformer(speech_emb)
+        speech_latents = self.to_speech_latent(enc_speech)
+
+        cond_latents, speech_latents = map(
+            lambda t: F.normalize(t, p=2, dim=-1), (cond_latents, speech_latents)
+        )
+        temp = self.temperature.exp()
+
+        if not return_loss:
+            sim = einsum("n d, n d -> n", cond_latents, speech_latents) * temp
+            return sim
+
+        sim = einsum("i d, j d -> i j", cond_latents, speech_latents) * temp
+        labels = torch.arange(cond_latents.shape[0], device=mel_input.device)
+        loss = (F.cross_entropy(sim, labels) + F.cross_entropy(sim.t(), labels)) / 2
+
+        return loss
+
+
+if __name__ == "__main__":
+    clvp = CVVP()
+    clvp(torch.randn(2, 80, 100), torch.randn(2, 80, 95), return_loss=True)
--- a/TTS/tts/layers/tortoise/diffusion.py
+++ b/TTS/tts/layers/tortoise/diffusion.py
--- a/TTS/tts/layers/tortoise/diffusion_decoder.py
+++ b/TTS/tts/layers/tortoise/diffusion_decoder.py
@ -0,0 +1,445 @@
+import math
+import random
+from abc import abstractmethod
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import autocast
+
+from TTS.tts.layers.tortoise.arch_utils import AttentionBlock, normalization
+
+
+def is_latent(t):
+    return t.dtype == torch.float
+
+
+def is_sequence(t):
+    return t.dtype == torch.long
+
+
+def timestep_embedding(timesteps, dim, max_period=10000):
+    """
+    Create sinusoidal timestep embeddings.
+
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an [N x dim] Tensor of positional embeddings.
+    """
+    half = dim // 2
+    freqs = torch.exp(
+        -math.log(max_period)
+        * torch.arange(start=0, end=half, dtype=torch.float32)
+        / half
+    ).to(device=timesteps.device)
+    args = timesteps[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    return embedding
+
+
+class TimestepBlock(nn.Module):
+    @abstractmethod
+    def forward(self, x, emb):
+        """
+        Apply the module to `x` given `emb` timestep embeddings.
+        """
+
+
+class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
+    def forward(self, x, emb):
+        for layer in self:
+            if isinstance(layer, TimestepBlock):
+                x = layer(x, emb)
+            else:
+                x = layer(x)
+        return x
+
+
+class ResBlock(TimestepBlock):
+    def __init__(
+        self,
+        channels,
+        emb_channels,
+        dropout,
+        out_channels=None,
+        dims=2,
+        kernel_size=3,
+        efficient_config=True,
+        use_scale_shift_norm=False,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.emb_channels = emb_channels
+        self.dropout = dropout
+        self.out_channels = out_channels or channels
+        self.use_scale_shift_norm = use_scale_shift_norm
+        padding = {1: 0, 3: 1, 5: 2}[kernel_size]
+        eff_kernel = 1 if efficient_config else 3
+        eff_padding = 0 if efficient_config else 1
+
+        self.in_layers = nn.Sequential(
+            normalization(channels),
+            nn.SiLU(),
+            nn.Conv1d(channels, self.out_channels, eff_kernel, padding=eff_padding),
+        )
+
+        self.emb_layers = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(
+                emb_channels,
+                2 * self.out_channels if use_scale_shift_norm else self.out_channels,
+            ),
+        )
+        self.out_layers = nn.Sequential(
+            normalization(self.out_channels),
+            nn.SiLU(),
+            nn.Dropout(p=dropout),
+            nn.Conv1d(
+                self.out_channels, self.out_channels, kernel_size, padding=padding
+            ),
+        )
+
+        if self.out_channels == channels:
+            self.skip_connection = nn.Identity()
+        else:
+            self.skip_connection = nn.Conv1d(
+                channels, self.out_channels, eff_kernel, padding=eff_padding
+            )
+
+    def forward(self, x, emb):
+        h = self.in_layers(x)
+        emb_out = self.emb_layers(emb).type(h.dtype)
+        while len(emb_out.shape) < len(h.shape):
+            emb_out = emb_out[..., None]
+        if self.use_scale_shift_norm:
+            out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
+            scale, shift = torch.chunk(emb_out, 2, dim=1)
+            h = out_norm(h) * (1 + scale) + shift
+            h = out_rest(h)
+        else:
+            h = h + emb_out
+            h = self.out_layers(h)
+        return self.skip_connection(x) + h
+
+
+class DiffusionLayer(TimestepBlock):
+    def __init__(self, model_channels, dropout, num_heads):
+        super().__init__()
+        self.resblk = ResBlock(
+            model_channels,
+            model_channels,
+            dropout,
+            model_channels,
+            dims=1,
+            use_scale_shift_norm=True,
+        )
+        self.attn = AttentionBlock(
+            model_channels, num_heads, relative_pos_embeddings=True
+        )
+
+    def forward(self, x, time_emb):
+        y = self.resblk(x, time_emb)
+        return self.attn(y)
+
+
+class DiffusionTts(nn.Module):
+    def __init__(
+        self,
+        model_channels=512,
+        num_layers=8,
+        in_channels=100,
+        in_latent_channels=512,
+        in_tokens=8193,
+        out_channels=200,  # mean and variance
+        dropout=0,
+        use_fp16=False,
+        num_heads=16,
+        # Parameters for regularization.
+        layer_drop=0.1,
+        unconditioned_percentage=0.1,  # This implements a mechanism similar to what is used in classifier-free training.
+    ):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.out_channels = out_channels
+        self.dropout = dropout
+        self.num_heads = num_heads
+        self.unconditioned_percentage = unconditioned_percentage
+        self.enable_fp16 = use_fp16
+        self.layer_drop = layer_drop
+
+        self.inp_block = nn.Conv1d(in_channels, model_channels, 3, 1, 1)
+        self.time_embed = nn.Sequential(
+            nn.Linear(model_channels, model_channels),
+            nn.SiLU(),
+            nn.Linear(model_channels, model_channels),
+        )
+
+        # Either code_converter or latent_converter is used, depending on what type of conditioning data is fed.
+        # This model is meant to be able to be trained on both for efficiency purposes - it is far less computationally
+        # complex to generate tokens, while generating latents will normally mean propagating through a deep autoregressive
+        # transformer network.
+        self.code_embedding = nn.Embedding(in_tokens, model_channels)
+        self.code_converter = nn.Sequential(
+            AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True),
+            AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True),
+            AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True),
+        )
+        self.code_norm = normalization(model_channels)
+        self.latent_conditioner = nn.Sequential(
+            nn.Conv1d(in_latent_channels, model_channels, 3, padding=1),
+            AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True),
+            AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True),
+            AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True),
+            AttentionBlock(model_channels, num_heads, relative_pos_embeddings=True),
+        )
+        self.contextual_embedder = nn.Sequential(
+            nn.Conv1d(in_channels, model_channels, 3, padding=1, stride=2),
+            nn.Conv1d(model_channels, model_channels * 2, 3, padding=1, stride=2),
+            AttentionBlock(
+                model_channels * 2,
+                num_heads,
+                relative_pos_embeddings=True,
+                do_checkpoint=False,
+            ),
+            AttentionBlock(
+                model_channels * 2,
+                num_heads,
+                relative_pos_embeddings=True,
+                do_checkpoint=False,
+            ),
+            AttentionBlock(
+                model_channels * 2,
+                num_heads,
+                relative_pos_embeddings=True,
+                do_checkpoint=False,
+            ),
+            AttentionBlock(
+                model_channels * 2,
+                num_heads,
+                relative_pos_embeddings=True,
+                do_checkpoint=False,
+            ),
+            AttentionBlock(
+                model_channels * 2,
+                num_heads,
+                relative_pos_embeddings=True,
+                do_checkpoint=False,
+            ),
+        )
+        self.unconditioned_embedding = nn.Parameter(torch.randn(1, model_channels, 1))
+        self.conditioning_timestep_integrator = TimestepEmbedSequential(
+            DiffusionLayer(model_channels, dropout, num_heads),
+            DiffusionLayer(model_channels, dropout, num_heads),
+            DiffusionLayer(model_channels, dropout, num_heads),
+        )
+
+        self.integrating_conv = nn.Conv1d(
+            model_channels * 2, model_channels, kernel_size=1
+        )
+        self.mel_head = nn.Conv1d(model_channels, in_channels, kernel_size=3, padding=1)
+
+        self.layers = nn.ModuleList(
+            [
+                DiffusionLayer(model_channels, dropout, num_heads)
+                for _ in range(num_layers)
+            ]
+            + [
+                ResBlock(
+                    model_channels,
+                    model_channels,
+                    dropout,
+                    dims=1,
+                    use_scale_shift_norm=True,
+                )
+                for _ in range(3)
+            ]
+        )
+
+        self.out = nn.Sequential(
+            normalization(model_channels),
+            nn.SiLU(),
+            nn.Conv1d(model_channels, out_channels, 3, padding=1),
+        )
+
+    def get_grad_norm_parameter_groups(self):
+        groups = {
+            "minicoder": list(self.contextual_embedder.parameters()),
+            "layers": list(self.layers.parameters()),
+            "code_converters": list(self.code_embedding.parameters())
+            + list(self.code_converter.parameters())
+            + list(self.latent_conditioner.parameters())
+            + list(self.latent_conditioner.parameters()),
+            "timestep_integrator": list(
+                self.conditioning_timestep_integrator.parameters()
+            )
+            + list(self.integrating_conv.parameters()),
+            "time_embed": list(self.time_embed.parameters()),
+        }
+        return groups
+
+    def get_conditioning(self, conditioning_input):
+        speech_conditioning_input = (
+            conditioning_input.unsqueeze(1)
+            if len(conditioning_input.shape) == 3
+            else conditioning_input
+        )
+        conds = []
+        for j in range(speech_conditioning_input.shape[1]):
+            conds.append(self.contextual_embedder(speech_conditioning_input[:, j]))
+        conds = torch.cat(conds, dim=-1)
+        conds = conds.mean(dim=-1)
+        return conds
+
+    def timestep_independent(
+        self,
+        aligned_conditioning,
+        conditioning_latent,
+        expected_seq_len,
+        return_code_pred,
+    ):
+        # Shuffle aligned_latent to BxCxS format
+        if is_latent(aligned_conditioning):
+            aligned_conditioning = aligned_conditioning.permute(0, 2, 1)
+
+        cond_scale, cond_shift = torch.chunk(conditioning_latent, 2, dim=1)
+        if is_latent(aligned_conditioning):
+            code_emb = self.latent_conditioner(aligned_conditioning)
+        else:
+            code_emb = self.code_embedding(aligned_conditioning).permute(0, 2, 1)
+            code_emb = self.code_converter(code_emb)
+        code_emb = self.code_norm(code_emb) * (
+            1 + cond_scale.unsqueeze(-1)
+        ) + cond_shift.unsqueeze(-1)
+
+        unconditioned_batches = torch.zeros(
+            (code_emb.shape[0], 1, 1), device=code_emb.device
+        )
+        # Mask out the conditioning branch for whole batch elements, implementing something similar to classifier-free guidance.
+        if self.training and self.unconditioned_percentage > 0:
+            unconditioned_batches = (
+                torch.rand((code_emb.shape[0], 1, 1), device=code_emb.device)
+                < self.unconditioned_percentage
+            )
+            code_emb = torch.where(
+                unconditioned_batches,
+                self.unconditioned_embedding.repeat(
+                    aligned_conditioning.shape[0], 1, 1
+                ),
+                code_emb,
+            )
+        expanded_code_emb = F.interpolate(
+            code_emb, size=expected_seq_len, mode="nearest"
+        )
+
+        if not return_code_pred:
+            return expanded_code_emb
+        else:
+            mel_pred = self.mel_head(expanded_code_emb)
+            # Multiply mel_pred by !unconditioned_branches, which drops the gradient on unconditioned branches. This is because we don't want that gradient being used to train parameters through the codes_embedder as it unbalances contributions to that network from the MSE loss.
+            mel_pred = mel_pred * unconditioned_batches.logical_not()
+            return expanded_code_emb, mel_pred
+
+    def forward(
+        self,
+        x,
+        timesteps,
+        aligned_conditioning=None,
+        conditioning_latent=None,
+        precomputed_aligned_embeddings=None,
+        conditioning_free=False,
+        return_code_pred=False,
+    ):
+        """
+        Apply the model to an input batch.
+
+        :param x: an [N x C x ...] Tensor of inputs.
+        :param timesteps: a 1-D batch of timesteps.
+        :param aligned_conditioning: an aligned latent or sequence of tokens providing useful data about the sample to be produced.
+        :param conditioning_latent: a pre-computed conditioning latent; see get_conditioning().
+        :param precomputed_aligned_embeddings: Embeddings returned from self.timestep_independent()
+        :param conditioning_free: When set, all conditioning inputs (including tokens and conditioning_input) will not be considered.
+        :return: an [N x C x ...] Tensor of outputs.
+        """
+        assert precomputed_aligned_embeddings is not None or (
+            aligned_conditioning is not None and conditioning_latent is not None
+        )
+        assert not (
+            return_code_pred and precomputed_aligned_embeddings is not None
+        )  # These two are mutually exclusive.
+
+        unused_params = []
+        if conditioning_free:
+            code_emb = self.unconditioned_embedding.repeat(x.shape[0], 1, x.shape[-1])
+            unused_params.extend(
+                list(self.code_converter.parameters())
+                + list(self.code_embedding.parameters())
+            )
+            unused_params.extend(list(self.latent_conditioner.parameters()))
+        else:
+            if precomputed_aligned_embeddings is not None:
+                code_emb = precomputed_aligned_embeddings
+            else:
+                code_emb, mel_pred = self.timestep_independent(
+                    aligned_conditioning, conditioning_latent, x.shape[-1], True
+                )
+                if is_latent(aligned_conditioning):
+                    unused_params.extend(
+                        list(self.code_converter.parameters())
+                        + list(self.code_embedding.parameters())
+                    )
+                else:
+                    unused_params.extend(list(self.latent_conditioner.parameters()))
+
+            unused_params.append(self.unconditioned_embedding)
+
+        time_emb = self.time_embed(timestep_embedding(timesteps, self.model_channels))
+        code_emb = self.conditioning_timestep_integrator(code_emb, time_emb)
+        x = self.inp_block(x)
+        x = torch.cat([x, code_emb], dim=1)
+        x = self.integrating_conv(x)
+        for i, lyr in enumerate(self.layers):
+            # Do layer drop where applicable. Do not drop first and last layers.
+            if (
+                self.training
+                and self.layer_drop > 0
+                and i != 0
+                and i != (len(self.layers) - 1)
+                and random.random() < self.layer_drop
+            ):
+                unused_params.extend(list(lyr.parameters()))
+            else:
+                # First and last blocks will have autocast disabled for improved precision.
+                with autocast(x.device.type, enabled=self.enable_fp16 and i != 0):
+                    x = lyr(x, time_emb)
+
+        x = x.float()
+        out = self.out(x)
+
+        # Involve probabilistic or possibly unused parameters in loss so we don't get DDP errors.
+        extraneous_addition = 0
+        for p in unused_params:
+            extraneous_addition = extraneous_addition + p.mean()
+        out = out + extraneous_addition * 0
+
+        if return_code_pred:
+            return out, mel_pred
+        return out
+
+
+if __name__ == "__main__":
+    clip = torch.randn(2, 100, 400)
+    aligned_latent = torch.randn(2, 388, 512)
+    aligned_sequence = torch.randint(0, 8192, (2, 100))
+    cond = torch.randn(2, 100, 400)
+    ts = torch.LongTensor([600, 600])
+    model = DiffusionTts(512, layer_drop=0.3, unconditioned_percentage=0.5)
+    # Test with latent aligned conditioning
+    # o = model(clip, ts, aligned_latent, cond)
+    # Test with sequence aligned conditioning
+    o = model(clip, ts, aligned_sequence, cond)
--- a/TTS/tts/layers/tortoise/dpm_solver.py
+++ b/TTS/tts/layers/tortoise/dpm_solver.py
--- a/TTS/tts/layers/tortoise/random_latent_generator.py
+++ b/TTS/tts/layers/tortoise/random_latent_generator.py
@ -0,0 +1,56 @@
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def fused_leaky_relu(input, bias=None, negative_slope=0.2, scale=2**0.5):
+    if bias is not None:
+        rest_dim = [1] * (input.ndim - bias.ndim - 1)
+        return (
+            F.leaky_relu(
+                input + bias.view(1, bias.shape[0], *rest_dim),
+                negative_slope=negative_slope,
+            )
+            * scale
+        )
+    else:
+        return F.leaky_relu(input, negative_slope=0.2) * scale
+
+
+class EqualLinear(nn.Module):
+    def __init__(self, in_dim, out_dim, bias=True, bias_init=0, lr_mul=1):
+        super().__init__()
+        self.weight = nn.Parameter(torch.randn(out_dim, in_dim).div_(lr_mul))
+        if bias:
+            self.bias = nn.Parameter(torch.zeros(out_dim).fill_(bias_init))
+        else:
+            self.bias = None
+        self.scale = (1 / math.sqrt(in_dim)) * lr_mul
+        self.lr_mul = lr_mul
+
+    def forward(self, input):
+        out = F.linear(input, self.weight * self.scale)
+        out = fused_leaky_relu(out, self.bias * self.lr_mul)
+        return out
+
+
+class RandomLatentConverter(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.layers = nn.Sequential(
+            *[EqualLinear(channels, channels, lr_mul=0.1) for _ in range(5)],
+            nn.Linear(channels, channels)
+        )
+        self.channels = channels
+
+    def forward(self, ref):
+        r = torch.randn(ref.shape[0], self.channels, device=ref.device)
+        y = self.layers(r)
+        return y
+
+
+if __name__ == "__main__":
+    model = RandomLatentConverter(512)
+    model(torch.randn(5, 512))
--- a/TTS/tts/layers/tortoise/tokenizer.py
+++ b/TTS/tts/layers/tortoise/tokenizer.py
@ -0,0 +1,201 @@
+import os
+import re
+
+import inflect
+import torch
+from tokenizers import Tokenizer
+
+# Regular expression matching whitespace:
+from unidecode import unidecode
+
+_whitespace_re = re.compile(r"\s+")
+
+
+# List of (regular expression, replacement) pairs for abbreviations:
+_abbreviations = [
+    (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+    for x in [
+        ("mrs", "misess"),
+        ("mr", "mister"),
+        ("dr", "doctor"),
+        ("st", "saint"),
+        ("co", "company"),
+        ("jr", "junior"),
+        ("maj", "major"),
+        ("gen", "general"),
+        ("drs", "doctors"),
+        ("rev", "reverend"),
+        ("lt", "lieutenant"),
+        ("hon", "honorable"),
+        ("sgt", "sergeant"),
+        ("capt", "captain"),
+        ("esq", "esquire"),
+        ("ltd", "limited"),
+        ("col", "colonel"),
+        ("ft", "fort"),
+    ]
+]
+
+
+def expand_abbreviations(text):
+    for regex, replacement in _abbreviations:
+        text = re.sub(regex, replacement, text)
+    return text
+
+
+_inflect = inflect.engine()
+_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
+_decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
+_pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
+_dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
+_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
+_number_re = re.compile(r"[0-9]+")
+
+
+def _remove_commas(m):
+    return m.group(1).replace(",", "")
+
+
+def _expand_decimal_point(m):
+    return m.group(1).replace(".", " point ")
+
+
+def _expand_dollars(m):
+    match = m.group(1)
+    parts = match.split(".")
+    if len(parts) > 2:
+        return match + " dollars"  # Unexpected format
+    dollars = int(parts[0]) if parts[0] else 0
+    cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
+    if dollars and cents:
+        dollar_unit = "dollar" if dollars == 1 else "dollars"
+        cent_unit = "cent" if cents == 1 else "cents"
+        return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
+    elif dollars:
+        dollar_unit = "dollar" if dollars == 1 else "dollars"
+        return "%s %s" % (dollars, dollar_unit)
+    elif cents:
+        cent_unit = "cent" if cents == 1 else "cents"
+        return "%s %s" % (cents, cent_unit)
+    else:
+        return "zero dollars"
+
+
+def _expand_ordinal(m):
+    return _inflect.number_to_words(m.group(0))
+
+
+def _expand_number(m):
+    num = int(m.group(0))
+    if num > 1000 and num < 3000:
+        if num == 2000:
+            return "two thousand"
+        elif num > 2000 and num < 2010:
+            return "two thousand " + _inflect.number_to_words(num % 100)
+        elif num % 100 == 0:
+            return _inflect.number_to_words(num // 100) + " hundred"
+        else:
+            return _inflect.number_to_words(
+                num, andword="", zero="oh", group=2
+            ).replace(", ", " ")
+    else:
+        return _inflect.number_to_words(num, andword="")
+
+
+def normalize_numbers(text):
+    text = re.sub(_comma_number_re, _remove_commas, text)
+    text = re.sub(_pounds_re, r"\1 pounds", text)
+    text = re.sub(_dollars_re, _expand_dollars, text)
+    text = re.sub(_decimal_number_re, _expand_decimal_point, text)
+    text = re.sub(_ordinal_re, _expand_ordinal, text)
+    text = re.sub(_number_re, _expand_number, text)
+    return text
+
+
+def expand_numbers(text):
+    return normalize_numbers(text)
+
+
+def lowercase(text):
+    return text.lower()
+
+
+def collapse_whitespace(text):
+    return re.sub(_whitespace_re, " ", text)
+
+
+def convert_to_ascii(text):
+    return unidecode(text)
+
+
+def basic_cleaners(text):
+    """Basic pipeline that lowercases and collapses whitespace without transliteration."""
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
+
+
+def transliteration_cleaners(text):
+    """Pipeline for non-English text that transliterates to ASCII."""
+    text = convert_to_ascii(text)
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
+
+
+def english_cleaners(text):
+    """Pipeline for English text, including number and abbreviation expansion."""
+    text = convert_to_ascii(text)
+    text = lowercase(text)
+    text = expand_numbers(text)
+    text = expand_abbreviations(text)
+    text = collapse_whitespace(text)
+    text = text.replace('"', "")
+    return text
+
+
+def lev_distance(s1, s2):
+    if len(s1) > len(s2):
+        s1, s2 = s2, s1
+
+    distances = range(len(s1) + 1)
+    for i2, c2 in enumerate(s2):
+        distances_ = [i2 + 1]
+        for i1, c1 in enumerate(s1):
+            if c1 == c2:
+                distances_.append(distances[i1])
+            else:
+                distances_.append(
+                    1 + min((distances[i1], distances[i1 + 1], distances_[-1]))
+                )
+        distances = distances_
+    return distances[-1]
+
+
+DEFAULT_VOCAB_FILE = os.path.join(
+    os.path.dirname(os.path.realpath(__file__)), "../../utils/assets/tortoise/tokenizer.json"
+)
+
+
+class VoiceBpeTokenizer:
+    def __init__(self, vocab_file=DEFAULT_VOCAB_FILE):
+        if vocab_file is not None:
+            self.tokenizer = Tokenizer.from_file(vocab_file)
+
+    def preprocess_text(self, txt):
+        txt = english_cleaners(txt)
+        return txt
+
+    def encode(self, txt):
+        txt = self.preprocess_text(txt)
+        txt = txt.replace(" ", "[SPACE]")
+        return self.tokenizer.encode(txt).ids
+
+    def decode(self, seq):
+        if isinstance(seq, torch.Tensor):
+            seq = seq.cpu().numpy()
+        txt = self.tokenizer.decode(seq, skip_special_tokens=False).replace(" ", "")
+        txt = txt.replace("[SPACE]", " ")
+        txt = txt.replace("[STOP]", "")
+        txt = txt.replace("[UNK]", "")
+        return txt
--- a/TTS/tts/layers/tortoise/transformer.py
+++ b/TTS/tts/layers/tortoise/transformer.py
@ -0,0 +1,237 @@
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from torch import nn
+
+# helpers
+
+
+def exists(val):
+    return val is not None
+
+
+def default(val, d):
+    return val if exists(val) else d
+
+
+def cast_tuple(val, depth=1):
+    if isinstance(val, list):
+        val = tuple(val)
+    return val if isinstance(val, tuple) else (val,) * depth
+
+
+def max_neg_value(t):
+    return -torch.finfo(t.dtype).max
+
+
+def stable_softmax(t, dim=-1, alpha=32**2):
+    t = t / alpha
+    t = t - torch.amax(t, dim=dim, keepdim=True).detach()
+    return (t * alpha).softmax(dim=dim)
+
+
+def route_args(router, args, depth):
+    routed_args = [(dict(), dict()) for _ in range(depth)]
+    matched_keys = [key for key in args.keys() if key in router]
+
+    for key in matched_keys:
+        val = args[key]
+        for depth, ((f_args, g_args), routes) in enumerate(
+            zip(routed_args, router[key])
+        ):
+            new_f_args, new_g_args = map(
+                lambda route: ({key: val} if route else {}), routes
+            )
+            routed_args[depth] = ({**f_args, **new_f_args}, {**g_args, **new_g_args})
+    return routed_args
+
+
+# classes
+class SequentialSequence(nn.Module):
+    def __init__(self, layers, args_route={}, layer_dropout=0.0):
+        super().__init__()
+        assert all(
+            len(route) == len(layers) for route in args_route.values()
+        ), "each argument route map must have the same depth as the number of sequential layers"
+        self.layers = layers
+        self.args_route = args_route
+        self.layer_dropout = layer_dropout
+
+    def forward(self, x, **kwargs):
+        args = route_args(self.args_route, kwargs, len(self.layers))
+        layers_and_args = list(zip(self.layers, args))
+
+        for (f, g), (f_args, g_args) in layers_and_args:
+            x = x + f(x, **f_args)
+            x = x + g(x, **g_args)
+        return x
+
+
+class DivideMax(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, x):
+        maxes = x.amax(dim=self.dim, keepdim=True).detach()
+        return x / maxes
+
+
+# https://arxiv.org/abs/2103.17239
+class LayerScale(nn.Module):
+    def __init__(self, dim, depth, fn):
+        super().__init__()
+        if depth <= 18:
+            init_eps = 0.1
+        elif depth > 18 and depth <= 24:
+            init_eps = 1e-5
+        else:
+            init_eps = 1e-6
+
+        scale = torch.zeros(1, 1, dim).fill_(init_eps)
+        self.scale = nn.Parameter(scale)
+        self.fn = fn
+
+    def forward(self, x, **kwargs):
+        return self.fn(x, **kwargs) * self.scale
+
+
+# layer norm
+
+
+class PreNorm(nn.Module):
+    def __init__(self, dim, fn, sandwich=False):
+        super().__init__()
+        self.norm = nn.LayerNorm(dim)
+        self.norm_out = nn.LayerNorm(dim) if sandwich else nn.Identity()
+        self.fn = fn
+
+    def forward(self, x, **kwargs):
+        x = self.norm(x)
+        x = self.fn(x, **kwargs)
+        return self.norm_out(x)
+
+
+# feed forward
+
+
+class GEGLU(nn.Module):
+    def forward(self, x):
+        x, gates = x.chunk(2, dim=-1)
+        return x * F.gelu(gates)
+
+
+class FeedForward(nn.Module):
+    def __init__(self, dim, dropout=0.0, mult=4.0):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(dim, dim * mult * 2),
+            GEGLU(),
+            nn.Dropout(dropout),
+            nn.Linear(dim * mult, dim),
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+
+# Attention
+
+
+class Attention(nn.Module):
+    def __init__(self, dim, seq_len, causal=True, heads=8, dim_head=64, dropout=0.0):
+        super().__init__()
+        inner_dim = dim_head * heads
+        self.heads = heads
+        self.seq_len = seq_len
+        self.scale = dim_head**-0.5
+
+        self.causal = causal
+
+        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False)
+        self.to_out = nn.Sequential(nn.Linear(inner_dim, dim), nn.Dropout(dropout))
+
+    def forward(self, x, mask=None):
+        b, n, _, h, device = *x.shape, self.heads, x.device
+        softmax = torch.softmax
+
+        qkv = self.to_qkv(x).chunk(3, dim=-1)
+        q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), qkv)
+
+        q = q * self.scale
+
+        dots = torch.einsum("b h i d, b h j d -> b h i j", q, k)
+        mask_value = max_neg_value(dots)
+
+        if exists(mask):
+            mask = rearrange(mask, "b j -> b () () j")
+            dots.masked_fill_(~mask, mask_value)
+            del mask
+
+        if self.causal:
+            i, j = dots.shape[-2:]
+            mask = torch.ones(i, j, device=device).triu_(j - i + 1).bool()
+            dots.masked_fill_(mask, mask_value)
+
+        attn = softmax(dots, dim=-1)
+
+        out = torch.einsum("b h i j, b h j d -> b h i d", attn, v)
+        out = rearrange(out, "b h n d -> b n (h d)")
+        out = self.to_out(out)
+        return out
+
+
+# main transformer class
+class Transformer(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        depth,
+        seq_len,
+        causal=True,
+        heads=8,
+        dim_head=64,
+        ff_mult=4,
+        attn_dropout=0.0,
+        ff_dropout=0.0,
+        sparse_attn=False,
+        sandwich_norm=False,
+    ):
+        super().__init__()
+        layers = nn.ModuleList([])
+        sparse_layer = cast_tuple(sparse_attn, depth)
+
+        for ind, sparse_attn in zip(range(depth), sparse_layer):
+            attn = Attention(
+                dim,
+                causal=causal,
+                seq_len=seq_len,
+                heads=heads,
+                dim_head=dim_head,
+                dropout=attn_dropout,
+            )
+
+            ff = FeedForward(dim, mult=ff_mult, dropout=ff_dropout)
+
+            layers.append(
+                nn.ModuleList(
+                    [
+                        LayerScale(
+                            dim, ind + 1, PreNorm(dim, attn, sandwich=sandwich_norm)
+                        ),
+                        LayerScale(
+                            dim, ind + 1, PreNorm(dim, ff, sandwich=sandwich_norm)
+                        ),
+                    ]
+                )
+            )
+
+        execute_type = SequentialSequence
+        route_attn = ((True, False),) * depth
+        attn_route_map = {"mask": route_attn}
+
+        self.layers = execute_type(layers, args_route=attn_route_map)
+
+    def forward(self, x, **kwargs):
+        return self.layers(x, **kwargs)
--- a/TTS/tts/layers/tortoise/utils.py
+++ b/TTS/tts/layers/tortoise/utils.py
@ -0,0 +1,75 @@
+import os
+try: import gdown
+except ImportError:
+    raise ImportError(
+        "Sorry, gdown is required in order to download the new BigVGAN vocoder.\n"
+        "Please install it with `pip install gdown` and try again."
+    )
+from urllib import request
+
+import progressbar
+
+D_STEM = "https://drive.google.com/uc?id="
+
+DEFAULT_MODELS_DIR = os.path.join(
+    os.path.expanduser("~"), ".cache", "tortoise", "models"
+)
+# MODELS_DIR = os.environ.get("TORTOISE_MODELS_DIR", DEFAULT_MODELS_DIR)
+MODELS_DIR = "/data/speech_synth/models/"
+MODELS = {
+    "autoregressive.pth": "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/autoregressive.pth",
+    "classifier.pth": "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/classifier.pth",
+    "clvp2.pth": "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/clvp2.pth",
+    "cvvp.pth": "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/cvvp.pth",
+    "diffusion_decoder.pth": "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/diffusion_decoder.pth",
+    "vocoder.pth": "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/vocoder.pth",
+    "rlg_auto.pth": "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/rlg_auto.pth",
+    "rlg_diffuser.pth": "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/rlg_diffuser.pth",
+    # these links are from the nvidia gdrive
+    "bigvgan_base_24khz_100band_g.pth": "https://drive.google.com/uc?id=1_cKskUDuvxQJUEBwdgjAxKuDTUW6kPdY",
+    "bigvgan_24khz_100band_g.pth": "https://drive.google.com/uc?id=1wmP_mAs7d00KHVfVEl8B5Gb72Kzpcavp",
+}
+
+pbar = None
+def download_models(specific_models=None):
+    """
+    Call to download all the models that Tortoise uses.
+    """
+    os.makedirs(MODELS_DIR, exist_ok=True)
+
+    def show_progress(block_num, block_size, total_size):
+        global pbar
+        if pbar is None:
+            pbar = progressbar.ProgressBar(maxval=total_size)
+            pbar.start()
+
+        downloaded = block_num * block_size
+        if downloaded < total_size:
+            pbar.update(downloaded)
+        else:
+            pbar.finish()
+            pbar = None
+
+    for model_name, url in MODELS.items():
+        if specific_models is not None and model_name not in specific_models:
+            continue
+        model_path = os.path.join(MODELS_DIR, model_name)
+        if os.path.exists(model_path):
+            continue
+        print(f"Downloading {model_name} from {url}...")
+        if D_STEM in url:
+            gdown.download(url, model_path, quiet=False)
+        else:
+            request.urlretrieve(url, model_path, show_progress)
+        print("Done.")
+
+def get_model_path(model_name, models_dir=MODELS_DIR):
+    """
+    Get path to given model, download it if it doesn't exist.
+    """
+    if model_name not in MODELS:
+        raise ValueError(f"Model {model_name} not found in available models.")
+    model_path = os.path.join(models_dir, model_name)
+    if not os.path.exists(model_path) and models_dir == MODELS_DIR:
+        download_models([model_name])
+    return model_path
--- a/TTS/tts/layers/tortoise/vocoder.py
+++ b/TTS/tts/layers/tortoise/vocoder.py
@ -0,0 +1,421 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import json
+from enum import Enum
+from typing import Optional, Callable
+from dataclasses import dataclass
+
+MAX_WAV_VALUE = 32768.0
+
+
+class KernelPredictor(torch.nn.Module):
+    """Kernel predictor for the location-variable convolutions"""
+
+    def __init__(
+        self,
+        cond_channels,
+        conv_in_channels,
+        conv_out_channels,
+        conv_layers,
+        conv_kernel_size=3,
+        kpnet_hidden_channels=64,
+        kpnet_conv_size=3,
+        kpnet_dropout=0.0,
+        kpnet_nonlinear_activation="LeakyReLU",
+        kpnet_nonlinear_activation_params={"negative_slope": 0.1},
+    ):
+        """
+        Args:
+            cond_channels (int): number of channel for the conditioning sequence,
+            conv_in_channels (int): number of channel for the input sequence,
+            conv_out_channels (int): number of channel for the output sequence,
+            conv_layers (int): number of layers
+        """
+        super().__init__()
+
+        self.conv_in_channels = conv_in_channels
+        self.conv_out_channels = conv_out_channels
+        self.conv_kernel_size = conv_kernel_size
+        self.conv_layers = conv_layers
+
+        kpnet_kernel_channels = (
+            conv_in_channels * conv_out_channels * conv_kernel_size * conv_layers
+        )  # l_w
+        kpnet_bias_channels = conv_out_channels * conv_layers  # l_b
+
+        self.input_conv = nn.Sequential(
+            nn.utils.weight_norm(
+                nn.Conv1d(cond_channels, kpnet_hidden_channels, 5, padding=2, bias=True)
+            ),
+            getattr(nn, kpnet_nonlinear_activation)(
+                **kpnet_nonlinear_activation_params
+            ),
+        )
+
+        self.residual_convs = nn.ModuleList()
+        padding = (kpnet_conv_size - 1) // 2
+        for _ in range(3):
+            self.residual_convs.append(
+                nn.Sequential(
+                    nn.Dropout(kpnet_dropout),
+                    nn.utils.weight_norm(
+                        nn.Conv1d(
+                            kpnet_hidden_channels,
+                            kpnet_hidden_channels,
+                            kpnet_conv_size,
+                            padding=padding,
+                            bias=True,
+                        )
+                    ),
+                    getattr(nn, kpnet_nonlinear_activation)(
+                        **kpnet_nonlinear_activation_params
+                    ),
+                    nn.utils.weight_norm(
+                        nn.Conv1d(
+                            kpnet_hidden_channels,
+                            kpnet_hidden_channels,
+                            kpnet_conv_size,
+                            padding=padding,
+                            bias=True,
+                        )
+                    ),
+                    getattr(nn, kpnet_nonlinear_activation)(
+                        **kpnet_nonlinear_activation_params
+                    ),
+                )
+            )
+        self.kernel_conv = nn.utils.weight_norm(
+            nn.Conv1d(
+                kpnet_hidden_channels,
+                kpnet_kernel_channels,
+                kpnet_conv_size,
+                padding=padding,
+                bias=True,
+            )
+        )
+        self.bias_conv = nn.utils.weight_norm(
+            nn.Conv1d(
+                kpnet_hidden_channels,
+                kpnet_bias_channels,
+                kpnet_conv_size,
+                padding=padding,
+                bias=True,
+            )
+        )
+
+    def forward(self, c):
+        """
+        Args:
+            c (Tensor): the conditioning sequence (batch, cond_channels, cond_length)
+        """
+        batch, _, cond_length = c.shape
+        c = self.input_conv(c)
+        for residual_conv in self.residual_convs:
+            residual_conv.to(c.device)
+            c = c + residual_conv(c)
+        k = self.kernel_conv(c)
+        b = self.bias_conv(c)
+        kernels = k.contiguous().view(
+            batch,
+            self.conv_layers,
+            self.conv_in_channels,
+            self.conv_out_channels,
+            self.conv_kernel_size,
+            cond_length,
+        )
+        bias = b.contiguous().view(
+            batch,
+            self.conv_layers,
+            self.conv_out_channels,
+            cond_length,
+        )
+
+        return kernels, bias
+
+    def remove_weight_norm(self):
+        nn.utils.remove_weight_norm(self.input_conv[0])
+        nn.utils.remove_weight_norm(self.kernel_conv)
+        nn.utils.remove_weight_norm(self.bias_conv)
+        for block in self.residual_convs:
+            nn.utils.remove_weight_norm(block[1])
+            nn.utils.remove_weight_norm(block[3])
+
+
+class LVCBlock(torch.nn.Module):
+    """the location-variable convolutions"""
+
+    def __init__(
+        self,
+        in_channels,
+        cond_channels,
+        stride,
+        dilations=[1, 3, 9, 27],
+        lReLU_slope=0.2,
+        conv_kernel_size=3,
+        cond_hop_length=256,
+        kpnet_hidden_channels=64,
+        kpnet_conv_size=3,
+        kpnet_dropout=0.0,
+    ):
+        super().__init__()
+
+        self.cond_hop_length = cond_hop_length
+        self.conv_layers = len(dilations)
+        self.conv_kernel_size = conv_kernel_size
+
+        self.kernel_predictor = KernelPredictor(
+            cond_channels=cond_channels,
+            conv_in_channels=in_channels,
+            conv_out_channels=2 * in_channels,
+            conv_layers=len(dilations),
+            conv_kernel_size=conv_kernel_size,
+            kpnet_hidden_channels=kpnet_hidden_channels,
+            kpnet_conv_size=kpnet_conv_size,
+            kpnet_dropout=kpnet_dropout,
+            kpnet_nonlinear_activation_params={"negative_slope": lReLU_slope},
+        )
+
+        self.convt_pre = nn.Sequential(
+            nn.LeakyReLU(lReLU_slope),
+            nn.utils.weight_norm(
+                nn.ConvTranspose1d(
+                    in_channels,
+                    in_channels,
+                    2 * stride,
+                    stride=stride,
+                    padding=stride // 2 + stride % 2,
+                    output_padding=stride % 2,
+                )
+            ),
+        )
+
+        self.conv_blocks = nn.ModuleList()
+        for dilation in dilations:
+            self.conv_blocks.append(
+                nn.Sequential(
+                    nn.LeakyReLU(lReLU_slope),
+                    nn.utils.weight_norm(
+                        nn.Conv1d(
+                            in_channels,
+                            in_channels,
+                            conv_kernel_size,
+                            padding=dilation * (conv_kernel_size - 1) // 2,
+                            dilation=dilation,
+                        )
+                    ),
+                    nn.LeakyReLU(lReLU_slope),
+                )
+            )
+
+    def forward(self, x, c):
+        """forward propagation of the location-variable convolutions.
+        Args:
+            x (Tensor): the input sequence (batch, in_channels, in_length)
+            c (Tensor): the conditioning sequence (batch, cond_channels, cond_length)
+
+        Returns:
+            Tensor: the output sequence (batch, in_channels, in_length)
+        """
+        _, in_channels, _ = x.shape  # (B, c_g, L')
+
+        x = self.convt_pre(x)  # (B, c_g, stride * L')
+        kernels, bias = self.kernel_predictor(c)
+
+        for i, conv in enumerate(self.conv_blocks):
+            output = conv(x)  # (B, c_g, stride * L')
+
+            k = kernels[:, i, :, :, :, :]  # (B, 2 * c_g, c_g, kernel_size, cond_length)
+            b = bias[:, i, :, :]  # (B, 2 * c_g, cond_length)
+
+            output = self.location_variable_convolution(
+                output, k, b, hop_size=self.cond_hop_length
+            )  # (B, 2 * c_g, stride * L'): LVC
+            x = x + torch.sigmoid(output[:, :in_channels, :]) * torch.tanh(
+                output[:, in_channels:, :]
+            )  # (B, c_g, stride * L'): GAU
+
+        return x
+
+    def location_variable_convolution(self, x, kernel, bias, dilation=1, hop_size=256):
+        """perform location-variable convolution operation on the input sequence (x) using the local convolution kernl.
+        Time: 414 μs ± 309 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each), test on NVIDIA V100.
+        Args:
+            x (Tensor): the input sequence (batch, in_channels, in_length).
+            kernel (Tensor): the local convolution kernel (batch, in_channel, out_channels, kernel_size, kernel_length)
+            bias (Tensor): the bias for the local convolution (batch, out_channels, kernel_length)
+            dilation (int): the dilation of convolution.
+            hop_size (int): the hop_size of the conditioning sequence.
+        Returns:
+            (Tensor): the output sequence after performing local convolution. (batch, out_channels, in_length).
+        """
+        batch, _, in_length = x.shape
+        batch, _, out_channels, kernel_size, kernel_length = kernel.shape
+        assert in_length == (
+            kernel_length * hop_size
+        ), "length of (x, kernel) is not matched"
+
+        padding = dilation * int((kernel_size - 1) / 2)
+        x = F.pad(
+            x, (padding, padding), "constant", 0
+        )  # (batch, in_channels, in_length + 2*padding)
+        x = x.unfold(
+            2, hop_size + 2 * padding, hop_size
+        )  # (batch, in_channels, kernel_length, hop_size + 2*padding)
+
+        if hop_size < dilation:
+            x = F.pad(x, (0, dilation), "constant", 0)
+        x = x.unfold(
+            3, dilation, dilation
+        )  # (batch, in_channels, kernel_length, (hop_size + 2*padding)/dilation, dilation)
+        x = x[:, :, :, :, :hop_size]
+        x = x.transpose(
+            3, 4
+        )  # (batch, in_channels, kernel_length, dilation, (hop_size + 2*padding)/dilation)
+        x = x.unfold(
+            4, kernel_size, 1
+        )  # (batch, in_channels, kernel_length, dilation, _, kernel_size)
+
+        o = torch.einsum("bildsk,biokl->bolsd", x, kernel)
+        o = o.to(memory_format=torch.channels_last_3d)
+        bias = bias.unsqueeze(-1).unsqueeze(-1).to(memory_format=torch.channels_last_3d)
+        o = o + bias
+        o = o.contiguous().view(batch, out_channels, -1)
+
+        return o
+
+    def remove_weight_norm(self):
+        self.kernel_predictor.remove_weight_norm()
+        nn.utils.remove_weight_norm(self.convt_pre[1])
+        for block in self.conv_blocks:
+            nn.utils.remove_weight_norm(block[1])
+
+
+class UnivNetGenerator(nn.Module):
+    """
+    UnivNet Generator
+
+    Originally from https://github.com/mindslab-ai/univnet/blob/master/model/generator.py.
+    """
+
+    def __init__(
+        self,
+        noise_dim=64,
+        channel_size=32,
+        dilations=[1, 3, 9, 27],
+        strides=[8, 8, 4],
+        lReLU_slope=0.2,
+        kpnet_conv_size=3,
+        # Below are MEL configurations options that this generator requires.
+        hop_length=256,
+        n_mel_channels=100,
+    ):
+        super(UnivNetGenerator, self).__init__()
+        self.mel_channel = n_mel_channels
+        self.noise_dim = noise_dim
+        self.hop_length = hop_length
+        channel_size = channel_size
+        kpnet_conv_size = kpnet_conv_size
+
+        self.res_stack = nn.ModuleList()
+        hop_length = 1
+        for stride in strides:
+            hop_length = stride * hop_length
+            self.res_stack.append(
+                LVCBlock(
+                    channel_size,
+                    n_mel_channels,
+                    stride=stride,
+                    dilations=dilations,
+                    lReLU_slope=lReLU_slope,
+                    cond_hop_length=hop_length,
+                    kpnet_conv_size=kpnet_conv_size,
+                )
+            )
+
+        self.conv_pre = nn.utils.weight_norm(
+            nn.Conv1d(noise_dim, channel_size, 7, padding=3, padding_mode="reflect")
+        )
+
+        self.conv_post = nn.Sequential(
+            nn.LeakyReLU(lReLU_slope),
+            nn.utils.weight_norm(
+                nn.Conv1d(channel_size, 1, 7, padding=3, padding_mode="reflect")
+            ),
+            nn.Tanh(),
+        )
+
+    def forward(self, c, z):
+        """
+        Args:
+            c (Tensor): the conditioning sequence of mel-spectrogram (batch, mel_channels, in_length)
+            z (Tensor): the noise sequence (batch, noise_dim, in_length)
+
+        """
+        z = self.conv_pre(z)  # (B, c_g, L)
+
+        for res_block in self.res_stack:
+            res_block.to(z.device)
+            z = res_block(z, c)  # (B, c_g, L * s_0 * ... * s_i)
+
+        z = self.conv_post(z)  # (B, 1, L * 256)
+
+        return z
+
+    def eval(self, inference=False):
+        super(UnivNetGenerator, self).eval()
+        # don't remove weight norm while validation in training loop
+        if inference:
+            self.remove_weight_norm()
+
+    def remove_weight_norm(self):
+        nn.utils.remove_weight_norm(self.conv_pre)
+
+        for layer in self.conv_post:
+            if len(layer.state_dict()) != 0:
+                nn.utils.remove_weight_norm(layer)
+
+        for res_block in self.res_stack:
+            res_block.remove_weight_norm()
+
+    def inference(self, c, z=None):
+        # pad input mel with zeros to cut artifact
+        # see https://github.com/seungwonpark/melgan/issues/8
+        zero = torch.full((c.shape[0], self.mel_channel, 10), -11.5129).to(c.device)
+        mel = torch.cat((c, zero), dim=2)
+
+        if z is None:
+            z = torch.randn(c.shape[0], self.noise_dim, mel.size(2)).to(mel.device)
+
+        audio = self.forward(mel, z)
+        audio = audio[:, :, : -(self.hop_length * 10)]
+        audio = audio.clamp(min=-1, max=1)
+        return audio
+
+
+@dataclass
+class VocType:
+    constructor: Callable[[], nn.Module]
+    model_path: str
+    subkey: Optional[str] = None
+    def optionally_index(self, model_dict):
+        if self.subkey is not None:
+            return model_dict[self.subkey]
+        return model_dict
+class VocConf(Enum):
+    Univnet = VocType(UnivNetGenerator, "vocoder.pth", 'model_g')
+
+if __name__ == "__main__":
+    model = UnivNetGenerator()
+
+    c = torch.randn(3, 100, 10)
+    z = torch.randn(3, 64, 10)
+    print(c.shape)
+
+    y = model(c, z)
+    print(y.shape)
+    assert y.shape == torch.Size([3, 1, 2560])
+
+    pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print(pytorch_total_params)
--- a/TTS/tts/layers/tortoise/wav2vec_alignment.py
+++ b/TTS/tts/layers/tortoise/wav2vec_alignment.py
@ -0,0 +1,164 @@
+import torch
+import torchaudio
+from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2ForCTC
+
+
+def max_alignment(s1, s2, skip_character="~", record=None):
+    """
+    A clever function that aligns s1 to s2 as best it can. Wherever a character from s1 is not found in s2, a '~' is
+    used to replace that character.
+
+    Finally got to use my DP skills!
+    """
+    if record is None:
+        record = {}
+    assert (
+        skip_character not in s1
+    ), f"Found the skip character {skip_character} in the provided string, {s1}"
+    if len(s1) == 0:
+        return ""
+    if len(s2) == 0:
+        return skip_character * len(s1)
+    if s1 == s2:
+        return s1
+    if s1[0] == s2[0]:
+        return s1[0] + max_alignment(s1[1:], s2[1:], skip_character, record)
+
+    take_s1_key = (len(s1), len(s2) - 1)
+    if take_s1_key in record:
+        take_s1, take_s1_score = record[take_s1_key]
+    else:
+        take_s1 = max_alignment(s1, s2[1:], skip_character, record)
+        take_s1_score = len(take_s1.replace(skip_character, ""))
+        record[take_s1_key] = (take_s1, take_s1_score)
+
+    take_s2_key = (len(s1) - 1, len(s2))
+    if take_s2_key in record:
+        take_s2, take_s2_score = record[take_s2_key]
+    else:
+        take_s2 = max_alignment(s1[1:], s2, skip_character, record)
+        take_s2_score = len(take_s2.replace(skip_character, ""))
+        record[take_s2_key] = (take_s2, take_s2_score)
+
+    return take_s1 if take_s1_score > take_s2_score else skip_character + take_s2
+
+
+class Wav2VecAlignment:
+    """
+    Uses wav2vec2 to perform audio<->text alignment.
+    """
+
+    def __init__(self, device="cuda"):
+        self.model = Wav2Vec2ForCTC.from_pretrained(
+            "jbetker/wav2vec2-large-robust-ft-libritts-voxpopuli"
+        ).cpu()
+        self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
+            "facebook/wav2vec2-large-960h"
+        )
+        self.tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(
+            "jbetker/tacotron-symbols"
+        )
+        self.device = device
+
+    def align(self, audio, expected_text, audio_sample_rate=24000):
+        orig_len = audio.shape[-1]
+
+        with torch.no_grad():
+            self.model = self.model.to(self.device)
+            audio = audio.to(self.device)
+            audio = torchaudio.functional.resample(audio, audio_sample_rate, 16000)
+            clip_norm = (audio - audio.mean()) / torch.sqrt(audio.var() + 1e-7)
+            logits = self.model(clip_norm).logits
+            self.model = self.model.cpu()
+
+        logits = logits[0]
+        pred_string = self.tokenizer.decode(logits.argmax(-1).tolist())
+
+        fixed_expectation = max_alignment(expected_text.lower(), pred_string)
+        w2v_compression = orig_len // logits.shape[0]
+        expected_tokens = self.tokenizer.encode(fixed_expectation)
+        expected_chars = list(fixed_expectation)
+        if len(expected_tokens) == 1:
+            return [0]  # The alignment is simple; there is only one token.
+        expected_tokens.pop(0)  # The first token is a given.
+        expected_chars.pop(0)
+
+        alignments = [0]
+
+        def pop_till_you_win():
+            if len(expected_tokens) == 0:
+                return None
+            popped = expected_tokens.pop(0)
+            popped_char = expected_chars.pop(0)
+            while popped_char == "~":
+                alignments.append(-1)
+                if len(expected_tokens) == 0:
+                    return None
+                popped = expected_tokens.pop(0)
+                popped_char = expected_chars.pop(0)
+            return popped
+
+        next_expected_token = pop_till_you_win()
+        for i, logit in enumerate(logits):
+            top = logit.argmax()
+            if next_expected_token == top:
+                alignments.append(i * w2v_compression)
+                if len(expected_tokens) > 0:
+                    next_expected_token = pop_till_you_win()
+                else:
+                    break
+
+        pop_till_you_win()
+        if not (len(expected_tokens) == 0 and len(alignments) == len(expected_text)):
+            torch.save([audio, expected_text], "alignment_debug.pth")
+            assert False, (
+                "Something went wrong with the alignment algorithm. I've dumped a file, 'alignment_debug.pth' to"
+                "your current working directory. Please report this along with the file so it can get fixed."
+            )
+
+        # Now fix up alignments. Anything with -1 should be interpolated.
+        alignments.append(
+            orig_len
+        )  # This'll get removed but makes the algorithm below more readable.
+        for i in range(len(alignments)):
+            if alignments[i] == -1:
+                for j in range(i + 1, len(alignments)):
+                    if alignments[j] != -1:
+                        next_found_token = j
+                        break
+                for j in range(i, next_found_token):
+                    gap = alignments[next_found_token] - alignments[i - 1]
+                    alignments[j] = (j - i + 1) * gap // (
+                        next_found_token - i + 1
+                    ) + alignments[i - 1]
+
+        return alignments[:-1]
+
+    def redact(self, audio, expected_text, audio_sample_rate=24000):
+        if "[" not in expected_text:
+            return audio
+        splitted = expected_text.split("[")
+        fully_split = [splitted[0]]
+        for spl in splitted[1:]:
+            assert (
+                "]" in spl
+            ), 'Every "[" character must be paired with a "]" with no nesting.'
+            fully_split.extend(spl.split("]"))
+
+        # At this point, fully_split is a list of strings, with every other string being something that should be redacted.
+        non_redacted_intervals = []
+        last_point = 0
+        for i in range(len(fully_split)):
+            if i % 2 == 0:
+                end_interval = max(0, last_point + len(fully_split[i]) - 1)
+                non_redacted_intervals.append((last_point, end_interval))
+            last_point += len(fully_split[i])
+
+        bare_text = "".join(fully_split)
+        alignments = self.align(audio, bare_text, audio_sample_rate)
+
+        output_audio = []
+        for nri in non_redacted_intervals:
+            start, stop = nri
+            output_audio.append(audio[:, alignments[start] : alignments[stop]])
+        return torch.cat(output_audio, dim=-1)
--- a/TTS/tts/layers/vits/fram_prior_network.py
+++ b/TTS/tts/layers/vits/fram_prior_network.py
@ -0,0 +1,31 @@
+from torch import nn
+from TTS.tts.layers.generic.res_conv_bn import Conv1dBNBlock
+
+
+class FramePriorNet(nn.Module):
+    def __init__(
+        self, in_channels, out_channels, hidden_channels, kernel_size, num_res_blocks=13, num_conv_blocks=2
+    ):
+        super().__init__()
+        self.res_blocks = nn.ModuleList()
+        for idx in range(num_res_blocks):
+            block = Conv1dBNBlock(
+                in_channels if idx == 0 else hidden_channels,
+                out_channels if (idx + 1) == num_res_blocks else hidden_channels,
+                hidden_channels,
+                kernel_size,
+                1,
+                num_conv_blocks,
+            )
+            self.res_blocks.append(block)
+    def forward(self, x, x_mask=None):
+        if x_mask is None:
+            x_mask = 1.0
+        o = x * x_mask
+        for block in self.res_blocks:
+            res = o
+            o = block(o)
+            o = o + res
+            if x_mask is not None:
+                o = o * x_mask
+        return o
--- a/TTS/tts/layers/vits/reference_encoder.py
+++ b/TTS/tts/layers/vits/reference_encoder.py
@ -0,0 +1,91 @@
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+class ReferenceEncoder(nn.Module):
+    """NN module creating a fixed size prosody embedding from a spectrogram.
+
+    inputs: mel spectrograms [batch_size, num_spec_frames, num_mel]
+    outputs: [batch_size, embedding_dim]
+    """
+
+    def __init__(self, num_mel, filter):
+        super().__init__()
+        self.num_mel = num_mel
+        start_index = 2 
+        end_index = filter / 16
+        i = start_index
+        filt_len = []
+        while i <= end_index:
+            i = i * 2
+            filt_len.append(i)
+            filt_len.append(i)
+        filters = [1] + filt_len
+        num_layers = len(filters) - 1
+        convs = [
+            nn.Conv2d(
+                in_channels=filters[i], out_channels=filters[i + 1], kernel_size=(3, 3), stride=(2, 2), padding=(2, 2)
+            )
+            for i in range(num_layers)
+        ]
+        self.convs = nn.ModuleList(convs)
+        self.training = False
+        self.bns = nn.ModuleList([nn.BatchNorm2d(num_features=filter_size) for filter_size in filters[1:]])
+
+        post_conv_height = self.calculate_post_conv_height(num_mel, 3, 2, 2, num_layers)
+        self.recurrence = nn.LSTM(
+            input_size=filters[-1] * post_conv_height, hidden_size=out_dim, batch_first=True, bidirectional=False
+        )
+
+    def forward(self, inputs, input_lengths):
+        batch_size = inputs.size(0)
+        x = inputs.view(batch_size, 1, -1, self.num_mel)  # [batch_size, num_channels==1, num_frames, num_mel]
+        valid_lengths = input_lengths.float()  # [batch_size]
+        for conv, bn in zip(self.convs, self.bns):
+            x = conv(x)
+            x = bn(x)
+            x = F.relu(x)
+
+            # Create the post conv width mask based on the valid lengths of the output of the convolution.
+            # The valid lengths for the output of a convolution on varying length inputs is
+            # ceil(input_length/stride) + 1 for stride=3 and padding=2
+            # For example (kernel_size=3, stride=2, padding=2):
+            # 0 0 x x x x x 0 0 -> Input = 5, 0 is zero padding, x is valid values coming from padding=2 in conv2d
+            # _____
+            #   x _____
+            #       x _____
+            #           x  ____
+            #               x
+            # x x x x -> Output valid length = 4
+            # Since every example in te batch is zero padded and therefore have separate valid_lengths,
+            # we need to mask off all the values AFTER the valid length for each example in the batch.
+            # Otherwise, the convolutions create noise and a lot of not real information
+            valid_lengths = (valid_lengths / 2).float()
+            valid_lengths = torch.ceil(valid_lengths).to(dtype=torch.int64) + 1  # 2 is stride -- size: [batch_size]
+            post_conv_max_width = x.size(2)
+
+            mask = torch.arange(post_conv_max_width).to(inputs.device).expand(
+                len(valid_lengths), post_conv_max_width
+            ) < valid_lengths.unsqueeze(1)
+            mask = mask.expand(1, 1, -1, -1).transpose(2, 0).transpose(-1, 2)  # [batch_size, 1, post_conv_max_width, 1]
+            x = x * mask
+
+        x = x.transpose(1, 2)
+        # x: 4D tensor [batch_size, post_conv_width,
+        #               num_channels==128, post_conv_height]
+
+        post_conv_width = x.size(1)
+        x = x.contiguous().view(batch_size, post_conv_width, -1)
+        # x: 3D tensor [batch_size, post_conv_width,
+        #               num_channels*post_conv_height]
+
+        # Routine for fetching the last valid output of a dynamic LSTM with varying input lengths and padding
+        post_conv_input_lengths = valid_lengths
+        packed_seqs = nn.utils.rnn.pack_padded_sequence(
+            x, post_conv_input_lengths.tolist(), batch_first=True, enforce_sorted=False
+        )  # dynamic rnn sequence padding
+        self.recurrence.flatten_parameters()
+        _, (ht, _) = self.recurrence(packed_seqs)
+        last_output = ht[-1]
+
+        return last_output.to(inputs.device)  # [B, 128]
--- a/TTS/tts/layers/vits/vqvae.py
+++ b/TTS/tts/layers/vits/vqvae.py
@ -0,0 +1,67 @@
+import torch
+from torch.autograd import Function
+
+class VectorQuantization(Function):
+    @staticmethod
+    def forward(ctx, inputs, codebook):
+        with torch.no_grad():
+            embedding_size = codebook.size(1)
+            inputs_size = inputs.size()
+            inputs_flatten = inputs.view(-1, embedding_size)
+
+            codebook_sqr = torch.sum(codebook ** 2, dim=1)
+            inputs_sqr = torch.sum(inputs_flatten ** 2, dim=1, keepdim=True)
+
+            # Compute the distances to the codebook
+            distances = torch.addmm(codebook_sqr + inputs_sqr,
+                inputs_flatten, codebook.t(), alpha=-2.0, beta=1.0)
+
+            _, indices_flatten = torch.min(distances, dim=1)
+            indices = indices_flatten.view(*inputs_size[:-1])
+            ctx.mark_non_differentiable(indices)
+
+            return indices
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        raise RuntimeError('Trying to call `.grad()` on graph containing '
+            '`VectorQuantization`. The function `VectorQuantization` '
+            'is not differentiable. Use `VectorQuantizationStraightThrough` '
+            'if you want a straight-through estimator of the gradient.')
+
+class VectorQuantizationStraightThrough(Function):
+    @staticmethod
+    def forward(ctx, inputs, codebook):
+        indices = vq(inputs, codebook)
+        indices_flatten = indices.view(-1)
+        ctx.save_for_backward(indices_flatten, codebook)
+        ctx.mark_non_differentiable(indices_flatten)
+
+        codes_flatten = torch.index_select(codebook, dim=0,
+            index=indices_flatten)
+        codes = codes_flatten.view_as(inputs)
+
+        return (codes, indices_flatten)
+
+    @staticmethod
+    def backward(ctx, grad_output, grad_indices):
+        grad_inputs, grad_codebook = None, None
+
+        if ctx.needs_input_grad[0]:
+            # Straight-through estimator
+            grad_inputs = grad_output.clone()
+        if ctx.needs_input_grad[1]:
+            # Gradient wrt. the codebook
+            indices, codebook = ctx.saved_tensors
+            embedding_size = codebook.size(1)
+
+            grad_output_flatten = (grad_output.contiguous()
+                                              .view(-1, embedding_size))
+            grad_codebook = torch.zeros_like(codebook)
+            grad_codebook.index_add_(0, indices, grad_output_flatten)
+
+        return (grad_inputs, grad_codebook)
+
+vq = VectorQuantization.apply
+vq_st = VectorQuantizationStraightThrough.apply
+__all__ = [vq, vq_st]
--- a/TTS/tts/models/tortoise.py
+++ b/TTS/tts/models/tortoise.py
@ -0,0 +1,835 @@
+# ## AGPL: a notification must be added stating that changes have been made to that file.
+
+import os
+import random
+from time import time
+
+import torch
+import torch.nn.functional as F
+import torchaudio
+
+from tqdm import tqdm
+
+from TTS.tts.layers.tortoise.arch_utils import TorchMelSpectrogram
+from TTS.tts.layers.tortoise.audio_utils import denormalize_tacotron_mel, wav_to_univnet_mel
+from TTS.tts.layers.tortoise.autoregressive import UnifiedVoice
+from TTS.tts.layers.tortoise.classifier import AudioMiniEncoderWithClassifierHead
+from TTS.tts.layers.tortoise.clvp import CLVP
+from TTS.tts.layers.tortoise.cvvp import CVVP
+from TTS.tts.layers.tortoise.diffusion_decoder import DiffusionTts
+from TTS.tts.layers.tortoise.random_latent_generator import RandomLatentConverter
+from TTS.tts.layers.tortoise.vocoder import VocConf
+
+from TTS.tts.layers.tortoise.diffusion import (
+    SpacedDiffusion,
+    get_named_beta_schedule,
+    space_timesteps,
+)
+
+from TTS.tts.layers.tortoise.tokenizer import VoiceBpeTokenizer
+from TTS.tts.layers.tortoise.wav2vec_alignment import Wav2VecAlignment
+
+from TTS.tts.layers.tortoise.utils import MODELS_DIR, get_model_path
+
+from contextlib import contextmanager
+
+def pad_or_truncate(t, length):
+    """
+    Utility function for forcing <t> to have the specified sequence length, whether by clipping it or padding it with 0s.
+    """
+    if t.shape[-1] == length:
+        return t
+    elif t.shape[-1] < length:
+        return F.pad(t, (0, length - t.shape[-1]))
+    else:
+        return t[..., :length]
+
+
+def load_discrete_vocoder_diffuser(
+    trained_diffusion_steps=4000,
+    desired_diffusion_steps=200,
+    cond_free=True,
+    cond_free_k=1,
+    sampler="ddim",
+):
+    """
+    Helper function to load a GaussianDiffusion instance configured for use as a vocoder.
+    """
+    return SpacedDiffusion(
+        use_timesteps=space_timesteps(
+            trained_diffusion_steps, [desired_diffusion_steps]
+        ),
+        model_mean_type="epsilon",
+        model_var_type="learned_range",
+        loss_type="mse",
+        betas=get_named_beta_schedule("linear", trained_diffusion_steps),
+        conditioning_free=cond_free,
+        conditioning_free_k=cond_free_k,
+        sampler=sampler,
+    )
+
+
+def format_conditioning(clip, cond_length=132300, device="cuda"):
+    """
+    Converts the given conditioning signal to a MEL spectrogram and clips it as expected by the models.
+    """
+    gap = clip.shape[-1] - cond_length
+    if gap < 0:
+        clip = F.pad(clip, pad=(0, abs(gap)))
+    elif gap > 0:
+        rand_start = random.randint(0, gap)
+        clip = clip[:, rand_start : rand_start + cond_length]
+    mel_clip = TorchMelSpectrogram()(clip.unsqueeze(0)).squeeze(0)
+    return mel_clip.unsqueeze(0).to(device)
+
+
+def fix_autoregressive_output(codes, stop_token, complain=True):
+    """
+    This function performs some padding on coded audio that fixes a mismatch issue between what the diffusion model was
+    trained on and what the autoregressive code generator creates (which has no padding or end).
+    This is highly specific to the DVAE being used, so this particular coding will not necessarily work if used with
+    a different DVAE. This can be inferred by feeding a audio clip padded with lots of zeros on the end through the DVAE
+    and copying out the last few codes.
+
+    Failing to do this padding will produce speech with a harsh end that sounds like "BLAH" or similar.
+    """
+    # Strip off the autoregressive stop token and add padding.
+    stop_token_indices = (codes == stop_token).nonzero()
+    if len(stop_token_indices) == 0:
+        if complain:
+            print(
+                "No stop tokens found in one of the generated voice clips. This typically means the spoken audio is "
+                "too long. In some cases, the output will still be good, though. Listen to it and if it is missing words, "
+                "try breaking up your input text."
+            )
+        return codes
+    else:
+        codes[stop_token_indices] = 83
+    stm = stop_token_indices.min().item()
+    codes[stm:] = 83
+    if stm - 3 < codes.shape[0]:
+        codes[-3] = 45
+        codes[-2] = 45
+        codes[-1] = 248
+
+    return codes
+
+
+def do_spectrogram_diffusion(
+    diffusion_model,
+    diffuser,
+    latents,
+    conditioning_latents,
+    temperature=1,
+    verbose=True,
+):
+    """
+    Uses the specified diffusion model to convert discrete codes into a spectrogram.
+    """
+    with torch.no_grad():
+        output_seq_len = (
+            latents.shape[1] * 4 * 24000 // 22050
+        )  # This diffusion model converts from 22kHz spectrogram codes to a 24kHz spectrogram signal.
+        output_shape = (latents.shape[0], 100, output_seq_len)
+        precomputed_embeddings = diffusion_model.timestep_independent(
+            latents, conditioning_latents, output_seq_len, False
+        )
+
+        noise = torch.randn(output_shape, device=latents.device) * temperature
+        mel = diffuser.sample_loop(
+                diffusion_model,
+                output_shape,
+                noise=noise,
+                model_kwargs={"precomputed_aligned_embeddings": precomputed_embeddings},
+                progress=verbose
+            )
+        return denormalize_tacotron_mel(mel)[:, :, :output_seq_len]
+
+
+def classify_audio_clip(clip):
+    """
+    Returns whether or not Tortoises' classifier thinks the given clip came from Tortoise.
+    :param clip: torch tensor containing audio waveform data (get it from load_audio)
+    :return: True if the clip was classified as coming from Tortoise and false if it was classified as real.
+    """
+    classifier = AudioMiniEncoderWithClassifierHead(
+        2,
+        spec_dim=1,
+        embedding_dim=512,
+        depth=5,
+        downsample_factor=4,
+        resnet_blocks=2,
+        attn_blocks=4,
+        num_attn_heads=4,
+        base_channels=32,
+        dropout=0,
+        kernel_size=5,
+        distribute_zero_label=False,
+    )
+    classifier.load_state_dict(
+        torch.load(get_model_path("classifier.pth"), map_location=torch.device("cpu"))
+    )
+    clip = clip.cpu().unsqueeze(0)
+    results = F.softmax(classifier(clip), dim=-1)
+    return results[0][0]
+
+
+def pick_best_batch_size_for_gpu():
+    """
+    Tries to pick a batch size that will fit in your GPU. These sizes aren't guaranteed to work, but they should give
+    you a good shot.
+    """
+    if torch.cuda.is_available():
+        _, available = torch.cuda.mem_get_info()
+        availableGb = available / (1024**3)
+        if availableGb > 14:
+            return 16
+        elif availableGb > 10:
+            return 8
+        elif availableGb > 7:
+            return 4
+    return 1
+
+
+class TextToSpeech:
+    """
+    Main entry point into Tortoise.
+    """
+
+    def _config(self):
+        raise RuntimeError("This is depreciated")
+        return {
+            "high_vram": self.high_vram,
+            "models_dir": self.models_dir,
+            "kv_cache": self.autoregressive.inference_model.kv_cache,
+            "ar_checkpoint": self.ar_checkpoint,
+        }
+
+    def __init__(
+        self,
+        autoregressive_batch_size=None,
+        models_dir=MODELS_DIR,
+        enable_redaction=True,
+        device=None,
+        high_vram=False,
+        kv_cache=True,
+        ar_checkpoint=None,
+        clvp_checkpoint=None,
+        diff_checkpoint=None,
+        vocoder=VocConf.Univnet,
+    ):
+        """
+        Constructor
+        :param autoregressive_batch_size: Specifies how many samples to generate per batch. Lower this if you are seeing
+                                          GPU OOM errors. Larger numbers generates slightly faster.
+        :param models_dir: Where model weights are stored. This should only be specified if you are providing your own
+                           models, otherwise use the defaults.
+        :param enable_redaction: When true, text enclosed in brackets are automatically redacted from the spoken output
+                                 (but are still rendered by the model). This can be used for prompt engineering.
+                                 Default is true.
+        :param device: Device to use when running the model. If omitted, the device will be automatically chosen.
+        :param high_vram: If true, the model will use more VRAM but will run faster.
+        :param kv_cache: If true, the autoregressive model will cache key value attention pairs to speed up generation.
+        :param ar_checkpoint: Path to a checkpoint file for the autoregressive model. If omitted, uses default
+        :param clvp_checkpoint: Path to a checkpoint file for the CLVP model. If omitted, uses default
+        :param diff_checkpoint: Path to a checkpoint file for the diffusion model. If omitted, uses default
+        """
+        self.ar_checkpoint = ar_checkpoint
+        self.diff_checkpoint = diff_checkpoint  # TODO: check if this is even needed
+        self.models_dir = models_dir
+        self.autoregressive_batch_size = (
+            pick_best_batch_size_for_gpu()
+            if autoregressive_batch_size is None
+            else autoregressive_batch_size
+        )
+        self.enable_redaction = enable_redaction
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        if self.enable_redaction:
+            self.aligner = Wav2VecAlignment()
+
+        self.tokenizer = VoiceBpeTokenizer()
+
+        if os.path.exists(f"{models_dir}/autoregressive.ptt"):
+            # Assume this is a traced directory.
+            self.autoregressive = torch.jit.load(f"{models_dir}/autoregressive.ptt")
+            self.diffusion = torch.jit.load(f"{models_dir}/diffusion_decoder.ptt")
+        else:
+            self.autoregressive = (
+                UnifiedVoice(
+                    max_mel_tokens=604,
+                    max_text_tokens=402,
+                    max_conditioning_inputs=2,
+                    layers=30,
+                    model_dim=1024,
+                    heads=16,
+                    number_text_tokens=255,
+                    start_text_token=255,
+                    checkpointing=False,
+                    train_solo_embeddings=False,
+                )
+                .cpu()
+                .eval()
+            )
+            ar_path = ar_checkpoint or get_model_path("autoregressive.pth", models_dir)
+            self.autoregressive.load_state_dict(torch.load(ar_path))
+            self.autoregressive.post_init_gpt2_config(kv_cache)
+
+            diff_path = diff_checkpoint or get_model_path(
+                "diffusion_decoder.pth", models_dir
+            )
+            self.diffusion = (
+                DiffusionTts(
+                    model_channels=1024,
+                    num_layers=10,
+                    in_channels=100,
+                    out_channels=200,
+                    in_latent_channels=1024,
+                    in_tokens=8193,
+                    dropout=0,
+                    use_fp16=False,
+                    num_heads=16,
+                    layer_drop=0,
+                    unconditioned_percentage=0,
+                )
+                .cpu()
+                .eval()
+            )
+            self.diffusion.load_state_dict(torch.load(diff_path))
+        self.clvp = (
+            CLVP(
+                dim_text=768,
+                dim_speech=768,
+                dim_latent=768,
+                num_text_tokens=256,
+                text_enc_depth=20,
+                text_seq_len=350,
+                text_heads=12,
+                num_speech_tokens=8192,
+                speech_enc_depth=20,
+                speech_heads=12,
+                speech_seq_len=430,
+                use_xformers=True,
+            )
+            .cpu()
+            .eval()
+        )
+        clvp_path = clvp_checkpoint or get_model_path("clvp2.pth", models_dir)
+        self.clvp.load_state_dict(torch.load(clvp_path))
+        self.cvvp = None  # CVVP model is only loaded if used.
+
+        self.vocoder = vocoder.value.constructor().cpu()
+        self.vocoder.load_state_dict(
+            vocoder.value.optionally_index(
+                torch.load(
+                    get_model_path(vocoder.value.model_path, models_dir),
+                    map_location=torch.device("cpu"),
+                )
+            )
+        )
+        self.vocoder.eval(inference=True)
+
+        # Random latent generators (RLGs) are loaded lazily.
+        self.rlg_auto = None
+        self.rlg_diffusion = None
+
+        if high_vram:
+            self.autoregressive = self.autoregressive.to(self.device)
+            self.diffusion = self.diffusion.to(self.device)
+            self.clvp = self.clvp.to(self.device)
+            self.vocoder = self.vocoder.to(self.device)
+        self.high_vram = high_vram
+
+    @contextmanager
+    def temporary_cuda(self, model):
+        if self.high_vram:
+            yield model
+        else:
+            m = model.to(self.device)
+            yield m
+            m = model.cpu()
+
+    def load_cvvp(self):
+        """Load CVVP model."""
+        self.cvvp = (
+            CVVP(
+                model_dim=512,
+                transformer_heads=8,
+                dropout=0,
+                mel_codes=8192,
+                conditioning_enc_depth=8,
+                cond_mask_percentage=0,
+                speech_enc_depth=8,
+                speech_mask_percentage=0,
+                latent_multiplier=1,
+            )
+            .cpu()
+            .eval()
+        )
+        self.cvvp.load_state_dict(
+            torch.load(get_model_path("cvvp.pth", self.models_dir))
+        )
+
+    def get_conditioning_latents(
+        self,
+        voice_samples,
+        return_mels=False,
+        latent_averaging_mode=0,
+        original_tortoise=False,
+    ):
+        """
+        Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
+        These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic
+        properties.
+        :param voice_samples: List of arbitrary reference clips, which should be *pairs* of torch tensors containing arbitrary kHz waveform data.
+        :param latent_averaging_mode: 0/1/2 for following modes:
+            0 - latents will be generated as in original tortoise, using ~4.27s from each voice sample, averaging latent across all samples
+            1 - latents will be generated using (almost) entire voice samples, averaged across all the ~4.27s chunks
+            2 - latents will be generated using (almost) entire voice samples, averaged per voice sample
+        """
+        assert latent_averaging_mode in [
+            0,
+            1,
+            2,
+        ], "latent_averaging mode has to be one of (0, 1, 2)"
+        print("mode", latent_averaging_mode)
+        with torch.no_grad():
+            voice_samples = [[v.to(self.device) for v in ls] for ls in voice_samples]
+
+            auto_conds = []
+            for ls in voice_samples:
+                auto_conds.append(format_conditioning(ls[0], device=self.device))
+            auto_conds = torch.stack(auto_conds, dim=1)
+            with self.temporary_cuda(self.autoregressive) as ar:
+                auto_latent = ar.get_conditioning(auto_conds)
+
+            diffusion_conds = []
+
+            DURS_CONST = 102400
+            for ls in voice_samples:
+                # The diffuser operates at a sample rate of 24000 (except for the latent inputs)
+                sample = (
+                    torchaudio.functional.resample(ls[0], 22050, 24000)
+                    if original_tortoise
+                    else ls[1]
+                )
+                if latent_averaging_mode == 0:
+                    sample = pad_or_truncate(sample, DURS_CONST)
+                    cond_mel = wav_to_univnet_mel(
+                        sample.to(self.device),
+                        do_normalization=False,
+                        device=self.device,
+                    )
+                    diffusion_conds.append(cond_mel)
+                else:
+                    from math import ceil
+
+                    if latent_averaging_mode == 2:
+                        temp_diffusion_conds = []
+                    for chunk in range(ceil(sample.shape[1] / DURS_CONST)):
+                        current_sample = sample[
+                            :, chunk * DURS_CONST : (chunk + 1) * DURS_CONST
+                        ]
+                        current_sample = pad_or_truncate(current_sample, DURS_CONST)
+                        cond_mel = wav_to_univnet_mel(
+                            current_sample.to(self.device),
+                            do_normalization=False,
+                            device=self.device,
+                        )
+                        if latent_averaging_mode == 1:
+                            diffusion_conds.append(cond_mel)
+                        elif latent_averaging_mode == 2:
+                            temp_diffusion_conds.append(cond_mel)
+                    if latent_averaging_mode == 2:
+                        diffusion_conds.append(
+                            torch.stack(temp_diffusion_conds).mean(0)
+                        )
+            diffusion_conds = torch.stack(diffusion_conds, dim=1)
+
+            with self.temporary_cuda(self.diffusion) as diffusion:
+                diffusion_latent = diffusion.get_conditioning(diffusion_conds)
+
+        if return_mels:
+            return auto_latent, diffusion_latent, auto_conds, diffusion_conds
+        else:
+            return auto_latent, diffusion_latent
+
+    def get_random_conditioning_latents(self):
+        # Lazy-load the RLG models.
+        if self.rlg_auto is None:
+            self.rlg_auto = RandomLatentConverter(1024).eval()
+            self.rlg_auto.load_state_dict(
+                torch.load(
+                    get_model_path("rlg_auto.pth", self.models_dir),
+                    map_location=torch.device("cpu"),
+                )
+            )
+            self.rlg_diffusion = RandomLatentConverter(2048).eval()
+            self.rlg_diffusion.load_state_dict(
+                torch.load(
+                    get_model_path("rlg_diffuser.pth", self.models_dir),
+                    map_location=torch.device("cpu"),
+                )
+            )
+        with torch.no_grad():
+            return self.rlg_auto(torch.tensor([0.0])), self.rlg_diffusion(
+                torch.tensor([0.0])
+            )
+
+    def tts_with_preset(self, text, preset="fast", **kwargs):
+        """
+        Calls TTS with one of a set of preset generation parameters. Options:
+            'single_sample': Produces speech even faster, but only produces 1 sample.
+            'ultra_fast': Produces speech much faster than the original tortoise repo.
+            'ultra_fast_old': Produces speech at a speed which belies the name of this repo. (Not really, but it's definitely fastest).
+            'fast': Decent quality speech at a decent inference rate. A good choice for mass inference.
+            'standard': Very good quality. This is generally about as good as you are going to get.
+            'high_quality': Use if you want the absolute best. This is not really worth the compute, though.
+        """
+        # Use generally found best tuning knobs for generation.
+        settings = {
+            "temperature": 0.2,
+            "length_penalty": 1.0,
+            "repetition_penalty": 2.0,
+            "top_p": 0.8,
+            "cond_free_k": 2.0,
+            "diffusion_temperature": 1.0,
+        }
+        # Presets are defined here.
+        presets = {
+            "single_sample": {
+                "num_autoregressive_samples": 8,
+                "diffusion_iterations": 10,
+                "sampler": "ddim",
+            },
+            "ultra_fast": {
+                "num_autoregressive_samples": 16,
+                "diffusion_iterations": 10,
+                "sampler": "ddim",
+            },
+            "ultra_fast_old": {
+                "num_autoregressive_samples": 16,
+                "diffusion_iterations": 30,
+                "cond_free": False,
+            },
+            "very_fast": {
+                "num_autoregressive_samples": 32,
+                "diffusion_iterations": 30,
+                "sampler": "dpm++2m",
+            },
+            "fast": {
+                "num_autoregressive_samples": 16,
+                "diffusion_iterations": 50,
+                "sampler": "ddim",
+            },
+            "fast_old": {
+                "num_autoregressive_samples": 96, 
+                "diffusion_iterations": 80
+            },
+            "standard": {
+                "num_autoregressive_samples": 256,
+                "diffusion_iterations": 200,
+            },
+            "high_quality": {
+                "num_autoregressive_samples": 256,
+                "diffusion_iterations": 400,
+            },
+        }
+        settings.update(presets[preset])
+        settings.update(kwargs)  # allow overriding of preset settings with kwargs
+        return self.tts(text, **settings)
+
+    def tts(
+        self,
+        text,
+        voice_samples=None,
+        conditioning_latents=None,
+        k=1,
+        verbose=True,
+        use_deterministic_seed=None,
+        return_deterministic_state=False,
+        latent_averaging_mode=0,
+        # autoregressive generation parameters follow
+        num_autoregressive_samples=512,
+        temperature=0.8,
+        length_penalty=1,
+        repetition_penalty=2.0,
+        top_p=0.8,
+        max_mel_tokens=500,
+        # CVVP parameters follow
+        cvvp_amount=0.0,
+        # diffusion generation parameters follow
+        diffusion_iterations=100,
+        cond_free=True,
+        cond_free_k=2,
+        diffusion_temperature=1.0,
+        sampler="ddim",
+        half=True,
+        original_tortoise=False,
+        **hf_generate_kwargs,
+    ):
+        """
+        Produces an audio clip of the given text being spoken with the given reference voice.
+        :param text: Text to be spoken.
+        :param voice_samples: List of an arbitrary number of reference clips, which should be *tuple-pairs* of torch tensors containing arbitrary kHz waveform data.
+        :param conditioning_latents: A tuple of (autoregressive_conditioning_latent, diffusion_conditioning_latent), which
+                                     can be provided in lieu of voice_samples. This is ignored unless voice_samples=None.
+                                     Conditioning latents can be retrieved via get_conditioning_latents().
+        :param k: The number of returned clips. The most likely (as determined by Tortoises' CLVP model) clips are returned.
+        :param latent_averaging_mode: 0/1/2 for following modes:
+            0 - latents will be generated as in original tortoise, using ~4.27s from each voice sample, averaging latent across all samples
+            1 - latents will be generated using (almost) entire voice samples, averaged across all the ~4.27s chunks
+            2 - latents will be generated using (almost) entire voice samples, averaged per voice sample
+        :param verbose: Whether or not to print log messages indicating the progress of creating a clip. Default=true.
+        ~~AUTOREGRESSIVE KNOBS~~
+        :param num_autoregressive_samples: Number of samples taken from the autoregressive model, all of which are filtered using CLVP.
+               As Tortoise is a probabilistic model, more samples means a higher probability of creating something "great".
+        :param temperature: The softmax temperature of the autoregressive model.
+        :param length_penalty: A length penalty applied to the autoregressive decoder. Higher settings causes the model to produce more terse outputs.
+        :param repetition_penalty: A penalty that prevents the autoregressive decoder from repeating itself during decoding. Can be used to reduce the incidence
+                                   of long silences or "uhhhhhhs", etc.
+        :param top_p: P value used in nucleus sampling. (0,1]. Lower values mean the decoder produces more "likely" (aka boring) outputs.
+        :param max_mel_tokens: Restricts the output length. (0,600] integer. Each unit is 1/20 of a second.
+        :param typical_sampling: Turns typical sampling on or off. This sampling mode is discussed in this paper: https://arxiv.org/abs/2202.00666
+                                 I was interested in the premise, but the results were not as good as I was hoping. This is off by default, but
+                                 could use some tuning.
+        :param typical_mass: The typical_mass parameter from the typical_sampling algorithm.
+        ~~CLVP-CVVP KNOBS~~
+        :param cvvp_amount: Controls the influence of the CVVP model in selecting the best output from the autoregressive model.
+                            [0,1]. Values closer to 1 mean the CVVP model is more important, 0 disables the CVVP model.
+        ~~DIFFUSION KNOBS~~
+        :param diffusion_iterations: Number of diffusion steps to perform. [0,4000]. More steps means the network has more chances to iteratively refine
+                                     the output, which should theoretically mean a higher quality output. Generally a value above 250 is not noticeably better,
+                                     however.
+        :param cond_free: Whether or not to perform conditioning-free diffusion. Conditioning-free diffusion performs two forward passes for
+                          each diffusion step: one with the outputs of the autoregressive model and one with no conditioning priors. The output
+                          of the two is blended according to the cond_free_k value below. Conditioning-free diffusion is the real deal, and
+                          dramatically improves realism.
+        :param cond_free_k: Knob that determines how to balance the conditioning free signal with the conditioning-present signal. [0,inf].
+                            As cond_free_k increases, the output becomes dominated by the conditioning-free signal.
+                            Formula is: output=cond_present_output*(cond_free_k+1)-cond_absenct_output*cond_free_k
+        :param diffusion_temperature: Controls the variance of the noise fed into the diffusion model. [0,1]. Values at 0
+                                      are the "mean" prediction of the diffusion network and will sound bland and smeared.
+        ~~OTHER STUFF~~
+        :param hf_generate_kwargs: The huggingface Transformers generate API is used for the autoregressive transformer.
+                                   Extra keyword args fed to this function get forwarded directly to that API. Documentation
+                                   here: https://huggingface.co/docs/transformers/internal/generation_utils
+        :return: Generated audio clip(s) as a torch tensor. Shape 1,S if k=1 else, (k,1,S) where S is the sample length.
+                 Sample rate is 24kHz.
+        """
+        deterministic_seed = self.deterministic_state(seed=use_deterministic_seed)
+
+        text_tokens = (
+            torch.IntTensor(self.tokenizer.encode(text)).unsqueeze(0).to(self.device)
+        )
+        text_tokens = F.pad(text_tokens, (0, 1))  # This may not be necessary.
+        assert (
+            text_tokens.shape[-1] < 400
+        ), "Too much text provided. Break the text up into separate segments and re-try inference."
+
+        auto_conds = None
+        if voice_samples is not None:
+            (
+                auto_conditioning,
+                diffusion_conditioning,
+                auto_conds,
+                _,
+            ) = self.get_conditioning_latents(
+                voice_samples,
+                return_mels=True,
+                latent_averaging_mode=latent_averaging_mode,
+                original_tortoise=original_tortoise,
+            )
+        elif conditioning_latents is not None:
+            auto_conditioning, diffusion_conditioning = conditioning_latents
+        else:
+            (
+                auto_conditioning,
+                diffusion_conditioning,
+            ) = self.get_random_conditioning_latents()
+        auto_conditioning = auto_conditioning.to(self.device)
+        diffusion_conditioning = diffusion_conditioning.to(self.device)
+
+        diffuser = load_discrete_vocoder_diffuser(
+            desired_diffusion_steps=diffusion_iterations,
+            cond_free=cond_free,
+            cond_free_k=cond_free_k,
+            sampler=sampler
+        )
+
+        # in the case of single_sample,
+        orig_batch_size = self.autoregressive_batch_size
+        while num_autoregressive_samples % self.autoregressive_batch_size:
+            self.autoregressive_batch_size //= 2
+        with torch.no_grad():
+            samples = []
+            num_batches = num_autoregressive_samples // self.autoregressive_batch_size
+            stop_mel_token = self.autoregressive.stop_mel_token
+            calm_token = 83  # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output"
+            self.autoregressive = self.autoregressive.to(self.device)
+            if verbose:
+                print("Generating autoregressive samples..")
+            with self.temporary_cuda(
+                self.autoregressive
+            ) as autoregressive, torch.autocast(
+                device_type="cuda", dtype=torch.float16, enabled=half
+            ):
+                for b in tqdm(range(num_batches), disable=not verbose):
+                    codes = autoregressive.inference_speech(
+                        auto_conditioning,
+                        text_tokens,
+                        do_sample=True,
+                        top_p=top_p,
+                        temperature=temperature,
+                        num_return_sequences=self.autoregressive_batch_size,
+                        length_penalty=length_penalty,
+                        repetition_penalty=repetition_penalty,
+                        max_generate_length=max_mel_tokens,
+                        **hf_generate_kwargs,
+                    )
+                    padding_needed = max_mel_tokens - codes.shape[1]
+                    codes = F.pad(codes, (0, padding_needed), value=stop_mel_token)
+                    samples.append(codes)
+            self.autoregressive_batch_size = (
+                orig_batch_size  # in the case of single_sample
+            )
+
+            clip_results = []
+            with self.temporary_cuda(self.clvp) as clvp, torch.autocast(
+                device_type="cuda", dtype=torch.float16, enabled=half
+            ):
+                if cvvp_amount > 0:
+                    if self.cvvp is None:
+                        self.load_cvvp()
+                    self.cvvp = self.cvvp.to(self.device)
+                if verbose:
+                    if self.cvvp is None:
+                        print("Computing best candidates using CLVP")
+                    else:
+                        print(
+                            f"Computing best candidates using CLVP {((1-cvvp_amount) * 100):2.0f}% and CVVP {(cvvp_amount * 100):2.0f}%"
+                        )
+                for batch in tqdm(samples, disable=not verbose):
+                    for i in range(batch.shape[0]):
+                        batch[i] = fix_autoregressive_output(batch[i], stop_mel_token)
+                    if cvvp_amount != 1:
+                        clvp_res = clvp(
+                            text_tokens.repeat(batch.shape[0], 1),
+                            batch,
+                            return_loss=False,
+                        )
+                    if auto_conds is not None and cvvp_amount > 0:
+                        cvvp_accumulator = 0
+                        for cl in range(auto_conds.shape[1]):
+                            cvvp_accumulator = cvvp_accumulator + self.cvvp(
+                                auto_conds[:, cl].repeat(batch.shape[0], 1, 1),
+                                batch,
+                                return_loss=False,
+                            )
+                        cvvp = cvvp_accumulator / auto_conds.shape[1]
+                        if cvvp_amount == 1:
+                            clip_results.append(cvvp)
+                        else:
+                            clip_results.append(
+                                cvvp * cvvp_amount + clvp_res * (1 - cvvp_amount)
+                            )
+                    else:
+                        clip_results.append(clvp_res)
+                clip_results = torch.cat(clip_results, dim=0)
+                samples = torch.cat(samples, dim=0)
+                best_results = samples[torch.topk(clip_results, k=k).indices]
+            if self.cvvp is not None:
+                self.cvvp = self.cvvp.cpu()
+            del samples
+
+            # The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
+            # inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
+            # results, but will increase memory usage.
+            with self.temporary_cuda(
+                self.autoregressive
+            ) as autoregressive:
+                best_latents = autoregressive(
+                    auto_conditioning.repeat(k, 1),
+                    text_tokens.repeat(k, 1),
+                    torch.tensor([text_tokens.shape[-1]], device=text_tokens.device),
+                    best_results,
+                    torch.tensor(
+                        [
+                            best_results.shape[-1]
+                            * self.autoregressive.mel_length_compression
+                        ],
+                        device=text_tokens.device,
+                    ),
+                    return_latent=True,
+                    clip_inputs=False,
+                )
+            del auto_conditioning
+
+            if verbose:
+                print("Transforming autoregressive outputs into audio..")
+            wav_candidates = []
+            for b in range(best_results.shape[0]):
+                codes = best_results[b].unsqueeze(0)
+                latents = best_latents[b].unsqueeze(0)
+
+                # Find the first occurrence of the "calm" token and trim the codes to that.
+                ctokens = 0
+                for k in range(codes.shape[-1]):
+                    if codes[0, k] == calm_token:
+                        ctokens += 1
+                    else:
+                        ctokens = 0
+                    if (
+                        ctokens > 8
+                    ):  # 8 tokens gives the diffusion model some "breathing room" to terminate speech.
+                        latents = latents[:, :k]
+                        break
+                with self.temporary_cuda(self.diffusion) as diffusion:
+                    mel = do_spectrogram_diffusion(
+                        diffusion,
+                        diffuser,
+                        latents,
+                        diffusion_conditioning,
+                        temperature=diffusion_temperature,
+                        verbose=verbose,
+                    )
+                with self.temporary_cuda(self.vocoder) as vocoder:
+                    wav = vocoder.inference(mel)
+                    wav_candidates.append(wav.cpu())
+
+            def potentially_redact(clip, text):
+                if self.enable_redaction:
+                    return self.aligner.redact(clip.squeeze(1), text).unsqueeze(1)
+                return clip
+
+            wav_candidates = [
+                potentially_redact(wav_candidate, text)
+                for wav_candidate in wav_candidates
+            ]
+
+            if len(wav_candidates) > 1:
+                res = wav_candidates
+            else:
+                res = wav_candidates[0]
+
+            if return_deterministic_state:
+                return res, (
+                    deterministic_seed,
+                    text,
+                    voice_samples,
+                    conditioning_latents,
+                )
+            else:
+                return res
+
+    def deterministic_state(self, seed=None):
+        """
+        Sets the random seeds that tortoise uses to the current time() and returns that seed so results can be
+        reproduced.
+        """
+        seed = int(time()) if seed is None else seed
+        torch.manual_seed(seed)
+        random.seed(seed)
+        # Can't currently set this because of CUBLAS. TODO: potentially enable it if necessary.
+        # torch.use_deterministic_algorithms(True)
+
+        return seed
--- a/TTS/tts/utils/assets/tortoise/got.txt
+++ b/TTS/tts/utils/assets/tortoise/got.txt
@ -0,0 +1,276 @@
+Chapter One
+
+
+Bran
+
+
+The morning had dawned clear and cold, with a crispness that hinted at the end of summer. They set forth at daybreak to see a man beheaded, twenty in all, and Bran rode among them, nervous with excitement. This was the first time he had been deemed old enough to go with his lord father and his brothers to see the king's justice done. It was the ninth year of summer, and the seventh of Bran's life.
+
+
+The man had been taken outside a small holdfast in the hills. Robb thought he was a wildling, his sword sworn to Mance Rayder, the King-beyond-the-Wall. It made Bran's skin prickle to think of it. He remembered the hearth tales Old Nan told them. The wildlings were cruel men, she said, slavers and slayers and thieves. They consorted with giants and ghouls, stole girl children in the dead of night, and drank blood from polished horns. And their women lay with the Others in the Long Night to sire terrible half-human children.
+
+
+But the man they found bound hand and foot to the holdfast wall awaiting the king's justice was old and scrawny, not much taller than Robb. He had lost both ears and a finger to frostbite, and he dressed all in black, the same as a brother of the Night's Watch, except that his furs were ragged and greasy.
+
+
+The breath of man and horse mingled, steaming, in the cold morning air as his lord father had the man cut down from the wall and dragged before them. Robb and Jon sat tall and still on their horses, with Bran between them on his pony, trying to seem older than seven, trying to pretend that he'd seen all this before. A faint wind blew through the holdfast gate. Over their heads flapped the banner of the Starks of Winterfell: a grey direwolf racing across an ice-white field.
+
+Bran's father sat solemnly on his horse, long brown hair stirring in the wind. His closely trimmed beard was shot with white, making him look older than his thirty-five years. He had a grim cast to his grey eyes this day, and he seemed not at all the man who would sit before the fire in the evening and talk softly of the age of heroes and the children of the forest. He had taken off Father's face, Bran thought, and donned the face of Lord Stark of Winterfell.
+
+
+There were questions asked and answers given there in the chill of morning, but afterward Bran could not recall much of what had been said. Finally his lord father gave a command, and two of his guardsmen dragged the ragged man to the ironwood stump in the center of the square. They forced his head down onto the hard black wood. Lord Eddard Stark dismounted and his ward Theon Greyjoy brought forth the sword. "Ice," that sword was called. It was as wide across as a man's hand, and taller even than Robb. The blade was Valyrian steel, spell-forged and dark as smoke. Nothing held an edge like Valyrian steel.
+
+
+His father peeled off his gloves and handed them to Jory Cassel, the captain of his household guard. He took hold of Ice with both hands and said, "In the name of Robert of the House Baratheon, the First of his Name, King of the Andals and the Rhoynar and the First Men, Lord of the Seven Kingdoms and Protector of the Realm, by the word of Eddard of the House Stark, Lord of Winterfell and Warden of the North, I do sentence you to die." He lifted the greatsword high above his head.
+
+
+Bran's bastard brother Jon Snow moved closer. "Keep the pony well in hand," he whispered. "And don't look away. Father will know if you do."
+
+
+Bran kept his pony well in hand, and did not look away.
+
+
+His father took off the man's head with a single sure stroke. Blood sprayed out across the snow, as red as surnmerwine. One of the horses reared and had to be restrained to keep from bolting. Bran could not take his eyes off the blood. The snows around the stump drank it eagerly, reddening as he watched.
+
+The head bounced off a thick root and rolled. It came up near Greyjoy's feet. Theon was a lean, dark youth of nineteen who found everything amusing. He laughed, put his boot on the head, and kicked it away.
+
+
+"Ass," Jon muttered, low enough so Greyjoy did not hear. He put a hand on Bran's shoulder, and Bran looked over at his bastard brother. "You did well," Jon told him solemnly. Jon was fourteen, an old hand at justice.
+
+
+It seemed colder on the long ride back to Winterfell, though the wind had died by then and the sun was higher in the sky. Bran rode with his brothers, well ahead of the main party, his pony struggling hard to keep up with their horses.
+
+
+"The deserter died bravely," Robb said. He was big and broad and growing every day, with his mother's coloring, the fair skin, red-brown hair, and blue eyes of the Tullys of Riverrun. "He had courage, at the least."
+
+
+"No," Jon Snow said quietly. "It was not courage. This one was dead of fear. You could see it in his eyes, Stark." Jon's eyes were a grey so dark they seemed almost black, but there was little they did not see. He was of an age with Robb, but they did not look alike. Jon was slender where Robb was muscular, dark where Robb was fair, graceful and quick where his half brother was strong and fast.
+
+
+Robb was not impressed. "The Others take his eyes," he swore. "He died well. Race you to the bridge?"
+
+
+"Done," Jon said, kicking his horse forward. Robb cursed and followed, and they galloped off down the trail, Robb laughing and hooting, Jon silent and intent. The hooves of their horses kicked up showers of snow as they went.
+
+Bran did not try to follow. His pony could not keep up. He had seen the ragged man's eyes, and he was thinking of them now. After a while, the sound of Robb's laughter receded, and the woods grew silent again.
+
+
+So deep in thought was he that he never heard the rest of the party until his father moved up to ride beside him. "Are you well, Bran?" he asked, not unkindly.
+
+
+"Yes, Father," Bran told him. He looked up. Wrapped in his furs and leathers, mounted on his great warhorse, his lord father loomed over him like a giant. "Robb says the man died bravely, but Jon says he was afraid."
+
+
+"What do you think?" his father asked.
+
+
+Bran thought about it. "Can a man still be brave if he's afraid?"
+
+
+"That is the only time a man can be brave," his father told him. "Do you understand why I did it?"
+
+
+"He was a wildling," Bran said. "They carry off women and sell them to the Others."
+
+
+His lord father smiled. "Old Nan has been telling you stories again. In truth, the man was an oathbreaker, a deserter from the Night's Watch. No man is more dangerous. The deserter knows his life is forfeit if he is taken, so he will not flinch from any crime, no matter how vile. But you mistake me. The question was not why the man had to die, but why I must do it."
+
+
+Bran had no answer for that. "King Robert has a headsman," he said, uncertainly.
+
+
+"He does," his father admitted. "As did the Targaryen kings before him. Yet our way is the older way. The blood of the First Men still flows in the veins of the Starks, and we hold to the belief that the man who passes the sentence should swing the sword. If you would take a man's life, you owe it to him to look into his eyes and hear his final words. And if you cannot bear to do that, then perhaps the man does not deserve to die.
+
+
+"One day, Bran, you will be Robb's bannerman, holding a keep of your own for your brother and your king, and justice will fall to you. When that day comes, you must take no pleasure in the task, but neither must you look away. A ruler who hides behind paid executioners soon forgets what death is."
+
+
+That was when Jon reappeared on the crest of the hill before them. He waved and shouted down at them. "Father, Bran, come quickly, see what Robb has found!" Then he was gone again.
+
+
+Jory rode up beside them. "Trouble, my lord?"
+
+
+"Beyond a doubt," his lord father said. "Come, let us see what mischief my sons have rooted out now." He sent his horse into a trot. Jory and Bran and the rest came after.
+
+
+They found Robb on the riverbank north of the bridge, with Jon still mounted beside him. The late summer snows had been heavy this moonturn. Robb stood knee-deep in white, his hood pulled back so the sun shone in his hair. He was cradling something in his arm, while the boys talked in hushed, excited voices.
+
+
+The riders picked their way carefully through the drifts, groping for solid footing on the hidden, uneven ground . Jory Cassel and Theon Greyjoy were the first to reach the boys. Greyjoy was laughing and joking as he rode. Bran heard the breath go out of him. "Gods!" he exclaimed, struggling to keep control of his horse as he reached for his sword.
+
+
+Jory's sword was already out. "Robb, get away from it!" he called as his horse reared under him.
+
+
+Robb grinned and looked up from the bundle in his arms. "She can't hurt you," he said. "She's dead, Jory."
+
+
+Bran was afire with curiosity by then. He would have spurred the pony faster, but his father made them dismount beside the bridge and approach on foot. Bran jumped off and ran.
+
+
+By then Jon, Jory, and Theon Greyjoy had all dismounted as well. "What in the seven hells is it?" Greyjoy was saying.
+
+
+"A wolf," Robb told him.
+
+
+"A freak," Greyjoy said. "Look at the size of it."
+
+
+Bran's heart was thumping in his chest as he pushed through a waist-high drift to his brothers' side.
+
+
+Half-buried in bloodstained snow, a huge dark shape slumped in death. Ice had formed in its shaggy grey fur, and the faint smell of corruption clung to it like a woman's perfume. Bran glimpsed blind eyes crawling with maggots, a wide mouth full of yellowed teeth. But it was the size of it that made him gasp. It was bigger than his pony, twice the size of the largest hound in his father's kennel.
+
+
+"It's no freak," Jon said calmly. "That's a direwolf. They grow larger than the other kind."
+
+
+Theon Greyjoy said, "There's not been a direwolf sighted south of the Wall in two hundred years."
+
+
+"I see one now," Jon replied.
+
+
+Bran tore his eyes away from the monster. That was when he noticed the bundle in Robb's arms. He gave a cry of delight and moved closer. The pup was a tiny ball of grey-black fur, its eyes still closed. It nuzzled blindly against Robb's chest as he cradled it, searching for milk among his leathers, making a sad little whimpery sound. Bran reached out hesitantly. "Go on," Robb told him. "You can touch him."
+
+
+Bran gave the pup a quick nervous stroke, then turned as Jon said, "Here you go." His half brother put a second pup into his arms. "There are five of them." Bran sat down in the snow and hugged the wolf pup to his face. Its fur was soft and warm against his cheek.
+
+
+"Direwolves loose in the realm, after so many years," muttered Hullen, the master of horse. "I like it not."
+
+
+"It is a sign," Jory said.
+
+
+Father frowned. "This is only a dead animal, Jory," he said. Yet he seemed troubled. Snow crunched under his boots as he moved around the body. "Do we know what killed her?"
+
+
+"There's something in the throat," Robb told him, proud to have found the answer before his father even asked. "There, just under the jaw."
+
+
+His father knelt and groped under the beast's head with his hand. He gave a yank and held it up for all to see. A foot of shattered antler, tines snapped off, all wet with blood.
+
+
+A sudden silence descended over the party. The men looked at the antler uneasily, and no one dared to speak. Even Bran could sense their fear, though he did not understand.
+
+
+His father tossed the antler to the side and cleansed his hands in the snow. "I'm surprised she lived long enough to whelp," he said. His voice broke the spell.
+
+
+"Maybe she didn't," Jory said. "I've heard tales . . . maybe the bitch was already dead when the pups came."
+
+
+"Born with the dead," another man put in. "Worse luck."
+
+
+"No matter," said Hullen. "They be dead soon enough too."
+
+
+Bran gave a wordless cry of dismay.
+
+
+"The sooner the better," Theon Greyjoy agreed. He drew his sword. "Give the beast here, Bran."
+
+
+The little thing squirmed against him, as if it heard and understood. "No!" Bran cried out fiercely. "It's mine."
+
+
+"Put away your sword, Greyjoy," Robb said. For a moment he sounded as commanding as their father, like the lord he would someday be. "We will keep these pups."
+
+
+"You cannot do that, boy," said Harwin, who was Hullen's son.
+
+
+"It be a mercy to kill them," Hullen said.
+
+
+Bran looked to his lord father for rescue, but got only a frown, a furrowed brow. "Hullen speaks truly, son. Better a swift death than a hard one from cold and starvation."
+
+
+"No!" He could feel tears welling in his eyes, and he looked away. He did not want to cry in front of his father.
+
+
+Robb resisted stubbornly. "Ser Rodrik's red bitch whelped again last week," he said. "It was a small litter, only two live pups. She'll have milk enough."
+
+
+"She'll rip them apart when they try to nurse."
+
+
+"Lord Stark," Jon said. It was strange to hear him call Father that, so formal. Bran looked at him with desperate hope. "There are five pups," he told Father. "Three male, two female."
+
+
+"What of it, Jon?"
+
+
+"You have five trueborn children," Jon said. "Three sons, two daughters. The direwolf is the sigil of your House. Your children were meant to have these pups, my lord."
+
+
+Bran saw his father's face change, saw the other men exchange glances. He loved Jon with all his heart at that moment. Even at seven, Bran understood what his brother had done. The count had come right only because Jon had omitted himself. He had included the girls, included even Rickon, the baby, but not the bastard who bore the surname Snow, the name that custom decreed be given to all those in the north unlucky enough to be born with no name of their own.
+
+
+Their father understood as well. "You want no pup for yourself, Jon?" he asked softly.
+
+
+"The direwolf graces the banners of House Stark," Jon pointed out. "I am no Stark, Father."
+
+
+Their lord father regarded Jon thoughtfully. Robb rushed into the silence he left. "I will nurse him myself, Father," he promised. "I will soak a towel with warm milk, and give him suck from that."
+
+
+"Me too!" Bran echoed.
+
+
+The lord weighed his sons long and carefully with his eyes. "Easy to say, and harder to do. I will not have you wasting the servants' time with this. If you want these pups, you will feed them yourselves. Is that understood?"
+
+
+Bran nodded eagerly. The pup squirmed in his grasp, licked at his face with a warm tongue.
+
+
+"You must train them as well," their father said. "You must train them. The kennelmaster will have nothing to do with these monsters, I promise you that. And the gods help you if you neglect them, or brutalize them, or train them badly. These are not dogs to beg for treats and slink off at a kick. A direwolf will rip a man's arm off his shoulder as easily as a dog will kill a rat. Are you sure you want this?"
+
+"Yes, Father," Bran said.
+
+
+"Yes," Robb agreed.
+
+
+"The pups may die anyway, despite all you do."
+
+
+"They won't die," Robb said. "We won't let them die."
+
+
+"Keep them, then. Jory, Desmond, gather up the other pups. It's time we were back to Winterfell."
+
+
+It was not until they were mounted and on their way that Bran allowed himself to taste the sweet air of victory. By then, his pup was snuggled inside his leathers, warm against him, safe for the long ride home. Bran was wondering what to name him.
+
+
+Halfway across the bridge, Jon pulled up suddenly.
+
+
+"What is it, Jon?" their lord father asked.
+
+
+"Can't you hear it?"
+
+
+Bran could hear the wind in the trees, the clatter of their hooves on the ironwood planks, the whimpering of his hungry pup, but Jon was listening to something else.
+
+
+"There," Jon said. He swung his horse around and galloped back across the bridge. They watched him dismount where the direwolf lay dead in the snow, watched him kneel. A moment later he was riding back to them, smiling.
+
+
+"He must have crawled away from the others," Jon said.
+
+
+"Or been driven away," their father said, looking at the sixth pup. His fur was white, where the rest of the litter was grey. His eyes were as red as the blood of the ragged man who had died that morning. Bran thought it curious that this pup alone would have opened his eyes while the others were still blind.
+
+
+"An albino," Theon Greyjoy said with wry amusement. "This one will die even faster than the others."
+
+
+Jon Snow gave his father's ward a long, chilling look. "I think not, Greyjoy," he said. "This one belongs to me."
--- a/TTS/tts/utils/assets/tortoise/layman.txt
+++ b/TTS/tts/utils/assets/tortoise/layman.txt
--- a/TTS/tts/utils/assets/tortoise/riding_hood.txt
+++ b/TTS/tts/utils/assets/tortoise/riding_hood.txt
@ -0,0 +1,54 @@
+Once upon a time there lived in a certain village a little country girl, the prettiest creature who was ever seen. Her mother was excessively fond of her; and her grandmother doted on her still more. This good woman had a little red riding hood made for her. It suited the girl so extremely well that everybody called her Little Red Riding Hood.
+One day her mother, having made some cakes, said to her, "Go, my dear, and see how your grandmother is doing, for I hear she has been very ill. Take her a cake, and this little pot of butter."
+
+Little Red Riding Hood set out immediately to go to her grandmother, who lived in another village.
+
+As she was going through the wood, she met with a wolf, who had a very great mind to eat her up, but he dared not, because of some woodcutters working nearby in the forest. He asked her where she was going. The poor child, who did not know that it was dangerous to stay and talk to a wolf, said to him, "I am going to see my grandmother and carry her a cake and a little pot of butter from my mother."
+
+"Does she live far off?" said the wolf
+
+"Oh I say," answered Little Red Riding Hood; "it is beyond that mill you see there, at the first house in the village."
+
+"Well," said the wolf, "and I'll go and see her too. I'll go this way and go you that, and we shall see who will be there first."
+
+The wolf ran as fast as he could, taking the shortest path, and the little girl took a roundabout way, entertaining herself by gathering nuts, running after butterflies, and gathering bouquets of little flowers. It was not long before the wolf arrived at the old woman's house. He knocked at the door: tap, tap.
+
+"Who's there?"
+
+"Your grandchild, Little Red Riding Hood," replied the wolf, counterfeiting her voice; "who has brought you a cake and a little pot of butter sent you by mother."
+
+The good grandmother, who was in bed, because she was somewhat ill, cried out, "Pull the bobbin, and the latch will go up."
+
+The wolf pulled the bobbin, and the door opened, and then he immediately fell upon the good woman and ate her up in a moment, for it been more than three days since he had eaten. He then shut the door and got into the grandmother's bed, expecting Little Red Riding Hood, who came some time afterwards and knocked at the door: tap, tap.
+
+"Who's there?"
+
+Little Red Riding Hood, hearing the big voice of the wolf, was at first afraid; but believing her grandmother had a cold and was hoarse, answered, "It is your grandchild Little Red Riding Hood, who has brought you a cake and a little pot of butter mother sends you."
+
+The wolf cried out to her, softening his voice as much as he could, "Pull the bobbin, and the latch will go up."
+
+Little Red Riding Hood pulled the bobbin, and the door opened.
+
+The wolf, seeing her come in, said to her, hiding himself under the bedclothes, "Put the cake and the little pot of butter upon the stool, and come get into bed with me."
+
+Little Red Riding Hood took off her clothes and got into bed. She was greatly amazed to see how her grandmother looked in her nightclothes, and said to her, "Grandmother, what big arms you have!"
+
+"All the better to hug you with, my dear."
+
+"Grandmother, what big legs you have!"
+
+"All the better to run with, my child."
+
+"Grandmother, what big ears you have!"
+
+"All the better to hear with, my child."
+
+"Grandmother, what big eyes you have!"
+
+"All the better to see with, my child."
+
+"Grandmother, what big teeth you have got!"
+
+"All the better to eat you up with."
+
+And, saying these words, this wicked wolf fell upon Little Red Riding Hood, and ate her all up.
--- a/TTS/tts/utils/assets/tortoise/seal_copypasta.txt
+++ b/TTS/tts/utils/assets/tortoise/seal_copypasta.txt
@ -0,0 +1 @@
+What the fuck did you just fucking say about me, you little bitch? I'll have you know I graduated top of my class in the Navy Seals, and I've been involved in numerous secret raids on Al kayda, and I have over 300 confirmed kills. I am trained in gorilla warfare and I'm the top sniper in the entire U S armed forces. You are nothing to me but just another target. I will wipe you the fuck out with precision the likes of which has never been seen before on this Earth, mark my fucking words. You think you can get away with saying that shit to me over the Internet? Think again, fucker. As we speak I am contacting my secret network of spies across the U S A and your IP is being traced right now so you better prepare for the storm, maggot. The storm that wipes out the pathetic little thing you call your life. You're fucking dead, kid. I can be anywhere, anytime, and I can kill you in over seven hundred ways, and that's just with my bare hands. Not only am I extensively trained in unarmed combat, but I have access to the entire arsenal of the United States Marine Corps and I will use it to its full extent to wipe your miserable ass off the face of the continent, you little shit. If only you could have known what unholy retribution your little "clever" comment was about to bring down upon you, maybe you would have held your fucking tongue. But you couldn't, you didn't, and now you're paying the price, you goddamn idiot. I will shit fury all over you and you will drown in it. You're fucking dead, kiddo.
--- a/TTS/tts/utils/assets/tortoise/tokenizer.json
+++ b/TTS/tts/utils/assets/tortoise/tokenizer.json
@ -0,0 +1 @@
+{"version":"1.0","truncation":null,"padding":null,"added_tokens":[{"id":0,"special":true,"content":"[STOP]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":1,"special":true,"content":"[UNK]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":2,"special":true,"content":"[SPACE]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false}],"normalizer":null,"pre_tokenizer":{"type":"Whitespace"},"post_processor":null,"decoder":null,"model":{"type":"BPE","dropout":null,"unk_token":"[UNK]","continuing_subword_prefix":null,"end_of_word_suffix":null,"fuse_unk":false,"vocab":{"[STOP]":0,"[UNK]":1,"[SPACE]":2,"!":3,"'":4,"(":5,")":6,",":7,"-":8,".":9,"/":10,":":11,";":12,"?":13,"a":14,"b":15,"c":16,"d":17,"e":18,"f":19,"g":20,"h":21,"i":22,"j":23,"k":24,"l":25,"m":26,"n":27,"o":28,"p":29,"q":30,"r":31,"s":32,"t":33,"u":34,"v":35,"w":36,"x":37,"y":38,"z":39,"th":40,"in":41,"the":42,"an":43,"er":44,"ou":45,"re":46,"on":47,"at":48,"ed":49,"en":50,"to":51,"ing":52,"and":53,"is":54,"as":55,"al":56,"or":57,"of":58,"ar":59,"it":60,"es":61,"he":62,"st":63,"le":64,"om":65,"se":66,"be":67,"ad":68,"ow":69,"ly":70,"ch":71,"wh":72,"that":73,"you":74,"li":75,"ve":76,"ac":77,"ti":78,"ld":79,"me":80,"was":81,"gh":82,"id":83,"ll":84,"wi":85,"ent":86,"for":87,"ay":88,"ro":89,"ver":90,"ic":91,"her":92,"ke":93,"his":94,"no":95,"ut":96,"un":97,"ir":98,"lo":99,"we":100,"ri":101,"ha":102,"with":103,"ght":104,"out":105,"im":106,"ion":107,"all":108,"ab":109,"one":110,"ne":111,"ge":112,"ould":113,"ter":114,"mo":115,"had":116,"ce":117,"she":118,"go":119,"sh":120,"ur":121,"am":122,"so":123,"pe":124,"my":125,"de":126,"are":127,"but":128,"ome":129,"fr":130,"ther":131,"fe":132,"su":133,"do":134,"con":135,"te":136,"ain":137,"ere":138,"po":139,"if":140,"they":141,"us":142,"ag":143,"tr":144,"now":145,"oun":146,"this":147,"have":148,"not":149,"sa":150,"il":151,"up":152,"thing":153,"from":154,"ap":155,"him":156,"ack":157,"ation":158,"ant":159,"our":160,"op":161,"like":162,"ust":163,"ess":164,"bo":165,"ok":166,"ul":167,"ind":168,"ex":169,"com":170,"some":171,"there":172,"ers":173,"co":174,"res":175,"man":176,"ard":177,"pl":178,"wor":179,"way":180,"tion":181,"fo":182,"ca":183,"were":184,"by":185,"ate":186,"pro":187,"ted":188,"ound":189,"own":190,"would":191,"ts":192,"what":193,"qu":194,"ally":195,"ight":196,"ck":197,"gr":198,"when":199,"ven":200,"can":201,"ough":202,"ine":203,"end":204,"per":205,"ous":206,"od":207,"ide":208,"know":209,"ty":210,"very":211,"si":212,"ak":213,"who":214,"about":215,"ill":216,"them":217,"est":218,"red":219,"ye":220,"could":221,"ong":222,"your":223,"their":224,"em":225,"just":226,"other":227,"into":228,"any":229,"whi":230,"um":231,"tw":232,"ast":233,"der":234,"did":235,"ie":236,"been":237,"ace":238,"ink":239,"ity":240,"back":241,"ting":242,"br":243,"more":244,"ake":245,"pp":246,"then":247,"sp":248,"el":249,"use":250,"bl":251,"said":252,"over":253,"get":254},"merges":["t h","i n","th e","a n","e r","o u","r e","o n","a t","e d","e n","t o","in g","an d","i s","a s","a l","o r","o f","a r","i t","e s","h e","s t","l e","o m","s e","b e","a d","o w","l y","c h","w h","th at","y ou","l i","v e","a c","t i","l d","m e","w as","g h","i d","l l","w i","en t","f or","a y","r o","v er","i c","h er","k e","h is","n o","u t","u n","i r","l o","w e","r i","h a","wi th","gh t","ou t","i m","i on","al l","a b","on e","n e","g e","ou ld","t er","m o","h ad","c e","s he","g o","s h","u r","a m","s o","p e","m y","d e","a re","b ut","om e","f r","the r","f e","s u","d o","c on","t e","a in","er e","p o","i f","the y","u s","a g","t r","n ow","ou n","th is","ha ve","no t","s a","i l","u p","th ing","fr om","a p","h im","ac k","at ion","an t","ou r","o p","li ke","u st","es s","b o","o k","u l","in d","e x","c om","s ome","the re","er s","c o","re s","m an","ar d","p l","w or","w ay","ti on","f o","c a","w ere","b y","at e","p ro","t ed","oun d","ow n","w ould","t s","wh at","q u","al ly","i ght","c k","g r","wh en","v en","c an","ou gh","in e","en d","p er","ou s","o d","id e","k now","t y","ver y","s i","a k","wh o","ab out","i ll","the m","es t","re d","y e","c ould","on g","you r","the ir","e m","j ust","o ther","in to","an y","wh i","u m","t w","as t","d er","d id","i e","be en","ac e","in k","it y","b ack","t ing","b r","mo re","a ke","p p","the n","s p","e l","u se","b l","sa id","o ver","ge t"]}}
--- a/TTS/tts/utils/assets/tortoise/voices/angie/1.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/angie/1.wav
--- a/TTS/tts/utils/assets/tortoise/voices/angie/2.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/angie/2.wav
--- a/TTS/tts/utils/assets/tortoise/voices/angie/3.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/angie/3.wav
--- a/TTS/tts/utils/assets/tortoise/voices/applejack/1.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/applejack/1.wav
--- a/TTS/tts/utils/assets/tortoise/voices/applejack/2.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/applejack/2.wav
--- a/TTS/tts/utils/assets/tortoise/voices/applejack/3.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/applejack/3.wav
--- a/TTS/tts/utils/assets/tortoise/voices/daniel/1.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/daniel/1.wav
--- a/TTS/tts/utils/assets/tortoise/voices/daniel/2.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/daniel/2.wav
--- a/TTS/tts/utils/assets/tortoise/voices/daniel/3.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/daniel/3.wav
--- a/TTS/tts/utils/assets/tortoise/voices/daniel/4.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/daniel/4.wav
--- a/TTS/tts/utils/assets/tortoise/voices/deniro/1.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/deniro/1.wav
--- a/TTS/tts/utils/assets/tortoise/voices/deniro/2.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/deniro/2.wav
--- a/TTS/tts/utils/assets/tortoise/voices/deniro/3.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/deniro/3.wav
--- a/TTS/tts/utils/assets/tortoise/voices/deniro/4.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/deniro/4.wav
--- a/TTS/tts/utils/assets/tortoise/voices/emma/1.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/emma/1.wav
--- a/TTS/tts/utils/assets/tortoise/voices/emma/2.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/emma/2.wav
--- a/TTS/tts/utils/assets/tortoise/voices/emma/3.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/emma/3.wav
--- a/TTS/tts/utils/assets/tortoise/voices/freeman/1.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/freeman/1.wav
--- a/TTS/tts/utils/assets/tortoise/voices/freeman/2.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/freeman/2.wav
--- a/TTS/tts/utils/assets/tortoise/voices/freeman/3.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/freeman/3.wav
--- a/TTS/tts/utils/assets/tortoise/voices/geralt/1.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/geralt/1.wav
--- a/TTS/tts/utils/assets/tortoise/voices/geralt/2.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/geralt/2.wav
--- a/TTS/tts/utils/assets/tortoise/voices/geralt/3.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/geralt/3.wav
--- a/TTS/tts/utils/assets/tortoise/voices/halle/1.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/halle/1.wav
--- a/TTS/tts/utils/assets/tortoise/voices/halle/2.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/halle/2.wav
--- a/TTS/tts/utils/assets/tortoise/voices/halle/3.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/halle/3.wav
--- a/TTS/tts/utils/assets/tortoise/voices/jlaw/1.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/jlaw/1.wav
--- a/TTS/tts/utils/assets/tortoise/voices/jlaw/2.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/jlaw/2.wav
--- a/TTS/tts/utils/assets/tortoise/voices/jlaw/3.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/jlaw/3.wav
--- a/TTS/tts/utils/assets/tortoise/voices/jlaw/4.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/jlaw/4.wav
--- a/TTS/tts/utils/assets/tortoise/voices/lj/1.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/lj/1.wav
--- a/TTS/tts/utils/assets/tortoise/voices/lj/2.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/lj/2.wav
--- a/TTS/tts/utils/assets/tortoise/voices/mol/1.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/mol/1.wav
--- a/TTS/tts/utils/assets/tortoise/voices/mol/2.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/mol/2.wav
--- a/TTS/tts/utils/assets/tortoise/voices/myself/1.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/myself/1.wav
--- a/TTS/tts/utils/assets/tortoise/voices/myself/2.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/myself/2.wav
--- a/TTS/tts/utils/assets/tortoise/voices/myself/3.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/myself/3.wav
--- a/TTS/tts/utils/assets/tortoise/voices/pat/1.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/pat/1.wav
--- a/TTS/tts/utils/assets/tortoise/voices/pat/2.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/pat/2.wav
--- a/TTS/tts/utils/assets/tortoise/voices/pat/3.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/pat/3.wav
--- a/TTS/tts/utils/assets/tortoise/voices/pat/4.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/pat/4.wav
--- a/TTS/tts/utils/assets/tortoise/voices/pat2/00100.mp3
+++ b/TTS/tts/utils/assets/tortoise/voices/pat2/00100.mp3
--- a/TTS/tts/utils/assets/tortoise/voices/pat2/00112.mp3
+++ b/TTS/tts/utils/assets/tortoise/voices/pat2/00112.mp3
--- a/TTS/tts/utils/assets/tortoise/voices/pat2/00130.mp3
+++ b/TTS/tts/utils/assets/tortoise/voices/pat2/00130.mp3
--- a/TTS/tts/utils/assets/tortoise/voices/pat2/00159.mp3
+++ b/TTS/tts/utils/assets/tortoise/voices/pat2/00159.mp3
--- a/TTS/tts/utils/assets/tortoise/voices/rainbow/1.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/rainbow/1.wav
--- a/TTS/tts/utils/assets/tortoise/voices/rainbow/2.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/rainbow/2.wav
--- a/TTS/tts/utils/assets/tortoise/voices/rainbow/3.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/rainbow/3.wav
--- a/TTS/tts/utils/assets/tortoise/voices/rainbow/4.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/rainbow/4.wav
--- a/TTS/tts/utils/assets/tortoise/voices/rainbow/5.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/rainbow/5.wav
--- a/TTS/tts/utils/assets/tortoise/voices/snakes/00115.mp3
+++ b/TTS/tts/utils/assets/tortoise/voices/snakes/00115.mp3
--- a/TTS/tts/utils/assets/tortoise/voices/snakes/00162.mp3
+++ b/TTS/tts/utils/assets/tortoise/voices/snakes/00162.mp3
--- a/TTS/tts/utils/assets/tortoise/voices/snakes/03504.mp3
+++ b/TTS/tts/utils/assets/tortoise/voices/snakes/03504.mp3
--- a/TTS/tts/utils/assets/tortoise/voices/tim_reynolds/1.mp3
+++ b/TTS/tts/utils/assets/tortoise/voices/tim_reynolds/1.mp3
--- a/TTS/tts/utils/assets/tortoise/voices/tim_reynolds/2.mp3
+++ b/TTS/tts/utils/assets/tortoise/voices/tim_reynolds/2.mp3
--- a/TTS/tts/utils/assets/tortoise/voices/tim_reynolds/3.mp3
+++ b/TTS/tts/utils/assets/tortoise/voices/tim_reynolds/3.mp3
--- a/TTS/tts/utils/assets/tortoise/voices/tim_reynolds/4.mp3
+++ b/TTS/tts/utils/assets/tortoise/voices/tim_reynolds/4.mp3
--- a/TTS/tts/utils/assets/tortoise/voices/tom/1.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/tom/1.wav
--- a/TTS/tts/utils/assets/tortoise/voices/tom/2.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/tom/2.wav
--- a/TTS/tts/utils/assets/tortoise/voices/tom/3.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/tom/3.wav
--- a/TTS/tts/utils/assets/tortoise/voices/tom/4.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/tom/4.wav
--- a/TTS/tts/utils/assets/tortoise/voices/train_atkins/1.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/train_atkins/1.wav
--- a/TTS/tts/utils/assets/tortoise/voices/train_atkins/2.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/train_atkins/2.wav
--- a/TTS/tts/utils/assets/tortoise/voices/train_daws/1.mp3
+++ b/TTS/tts/utils/assets/tortoise/voices/train_daws/1.mp3
--- a/TTS/tts/utils/assets/tortoise/voices/train_daws/2.mp3
+++ b/TTS/tts/utils/assets/tortoise/voices/train_daws/2.mp3
--- a/TTS/tts/utils/assets/tortoise/voices/train_daws/3.mp3
+++ b/TTS/tts/utils/assets/tortoise/voices/train_daws/3.mp3
--- a/TTS/tts/utils/assets/tortoise/voices/train_dotrice/1.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/train_dotrice/1.wav
--- a/TTS/tts/utils/assets/tortoise/voices/train_dotrice/2.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/train_dotrice/2.wav
--- a/TTS/tts/utils/assets/tortoise/voices/train_dreams/1.mp3
+++ b/TTS/tts/utils/assets/tortoise/voices/train_dreams/1.mp3
--- a/TTS/tts/utils/assets/tortoise/voices/train_dreams/2.mp3
+++ b/TTS/tts/utils/assets/tortoise/voices/train_dreams/2.mp3
--- a/TTS/tts/utils/assets/tortoise/voices/train_dreams/3.mp3
+++ b/TTS/tts/utils/assets/tortoise/voices/train_dreams/3.mp3
--- a/TTS/tts/utils/assets/tortoise/voices/train_empire/1.mp3
+++ b/TTS/tts/utils/assets/tortoise/voices/train_empire/1.mp3
--- a/TTS/tts/utils/assets/tortoise/voices/train_empire/2.mp3
+++ b/TTS/tts/utils/assets/tortoise/voices/train_empire/2.mp3
--- a/TTS/tts/utils/assets/tortoise/voices/train_empire/3.mp3
+++ b/TTS/tts/utils/assets/tortoise/voices/train_empire/3.mp3
--- a/TTS/tts/utils/assets/tortoise/voices/train_grace/1.wav
+++ b/TTS/tts/utils/assets/tortoise/voices/train_grace/1.wav
--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				What the fuck did you just fucking say about me, you little bitch? I'll have you know I graduated top of my class in the Navy Seals, and I've been involved in numerous secret raids on Al kayda, and I have over 300 confirmed kills. I am trained in gorilla warfare and I'm the top sniper in the entire U S armed forces. You are nothing to me but just another target. I will wipe you the fuck out with precision the likes of which has never been seen before on this Earth, mark my fucking words. You think you can get away with saying that shit to me over the Internet? Think again, fucker. As we speak I am contacting my secret network of spies across the U S A and your IP is being traced right now so you better prepare for the storm, maggot. The storm that wipes out the pathetic little thing you call your life. You're fucking dead, kid. I can be anywhere, anytime, and I can kill you in over seven hundred ways, and that's just with my bare hands. Not only am I extensively trained in unarmed combat, but I have access to the entire arsenal of the United States Marine Corps and I will use it to its full extent to wipe your miserable ass off the face of the continent, you little shit. If only you could have known what unholy retribution your little "clever" comment was about to bring down upon you, maybe you would have held your fucking tongue. But you couldn't, you didn't, and now you're paying the price, you goddamn idiot. I will shit fury all over you and you will drown in it. You're fucking dead, kiddo.
				`@ -0,0 +1 @@`
				{"version":"1.0","truncation":null,"padding":null,"added_tokens":[{"id":0,"special":true,"content":"[STOP]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":1,"special":true,"content":"[UNK]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":2,"special":true,"content":"[SPACE]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false}],"normalizer":null,"pre_tokenizer":{"type":"Whitespace"},"post_processor":null,"decoder":null,"model":{"type":"BPE","dropout":null,"unk_token":"[UNK]","continuing_subword_prefix":null,"end_of_word_suffix":null,"fuse_unk":false,"vocab":{"[STOP]":0,"[UNK]":1,"[SPACE]":2,"!":3,"'":4,"(":5,")":6,",":7,"-":8,".":9,"/":10,":":11,";":12,"?":13,"a":14,"b":15,"c":16,"d":17,"e":18,"f":19,"g":20,"h":21,"i":22,"j":23,"k":24,"l":25,"m":26,"n":27,"o":28,"p":29,"q":30,"r":31,"s":32,"t":33,"u":34,"v":35,"w":36,"x":37,"y":38,"z":39,"th":40,"in":41,"the":42,"an":43,"er":44,"ou":45,"re":46,"on":47,"at":48,"ed":49,"en":50,"to":51,"ing":52,"and":53,"is":54,"as":55,"al":56,"or":57,"of":58,"ar":59,"it":60,"es":61,"he":62,"st":63,"le":64,"om":65,"se":66,"be":67,"ad":68,"ow":69,"ly":70,"ch":71,"wh":72,"that":73,"you":74,"li":75,"ve":76,"ac":77,"ti":78,"ld":79,"me":80,"was":81,"gh":82,"id":83,"ll":84,"wi":85,"ent":86,"for":87,"ay":88,"ro":89,"ver":90,"ic":91,"her":92,"ke":93,"his":94,"no":95,"ut":96,"un":97,"ir":98,"lo":99,"we":100,"ri":101,"ha":102,"with":103,"ght":104,"out":105,"im":106,"ion":107,"all":108,"ab":109,"one":110,"ne":111,"ge":112,"ould":113,"ter":114,"mo":115,"had":116,"ce":117,"she":118,"go":119,"sh":120,"ur":121,"am":122,"so":123,"pe":124,"my":125,"de":126,"are":127,"but":128,"ome":129,"fr":130,"ther":131,"fe":132,"su":133,"do":134,"con":135,"te":136,"ain":137,"ere":138,"po":139,"if":140,"they":141,"us":142,"ag":143,"tr":144,"now":145,"oun":146,"this":147,"have":148,"not":149,"sa":150,"il":151,"up":152,"thing":153,"from":154,"ap":155,"him":156,"ack":157,"ation":158,"ant":159,"our":160,"op":161,"like":162,"ust":163,"ess":164,"bo":165,"ok":166,"ul":167,"ind":168,"ex":169,"com":170,"some":171,"there":172,"ers":173,"co":174,"res":175,"man":176,"ard":177,"pl":178,"wor":179,"way":180,"tion":181,"fo":182,"ca":183,"were":184,"by":185,"ate":186,"pro":187,"ted":188,"ound":189,"own":190,"would":191,"ts":192,"what":193,"qu":194,"ally":195,"ight":196,"ck":197,"gr":198,"when":199,"ven":200,"can":201,"ough":202,"ine":203,"end":204,"per":205,"ous":206,"od":207,"ide":208,"know":209,"ty":210,"very":211,"si":212,"ak":213,"who":214,"about":215,"ill":216,"them":217,"est":218,"red":219,"ye":220,"could":221,"ong":222,"your":223,"their":224,"em":225,"just":226,"other":227,"into":228,"any":229,"whi":230,"um":231,"tw":232,"ast":233,"der":234,"did":235,"ie":236,"been":237,"ace":238,"ink":239,"ity":240,"back":241,"ting":242,"br":243,"more":244,"ake":245,"pp":246,"then":247,"sp":248,"el":249,"use":250,"bl":251,"said":252,"over":253,"get":254},"merges":["t h","i n","th e","a n","e r","o u","r e","o n","a t","e d","e n","t o","in g","an d","i s","a s","a l","o r","o f","a r","i t","e s","h e","s t","l e","o m","s e","b e","a d","o w","l y","c h","w h","th at","y ou","l i","v e","a c","t i","l d","m e","w as","g h","i d","l l","w i","en t","f or","a y","r o","v er","i c","h er","k e","h is","n o","u t","u n","i r","l o","w e","r i","h a","wi th","gh t","ou t","i m","i on","al l","a b","on e","n e","g e","ou ld","t er","m o","h ad","c e","s he","g o","s h","u r","a m","s o","p e","m y","d e","a re","b ut","om e","f r","the r","f e","s u","d o","c on","t e","a in","er e","p o","i f","the y","u s","a g","t r","n ow","ou n","th is","ha ve","no t","s a","i l","u p","th ing","fr om","a p","h im","ac k","at ion","an t","ou r","o p","li ke","u st","es s","b o","o k","u l","in d","e x","c om","s ome","the re","er s","c o","re s","m an","ar d","p l","w or","w ay","ti on","f o","c a","w ere","b y","at e","p ro","t ed","oun d","ow n","w ould","t s","wh at","q u","al ly","i ght","c k","g r","wh en","v en","c an","ou gh","in e","en d","p er","ou s","o d","id e","k now","t y","ver y","s i","a k","wh o","ab out","i ll","the m","es t","re d","y e","c ould","on g","you r","the ir","e m","j ust","o ther","in to","an y","wh i","u m","t w","as t","d er","d id","i e","be en","ac e","in k","it y","b ack","t ing","b r","mo re","a ke","p p","the n","s p","e l","u se","b l","sa id","o ver","ge t"]}}