Remove ununsed config and args

2023-11-08 10:14:38 +01:00 · 2023-11-08 10:14:38 +01:00 · 5c81500e3e
parent 9f106034a1
commit 5c81500e3e
3 changed files with 1 additions and 59 deletions
--- a/TTS/tts/configs/xtts_config.py
+++ b/TTS/tts/configs/xtts_config.py
@ -37,29 +37,11 @@ class XttsConfig(BaseTTSConfig):
            If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
            Defaults to `0.8`.
        cond_free_k (float):
            Knob that determines how to balance the conditioning free signal with the conditioning-present signal. [0,inf].
            As cond_free_k increases, the output becomes dominated by the conditioning-free signal.
            Formula is: output=cond_present_output*(cond_free_k+1)-cond_absenct_output*cond_free_k. Defaults to `2.0`.
        diffusion_temperature (float):
            Controls the variance of the noise fed into the diffusion model. [0,1]. Values at 0
            are the "mean" prediction of the diffusion network and will sound bland and smeared.
            Defaults to `1.0`.
        num_gpt_outputs (int):
            Number of samples taken from the autoregressive model, all of which are filtered using CLVP.
            As XTTS is a probabilistic model, more samples means a higher probability of creating something "great".
            Defaults to `16`.
        decoder_iterations (int):
            Number of diffusion steps to perform. [0,4000]. More steps means the network has more chances to iteratively refine
            the output, which should theoretically mean a higher quality output. Generally a value above 250 is not noticeably better,
            however. Defaults to `30`.
        decoder_sampler (str):
            Diffusion sampler to be used. `ddim` or `dpm++2m`. Defaults to `ddim`.
        gpt_cond_len (int):
            Secs audio to be used as conditioning for the autoregressive model. Defaults to `3`.
@ -110,11 +92,7 @@ class XttsConfig(BaseTTSConfig):
    repetition_penalty: float = 2.0
    top_k: int = 50
    top_p: float = 0.85
    cond_free_k: float = 2.0
    diffusion_temperature: float = 1.0
    num_gpt_outputs: int = 1
    decoder_iterations: int = 30
    decoder_sampler: str = "ddim"
    # cloning
    gpt_cond_len: int = 3
--- a/TTS/tts/models/xtts.py
+++ b/TTS/tts/models/xtts.py
@ -152,19 +152,6 @@ class XttsArgs(Coqpit):
        gpt_code_stride_len (int, optional): The hop_size of dvae and consequently of the gpt output. Defaults to 1024.
        gpt_use_masking_gt_prompt_approach (bool, optional):  If True, it will use ground truth as prompt and it will mask the loss to avoid repetition. Defaults to True.
        gpt_use_perceiver_resampler (bool, optional):  If True, it will use perceiver resampler from flamingo paper - https://arxiv.org/abs/2204.14198. Defaults to False.
        For DiffTTS model:
        diff_model_channels (int, optional): The number of channels for the DiffTTS model. Defaults to 1024.
        diff_num_layers (int, optional): The number of layers for the DiffTTS model. Defaults to 10.
        diff_in_channels (int, optional): The input channels for the DiffTTS model. Defaults to 100.
        diff_out_channels (int, optional): The output channels for the DiffTTS model. Defaults to 200.
        diff_in_latent_channels (int, optional): The input latent channels for the DiffTTS model. Defaults to 1024.
        diff_in_tokens (int, optional): The input tokens for the DiffTTS model. Defaults to 8193.
        diff_dropout (int, optional): The dropout percentage for the DiffTTS model. Defaults to 0.
        diff_use_fp16 (bool, optional): Whether to use fp16 for the DiffTTS model. Defaults to False.
        diff_num_heads (int, optional): The number of heads for the DiffTTS model. Defaults to 16.
        diff_layer_drop (int, optional): The layer dropout percentage for the DiffTTS model. Defaults to 0.
        diff_unconditioned_percentage (int, optional): The percentage of unconditioned inputs for the DiffTTS model. Defaults to 0.
    """
    gpt_batch_size: int = 1
@ -193,19 +180,6 @@ class XttsArgs(Coqpit):
    gpt_use_masking_gt_prompt_approach: bool = True
    gpt_use_perceiver_resampler: bool = False
    # Diffusion Decoder params
    diff_model_channels: int = 1024
    diff_num_layers: int = 10
    diff_in_channels: int = 100
    diff_out_channels: int = 200
    diff_in_latent_channels: int = 1024
    diff_in_tokens: int = 8193
    diff_dropout: int = 0
    diff_use_fp16: bool = False
    diff_num_heads: int = 16
    diff_layer_drop: int = 0
    diff_unconditioned_percentage: int = 0
    # HifiGAN Decoder params
    input_sample_rate: int = 22050
    output_sample_rate: int = 24000
@ -426,10 +400,6 @@ class Xtts(BaseTTS):
            "repetition_penalty": config.repetition_penalty,
            "top_k": config.top_k,
            "top_p": config.top_p,
            "cond_free_k": config.cond_free_k,
            "diffusion_temperature": config.diffusion_temperature,
            "decoder_iterations": config.decoder_iterations,
            "decoder_sampler": config.decoder_sampler,
            "gpt_cond_len": config.gpt_cond_len,
            "max_ref_len": config.max_ref_len,
            "sound_norm_refs": config.sound_norm_refs,
@ -454,13 +424,6 @@ class Xtts(BaseTTS):
        gpt_cond_len=6,
        max_ref_len=10,
        sound_norm_refs=False,
        # Decoder inference
        decoder_iterations=100,
        cond_free=True,
        cond_free_k=2,
        diffusion_temperature=1.0,
        decoder_sampler="ddim",
        decoder="hifigan",
        **hf_generate_kwargs,
    ):
        """
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@ -394,6 +394,7 @@ class ModelManager(object):
            # ToDo: we need a better way to handle it
            if "xtts" in model_name:
                try:
                    #raise Exception(" > XTTS models are not supported yet.")
                    self.check_if_configs_are_equal(model_name, model_item, output_path)
                except:
                    pass