add docstrings with default value fixes

2021-05-13 16:04:49 +02:00 · 2021-05-13 16:04:49 +02:00 · 8b1014d188
parent 7e02cff924
commit 8b1014d188
16 changed files with 1091 additions and 75 deletions
--- a/TTS/config/shared_configs.py
+++ b/TTS/config/shared_configs.py
@ -13,7 +13,7 @@ class BaseAudioConfig(Coqpit):
            Number of STFT frequency levels aka.size of the linear spectogram frame. Defaults to 1024.
        win_length (int):
            Each frame of audio is windowed by window of length ```win_length``` and then padded with zeros to match
-            ```fft_size```. Defaults to 256.
+            ```fft_size```. Defaults to 1024.
        hop_length (int):
            Number of audio samples between adjacent STFT columns. Defaults to 1024.
        frame_shift_ms (int):
@ -21,7 +21,7 @@ class BaseAudioConfig(Coqpit):
        frame_length_ms (int):
            Set ```win_length``` based on milliseconds and sampling rate.
        stft_pad_mode (str):
-            Padding method used in STFT. 'reflect' or 'center'.
+            Padding method used in STFT. 'reflect' or 'center'. Defaults to 'reflect'.
        sample_rate (int):
            Audio sampling rate. Defaults to 22050.
        resample (bool):
@ -135,11 +135,27 @@ class BaseAudioConfig(Coqpit):

@dataclass
 class BaseDatasetConfig(Coqpit):
-    name: str = None
-    path: str = None
-    meta_file_train: Union[str, List] = None  # TODO: don't take ignored speakers for multi-speaker datasets over this. This is Union for SC-Glow compat.
-    meta_file_val: str = None
-    meta_file_attn_mask: str = None
+    """Base config for TTS datasets.
+
+    Args:
+        name (str):
+            Dataset name that defines the preprocessor in use. Defaults to None.
+        path (str):
+            Root path to the dataset files. Defaults to None.
+        meta_file_train (Union[str, List]):
+            Name of the dataset meta file. Or a list of speakers to be ignored at training for multi-speaker datasets.
+            Defaults to None.
+        meta_file_val (str):
+            Name of the dataset meta file that defines the instances used at validation.
+        meta_file_attn_mask (str):
+            Path to the file that lists the attention mask files used with models that require attention masks to
+            train the duration predictor.
+    """
+    name: str = ''
+    path: str = ''
+    meta_file_train: Union[str, List] = ''  # TODO: don't take ignored speakers for multi-speaker datasets over this. This is Union for SC-Glow compat.
+    meta_file_val: str = ''
+    meta_file_attn_mask: str = ''

    def check_values(
        self,
@ -161,12 +177,8 @@ class BaseTrainingConfig(Coqpit):
    Args:
        batch_size (int):
            Training batch size.
-        batch_group_size (int):
-            Number of batches to shuffle after bucketing.
        eval_batch_size (int):
            Validation batch size.
-        loss_masking (bool):
-            Enable / Disable masking padding segments of sequences.
        mixed_precision (bool):
            Enable / Disable mixed precision training. It reduces the VRAM use and allows larger batch sizes, however
            it may also cause numerical unstability in some cases.
@ -195,34 +207,13 @@ class BaseTrainingConfig(Coqpit):
        keep_after (int):
            Number of steps to wait before saving all the best models. In use if ```keep_all_best == True```. Defaults
            to 10000.
-        text_cleaner (str):
-            Text cleaner to be used at model training. It is set to be one of the cleaners in
-            ```TTS.tts.utils.text.cleaners```.
-        enable_eos_bos_chars (bool):
-            Enable / Disable using special characters indicating end-of-sentence and begining-of-sentence.
        num_loader_workers (int):
            Number of workers for training time dataloader.
        num_val_loader_workers (int):
            Number of workers for evaluation time dataloader.
-        min_seq_len (int):
-            Minimum sequence length to be used at training.
-        max_seq_len (int):
-            Maximum sequence length to be used at training. VRAM use at training depends on this parameter. Consider to
-            decrease it if you get OOM errors.
-        compute_f0 (bool):
-            Return F0 frames from the dataloader. Defaults to ```False```.
-        compute_input_seq_cache (bool):
-            Enable / Disable computing and caching phonemes sequences from character sequences at the begining of the
-            training. It allows faster data loading times and more precise max-min sequence prunning. Defaults
-            to ```False```.
        output_path (str):
            Path for training output folder. The nonexist part of the given path is created automatically.
            All training outputs are saved there.
-        phoneme_cache_path (str):
-            Path to a folder to save the computed phoneme sequences.
-        datasets (List[BaseDatasetConfig]):
-            ist of DatasetConfig.
-
    """

    model: str = None
--- a/TTS/tts/configs/align_tts_config.py
+++ b/TTS/tts/configs/align_tts_config.py
@ -1,11 +1,69 @@
 from dataclasses import dataclass, field

-from .shared_configs import BaseTTSConfig
+from TTS.tts.configs.shared_configs import BaseTTSConfig


@dataclass
 class AlignTTSConfig(BaseTTSConfig):
-    """Defines parameters for AlignTTS model."""
+    """Defines parameters for AlignTTS model.
+    Example:
+
+        >>> from TTS.tts.configs import AlignTTSConfig
+        >>> config = AlignTTSConfig()
+
+    Args:
+        model(str):
+            Model name used for selecting the right model at initialization. Defaults to `align_tts`.
+        positional_encoding (bool):
+            enable / disable positional encoding applied to the encoder output. Defaults to True.
+        hidden_channels (int):
+            Base number of hidden channels. Defines all the layers expect ones defined by the specific encoder or decoder
+            parameters. Defaults to 256.
+        hidden_channels_dp (int):
+            Number of hidden channels of the duration predictor's layers. Defaults to 256.
+        encoder_type (str):
+            Type of the encoder used by the model. Look at `TTS.tts.layers.feed_forward.encoder` for more details.
+            Defaults to `fftransformer`.
+        encoder_params (dict):
+            Parameters used to define the encoder network. Look at `TTS.tts.layers.feed_forward.encoder` for more details.
+            Defaults to `{"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}`.
+        decoder_type (str):
+            Type of the decoder used by the model. Look at `TTS.tts.layers.feed_forward.decoder` for more details.
+            Defaults to `fftransformer`.
+        decoder_params (dict):
+            Parameters used to define the decoder network. Look at `TTS.tts.layers.feed_forward.decoder` for more details.
+            Defaults to `{"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}`.
+        phase_start_steps (List[int]):
+            A list of number of steps required to start the next training phase. AlignTTS has 4 different training
+            phases. Thus you need to define 4 different values to enable phase based training. If None, it
+            trains the whole model together. Defaults to None.
+        ssim_alpha (float):
+            Weight for the SSIM loss. If set <= 0, disables the SSIM loss. Defaults to 1.0.
+        duration_loss_alpha (float):
+            Weight for the duration predictor's loss. Defaults to 1.0.
+        mdn_alpha (float):
+            Weight for the MDN loss. Defaults to 1.0.
+        spec_loss_alpha (float):
+            Weight for the MSE spectrogram loss. If set <= 0, disables the L1 loss. Defaults to 1.0.
+        use_speaker_embedding (bool):
+            enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
+            in the multi-speaker mode. Defaults to False.
+        use_external_speaker_embedding_file (bool):
+            enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
+        external_speaker_embedding_file (str):
+            Path to the file including pre-computed speaker embeddings. Defaults to None.
+        noam_schedule (bool):
+            enable / disable the use of Noam LR scheduler. Defaults to False.
+        warmup_steps (int):
+            Number of warm-up steps for the Noam scheduler. Defaults 4000.
+        lr (float):
+            Initial learning rate. Defaults to `1e-3`.
+        wd (float):
+            Weight decay coefficient. Defaults to `1e-7`.
+        min_seq_len (int):
+            Minimum input sequence length to be used at training.
+        max_seq_len (int):
+            Maximum input sequence length to be used at training. Larger values result in more VRAM usage."""

    model: str = "align_tts"
    # model specific params
--- a/TTS/tts/configs/glow_tts_config.py
+++ b/TTS/tts/configs/glow_tts_config.py
@ -1,11 +1,64 @@
 from dataclasses import dataclass, field

-from .shared_configs import BaseTTSConfig
+from TTS.tts.configs.shared_configs import BaseTTSConfig


@dataclass
 class GlowTTSConfig(BaseTTSConfig):
-    """Defines parameters for GlowTTS model."""
+    """Defines parameters for GlowTTS model.
+
+     Example:
+
+        >>> from TTS.tts.configs import GlowTTSConfig
+        >>> config = GlowTTSConfig()
+
+    Args:
+        model(str):
+            Model name used for selecting the right model at initialization. Defaults to `glow_tts`.
+        encoder_type (str):
+            Type of the encoder used by the model. Look at `TTS.tts.layers.glow_tts.encoder` for more details.
+            Defaults to `rel_pos_transformers`.
+        encoder_params (dict):
+            Parameters used to define the encoder network. Look at `TTS.tts.layers.glow_tts.encoder` for more details.
+            Defaults to `{"kernel_size": 3, "dropout_p": 0.1, "num_layers": 6, "num_heads": 2, "hidden_channels_ffn": 768}`
+        use_encoder_prenet (bool):
+            enable / disable the use of a prenet for the encoder. Defaults to True.
+        hidden_channels_encoder (int):
+            Number of base hidden channels used by the encoder network. It defines the input and the output channel sizes,
+            and for some encoder types internal hidden channels sizes too. Defaults to 192.
+        hidden_channels_decoder (int):
+            Number of base hidden channels used by the decoder WaveNet network. Defaults to 192 as in the original work.
+        hidden_channels_duration_predictor (int):
+            Number of layer channels of the duration predictor network. Defaults to 256 as in the original work.
+        data_dep_init_steps (int):
+            Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
+            Activation Normalization that pre-computes normalization stats at the beginning and use the same values
+            for the rest. Defaults to 10.
+        style_wav_for_test (str):
+            Path to the wav file used for changing the style of the speech. Defaults to None.
+        inference_noise_scale (float):
+            Variance used for sampling the random noise added to the decoder's input at inference. Defaults to 0.0.
+        use_speaker_embedding (bool):
+            enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
+            in the multi-speaker mode. Defaults to False.
+        use_external_speaker_embedding_file (bool):
+            enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
+        external_speaker_embedding_file (str):
+            Path to the file including pre-computed speaker embeddings. Defaults to None.
+        noam_schedule (bool):
+            enable / disable the use of Noam LR scheduler. Defaults to False.
+        warmup_steps (int):
+            Number of warm-up steps for the Noam scheduler. Defaults 4000.
+        lr (float):
+            Initial learning rate. Defaults to `1e-3`.
+        wd (float):
+            Weight decay coefficient. Defaults to `1e-7`.
+        min_seq_len (int):
+            Minimum input sequence length to be used at training.
+        max_seq_len (int):
+            Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
+    """
+

    model: str = "glow_tts"

@ -47,4 +100,4 @@ class GlowTTSConfig(BaseTTSConfig):
    # overrides
    min_seq_len: int = 3
    max_seq_len: int = 500
-    r: int = 1
+    r: int = 1  # DO NOT CHANGE - TODO: make this immutable once coqpit implements it.
--- a/TTS/tts/configs/shared_configs.py
+++ b/TTS/tts/configs/shared_configs.py
@ -8,8 +8,20 @@ from TTS.config import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig

@dataclass
 class GSTConfig(Coqpit):
-    """Defines Global Style Toke module"""
+    """Defines the Global Style Token Module

+    Args:
+        gst_style_input_wav (str):
+            Path to the wav file used to define the style of the output speech at inference. Defaults to None.
+        gst_style_input_weights (dict):
+            Defines the weights for each style token used at inference. Defaults to None.
+        gst_embedding_dim (int):
+            Defines the size of the GST embedding vector dimensions. Defaults to 256.
+        gst_num_heads (int):
+            Number of attention heads used by the multi-head attention. Defaults to 4.
+        gst_num_style_tokens (int):
+            Number of style token vectors. Defaults to 10.
+    """
    gst_style_input_wav: str = None
    gst_style_input_weights: dict = None
    gst_embedding_dim: int = 256
@ -33,7 +45,26 @@ class GSTConfig(Coqpit):

@dataclass
 class CharactersConfig(Coqpit):
-    """Defines character or phoneme set used by the model"""
+    """Defines character or phoneme set used by the model
+
+    Args:
+        pad (str):
+            characters in place of empty padding. Defaults to None.
+        eos (str):
+            characters showing the end of a sentence. Defaults to None.
+        bos (str):
+            characters showing the beginning of a sentence. Defaults to None.
+        characters (str):
+            character set used by the model. Characters not in this list are ignored when converting input text to
+            a list of sequence IDs. Defaults to None.
+        punctuations (str):
+            characters considered as punctuation as parsing the input sentence. Defaults to None.
+        phonemes (str):
+            characters considered as parsing phonemes. Defaults to None.
+        unique (bool):
+            remove any duplicate characters in the character lists. It is a bandaid for compatibility with the old
+            models trained with character lists with duplicates.
+    """

    pad: str = None
    eos: str = None
@ -58,7 +89,48 @@ class CharactersConfig(Coqpit):

@dataclass
 class BaseTTSConfig(BaseTrainingConfig):
-    """Shared parameters among all the tts models."""
+    """Shared parameters among all the tts models.
+
+    Args:
+        audio (BaseAudioConfig):
+            Audio processor config object instance.
+        use_phonemes (bool):
+            enable / disable phoneme use.
+        compute_input_seq_cache (bool):
+            enable / disable precomputation of the phoneme sequences. At the expense of some delay at the beginning of
+            the training, It allows faster data loader time and precise limitation with `max_seq_len` and
+            `min_seq_len`.
+        text_cleaner (str):
+            Name of the text cleaner used for cleaning and formatting transcripts.
+        enable_eos_bos_chars (bool):
+            enable / disable the use of eos and bos characters.
+        test_senteces_file (str):
+            Path to a txt file that has sentences used at test time. The file must have a sentence per line.
+        phoneme_cache_path (str):
+            Path to the output folder caching the computed phonemes for each sample.
+        characters (CharactersConfig):
+            Instance of a CharactersConfig class.
+        batch_group_size (int):
+            Size of the batch groups used for bucketing. By default, the dataloader orders samples by the sequence
+            length for a more efficient and stable training. If `batch_group_size > 1` then it performs bucketing to
+            prevent using the same batches for each epoch.
+        loss_masking (bool):
+            enable / disable masking loss values against padded segments of samples in a batch.
+        min_seq_len (int):
+            Minimum input sequence length to be used at training.
+        max_seq_len (int):
+            Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
+        compute_f0 (int):
+            (Not in use yet).
+        use_noise_augment (bool):
+            Augment the input audio with random noise.
+        add_blank (bool):
+            Add blank characters between each other two characters. It improves performance for some models at expense
+            of slower run-time due to the longer input sequence.
+        datasets (List[BaseDatasetConfig]):
+            List of datasets used for training. If multiple datasets are provided, they are merged and used together
+            for training.
+        """

    audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
    # phoneme settings
--- a/TTS/tts/configs/speedy_speech_config.py
+++ b/TTS/tts/configs/speedy_speech_config.py
@ -1,11 +1,74 @@
 from dataclasses import dataclass, field

-from .shared_configs import BaseTTSConfig
+from TTS.tts.configs.shared_configs import BaseTTSConfig


@dataclass
 class SpeedySpeechConfig(BaseTTSConfig):
-    """Defines parameters for Speedy Speech (feed-forward encoder-decoder) based models."""
+    """Defines parameters for Speedy Speech (feed-forward encoder-decoder) based models.
+
+    Example:
+
+        >>> from TTS.tts.configs import SpeedySpeechConfig
+        >>> config = SpeedySpeechConfig()
+
+    Args:
+        model (str):
+            Model name used for selecting the right model at initialization. Defaults to `speedy_speech`.
+        positional_encoding (bool):
+            enable / disable positional encoding applied to the encoder output. Defaults to True.
+        hidden_channels (int):
+            Base number of hidden channels. Defines all the layers expect ones defined by the specific encoder or decoder
+            parameters. Defaults to 128.
+        encoder_type (str):
+            Type of the encoder used by the model. Look at `TTS.tts.layers.feed_forward.encoder` for more details.
+            Defaults to `residual_conv_bn`.
+        encoder_params (dict):
+            Parameters used to define the encoder network. Look at `TTS.tts.layers.feed_forward.encoder` for more details.
+            Defaults to `{"kernel_size": 4, "dilations": [1, 2, 4, 1, 2, 4, 1, 2, 4, 1, 2, 4, 1], "num_conv_blocks": 2, "num_res_blocks": 13}`
+        decoder_type (str):
+            Type of the decoder used by the model. Look at `TTS.tts.layers.feed_forward.decoder` for more details.
+            Defaults to `residual_conv_bn`.
+        decoder_params (dict):
+            Parameters used to define the decoder network. Look at `TTS.tts.layers.feed_forward.decoder` for more details.
+            Defaults to `{"kernel_size": 4, "dilations": [1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1], "num_conv_blocks": 2, "num_res_blocks": 17}`
+        hidden_channels_encoder (int):
+            Number of base hidden channels used by the encoder network. It defines the input and the output channel sizes,
+            and for some encoder types internal hidden channels sizes too. Defaults to 192.
+        hidden_channels_decoder (int):
+            Number of base hidden channels used by the decoder WaveNet network. Defaults to 192 as in the original work.
+        hidden_channels_duration_predictor (int):
+            Number of layer channels of the duration predictor network. Defaults to 256 as in the original work.
+        data_dep_init_steps (int):
+            Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
+            Activation Normalization that pre-computes normalization stats at the beginning and use the same values
+            for the rest. Defaults to 10.
+        use_speaker_embedding (bool):
+            enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
+            in the multi-speaker mode. Defaults to False.
+        use_external_speaker_embedding_file (bool):
+            enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
+        external_speaker_embedding_file (str):
+            Path to the file including pre-computed speaker embeddings. Defaults to None.
+        noam_schedule (bool):
+            enable / disable the use of Noam LR scheduler. Defaults to False.
+        warmup_steps (int):
+            Number of warm-up steps for the Noam scheduler. Defaults 4000.
+        lr (float):
+            Initial learning rate. Defaults to `1e-3`.
+        wd (float):
+            Weight decay coefficient. Defaults to `1e-7`.
+        ssim_alpha (float):
+            Weight for the SSIM loss. If set <= 0, disables the SSIM loss. Defaults to 1.0.
+        huber_alpha (float):
+            Weight for the duration predictor's loss. Defaults to 1.0.
+        l1_alpha (float):
+            Weight for the L1 spectrogram loss. If set <= 0, disables the L1 loss. Defaults to 1.0.
+        min_seq_len (int):
+            Minimum input sequence length to be used at training.
+        max_seq_len (int):
+            Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
+    """

    model: str = "speedy_speech"
    # model specific params
@ -50,4 +113,4 @@ class SpeedySpeechConfig(BaseTTSConfig):
    # overrides
    min_seq_len: int = 13
    max_seq_len: int = 200
-    r: int = 1
+    r: int = 1  #DO NOT CHANGE
--- a/TTS/tts/configs/tacotron2_config.py
+++ b/TTS/tts/configs/tacotron2_config.py
@ -5,6 +5,114 @@ from TTS.tts.configs.tacotron_config import TacotronConfig

@dataclass
 class Tacotron2Config(TacotronConfig):
-    """Defines parameters for Tacotron2 based models."""
+    """Defines parameters for Tacotron2 based models.
+
+    Example:
+
+        >>> from TTS.tts.configs import Tacotron2Config
+        >>> config = Tacotron2Config()
+
+    Args:
+        model (str):
+            Model name used to select the right model class to initilize. Defaults to `Tacotron2`.
+        use_gst (bool):
+            enable / disable the use of Global Style Token modules. Defaults to False.
+        gst (GSTConfig):
+            Instance of `GSTConfig` class.
+        gst_style_input (str):
+            Path to the wav file used at inference to set the speech style through GST. If `GST` is enabled and
+            this is not defined, the model uses a zero vector as an input. Defaults to None.
+        r (int):
+            Number of output frames that the decoder computed per iteration. Larger values makes training and inference
+            faster but reduces the quality of the output frames. This needs to be tuned considering your own needs.
+            Defaults to 1.
+        gradual_trainin (List[List]):
+            Parameters for the gradual training schedule. It is in the form `[[a, b, c], [d ,e ,f] ..]` where `a` is
+            the step number to start using the rest of the values, `b` is the `r` value and `c` is the batch size.
+            If sets None, no gradual training is used. Defaults to None.
+        memory_size (int):
+            Defines the number of previous frames used by the Prenet. If set to < 0, then it uses only the last frame.
+            Defaults to -1.
+        prenet_type (str):
+            `original` or `bn`. `original` sets the default Prenet and `bn` uses Batch Normalization version of the
+            Prenet. Defaults to `original`.
+        prenet_dropout (bool):
+            enables / disables the use of dropout in the Prenet. Defaults to True.
+        prenet_dropout_at_inference (bool):
+            enable / disable the use of dropout in the Prenet at the inference time. Defaults to False.
+        stopnet (bool):
+            enable /disable the Stopnet that predicts the end of the decoder sequence. Defaults to True.
+        stopnet_pos_weight (float):
+            Weight that is applied to over-weight positive instances in the Stopnet loss. Use larger values with
+            datasets with longer sentences. Defaults to 10.
+        separate_stopnet (bool):
+            Use a distinct Stopnet which is trained separately from the rest of the model. Defaults to True.
+        attention_type (str):
+            attention type. Check ```TTS.tts.layers.attentions.init_attn```. Defaults to 'original'.
+        attention_heads (int):
+            Number of attention heads for GMM attention. Defaults to 5.
+        windowing (bool):
+            It especially useful at inference to keep attention alignment diagonal. Defaults to False.
+        use_forward_attn (bool):
+            It is only valid if ```attn_type``` is ```original```.  Defaults to False.
+        forward_attn_mask (bool):
+            enable/disable extra masking over forward attention. It is useful at inference to prevent
+            possible attention failures. Defaults to False.
+        transition_agent (bool):
+            enable/disable transition agent in forward attention. Defaults to False.
+        location_attn (bool):
+            enable/disable location sensitive attention as in the original Tacotron2 paper.
+            It is only valid if ```attn_type``` is ```original```. Defaults to True.
+        bidirectional_decoder (bool):
+            enable/disable bidirectional decoding. Defaults to False.
+        double_decoder_consistency (bool):
+            enable/disable double decoder consistency. Defaults to False.
+        ddc_r (int):
+            reduction rate used by the coarse decoder when `double_decoder_consistency` is in use. Set this
+            as a multiple of the `r` value. Defaults to 6.
+        use_speaker_embedding (bool):
+            enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
+            in the multi-speaker mode. Defaults to False.
+        use_external_speaker_embedding_file (bool):
+            enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
+        external_speaker_embedding_file (str):
+            Path to the file including pre-computed speaker embeddings. Defaults to None.
+        noam_schedule (bool):
+            enable / disable the use of Noam LR scheduler. Defaults to False.
+        warmup_steps (int):
+            Number of warm-up steps for the Noam scheduler. Defaults 4000.
+        lr (float):
+            Initial learning rate. Defaults to `1e-4`.
+        wd (float):
+            Weight decay coefficient. Defaults to `1e-6`.
+        grad_clip (float):
+            Gradient clipping threshold. Defaults to `5`.
+        seq_len_notm (bool):
+            enable / disable the sequnce length normalization in the loss functions. If set True, loss of a sample
+            is divided by the sequence length. Defaults to False.
+        loss_masking (bool):
+            enable / disable masking the paddings of the samples in loss computation. Defaults to True.
+        decoder_loss_alpha (float):
+            Weight for the decoder loss of the Tacotron model. If set less than or equal to zero, it disables the
+            corresponding loss function. Defaults to 0.25
+        postnet_loss_alpha (float):
+            Weight for the postnet loss of the Tacotron model. If set less than or equal to zero, it disables the
+            corresponding loss function. Defaults to 0.25
+        postnet_diff_spec_alpha (float):
+            Weight for the postnet differential loss of the Tacotron model. If set less than or equal to zero, it disables the
+            corresponding loss function. Defaults to 0.25
+        decoder_diff_spec_alpha (float):
+            Weight for the decoder differential loss of the Tacotron model. If set less than or equal to zero, it disables the
+            corresponding loss function. Defaults to 0.25
+        decoder_ssim_alpha (float):
+            Weight for the decoder SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the
+            corresponding loss function. Defaults to 0.25
+        postnet_ssim_alpha (float):
+            Weight for the postnet SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the
+            corresponding loss function. Defaults to 0.25
+        ga_alpha (float):
+            Weight for the guided attention loss. If set less than or equal to zero, it disables the corresponding loss
+            function. Defaults to 5.
+    """

    model: str = "tacotron2"
--- a/TTS/tts/configs/tacotron_config.py
+++ b/TTS/tts/configs/tacotron_config.py
@ -1,12 +1,120 @@
 from dataclasses import dataclass
 from typing import List

-from .shared_configs import BaseTTSConfig, GSTConfig
+from TTS.tts.configs.shared_configs import BaseTTSConfig, GSTConfig


@dataclass
 class TacotronConfig(BaseTTSConfig):
-    """Defines parameters for Tacotron based models."""
+    """Defines parameters for Tacotron based models.
+
+    Example:
+
+        >>> from TTS.tts.configs import TacotronConfig
+        >>> config = TacotronConfig()
+
+    Args:
+        model (str):
+            Model name used to select the right model class to initilize. Defaults to `Tacotron`.
+        use_gst (bool):
+            enable / disable the use of Global Style Token modules. Defaults to False.
+        gst (GSTConfig):
+            Instance of `GSTConfig` class.
+        gst_style_input (str):
+            Path to the wav file used at inference to set the speech style through GST. If `GST` is enabled and
+            this is not defined, the model uses a zero vector as an input. Defaults to None.
+        r (int):
+            Number of output frames that the decoder computed per iteration. Larger values makes training and inference
+            faster but reduces the quality of the output frames. This needs to be tuned considering your own needs.
+            Defaults to 1.
+        gradual_trainin (List[List]):
+            Parameters for the gradual training schedule. It is in the form `[[a, b, c], [d ,e ,f] ..]` where `a` is
+            the step number to start using the rest of the values, `b` is the `r` value and `c` is the batch size.
+            If sets None, no gradual training is used. Defaults to None.
+        memory_size (int):
+            Defines the number of previous frames used by the Prenet. If set to < 0, then it uses only the last frame.
+            Defaults to -1.
+        prenet_type (str):
+            `original` or `bn`. `original` sets the default Prenet and `bn` uses Batch Normalization version of the
+            Prenet. Defaults to `original`.
+        prenet_dropout (bool):
+            enables / disables the use of dropout in the Prenet. Defaults to True.
+        prenet_dropout_at_inference (bool):
+            enable / disable the use of dropout in the Prenet at the inference time. Defaults to False.
+        stopnet (bool):
+            enable /disable the Stopnet that predicts the end of the decoder sequence. Defaults to True.
+        stopnet_pos_weight (float):
+            Weight that is applied to over-weight positive instances in the Stopnet loss. Use larger values with
+            datasets with longer sentences. Defaults to 10.
+        separate_stopnet (bool):
+            Use a distinct Stopnet which is trained separately from the rest of the model. Defaults to True.
+        attention_type (str):
+            attention type. Check ```TTS.tts.layers.attentions.init_attn```. Defaults to 'original'.
+        attention_heads (int):
+            Number of attention heads for GMM attention. Defaults to 5.
+        windowing (bool):
+            It especially useful at inference to keep attention alignment diagonal. Defaults to False.
+        use_forward_attn (bool):
+            It is only valid if ```attn_type``` is ```original```.  Defaults to False.
+        forward_attn_mask (bool):
+            enable/disable extra masking over forward attention. It is useful at inference to prevent
+            possible attention failures. Defaults to False.
+        transition_agent (bool):
+            enable/disable transition agent in forward attention. Defaults to False.
+        location_attn (bool):
+            enable/disable location sensitive attention as in the original Tacotron2 paper.
+            It is only valid if ```attn_type``` is ```original```. Defaults to True.
+        bidirectional_decoder (bool):
+            enable/disable bidirectional decoding. Defaults to False.
+        double_decoder_consistency (bool):
+            enable/disable double decoder consistency. Defaults to False.
+        ddc_r (int):
+            reduction rate used by the coarse decoder when `double_decoder_consistency` is in use. Set this
+            as a multiple of the `r` value. Defaults to 6.
+        use_speaker_embedding (bool):
+            enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
+            in the multi-speaker mode. Defaults to False.
+        use_external_speaker_embedding_file (bool):
+            enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
+        external_speaker_embedding_file (str):
+            Path to the file including pre-computed speaker embeddings. Defaults to None.
+        noam_schedule (bool):
+            enable / disable the use of Noam LR scheduler. Defaults to False.
+        warmup_steps (int):
+            Number of warm-up steps for the Noam scheduler. Defaults 4000.
+        lr (float):
+            Initial learning rate. Defaults to `1e-4`.
+        wd (float):
+            Weight decay coefficient. Defaults to `1e-6`.
+        grad_clip (float):
+            Gradient clipping threshold. Defaults to `5`.
+        seq_len_notm (bool):
+            enable / disable the sequnce length normalization in the loss functions. If set True, loss of a sample
+            is divided by the sequence length. Defaults to False.
+        loss_masking (bool):
+            enable / disable masking the paddings of the samples in loss computation. Defaults to True.
+        decoder_loss_alpha (float):
+            Weight for the decoder loss of the Tacotron model. If set less than or equal to zero, it disables the
+            corresponding loss function. Defaults to 0.25
+        postnet_loss_alpha (float):
+            Weight for the postnet loss of the Tacotron model. If set less than or equal to zero, it disables the
+            corresponding loss function. Defaults to 0.25
+        postnet_diff_spec_alpha (float):
+            Weight for the postnet differential loss of the Tacotron model. If set less than or equal to zero, it disables the
+            corresponding loss function. Defaults to 0.25
+        decoder_diff_spec_alpha (float):
+            Weight for the decoder differential loss of the Tacotron model. If set less than or equal to zero, it disables the
+            corresponding loss function. Defaults to 0.25
+        decoder_ssim_alpha (float):
+            Weight for the decoder SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the
+            corresponding loss function. Defaults to 0.25
+        postnet_ssim_alpha (float):
+            Weight for the postnet SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the
+            corresponding loss function. Defaults to 0.25
+        ga_alpha (float):
+            Weight for the guided attention loss. If set less than or equal to zero, it disables the corresponding loss
+            function. Defaults to 5.
+    """

    model: str = "tacotron"
    use_gst: bool = False
--- a/TTS/tts/datasets/preprocess.py
+++ b/TTS/tts/datasets/preprocess.py
@ -52,19 +52,19 @@ def load_meta_data(datasets, eval_split=True):
        print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}")
        # load evaluation split if set
        if eval_split:
-            if meta_file_val is None:
-                meta_data_eval, meta_data_train = split_dataset(meta_data_train)
-            else:
+            if meta_file_val:
                meta_data_eval = preprocessor(root_path, meta_file_val)
+            else:
+                meta_data_eval, meta_data_train = split_dataset(meta_data_train)
            meta_data_eval_all += meta_data_eval
        meta_data_train_all += meta_data_train
        # load attention masks for duration predictor training
-        if dataset.meta_file_attn_mask is not None:
+        if dataset.meta_file_attn_mask:
            meta_data = dict(load_attention_mask_meta_data(dataset["meta_file_attn_mask"]))
            for idx, ins in enumerate(meta_data_train_all):
                attn_file = meta_data[ins[1]].strip()
                meta_data_train_all[idx].append(attn_file)
-            if meta_data_eval_all is not None:
+            if meta_data_eval_all:
                for idx, ins in enumerate(meta_data_eval_all):
                    attn_file = meta_data[ins[1]].strip()
                    meta_data_eval_all[idx].append(attn_file)
--- a/TTS/vocoder/configs/fullband_melgan_config.py
+++ b/TTS/vocoder/configs/fullband_melgan_config.py
@ -5,7 +5,62 @@ from .shared_configs import BaseGANVocoderConfig

@dataclass
 class FullbandMelganConfig(BaseGANVocoderConfig):
-    """Defines parameters for FullbandMelGAN vocoder."""
+    """Defines parameters for FullBand MelGAN vocoder.
+
+    Example:
+
+        >>> from TTS.vocoder.configs import FullbandMelganConfig
+        >>> config = FullbandMelganConfig()
+
+    Args:
+        model (str):
+            Model name used for selecting the right model at initialization. Defaults to `melgan`.
+        discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to
+            'melgan_multiscale_discriminator`.
+        discriminator_model_params (dict): The discriminator model parameters. Defaults to
+            '{"base_channels": 16, "max_channels": 1024, "downsample_factors": [4, 4, 4, 4]}`
+        generator_model (str): One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is
+            considered as a generator too. Defaults to `melgan_generator`.
+        batch_size (int):
+            Batch size used at training. Larger values use more memory. Defaults to 16.
+        seq_len (int):
+            Audio segment length used at training. Larger values use more memory. Defaults to 8192.
+        pad_short (int):
+            Additional padding applied to the audio samples shorter than `seq_len`. Defaults to 0.
+        use_noise_augment (bool):
+            enable / disable random noise added to the input waveform. The noise is added after computing the
+            features. Defaults to True.
+        use_cache (bool):
+            enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is
+            not large enough. Defaults to True.
+        use_stft_loss (bool):
+            enable / disable use of STFT loss originally used by ParallelWaveGAN model. Defaults to True.
+        use_subband_stft (bool):
+            enable / disable use of subband loss computation originally used by MultiBandMelgan model. Defaults to True.
+        use_mse_gan_loss (bool):
+            enable / disable using Mean Squeare Error GAN loss. Defaults to True.
+        use_hinge_gan_loss (bool):
+            enable / disable using Hinge GAN loss. You should choose either Hinge or MSE loss for training GAN models.
+            Defaults to False.
+        use_feat_match_loss (bool):
+            enable / disable using Feature Matching loss originally used by MelGAN model. Defaults to True.
+        use_l1_spec_loss (bool):
+            enable / disable using L1 spectrogram loss originally used by HifiGAN model. Defaults to False.
+        stft_loss_params (dict): STFT loss parameters. Default to
+        `{"n_ffts": [1024, 2048, 512], "hop_lengths": [120, 240, 50], "win_lengths": [600, 1200, 240]}`
+        stft_loss_weight (float): STFT loss weight that multiplies the computed loss before summing up the total
+            model loss. Defaults to 0.5.
+        subband_stft_loss_weight (float):
+            Subband STFT loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
+        mse_G_loss_weight (float):
+            MSE generator loss weight that multiplies the computed loss before summing up the total loss. faults to 2.5.
+        hinge_G_loss_weight (float):
+            Hinge generator loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
+        feat_match_loss_weight (float):
+            Feature matching loss weight that multiplies the computed loss before summing up the total loss. faults to 108.
+        l1_spec_loss_weight (float):
+            L1 spectrogram loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
+    """

    model: str = "melgan"

@ -48,4 +103,4 @@ class FullbandMelganConfig(BaseGANVocoderConfig):
    mse_G_loss_weight: float = 2.5
    hinge_G_loss_weight: float = 0
    feat_match_loss_weight: float = 108
-    l1_spec_loss_weight: float = 0
+    l1_spec_loss_weight: float = 0.0
--- a/TTS/vocoder/configs/hifigan_config.py
+++ b/TTS/vocoder/configs/hifigan_config.py
@ -1,11 +1,94 @@
 from dataclasses import dataclass, field

-from .shared_configs import BaseGANVocoderConfig
+from TTS.vocoder.configs.shared_configs import BaseGANVocoderConfig


@dataclass
 class HifiganConfig(BaseGANVocoderConfig):
-    """Defines parameters for HifiGAN vocoder."""
+    """Defines parameters for FullBand MelGAN vocoder.
+
+    Example:
+
+        >>> from TTS.vocoder.configs import HifiganConfig
+        >>> config = HifiganConfig()
+
+    Args:
+        model (str):
+            Model name used for selecting the right model at initialization. Defaults to `hifigan`.
+        discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to
+            'hifigan_discriminator`.
+        generator_model (str): One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is
+            considered as a generator too. Defaults to `hifigan_generator`.
+        generator_model_params (dict): Parameters of the generator model. Defaults to
+            `
+            {
+                "use_mel": True,
+                "sample_rate": 22050,
+                "n_fft": 1024,
+                "hop_length": 256,
+                "win_length": 1024,
+                "n_mels": 80,
+                "mel_fmin": 0.0,
+                "mel_fmax": None,
+            }
+            `
+        batch_size (int):
+            Batch size used at training. Larger values use more memory. Defaults to 16.
+        seq_len (int):
+            Audio segment length used at training. Larger values use more memory. Defaults to 8192.
+        pad_short (int):
+            Additional padding applied to the audio samples shorter than `seq_len`. Defaults to 0.
+        use_noise_augment (bool):
+            enable / disable random noise added to the input waveform. The noise is added after computing the
+            features. Defaults to True.
+        use_cache (bool):
+            enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is
+            not large enough. Defaults to True.
+        use_stft_loss (bool):
+            enable / disable use of STFT loss originally used by ParallelWaveGAN model. Defaults to True.
+        use_subband_stft (bool):
+            enable / disable use of subband loss computation originally used by MultiBandMelgan model. Defaults to True.
+        use_mse_gan_loss (bool):
+            enable / disable using Mean Squeare Error GAN loss. Defaults to True.
+        use_hinge_gan_loss (bool):
+            enable / disable using Hinge GAN loss. You should choose either Hinge or MSE loss for training GAN models.
+            Defaults to False.
+        use_feat_match_loss (bool):
+            enable / disable using Feature Matching loss originally used by MelGAN model. Defaults to True.
+        use_l1_spec_loss (bool):
+            enable / disable using L1 spectrogram loss originally used by HifiGAN model. Defaults to False.
+        stft_loss_params (dict):
+            STFT loss parameters. Default to
+            `{
+                "n_ffts": [1024, 2048, 512],
+                "hop_lengths": [120, 240, 50],
+                "win_lengths": [600, 1200, 240]
+            }`
+        l1_spec_loss_params (dict):
+            L1 spectrogram loss parameters. Default to
+            `{
+                "use_mel": True,
+                "sample_rate": 22050,
+                "n_fft": 1024,
+                "hop_length": 256,
+                "win_length": 1024,
+                "n_mels": 80,
+                "mel_fmin": 0.0,
+                "mel_fmax": None,
+            }`
+        stft_loss_weight (float): STFT loss weight that multiplies the computed loss before summing up the total
+            model loss. Defaults to 0.5.
+        subband_stft_loss_weight (float):
+            Subband STFT loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
+        mse_G_loss_weight (float):
+            MSE generator loss weight that multiplies the computed loss before summing up the total loss. faults to 2.5.
+        hinge_G_loss_weight (float):
+            Hinge generator loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
+        feat_match_loss_weight (float):
+            Feature matching loss weight that multiplies the computed loss before summing up the total loss. faults to 108.
+        l1_spec_loss_weight (float):
+            L1 spectrogram loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
+    """

    model: str = "hifigan"
    # model specific params
--- a/TTS/vocoder/configs/melgan_config.py
+++ b/TTS/vocoder/configs/melgan_config.py
@ -1,11 +1,66 @@
 from dataclasses import dataclass, field

-from .shared_configs import BaseGANVocoderConfig
+from TTS.vocoder.configs.shared_configs import BaseGANVocoderConfig


@dataclass
 class MelganConfig(BaseGANVocoderConfig):
-    """Defines parameters for MelGAN vocoder."""
+    """Defines parameters for MelGAN vocoder.
+
+    Example:
+
+        >>> from TTS.vocoder.configs import MelganConfig
+        >>> config = MelganConfig()
+
+    Args:
+        model (str):
+            Model name used for selecting the right model at initialization. Defaults to `melgan`.
+        discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to
+            'melgan_multiscale_discriminator`.
+        discriminator_model_params (dict): The discriminator model parameters. Defaults to
+            '{"base_channels": 16, "max_channels": 1024, "downsample_factors": [4, 4, 4, 4]}`
+        generator_model (str): One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is
+            considered as a generator too. Defaults to `melgan_generator`.
+        batch_size (int):
+            Batch size used at training. Larger values use more memory. Defaults to 16.
+        seq_len (int):
+            Audio segment length used at training. Larger values use more memory. Defaults to 8192.
+        pad_short (int):
+            Additional padding applied to the audio samples shorter than `seq_len`. Defaults to 0.
+        use_noise_augment (bool):
+            enable / disable random noise added to the input waveform. The noise is added after computing the
+            features. Defaults to True.
+        use_cache (bool):
+            enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is
+            not large enough. Defaults to True.
+        use_stft_loss (bool):
+            enable / disable use of STFT loss originally used by ParallelWaveGAN model. Defaults to True.
+        use_subband_stft (bool):
+            enable / disable use of subband loss computation originally used by MultiBandMelgan model. Defaults to True.
+        use_mse_gan_loss (bool):
+            enable / disable using Mean Squeare Error GAN loss. Defaults to True.
+        use_hinge_gan_loss (bool):
+            enable / disable using Hinge GAN loss. You should choose either Hinge or MSE loss for training GAN models.
+            Defaults to False.
+        use_feat_match_loss (bool):
+            enable / disable using Feature Matching loss originally used by MelGAN model. Defaults to True.
+        use_l1_spec_loss (bool):
+            enable / disable using L1 spectrogram loss originally used by HifiGAN model. Defaults to False.
+        stft_loss_params (dict): STFT loss parameters. Default to
+        `{"n_ffts": [1024, 2048, 512], "hop_lengths": [120, 240, 50], "win_lengths": [600, 1200, 240]}`
+        stft_loss_weight (float): STFT loss weight that multiplies the computed loss before summing up the total
+            model loss. Defaults to 0.5.
+        subband_stft_loss_weight (float):
+            Subband STFT loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
+        mse_G_loss_weight (float):
+            MSE generator loss weight that multiplies the computed loss before summing up the total loss. faults to 2.5.
+        hinge_G_loss_weight (float):
+            Hinge generator loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
+        feat_match_loss_weight (float):
+            Feature matching loss weight that multiplies the computed loss before summing up the total loss. faults to 108.
+        l1_spec_loss_weight (float):
+            L1 spectrogram loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
+    """

    model: str = "melgan"

--- a/TTS/vocoder/configs/multiband_melgan_config.py
+++ b/TTS/vocoder/configs/multiband_melgan_config.py
@ -1,11 +1,95 @@
 from dataclasses import dataclass, field

-from .shared_configs import BaseGANVocoderConfig
+from TTS.vocoder.configs.shared_configs import BaseGANVocoderConfig


@dataclass
 class MultibandMelganConfig(BaseGANVocoderConfig):
-    """Defines parameters for MultiBandMelGAN vocoder."""
+    """Defines parameters for MultiBandMelGAN vocoder.
+
+    Example:
+
+        >>> from TTS.vocoder.configs import MultibandMelganConfig
+        >>> config = MultibandMelganConfig()
+
+    Args:
+        model (str):
+            Model name used for selecting the right model at initialization. Defaults to `melgan`.
+        discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to
+            'melgan_multiscale_discriminator`.
+        discriminator_model_params (dict): The discriminator model parameters. Defaults to
+            '{
+                "base_channels": 16,
+                "max_channels": 512,
+                "downsample_factors": [4, 4, 4]
+            }`
+        generator_model (str): One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is
+            considered as a generator too. Defaults to `melgan_generator`.
+        generator_model_param (dict):
+            The generator model parameters. Defaults to `{"upsample_factors": [8, 4, 2], "num_res_blocks": 4}`.
+        use_pqmf (bool):
+            enable / disable PQMF modulation for multi-band training. Defaults to True.
+        lr_gen (float):
+            Initial learning rate for the generator model. Defaults to 0.0001.
+        lr_disc (float):
+            Initial learning rate for the discriminator model. Defaults to 0.0001.
+        optimizer (torch.optim.Optimizer):
+            Optimizer used for the training. Defaults to `AdamW`.
+        optimizer_params (dict):
+            Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}`
+        lr_scheduler_gen (torch.optim.Scheduler):
+            Learning rate scheduler for the generator. Defaults to `MultiStepLR`.
+        lr_scheduler_gen_params (dict):
+            Parameters for the generator learning rate scheduler. Defaults to
+            `{"gamma": 0.5, "milestones": [100000, 200000, 300000, 400000, 500000, 600000]}`.
+        lr_scheduler_disc (torch.optim.Scheduler):
+            Learning rate scheduler for the discriminator. Defaults to `MultiStepLR`.
+        lr_scheduler_dict_params (dict):
+            Parameters for the discriminator learning rate scheduler. Defaults to
+            `{"gamma": 0.5, "milestones": [100000, 200000, 300000, 400000, 500000, 600000]}`.
+        batch_size (int):
+            Batch size used at training. Larger values use more memory. Defaults to 16.
+        seq_len (int):
+            Audio segment length used at training. Larger values use more memory. Defaults to 8192.
+        pad_short (int):
+            Additional padding applied to the audio samples shorter than `seq_len`. Defaults to 0.
+        use_noise_augment (bool):
+            enable / disable random noise added to the input waveform. The noise is added after computing the
+            features. Defaults to True.
+        use_cache (bool):
+            enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is
+            not large enough. Defaults to True.
+        steps_to_start_discriminator (int):
+            Number of steps required to start training the discriminator. Defaults to 0.
+        use_stft_loss (bool):`
+            enable / disable use of STFT loss originally used by ParallelWaveGAN model. Defaults to True.
+        use_subband_stft (bool):
+            enable / disable use of subband loss computation originally used by MultiBandMelgan model. Defaults to True.
+        use_mse_gan_loss (bool):
+            enable / disable using Mean Squeare Error GAN loss. Defaults to True.
+        use_hinge_gan_loss (bool):
+            enable / disable using Hinge GAN loss. You should choose either Hinge or MSE loss for training GAN models.
+            Defaults to False.
+        use_feat_match_loss (bool):
+            enable / disable using Feature Matching loss originally used by MelGAN model. Defaults to True.
+        use_l1_spec_loss (bool):
+            enable / disable using L1 spectrogram loss originally used by HifiGAN model. Defaults to False.
+        stft_loss_params (dict): STFT loss parameters. Default to
+            `{"n_ffts": [1024, 2048, 512], "hop_lengths": [120, 240, 50], "win_lengths": [600, 1200, 240]}`
+        stft_loss_weight (float): STFT loss weight that multiplies the computed loss before summing up the total
+            model loss. Defaults to 0.5.
+        subband_stft_loss_weight (float):
+            Subband STFT loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
+        mse_G_loss_weight (float):
+            MSE generator loss weight that multiplies the computed loss before summing up the total loss. faults to 2.5.
+        hinge_G_loss_weight (float):
+            Hinge generator loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
+        feat_match_loss_weight (float):
+            Feature matching loss weight that multiplies the computed loss before summing up the total loss. faults to 108.
+        l1_spec_loss_weight (float):
+            L1 spectrogram loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
+    """
+

    model: str = "multiband_melgan"

@ -59,7 +143,3 @@ class MultibandMelganConfig(BaseGANVocoderConfig):
    hinge_G_loss_weight: float = 0
    feat_match_loss_weight: float = 108
    l1_spec_loss_weight: float = 0
-
-    # optimizer parameters
-    lr: float = 1e-4
-    wd: float = 1e-6
--- a/TTS/vocoder/configs/parallel_wavegan_config.py
+++ b/TTS/vocoder/configs/parallel_wavegan_config.py
@ -5,7 +5,77 @@ from .shared_configs import BaseGANVocoderConfig

@dataclass
 class ParallelWaveganConfig(BaseGANVocoderConfig):
-    """Defines parameters for ParallelWavegan vocoder."""
+    """Defines parameters for ParallelWavegan vocoder.
+
+    Args:
+        model (str):
+            Model name used for selecting the right configuration at initialization. Defaults to `parallel_wavegan`.
+        discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to
+            'parallel_wavegan_discriminator`.
+        discriminator_model_params (dict): The discriminator model kwargs. Defaults to
+            '{"num_layers": 10}`
+        generator_model (str): One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is
+            considered as a generator too. Defaults to `parallel_wavegan_generator`.
+        generator_model_param (dict):
+            The generator model kwargs. Defaults to `{"upsample_factors": [4, 4, 4, 4], "stacks": 3, "num_res_blocks": 30}`.
+        batch_size (int):
+            Batch size used at training. Larger values use more memory. Defaults to 16.
+        seq_len (int):
+            Audio segment length used at training. Larger values use more memory. Defaults to 8192.
+        pad_short (int):
+            Additional padding applied to the audio samples shorter than `seq_len`. Defaults to 0.
+        use_noise_augment (bool):
+            enable / disable random noise added to the input waveform. The noise is added after computing the
+            features. Defaults to True.
+        use_cache (bool):
+            enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is
+            not large enough. Defaults to True.
+        steps_to_start_discriminator (int):
+            Number of steps required to start training the discriminator. Defaults to 0.
+        use_stft_loss (bool):`
+            enable / disable use of STFT loss originally used by ParallelWaveGAN model. Defaults to True.
+        use_subband_stft (bool):
+            enable / disable use of subband loss computation originally used by MultiBandMelgan model. Defaults to True.
+        use_mse_gan_loss (bool):
+            enable / disable using Mean Squeare Error GAN loss. Defaults to True.
+        use_hinge_gan_loss (bool):
+            enable / disable using Hinge GAN loss. You should choose either Hinge or MSE loss for training GAN models.
+            Defaults to False.
+        use_feat_match_loss (bool):
+            enable / disable using Feature Matching loss originally used by MelGAN model. Defaults to True.
+        use_l1_spec_loss (bool):
+            enable / disable using L1 spectrogram loss originally used by HifiGAN model. Defaults to False.
+        stft_loss_params (dict): STFT loss parameters. Default to
+            `{"n_ffts": [1024, 2048, 512], "hop_lengths": [120, 240, 50], "win_lengths": [600, 1200, 240]}`
+        stft_loss_weight (float): STFT loss weight that multiplies the computed loss before summing up the total
+            model loss. Defaults to 0.5.
+        subband_stft_loss_weight (float):
+            Subband STFT loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
+        mse_G_loss_weight (float):
+            MSE generator loss weight that multiplies the computed loss before summing up the total loss. faults to 2.5.
+        hinge_G_loss_weight (float):
+            Hinge generator loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
+        feat_match_loss_weight (float):
+            Feature matching loss weight that multiplies the computed loss before summing up the total loss. faults to 0.
+        l1_spec_loss_weight (float):
+            L1 spectrogram loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
+        lr_gen (float):
+            Generator model initial learning rate. Defaults to 0.0002.
+        lr_disc (float):
+            Discriminator model initial learning rate. Defaults to 0.0002.
+        optimizer (torch.optim.Optimizer):
+            Optimizer used for the training. Defaults to `AdamW`.
+        optimizer_params (dict):
+            Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}`
+        lr_scheduler_gen (torch.optim.Scheduler):
+            Learning rate scheduler for the generator. Defaults to `ExponentialLR`.
+        lr_scheduler_gen_params (dict):
+            Parameters for the generator learning rate scheduler. Defaults to `{"gamma": 0.999, "last_epoch": -1}`.
+        lr_scheduler_disc (torch.optim.Scheduler):
+            Learning rate scheduler for the discriminator. Defaults to `ExponentialLR`.
+        lr_scheduler_dict_params (dict):
+            Parameters for the discriminator learning rate scheduler. Defaults to `{"gamma": 0.999, "last_epoch": -1}`.
+    """

    model: str = "parallel_wavegan"

--- a/TTS/vocoder/configs/shared_configs.py
+++ b/TTS/vocoder/configs/shared_configs.py
@ -7,7 +7,34 @@ from TTS.config import BaseAudioConfig, BaseTrainingConfig

@dataclass
 class BaseVocoderConfig(BaseTrainingConfig):
-    """Shared parameters among all the vocoder models."""
+    """Shared parameters among all the vocoder models.
+    Args:
+        audio (BaseAudioConfig):
+            Audio processor config instance. Defaultsto `BaseAudioConfig()`.
+        use_noise_augment (bool):
+            Augment the input audio with random noise. Defaults to False/
+        eval_split_size (int):
+            Number of instances used for evaluation. Defaults to 10.
+        data_path (str):
+            Root path of the training data. All the audio files found recursively from this root path are used for
+            training. Defaults to MISSING.
+        feature_path (str):
+            Root path to the precomputed feature files. Defaults to None.
+        seq_len (int):
+            Length of the waveform segments used for training. Defaults to MISSING.
+        pad_short (int):
+            Extra padding for the waveforms shorter than `seq_len`. Defaults to 0.
+        conv_path (int):
+            Extra padding for the feature frames against convolution of the edge frames. Defaults to MISSING.
+            Defaults to 0.
+        use_cache (bool):
+            enable / disable in memory caching of the computed features. If the RAM is not enough, if may cause OOM.
+            Defaults to False.
+        epochs (int):
+            Number of training epochs to. Defaults to 10000.
+        wd (float):
+            Weight decay.
+    """

    audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
    # dataloading
@ -19,7 +46,6 @@ class BaseVocoderConfig(BaseTrainingConfig):
    seq_len: int = MISSING  # signal length used in training.
    pad_short: int = 0  # additional padding for short wavs
    conv_pad: int = 0  # additional padding against convolutions applied to spectrograms
-    use_noise_augment: bool = False  # add noise to the audio signal for augmentation
    use_cache: bool = False  # use in memory cache to keep the computed features. This might cause OOM.
    # OPTIMIZER
    epochs: int = 10000  # total number of epochs to train.
@ -28,7 +54,78 @@ class BaseVocoderConfig(BaseTrainingConfig):

@dataclass
 class BaseGANVocoderConfig(BaseVocoderConfig):
-    """Common config interface for all the GAN based vocoder models."""
+    """Base config class used among all the GAN based vocoders.
+    Args:
+        use_stft_loss (bool):
+            enable / disable the use of STFT loss. Defaults to True.
+        use_subband_stft_loss (bool):
+            enable / disable the use of Subband STFT loss. Defaults to True.
+        use_mse_gan_loss (bool):
+            enable / disable the use of Mean Squared Error based GAN loss. Defaults to True.
+        use_hinge_gan_loss (bool):
+            enable / disable the use of Hinge GAN loss. Defaults to True.
+        use_feat_match_loss (bool):
+            enable / disable feature matching loss. Defaults to True.
+        use_l1_spec_loss (bool):
+            enable / disable L1 spectrogram loss. Defaults to True.
+        stft_loss_weight (float):
+            Loss weight that multiplies the computed loss value. Defaults to 0.
+        subband_stft_loss_weight (float):
+            Loss weight that multiplies the computed loss value. Defaults to 0.
+        mse_G_loss_weight (float):
+            Loss weight that multiplies the computed loss value. Defaults to 1.
+        hinge_G_loss_weight (float):
+            Loss weight that multiplies the computed loss value. Defaults to 0.
+        feat_match_loss_weight (float):
+            Loss weight that multiplies the computed loss value. Defaults to 100.
+        l1_spec_loss_weight (float):
+            Loss weight that multiplies the computed loss value. Defaults to 45.
+        stft_loss_params (dict):
+            Parameters for the STFT loss. Defaults to `{"n_ffts": [1024, 2048, 512], "hop_lengths": [120, 240, 50], "win_lengths": [600, 1200, 240]}`.
+        l1_spec_loss_params (dict):
+            Parameters for the L1 spectrogram loss. Defaults to
+            `{
+                "use_mel": True,
+                "sample_rate": 22050,
+                "n_fft": 1024,
+                "hop_length": 256,
+                "win_length": 1024,
+                "n_mels": 80,
+                "mel_fmin": 0.0,
+                "mel_fmax": None,
+            }`
+        target_loss (str):
+            Target loss name that defines the quality of the model. Defaults to `avg_G_loss`.
+        gen_clip_grad (float):
+            Gradient clipping threshold for the generator model. Any value less than 0 disables clipping.
+            Defaults to -1.
+        disc_clip_grad (float):
+            Gradient clipping threshold for the discriminator model. Any value less than 0 disables clipping.
+            Defaults to -1.
+        lr_gen (float):
+            Generator model initial learning rate. Defaults to 0.0002.
+        lr_disc (float):
+            Discriminator model initial learning rate. Defaults to 0.0002.
+        optimizer (torch.optim.Optimizer):
+            Optimizer used for the training. Defaults to `AdamW`.
+        optimizer_params (dict):
+            Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}`
+        lr_scheduler_gen (torch.optim.Scheduler):
+            Learning rate scheduler for the generator. Defaults to `ExponentialLR`.
+        lr_scheduler_gen_params (dict):
+            Parameters for the generator learning rate scheduler. Defaults to `{"gamma": 0.999, "last_epoch": -1}`.
+        lr_scheduler_disc (torch.optim.Scheduler):
+            Learning rate scheduler for the discriminator. Defaults to `ExponentialLR`.
+        lr_scheduler_dict_params (dict):
+            Parameters for the discriminator learning rate scheduler. Defaults to `{"gamma": 0.999, "last_epoch": -1}`.
+        use_pqmf (bool):
+            enable / disable PQMF for subband approximation at training. Defaults to False.
+        steps_to_start_discriminator (int):
+            Number of steps required to start training the discriminator. Defaults to 0.
+        diff_samples_for_G_and_D (bool):
+            enable / disable use of different training samples for the generator and the discriminator iterations.
+            Enabling it results in slower iterations but faster convergance in some cases. Defaults to False.
+    """

    # LOSS PARAMETERS
    use_stft_loss: bool = True
@ -43,7 +140,7 @@ class BaseGANVocoderConfig(BaseVocoderConfig):
    subband_stft_loss_weight: float = 0
    mse_G_loss_weight: float = 1
    hinge_G_loss_weight: float = 0
-    feat_match_loss_weight: float = 10
+    feat_match_loss_weight: float = 100
    l1_spec_loss_weight: float = 45

    stft_loss_params: dict = field(
--- a/TTS/vocoder/configs/wavegrad_config.py
+++ b/TTS/vocoder/configs/wavegrad_config.py
@ -1,12 +1,71 @@
 from dataclasses import dataclass, field

-from .shared_configs import BaseVocoderConfig
+from TTS.vocoder.configs.shared_configs import BaseVocoderConfig


@dataclass
 class WavegradConfig(BaseVocoderConfig):
-    """Defines parameters for Wavernn vocoder."""
+    """Defines parameters for WaveGrad vocoder.
+    Example:

+        >>> from TTS.vocoder.configs import WavegradConfig
+        >>> config = WavegradConfig()
+
+    Args:
+        model (str):
+            Model name used for selecting the right model at initialization. Defaults to `wavegrad`.
+        generator_model (str): One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is
+            considered as a generator too. Defaults to `wavegrad`.
+        model_params (dict):
+            WaveGrad kwargs. Defaults to
+            `
+            {
+                "use_weight_norm": True,
+                "y_conv_channels": 32,
+                "x_conv_channels": 768,
+                "ublock_out_channels": [512, 512, 256, 128, 128],
+                "dblock_out_channels": [128, 128, 256, 512],
+                "upsample_factors": [4, 4, 4, 2, 2],
+                "upsample_dilations": [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]],
+            }
+            `
+        target_loss (str):
+            Target loss name that defines the quality of the model. Defaults to `avg_wavegrad_loss`.
+        epochs (int):
+            Number of epochs to traing the model. Defaults to 10000.
+        batch_size (int):
+            Batch size used at training. Larger values use more memory. Defaults to 96.
+        seq_len (int):
+            Audio segment length used at training. Larger values use more memory. Defaults to 6144.
+        use_cache (bool):
+            enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is
+            not large enough. Defaults to True.
+        mixed_precision (bool):
+            enable / disable mixed precision training. Default is True.
+        eval_split_size (int):
+            Number of samples used for evalutaion. Defaults to 50.
+        train_noise_schedule (dict):
+            Training noise schedule. Defaults to
+            `{"min_val": 1e-6, "max_val": 1e-2, "num_steps": 1000}`
+        test_noise_schedule (dict):
+            Inference noise schedule. For a better performance, you may need to use `bin/tune_wavegrad.py` to find a
+            better schedule. Defaults to
+            `
+            {
+                "min_val": 1e-6,
+                "max_val": 1e-2,
+                "num_steps": 50,
+            }
+            `
+        grad_clip (float):
+            Gradient clipping threshold. If <= 0.0, no clipping is applied. Defaults to 1.0
+        lr (float):
+            Initila leraning rate. Defaults to 1e-4.
+        lr_scheduler (str):
+            One of the learning rate schedulers from `torch.optim.scheduler.*`. Defaults to `MultiStepLR`.
+        lr_scheduler_params (dict):
+            kwargs for the scheduler. Defaults to `{"gamma": 0.5, "milestones": [100000, 200000, 300000, 400000, 500000, 600000]}`
+    """
    model: str = "wavegrad"
    # Model specific params
    generator_model: str = "wavegrad"
@ -28,7 +87,6 @@ class WavegradConfig(BaseVocoderConfig):
    batch_size: int = 96
    seq_len: int = 6144
    use_cache: bool = True
-    steps_to_start_discriminator: int = 200000
    mixed_precision: bool = True
    eval_split_size: int = 50

--- a/TTS/vocoder/configs/wavernn_config.py
+++ b/TTS/vocoder/configs/wavernn_config.py
@ -1,11 +1,77 @@
 from dataclasses import dataclass, field

-from .shared_configs import BaseVocoderConfig
+from TTS.vocoder.configs.shared_configs import BaseVocoderConfig


@dataclass
 class WavernnConfig(BaseVocoderConfig):
-    """Defines parameters for Wavernn vocoder."""
+    """Defines parameters for Wavernn vocoder.
+    Example:
+
+        >>> from TTS.vocoder.configs import WavernnConfig
+        >>> config = WavernnConfig()
+
+    Args:
+        model (str):
+            Model name used for selecting the right model at initialization. Defaults to `wavernn`.
+        mode (str):
+            Output mode of the WaveRNN vocoder. `mold` for Mixture of Logistic Distribution, `gauss` for a single
+            Gaussian Distribution and `bits` for quantized bits as the model's output.
+        mulaw (bool):
+            enable / disable the use of Mulaw quantization for training. Only applicable if `mode == 'bits'`. Defaults
+            to `True`.
+        generator_model (str):
+            One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is
+            considered as a generator too. Defaults to `WaveRNN`.
+        wavernn_model_params (dict):
+            kwargs for the WaveRNN model. Defaults to
+            `{
+                "rnn_dims": 512,
+                "fc_dims": 512,
+                "compute_dims": 128,
+                "res_out_dims": 128,
+                "num_res_blocks": 10,
+                "use_aux_net": True,
+                "use_upsample_net": True,
+                "upsample_factors": [4, 8, 8]
+            }`
+        batched (bool):
+            enable / disable the batched inference. It speeds up the inference by splitting the input into segments and
+            processing the segments in a batch. Then it merges the outputs with a certain overlap and smoothing. If
+            you set it False, without CUDA, it is too slow to be practical. Defaults to True.
+        target_samples (int):
+            Size of the segments in batched mode. Defaults to 11000.
+        overlap_sampels (int):
+            Size of the overlap between consecutive segments. Defaults to 550.
+        batch_size (int):
+            Batch size used at training. Larger values use more memory. Defaults to 256.
+        seq_len (int):
+            Audio segment length used at training. Larger values use more memory. Defaults to 1280.
+        padding (int):
+            Padding applied to the input feature frames against the convolution layers of the feature network.
+            Defaults to 2.
+        use_noise_augment (bool):
+            enable / disable random noise added to the input waveform. The noise is added after computing the
+            features. Defaults to True.
+        use_cache (bool):
+            enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is
+            not large enough. Defaults to True.
+        mixed_precision (bool):
+            enable / disable mixed precision training. Default is True.
+        eval_split_size (int):
+            Number of samples used for evalutaion. Defaults to 50.
+        test_every_epoch (int):
+            Number of epochs waited to run the next evalution. Since inference takes some time, it is better to
+            wait some number of epochs not ot waste training time. Defaults to 10.
+        grad_clip (float):
+            Gradient clipping threshold. If <= 0.0, no clipping is applied. Defaults to 4.0
+        lr (float):
+            Initila leraning rate. Defaults to 1e-4.
+        lr_scheduler (str):
+            One of the learning rate schedulers from `torch.optim.scheduler.*`. Defaults to `MultiStepLR`.
+        lr_scheduler_params (dict):
+            kwargs for the scheduler. Defaults to `{"gamma": 0.5, "milestones": [200000, 400000, 600000]}`
+    """

    model: str = "wavernn"

@ -38,7 +104,6 @@ class WavernnConfig(BaseVocoderConfig):
    padding: int = 2
    use_noise_augment: bool = False
    use_cache: bool = True
-    steps_to_start_discriminator: int = 200000
    mixed_precision: bool = True
    eval_split_size: int = 50
    test_every_epochs: int = 10  # number of epochs to wait until the next test run (synthesizing a full audio clip).