From 786170fe7d5da036bbb44fa269f9e20865a9354f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Fri, 18 Jun 2021 13:28:58 +0200
Subject: [PATCH] Update tts model configs

---
 TTS/tts/configs/align_tts_config.py     |  21 ++---
 TTS/tts/configs/glow_tts_config.py      |  84 ++++++++++++++++---
 TTS/tts/configs/shared_configs.py       |  12 ++-
 TTS/tts/configs/speedy_speech_config.py |  56 ++-----------
 TTS/tts/configs/tacotron2_config.py     | 105 +-----------------------
 TTS/tts/configs/tacotron_config.py      |  39 +++++++--
 6 files changed, 133 insertions(+), 184 deletions(-)

diff --git a/TTS/tts/configs/align_tts_config.py b/TTS/tts/configs/align_tts_config.py
index 56622741..837cd519 100644
--- a/TTS/tts/configs/align_tts_config.py
+++ b/TTS/tts/configs/align_tts_config.py
@@ -2,6 +2,7 @@ from dataclasses import dataclass, field
 from typing import List
 
 from TTS.tts.configs.shared_configs import BaseTTSConfig
+from TTS.tts.models.align_tts import AlignTTSArgs
 
 
 @dataclass
@@ -49,9 +50,9 @@ class AlignTTSConfig(BaseTTSConfig):
         use_speaker_embedding (bool):
             enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
             in the multi-speaker mode. Defaults to False.
-        use_external_speaker_embedding_file (bool):
+        use_d_vector_file (bool):
             enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
-        external_speaker_embedding_file (str):
+        d_vector_file (str):
             Path to the file including pre-computed speaker embeddings. Defaults to None.
         noam_schedule (bool):
             enable / disable the use of Noam LR scheduler. Defaults to False.
@@ -68,17 +69,7 @@ class AlignTTSConfig(BaseTTSConfig):
 
     model: str = "align_tts"
     # model specific params
-    positional_encoding: bool = True
-    hidden_channels_dp: int = 256
-    hidden_channels: int = 256
-    encoder_type: str = "fftransformer"
-    encoder_params: dict = field(
-        default_factory=lambda: {"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}
-    )
-    decoder_type: str = "fftransformer"
-    decoder_params: dict = field(
-        default_factory=lambda: {"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}
-    )
+    model_args: AlignTTSArgs = field(default_factory=AlignTTSArgs)
     phase_start_steps: List[int] = None
 
     ssim_alpha: float = 1.0
@@ -88,8 +79,8 @@ class AlignTTSConfig(BaseTTSConfig):
 
     # multi-speaker settings
     use_speaker_embedding: bool = False
-    use_external_speaker_embedding_file: bool = False
-    external_speaker_embedding_file: str = False
+    use_d_vector_file: bool = False
+    d_vector_file: str = False
 
     # optimizer parameters
     optimizer: str = "Adam"
diff --git a/TTS/tts/configs/glow_tts_config.py b/TTS/tts/configs/glow_tts_config.py
index 925854c9..19b7abd9 100644
--- a/TTS/tts/configs/glow_tts_config.py
+++ b/TTS/tts/configs/glow_tts_config.py
@@ -23,13 +23,49 @@ class GlowTTSConfig(BaseTTSConfig):
             Defaults to `{"kernel_size": 3, "dropout_p": 0.1, "num_layers": 6, "num_heads": 2, "hidden_channels_ffn": 768}`
         use_encoder_prenet (bool):
             enable / disable the use of a prenet for the encoder. Defaults to True.
-        hidden_channels_encoder (int):
+        hidden_channels_enc (int):
             Number of base hidden channels used by the encoder network. It defines the input and the output channel sizes,
             and for some encoder types internal hidden channels sizes too. Defaults to 192.
-        hidden_channels_decoder (int):
+        hidden_channels_dec (int):
             Number of base hidden channels used by the decoder WaveNet network. Defaults to 192 as in the original work.
-        hidden_channels_duration_predictor (int):
+        hidden_channels_dp (int):
             Number of layer channels of the duration predictor network. Defaults to 256 as in the original work.
+        mean_only (bool):
+            If true predict only the mean values by the decoder flow. Defaults to True.
+        out_channels (int):
+            Number of channels of the model output tensor. Defaults to 80.
+        num_flow_blocks_dec (int):
+            Number of decoder blocks. Defaults to 12.
+        inference_noise_scale (float):
+            Noise scale used at inference. Defaults to 0.33.
+        kernel_size_dec (int):
+            Decoder kernel size. Defaults to 5
+        dilation_rate (int):
+            Rate to increase dilation by each layer in a decoder block. Defaults to 5.
+        num_block_layers (int):
+            Number of decoder layers in each decoder block.  Defaults to 4.
+        dropout_p_dec (float):
+            Dropout rate for decoder. Defaults to 0.1.
+        num_speaker (int):
+            Number of speaker to define the size of speaker embedding layer. Defaults to 0.
+        c_in_channels (int):
+            Number of speaker embedding channels. It is set to 512 if embeddings are learned. Defaults to 0.
+        num_splits (int):
+            Number of split levels in inversible conv1x1 operation. Defaults to 4.
+        num_squeeze (int):
+            Number of squeeze levels. When squeezing channels increases and time steps reduces by the factor
+            'num_squeeze'. Defaults to 1.
+        sigmoid_scale (bool):
+            enable/disable sigmoid scaling in decoder. Defaults to False.
+        mean_only (bool):
+            If True, encoder only computes mean value and uses constant variance for each time step. Defaults to true.
+        encoder_type (str):
+            Encoder module type. Possible values are`["rel_pos_transformer", "gated_conv", "residual_conv_bn", "time_depth_separable"]`
+            Check `TTS.tts.layers.glow_tts.encoder` for more details. Defaults to `rel_pos_transformers` as in the original paper.
+        encoder_params (dict):
+            Encoder module parameters. Defaults to None.
+        d_vector_dim (int):
+            Channels of external speaker embedding vectors. Defaults to 0.
         data_dep_init_steps (int):
             Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
             Activation Normalization that pre-computes normalization stats at the beginning and use the same values
@@ -41,9 +77,9 @@ class GlowTTSConfig(BaseTTSConfig):
         use_speaker_embedding (bool):
             enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
             in the multi-speaker mode. Defaults to False.
-        use_external_speaker_embedding_file (bool):
+        use_d_vector_file (bool):
             enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
-        external_speaker_embedding_file (str):
+        d_vector_file (str):
             Path to the file including pre-computed speaker embeddings. Defaults to None.
         noam_schedule (bool):
             enable / disable the use of Noam LR scheduler. Defaults to False.
@@ -62,6 +98,7 @@ class GlowTTSConfig(BaseTTSConfig):
     model: str = "glow_tts"
 
     # model params
+    num_chars: int = None
     encoder_type: str = "rel_pos_transformer"
     encoder_params: dict = field(
         default_factory=lambda: {
@@ -73,9 +110,36 @@ class GlowTTSConfig(BaseTTSConfig):
         }
     )
     use_encoder_prenet: bool = True
-    hidden_channels_encoder: int = 192
-    hidden_channels_decoder: int = 192
-    hidden_channels_duration_predictor: int = 256
+    hidden_channels_enc: int = 192
+    hidden_channels_dec: int = 192
+    hidden_channels_dp: int = 256
+    dropout_p_dp: float = 0.1
+    dropout_p_dec: float = 0.05
+    mean_only: bool = True
+    out_channels: int = 80
+    num_flow_blocks_dec: int = 12
+    inference_noise_scale: float = 0.33
+    kernel_size_dec: int = 5
+    dilation_rate: int = 5
+    num_block_layers: int = 4
+    num_speakers: int = 0
+    c_in_channels: int = 0
+    num_splits: int = 4
+    num_squeeze: int = 1
+    sigmoid_scale: bool = False
+    mean_only: bool = False
+    encoder_type: str = "rel_pos_transformer"
+    encoder_params: dict = field(
+        default_factory=lambda: {
+            "kernel_size": 3,
+            "dropout_p": 0.1,
+            "num_layers": 6,
+            "num_heads": 2,
+            "hidden_channels_ffn": 768,
+            "input_length": None,
+        }
+    )
+    d_vector_dim: int = 0
 
     # training params
     data_dep_init_steps: int = 10
@@ -86,8 +150,8 @@ class GlowTTSConfig(BaseTTSConfig):
 
     # multi-speaker settings
     use_speaker_embedding: bool = False
-    use_external_speaker_embedding_file: bool = False
-    external_speaker_embedding_file: str = False
+    use_d_vector_file: bool = False
+    d_vector_file: str = False
 
     # optimizer parameters
     optimizer: str = "RAdam"
diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py
index d02e58ae..4b916a17 100644
--- a/TTS/tts/configs/shared_configs.py
+++ b/TTS/tts/configs/shared_configs.py
@@ -1,7 +1,7 @@
 from dataclasses import asdict, dataclass, field
 from typing import List
 
-from coqpit import MISSING, Coqpit, check_argument
+from coqpit import Coqpit, check_argument
 
 from TTS.config import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig
 
@@ -153,7 +153,7 @@ class BaseTTSConfig(BaseTrainingConfig):
     use_espeak_phonemes: bool = True
     phoneme_language: str = None
     compute_input_seq_cache: bool = False
-    text_cleaner: str = MISSING
+    text_cleaner: str = None
     enable_eos_bos_chars: bool = False
     test_sentences_file: str = ""
     phoneme_cache_path: str = None
@@ -171,10 +171,14 @@ class BaseTTSConfig(BaseTrainingConfig):
     # dataset
     datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
     # optimizer
-    optimizer: str = MISSING
-    optimizer_params: dict = MISSING
+    optimizer: str = None
+    optimizer_params: dict = None
     # scheduler
     lr_scheduler: str = ""
     lr_scheduler_params: dict = field(default_factory=lambda: {})
     # testing
     test_sentences: List[str] = field(default_factory=lambda: [])
+    # multi-speaker
+    use_speaker_embedding: bool = False
+    use_d_vector_file: bool = False
+    d_vector_dim: int = 0
diff --git a/TTS/tts/configs/speedy_speech_config.py b/TTS/tts/configs/speedy_speech_config.py
index d76d94e2..b2641ab5 100644
--- a/TTS/tts/configs/speedy_speech_config.py
+++ b/TTS/tts/configs/speedy_speech_config.py
@@ -2,6 +2,7 @@ from dataclasses import dataclass, field
 from typing import List
 
 from TTS.tts.configs.shared_configs import BaseTTSConfig
+from TTS.tts.models.speedy_speech import SpeedySpeechArgs
 
 
 @dataclass
@@ -16,30 +17,8 @@ class SpeedySpeechConfig(BaseTTSConfig):
     Args:
         model (str):
             Model name used for selecting the right model at initialization. Defaults to `speedy_speech`.
-        positional_encoding (bool):
-            enable / disable positional encoding applied to the encoder output. Defaults to True.
-        hidden_channels (int):
-            Base number of hidden channels. Defines all the layers expect ones defined by the specific encoder or decoder
-            parameters. Defaults to 128.
-        encoder_type (str):
-            Type of the encoder used by the model. Look at `TTS.tts.layers.feed_forward.encoder` for more details.
-            Defaults to `residual_conv_bn`.
-        encoder_params (dict):
-            Parameters used to define the encoder network. Look at `TTS.tts.layers.feed_forward.encoder` for more details.
-            Defaults to `{"kernel_size": 4, "dilations": [1, 2, 4, 1, 2, 4, 1, 2, 4, 1, 2, 4, 1], "num_conv_blocks": 2, "num_res_blocks": 13}`
-        decoder_type (str):
-            Type of the decoder used by the model. Look at `TTS.tts.layers.feed_forward.decoder` for more details.
-            Defaults to `residual_conv_bn`.
-        decoder_params (dict):
-            Parameters used to define the decoder network. Look at `TTS.tts.layers.feed_forward.decoder` for more details.
-            Defaults to `{"kernel_size": 4, "dilations": [1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1], "num_conv_blocks": 2, "num_res_blocks": 17}`
-        hidden_channels_encoder (int):
-            Number of base hidden channels used by the encoder network. It defines the input and the output channel sizes,
-            and for some encoder types internal hidden channels sizes too. Defaults to 192.
-        hidden_channels_decoder (int):
-            Number of base hidden channels used by the decoder WaveNet network. Defaults to 192 as in the original work.
-        hidden_channels_duration_predictor (int):
-            Number of layer channels of the duration predictor network. Defaults to 256 as in the original work.
+        model_args (Coqpit):
+            Model class arguments. Check `SpeedySpeechArgs` for more details. Defaults to `SpeedySpeechArgs()`.
         data_dep_init_steps (int):
             Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
             Activation Normalization that pre-computes normalization stats at the beginning and use the same values
@@ -47,9 +26,9 @@ class SpeedySpeechConfig(BaseTTSConfig):
         use_speaker_embedding (bool):
             enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
             in the multi-speaker mode. Defaults to False.
-        use_external_speaker_embedding_file (bool):
+        use_d_vector_file (bool):
             enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
-        external_speaker_embedding_file (str):
+        d_vector_file (str):
             Path to the file including pre-computed speaker embeddings. Defaults to None.
         noam_schedule (bool):
             enable / disable the use of Noam LR scheduler. Defaults to False.
@@ -73,31 +52,12 @@ class SpeedySpeechConfig(BaseTTSConfig):
 
     model: str = "speedy_speech"
     # model specific params
-    positional_encoding: bool = True
-    hidden_channels: int = 128
-    encoder_type: str = "residual_conv_bn"
-    encoder_params: dict = field(
-        default_factory=lambda: {
-            "kernel_size": 4,
-            "dilations": [1, 2, 4, 1, 2, 4, 1, 2, 4, 1, 2, 4, 1],
-            "num_conv_blocks": 2,
-            "num_res_blocks": 13,
-        }
-    )
-    decoder_type: str = "residual_conv_bn"
-    decoder_params: dict = field(
-        default_factory=lambda: {
-            "kernel_size": 4,
-            "dilations": [1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1],
-            "num_conv_blocks": 2,
-            "num_res_blocks": 17,
-        }
-    )
+    model_args: SpeedySpeechArgs = field(default_factory=SpeedySpeechArgs)
 
     # multi-speaker settings
     use_speaker_embedding: bool = False
-    use_external_speaker_embedding_file: bool = False
-    external_speaker_embedding_file: str = False
+    use_d_vector_file: bool = False
+    d_vector_file: str = False
 
     # optimizer parameters
     optimizer: str = "RAdam"
diff --git a/TTS/tts/configs/tacotron2_config.py b/TTS/tts/configs/tacotron2_config.py
index ea66fae8..b622e640 100644
--- a/TTS/tts/configs/tacotron2_config.py
+++ b/TTS/tts/configs/tacotron2_config.py
@@ -12,107 +12,10 @@ class Tacotron2Config(TacotronConfig):
         >>> from TTS.tts.configs import Tacotron2Config
         >>> config = Tacotron2Config()
 
-    Args:
-        model (str):
-            Model name used to select the right model class to initilize. Defaults to `Tacotron2`.
-        use_gst (bool):
-            enable / disable the use of Global Style Token modules. Defaults to False.
-        gst (GSTConfig):
-            Instance of `GSTConfig` class.
-        gst_style_input (str):
-            Path to the wav file used at inference to set the speech style through GST. If `GST` is enabled and
-            this is not defined, the model uses a zero vector as an input. Defaults to None.
-        r (int):
-            Number of output frames that the decoder computed per iteration. Larger values makes training and inference
-            faster but reduces the quality of the output frames. This needs to be tuned considering your own needs.
-            Defaults to 1.
-        gradual_trainin (List[List]):
-            Parameters for the gradual training schedule. It is in the form `[[a, b, c], [d ,e ,f] ..]` where `a` is
-            the step number to start using the rest of the values, `b` is the `r` value and `c` is the batch size.
-            If sets None, no gradual training is used. Defaults to None.
-        memory_size (int):
-            Defines the number of previous frames used by the Prenet. If set to < 0, then it uses only the last frame.
-            Defaults to -1.
-        prenet_type (str):
-            `original` or `bn`. `original` sets the default Prenet and `bn` uses Batch Normalization version of the
-            Prenet. Defaults to `original`.
-        prenet_dropout (bool):
-            enables / disables the use of dropout in the Prenet. Defaults to True.
-        prenet_dropout_at_inference (bool):
-            enable / disable the use of dropout in the Prenet at the inference time. Defaults to False.
-        stopnet (bool):
-            enable /disable the Stopnet that predicts the end of the decoder sequence. Defaults to True.
-        stopnet_pos_weight (float):
-            Weight that is applied to over-weight positive instances in the Stopnet loss. Use larger values with
-            datasets with longer sentences. Defaults to 10.
-        separate_stopnet (bool):
-            Use a distinct Stopnet which is trained separately from the rest of the model. Defaults to True.
-        attention_type (str):
-            attention type. Check ```TTS.tts.layers.attentions.init_attn```. Defaults to 'original'.
-        attention_heads (int):
-            Number of attention heads for GMM attention. Defaults to 5.
-        windowing (bool):
-            It especially useful at inference to keep attention alignment diagonal. Defaults to False.
-        use_forward_attn (bool):
-            It is only valid if ```attn_type``` is ```original```.  Defaults to False.
-        forward_attn_mask (bool):
-            enable/disable extra masking over forward attention. It is useful at inference to prevent
-            possible attention failures. Defaults to False.
-        transition_agent (bool):
-            enable/disable transition agent in forward attention. Defaults to False.
-        location_attn (bool):
-            enable/disable location sensitive attention as in the original Tacotron2 paper.
-            It is only valid if ```attn_type``` is ```original```. Defaults to True.
-        bidirectional_decoder (bool):
-            enable/disable bidirectional decoding. Defaults to False.
-        double_decoder_consistency (bool):
-            enable/disable double decoder consistency. Defaults to False.
-        ddc_r (int):
-            reduction rate used by the coarse decoder when `double_decoder_consistency` is in use. Set this
-            as a multiple of the `r` value. Defaults to 6.
-        use_speaker_embedding (bool):
-            enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
-            in the multi-speaker mode. Defaults to False.
-        use_external_speaker_embedding_file (bool):
-            enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
-        external_speaker_embedding_file (str):
-            Path to the file including pre-computed speaker embeddings. Defaults to None.
-        noam_schedule (bool):
-            enable / disable the use of Noam LR scheduler. Defaults to False.
-        warmup_steps (int):
-            Number of warm-up steps for the Noam scheduler. Defaults 4000.
-        lr (float):
-            Initial learning rate. Defaults to `1e-4`.
-        wd (float):
-            Weight decay coefficient. Defaults to `1e-6`.
-        grad_clip (float):
-            Gradient clipping threshold. Defaults to `5`.
-        seq_len_notm (bool):
-            enable / disable the sequnce length normalization in the loss functions. If set True, loss of a sample
-            is divided by the sequence length. Defaults to False.
-        loss_masking (bool):
-            enable / disable masking the paddings of the samples in loss computation. Defaults to True.
-        decoder_loss_alpha (float):
-            Weight for the decoder loss of the Tacotron model. If set less than or equal to zero, it disables the
-            corresponding loss function. Defaults to 0.25
-        postnet_loss_alpha (float):
-            Weight for the postnet loss of the Tacotron model. If set less than or equal to zero, it disables the
-            corresponding loss function. Defaults to 0.25
-        postnet_diff_spec_alpha (float):
-            Weight for the postnet differential loss of the Tacotron model. If set less than or equal to zero, it disables the
-            corresponding loss function. Defaults to 0.25
-        decoder_diff_spec_alpha (float):
-            Weight for the decoder differential loss of the Tacotron model. If set less than or equal to zero, it disables the
-            corresponding loss function. Defaults to 0.25
-        decoder_ssim_alpha (float):
-            Weight for the decoder SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the
-            corresponding loss function. Defaults to 0.25
-        postnet_ssim_alpha (float):
-            Weight for the postnet SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the
-            corresponding loss function. Defaults to 0.25
-        ga_alpha (float):
-            Weight for the guided attention loss. If set less than or equal to zero, it disables the corresponding loss
-            function. Defaults to 5.
+    Check `TacotronConfig` for argument descriptions.
     """
 
     model: str = "tacotron2"
+    out_channels: int = 80
+    encoder_in_features: int = 512
+    decoder_in_features: int = 512
diff --git a/TTS/tts/configs/tacotron_config.py b/TTS/tts/configs/tacotron_config.py
index 2b67901c..89fb8d81 100644
--- a/TTS/tts/configs/tacotron_config.py
+++ b/TTS/tts/configs/tacotron_config.py
@@ -23,6 +23,10 @@ class TacotronConfig(BaseTTSConfig):
         gst_style_input (str):
             Path to the wav file used at inference to set the speech style through GST. If `GST` is enabled and
             this is not defined, the model uses a zero vector as an input. Defaults to None.
+        num_chars (int):
+            Number of characters used by the model. It must be defined before initializing the model. Defaults to None.
+        num_speakers (int):
+            Number of speakers for multi-speaker models. Defaults to 1.
         r (int):
             Initial number of output frames that the decoder computed per iteration. Larger values makes training and inference
             faster but reduces the quality of the output frames. This must be equal to the largest `r` value used in
@@ -47,7 +51,13 @@ class TacotronConfig(BaseTTSConfig):
             Weight that is applied to over-weight positive instances in the Stopnet loss. Use larger values with
             datasets with longer sentences. Defaults to 10.
         max_decoder_steps (int):
-            Max number of steps allowed for the decoder. Defaults to 10000.
+            Max number of steps allowed for the decoder. Defaults to 50.
+        encoder_in_features (int):
+            Channels of encoder input and character embedding tensors. Defaults to 256.
+        decoder_in_features (int):
+            Channels of decoder input and encoder output tensors. Defaults to 256.
+        out_channels (int):
+            Channels of the final model output. It must match the spectragram size. Defaults to 80.
         separate_stopnet (bool):
             Use a distinct Stopnet which is trained separately from the rest of the model. Defaults to True.
         attention_type (str):
@@ -76,9 +86,9 @@ class TacotronConfig(BaseTTSConfig):
         use_speaker_embedding (bool):
             enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
             in the multi-speaker mode. Defaults to False.
-        use_external_speaker_embedding_file (bool):
+        use_d_vector_file (bool):
             enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
-        external_speaker_embedding_file (str):
+        d_vector_file (str):
             Path to the file including pre-computed speaker embeddings. Defaults to None.
         optimizer (str):
             Optimizer used for the training. Set one from `torch.optim.Optimizer` or `TTS.utils.training`.
@@ -111,6 +121,7 @@ class TacotronConfig(BaseTTSConfig):
             Weight for the postnet differential loss of the Tacotron model. If set less than or equal to zero, it disables the
             corresponding loss function. Defaults to 0.25
         decoder_diff_spec_alpha (float):
+
             Weight for the decoder differential loss of the Tacotron model. If set less than or equal to zero, it disables the
             corresponding loss function. Defaults to 0.25
         decoder_ssim_alpha (float):
@@ -125,11 +136,14 @@ class TacotronConfig(BaseTTSConfig):
     """
 
     model: str = "tacotron"
+    # model_params: TacotronArgs = field(default_factory=lambda: TacotronArgs())
     use_gst: bool = False
     gst: GSTConfig = None
     gst_style_input: str = None
 
     # model specific params
+    num_speakers: int = 1
+    num_chars: int = 0
     r: int = 2
     gradual_training: List[List[int]] = None
     memory_size: int = -1
@@ -139,12 +153,17 @@ class TacotronConfig(BaseTTSConfig):
     stopnet: bool = True
     separate_stopnet: bool = True
     stopnet_pos_weight: float = 10.0
-    max_decoder_steps: int = 10000
+    max_decoder_steps: int = 500
+    encoder_in_features: int = 256
+    decoder_in_features: int = 256
+    decoder_output_dim: int = 80
+    out_channels: int = 513
 
     # attention layers
     attention_type: str = "original"
     attention_heads: int = None
     attention_norm: str = "sigmoid"
+    attention_win: bool = False
     windowing: bool = False
     use_forward_attn: bool = False
     forward_attn_mask: bool = False
@@ -158,8 +177,10 @@ class TacotronConfig(BaseTTSConfig):
 
     # multi-speaker settings
     use_speaker_embedding: bool = False
-    use_external_speaker_embedding_file: bool = False
-    external_speaker_embedding_file: str = False
+    speaker_embedding_dim: int = 512
+    use_d_vector_file: bool = False
+    d_vector_file: str = False
+    d_vector_dim: int = None
 
     # optimizer parameters
     optimizer: str = "RAdam"
@@ -196,3 +217,9 @@ class TacotronConfig(BaseTTSConfig):
             assert (
                 self.gradual_training[0][1] == self.r
             ), f"[!] the first scheduled gradual training `r` must be equal to the model's `r` value. {self.gradual_training[0][1]} vs {self.r}"
+        if self.model == "tacotron" and self.audio is not None:
+            assert self.out_channels == (
+                self.audio.fft_size // 2 + 1
+            ), f"{self.out_channels} vs {self.audio.fft_size // 2 + 1}"
+        if self.model == "tacotron2" and self.audio is not None:
+            assert self.out_channels == self.audio.num_mels