Update tts model configs

2021-06-18 13:28:58 +02:00 · 2021-06-18 13:28:58 +02:00 · 786170fe7d
parent 626c9d41e6
commit 786170fe7d
6 changed files with 133 additions and 184 deletions
--- a/TTS/tts/configs/align_tts_config.py
+++ b/TTS/tts/configs/align_tts_config.py
@ -2,6 +2,7 @@ from dataclasses import dataclass, field
 from typing import List
 from TTS.tts.configs.shared_configs import BaseTTSConfig
 from TTS.tts.models.align_tts import AlignTTSArgs
@dataclass
@ -49,9 +50,9 @@ class AlignTTSConfig(BaseTTSConfig):
        use_speaker_embedding (bool):
            enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
            in the multi-speaker mode. Defaults to False.
-        use_external_speaker_embedding_file (bool):
+        use_d_vector_file (bool):
            enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
-        external_speaker_embedding_file (str):
+        d_vector_file (str):
            Path to the file including pre-computed speaker embeddings. Defaults to None.
        noam_schedule (bool):
            enable / disable the use of Noam LR scheduler. Defaults to False.
@ -68,17 +69,7 @@ class AlignTTSConfig(BaseTTSConfig):
    model: str = "align_tts"
    # model specific params
-    positional_encoding: bool = True
+    model_args: AlignTTSArgs = field(default_factory=AlignTTSArgs)
    hidden_channels_dp: int = 256
    hidden_channels: int = 256
    encoder_type: str = "fftransformer"
    encoder_params: dict = field(
        default_factory=lambda: {"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}
    )
    decoder_type: str = "fftransformer"
    decoder_params: dict = field(
        default_factory=lambda: {"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}
    )
    phase_start_steps: List[int] = None
    ssim_alpha: float = 1.0
@ -88,8 +79,8 @@ class AlignTTSConfig(BaseTTSConfig):
    # multi-speaker settings
    use_speaker_embedding: bool = False
-    use_external_speaker_embedding_file: bool = False
+    use_d_vector_file: bool = False
-    external_speaker_embedding_file: str = False
+    d_vector_file: str = False
    # optimizer parameters
    optimizer: str = "Adam"
--- a/TTS/tts/configs/glow_tts_config.py
+++ b/TTS/tts/configs/glow_tts_config.py
@ -23,13 +23,49 @@ class GlowTTSConfig(BaseTTSConfig):
            Defaults to `{"kernel_size": 3, "dropout_p": 0.1, "num_layers": 6, "num_heads": 2, "hidden_channels_ffn": 768}`
        use_encoder_prenet (bool):
            enable / disable the use of a prenet for the encoder. Defaults to True.
-        hidden_channels_encoder (int):
+        hidden_channels_enc (int):
            Number of base hidden channels used by the encoder network. It defines the input and the output channel sizes,
            and for some encoder types internal hidden channels sizes too. Defaults to 192.
-        hidden_channels_decoder (int):
+        hidden_channels_dec (int):
            Number of base hidden channels used by the decoder WaveNet network. Defaults to 192 as in the original work.
-        hidden_channels_duration_predictor (int):
+        hidden_channels_dp (int):
            Number of layer channels of the duration predictor network. Defaults to 256 as in the original work.
        mean_only (bool):
            If true predict only the mean values by the decoder flow. Defaults to True.
        out_channels (int):
            Number of channels of the model output tensor. Defaults to 80.
        num_flow_blocks_dec (int):
            Number of decoder blocks. Defaults to 12.
        inference_noise_scale (float):
            Noise scale used at inference. Defaults to 0.33.
        kernel_size_dec (int):
            Decoder kernel size. Defaults to 5
        dilation_rate (int):
            Rate to increase dilation by each layer in a decoder block. Defaults to 5.
        num_block_layers (int):
            Number of decoder layers in each decoder block.  Defaults to 4.
        dropout_p_dec (float):
            Dropout rate for decoder. Defaults to 0.1.
        num_speaker (int):
            Number of speaker to define the size of speaker embedding layer. Defaults to 0.
        c_in_channels (int):
            Number of speaker embedding channels. It is set to 512 if embeddings are learned. Defaults to 0.
        num_splits (int):
            Number of split levels in inversible conv1x1 operation. Defaults to 4.
        num_squeeze (int):
            Number of squeeze levels. When squeezing channels increases and time steps reduces by the factor
            'num_squeeze'. Defaults to 1.
        sigmoid_scale (bool):
            enable/disable sigmoid scaling in decoder. Defaults to False.
        mean_only (bool):
            If True, encoder only computes mean value and uses constant variance for each time step. Defaults to true.
        encoder_type (str):
            Encoder module type. Possible values are`["rel_pos_transformer", "gated_conv", "residual_conv_bn", "time_depth_separable"]`
            Check `TTS.tts.layers.glow_tts.encoder` for more details. Defaults to `rel_pos_transformers` as in the original paper.
        encoder_params (dict):
            Encoder module parameters. Defaults to None.
        d_vector_dim (int):
            Channels of external speaker embedding vectors. Defaults to 0.
        data_dep_init_steps (int):
            Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
            Activation Normalization that pre-computes normalization stats at the beginning and use the same values
@ -41,9 +77,9 @@ class GlowTTSConfig(BaseTTSConfig):
        use_speaker_embedding (bool):
            enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
            in the multi-speaker mode. Defaults to False.
-        use_external_speaker_embedding_file (bool):
+        use_d_vector_file (bool):
            enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
-        external_speaker_embedding_file (str):
+        d_vector_file (str):
            Path to the file including pre-computed speaker embeddings. Defaults to None.
        noam_schedule (bool):
            enable / disable the use of Noam LR scheduler. Defaults to False.
@ -62,6 +98,7 @@ class GlowTTSConfig(BaseTTSConfig):
    model: str = "glow_tts"
    # model params
    num_chars: int = None
    encoder_type: str = "rel_pos_transformer"
    encoder_params: dict = field(
        default_factory=lambda: {
@ -73,9 +110,36 @@ class GlowTTSConfig(BaseTTSConfig):
        }
    )
    use_encoder_prenet: bool = True
-    hidden_channels_encoder: int = 192
+    hidden_channels_enc: int = 192
-    hidden_channels_decoder: int = 192
+    hidden_channels_dec: int = 192
-    hidden_channels_duration_predictor: int = 256
+    hidden_channels_dp: int = 256
    dropout_p_dp: float = 0.1
    dropout_p_dec: float = 0.05
    mean_only: bool = True
    out_channels: int = 80
    num_flow_blocks_dec: int = 12
    inference_noise_scale: float = 0.33
    kernel_size_dec: int = 5
    dilation_rate: int = 5
    num_block_layers: int = 4
    num_speakers: int = 0
    c_in_channels: int = 0
    num_splits: int = 4
    num_squeeze: int = 1
    sigmoid_scale: bool = False
    mean_only: bool = False
    encoder_type: str = "rel_pos_transformer"
    encoder_params: dict = field(
        default_factory=lambda: {
            "kernel_size": 3,
            "dropout_p": 0.1,
            "num_layers": 6,
            "num_heads": 2,
            "hidden_channels_ffn": 768,
            "input_length": None,
        }
    )
    d_vector_dim: int = 0
    # training params
    data_dep_init_steps: int = 10
@ -86,8 +150,8 @@ class GlowTTSConfig(BaseTTSConfig):
    # multi-speaker settings
    use_speaker_embedding: bool = False
-    use_external_speaker_embedding_file: bool = False
+    use_d_vector_file: bool = False
-    external_speaker_embedding_file: str = False
+    d_vector_file: str = False
    # optimizer parameters
    optimizer: str = "RAdam"
--- a/TTS/tts/configs/shared_configs.py
+++ b/TTS/tts/configs/shared_configs.py
@ -1,7 +1,7 @@
 from dataclasses import asdict, dataclass, field
 from typing import List
-from coqpit import MISSING, Coqpit, check_argument
+from coqpit import Coqpit, check_argument
 from TTS.config import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig
@ -153,7 +153,7 @@ class BaseTTSConfig(BaseTrainingConfig):
    use_espeak_phonemes: bool = True
    phoneme_language: str = None
    compute_input_seq_cache: bool = False
-    text_cleaner: str = MISSING
+    text_cleaner: str = None
    enable_eos_bos_chars: bool = False
    test_sentences_file: str = ""
    phoneme_cache_path: str = None
@ -171,10 +171,14 @@ class BaseTTSConfig(BaseTrainingConfig):
    # dataset
    datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
    # optimizer
-    optimizer: str = MISSING
+    optimizer: str = None
-    optimizer_params: dict = MISSING
+    optimizer_params: dict = None
    # scheduler
    lr_scheduler: str = ""
    lr_scheduler_params: dict = field(default_factory=lambda: {})
    # testing
    test_sentences: List[str] = field(default_factory=lambda: [])
    # multi-speaker
    use_speaker_embedding: bool = False
    use_d_vector_file: bool = False
    d_vector_dim: int = 0
--- a/TTS/tts/configs/speedy_speech_config.py
+++ b/TTS/tts/configs/speedy_speech_config.py
@ -2,6 +2,7 @@ from dataclasses import dataclass, field
 from typing import List
 from TTS.tts.configs.shared_configs import BaseTTSConfig
 from TTS.tts.models.speedy_speech import SpeedySpeechArgs
@dataclass
@ -16,30 +17,8 @@ class SpeedySpeechConfig(BaseTTSConfig):
    Args:
        model (str):
            Model name used for selecting the right model at initialization. Defaults to `speedy_speech`.
-        positional_encoding (bool):
+        model_args (Coqpit):
-            enable / disable positional encoding applied to the encoder output. Defaults to True.
+            Model class arguments. Check `SpeedySpeechArgs` for more details. Defaults to `SpeedySpeechArgs()`.
        hidden_channels (int):
            Base number of hidden channels. Defines all the layers expect ones defined by the specific encoder or decoder
            parameters. Defaults to 128.
        encoder_type (str):
            Type of the encoder used by the model. Look at `TTS.tts.layers.feed_forward.encoder` for more details.
            Defaults to `residual_conv_bn`.
        encoder_params (dict):
            Parameters used to define the encoder network. Look at `TTS.tts.layers.feed_forward.encoder` for more details.
            Defaults to `{"kernel_size": 4, "dilations": [1, 2, 4, 1, 2, 4, 1, 2, 4, 1, 2, 4, 1], "num_conv_blocks": 2, "num_res_blocks": 13}`
        decoder_type (str):
            Type of the decoder used by the model. Look at `TTS.tts.layers.feed_forward.decoder` for more details.
            Defaults to `residual_conv_bn`.
        decoder_params (dict):
            Parameters used to define the decoder network. Look at `TTS.tts.layers.feed_forward.decoder` for more details.
            Defaults to `{"kernel_size": 4, "dilations": [1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1], "num_conv_blocks": 2, "num_res_blocks": 17}`
        hidden_channels_encoder (int):
            Number of base hidden channels used by the encoder network. It defines the input and the output channel sizes,
            and for some encoder types internal hidden channels sizes too. Defaults to 192.
        hidden_channels_decoder (int):
            Number of base hidden channels used by the decoder WaveNet network. Defaults to 192 as in the original work.
        hidden_channels_duration_predictor (int):
            Number of layer channels of the duration predictor network. Defaults to 256 as in the original work.
        data_dep_init_steps (int):
            Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
            Activation Normalization that pre-computes normalization stats at the beginning and use the same values
@ -47,9 +26,9 @@ class SpeedySpeechConfig(BaseTTSConfig):
        use_speaker_embedding (bool):
            enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
            in the multi-speaker mode. Defaults to False.
-        use_external_speaker_embedding_file (bool):
+        use_d_vector_file (bool):
            enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
-        external_speaker_embedding_file (str):
+        d_vector_file (str):
            Path to the file including pre-computed speaker embeddings. Defaults to None.
        noam_schedule (bool):
            enable / disable the use of Noam LR scheduler. Defaults to False.
@ -73,31 +52,12 @@ class SpeedySpeechConfig(BaseTTSConfig):
    model: str = "speedy_speech"
    # model specific params
-    positional_encoding: bool = True
+    model_args: SpeedySpeechArgs = field(default_factory=SpeedySpeechArgs)
    hidden_channels: int = 128
    encoder_type: str = "residual_conv_bn"
    encoder_params: dict = field(
        default_factory=lambda: {
            "kernel_size": 4,
            "dilations": [1, 2, 4, 1, 2, 4, 1, 2, 4, 1, 2, 4, 1],
            "num_conv_blocks": 2,
            "num_res_blocks": 13,
        }
    )
    decoder_type: str = "residual_conv_bn"
    decoder_params: dict = field(
        default_factory=lambda: {
            "kernel_size": 4,
            "dilations": [1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1],
            "num_conv_blocks": 2,
            "num_res_blocks": 17,
        }
    )
    # multi-speaker settings
    use_speaker_embedding: bool = False
-    use_external_speaker_embedding_file: bool = False
+    use_d_vector_file: bool = False
-    external_speaker_embedding_file: str = False
+    d_vector_file: str = False
    # optimizer parameters
    optimizer: str = "RAdam"
--- a/TTS/tts/configs/tacotron2_config.py
+++ b/TTS/tts/configs/tacotron2_config.py
@ -12,107 +12,10 @@ class Tacotron2Config(TacotronConfig):
        >>> from TTS.tts.configs import Tacotron2Config
        >>> config = Tacotron2Config()
-    Args:
+    Check `TacotronConfig` for argument descriptions.
        model (str):
            Model name used to select the right model class to initilize. Defaults to `Tacotron2`.
        use_gst (bool):
            enable / disable the use of Global Style Token modules. Defaults to False.
        gst (GSTConfig):
            Instance of `GSTConfig` class.
        gst_style_input (str):
            Path to the wav file used at inference to set the speech style through GST. If `GST` is enabled and
            this is not defined, the model uses a zero vector as an input. Defaults to None.
        r (int):
            Number of output frames that the decoder computed per iteration. Larger values makes training and inference
            faster but reduces the quality of the output frames. This needs to be tuned considering your own needs.
            Defaults to 1.
        gradual_trainin (List[List]):
            Parameters for the gradual training schedule. It is in the form `[[a, b, c], [d ,e ,f] ..]` where `a` is
            the step number to start using the rest of the values, `b` is the `r` value and `c` is the batch size.
            If sets None, no gradual training is used. Defaults to None.
        memory_size (int):
            Defines the number of previous frames used by the Prenet. If set to < 0, then it uses only the last frame.
            Defaults to -1.
        prenet_type (str):
            `original` or `bn`. `original` sets the default Prenet and `bn` uses Batch Normalization version of the
            Prenet. Defaults to `original`.
        prenet_dropout (bool):
            enables / disables the use of dropout in the Prenet. Defaults to True.
        prenet_dropout_at_inference (bool):
            enable / disable the use of dropout in the Prenet at the inference time. Defaults to False.
        stopnet (bool):
            enable /disable the Stopnet that predicts the end of the decoder sequence. Defaults to True.
        stopnet_pos_weight (float):
            Weight that is applied to over-weight positive instances in the Stopnet loss. Use larger values with
            datasets with longer sentences. Defaults to 10.
        separate_stopnet (bool):
            Use a distinct Stopnet which is trained separately from the rest of the model. Defaults to True.
        attention_type (str):
            attention type. Check ```TTS.tts.layers.attentions.init_attn```. Defaults to 'original'.
        attention_heads (int):
            Number of attention heads for GMM attention. Defaults to 5.
        windowing (bool):
            It especially useful at inference to keep attention alignment diagonal. Defaults to False.
        use_forward_attn (bool):
            It is only valid if ```attn_type``` is ```original```.  Defaults to False.
        forward_attn_mask (bool):
            enable/disable extra masking over forward attention. It is useful at inference to prevent
            possible attention failures. Defaults to False.
        transition_agent (bool):
            enable/disable transition agent in forward attention. Defaults to False.
        location_attn (bool):
            enable/disable location sensitive attention as in the original Tacotron2 paper.
            It is only valid if ```attn_type``` is ```original```. Defaults to True.
        bidirectional_decoder (bool):
            enable/disable bidirectional decoding. Defaults to False.
        double_decoder_consistency (bool):
            enable/disable double decoder consistency. Defaults to False.
        ddc_r (int):
            reduction rate used by the coarse decoder when `double_decoder_consistency` is in use. Set this
            as a multiple of the `r` value. Defaults to 6.
        use_speaker_embedding (bool):
            enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
            in the multi-speaker mode. Defaults to False.
        use_external_speaker_embedding_file (bool):
            enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
        external_speaker_embedding_file (str):
            Path to the file including pre-computed speaker embeddings. Defaults to None.
        noam_schedule (bool):
            enable / disable the use of Noam LR scheduler. Defaults to False.
        warmup_steps (int):
            Number of warm-up steps for the Noam scheduler. Defaults 4000.
        lr (float):
            Initial learning rate. Defaults to `1e-4`.
        wd (float):
            Weight decay coefficient. Defaults to `1e-6`.
        grad_clip (float):
            Gradient clipping threshold. Defaults to `5`.
        seq_len_notm (bool):
            enable / disable the sequnce length normalization in the loss functions. If set True, loss of a sample
            is divided by the sequence length. Defaults to False.
        loss_masking (bool):
            enable / disable masking the paddings of the samples in loss computation. Defaults to True.
        decoder_loss_alpha (float):
            Weight for the decoder loss of the Tacotron model. If set less than or equal to zero, it disables the
            corresponding loss function. Defaults to 0.25
        postnet_loss_alpha (float):
            Weight for the postnet loss of the Tacotron model. If set less than or equal to zero, it disables the
            corresponding loss function. Defaults to 0.25
        postnet_diff_spec_alpha (float):
            Weight for the postnet differential loss of the Tacotron model. If set less than or equal to zero, it disables the
            corresponding loss function. Defaults to 0.25
        decoder_diff_spec_alpha (float):
            Weight for the decoder differential loss of the Tacotron model. If set less than or equal to zero, it disables the
            corresponding loss function. Defaults to 0.25
        decoder_ssim_alpha (float):
            Weight for the decoder SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the
            corresponding loss function. Defaults to 0.25
        postnet_ssim_alpha (float):
            Weight for the postnet SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the
            corresponding loss function. Defaults to 0.25
        ga_alpha (float):
            Weight for the guided attention loss. If set less than or equal to zero, it disables the corresponding loss
            function. Defaults to 5.
    """
    model: str = "tacotron2"
    out_channels: int = 80
    encoder_in_features: int = 512
    decoder_in_features: int = 512
--- a/TTS/tts/configs/tacotron_config.py
+++ b/TTS/tts/configs/tacotron_config.py
@ -23,6 +23,10 @@ class TacotronConfig(BaseTTSConfig):
        gst_style_input (str):
            Path to the wav file used at inference to set the speech style through GST. If `GST` is enabled and
            this is not defined, the model uses a zero vector as an input. Defaults to None.
        num_chars (int):
            Number of characters used by the model. It must be defined before initializing the model. Defaults to None.
        num_speakers (int):
            Number of speakers for multi-speaker models. Defaults to 1.
        r (int):
            Initial number of output frames that the decoder computed per iteration. Larger values makes training and inference
            faster but reduces the quality of the output frames. This must be equal to the largest `r` value used in
@ -47,7 +51,13 @@ class TacotronConfig(BaseTTSConfig):
            Weight that is applied to over-weight positive instances in the Stopnet loss. Use larger values with
            datasets with longer sentences. Defaults to 10.
        max_decoder_steps (int):
-            Max number of steps allowed for the decoder. Defaults to 10000.
+            Max number of steps allowed for the decoder. Defaults to 50.
        encoder_in_features (int):
            Channels of encoder input and character embedding tensors. Defaults to 256.
        decoder_in_features (int):
            Channels of decoder input and encoder output tensors. Defaults to 256.
        out_channels (int):
            Channels of the final model output. It must match the spectragram size. Defaults to 80.
        separate_stopnet (bool):
            Use a distinct Stopnet which is trained separately from the rest of the model. Defaults to True.
        attention_type (str):
@ -76,9 +86,9 @@ class TacotronConfig(BaseTTSConfig):
        use_speaker_embedding (bool):
            enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
            in the multi-speaker mode. Defaults to False.
-        use_external_speaker_embedding_file (bool):
+        use_d_vector_file (bool):
            enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
-        external_speaker_embedding_file (str):
+        d_vector_file (str):
            Path to the file including pre-computed speaker embeddings. Defaults to None.
        optimizer (str):
            Optimizer used for the training. Set one from `torch.optim.Optimizer` or `TTS.utils.training`.
@ -111,6 +121,7 @@ class TacotronConfig(BaseTTSConfig):
            Weight for the postnet differential loss of the Tacotron model. If set less than or equal to zero, it disables the
            corresponding loss function. Defaults to 0.25
        decoder_diff_spec_alpha (float):
            Weight for the decoder differential loss of the Tacotron model. If set less than or equal to zero, it disables the
            corresponding loss function. Defaults to 0.25
        decoder_ssim_alpha (float):
@ -125,11 +136,14 @@ class TacotronConfig(BaseTTSConfig):
    """
    model: str = "tacotron"
    # model_params: TacotronArgs = field(default_factory=lambda: TacotronArgs())
    use_gst: bool = False
    gst: GSTConfig = None
    gst_style_input: str = None
    # model specific params
    num_speakers: int = 1
    num_chars: int = 0
    r: int = 2
    gradual_training: List[List[int]] = None
    memory_size: int = -1
@ -139,12 +153,17 @@ class TacotronConfig(BaseTTSConfig):
    stopnet: bool = True
    separate_stopnet: bool = True
    stopnet_pos_weight: float = 10.0
-    max_decoder_steps: int = 10000
+    max_decoder_steps: int = 500
    encoder_in_features: int = 256
    decoder_in_features: int = 256
    decoder_output_dim: int = 80
    out_channels: int = 513
    # attention layers
    attention_type: str = "original"
    attention_heads: int = None
    attention_norm: str = "sigmoid"
    attention_win: bool = False
    windowing: bool = False
    use_forward_attn: bool = False
    forward_attn_mask: bool = False
@ -158,8 +177,10 @@ class TacotronConfig(BaseTTSConfig):
    # multi-speaker settings
    use_speaker_embedding: bool = False
-    use_external_speaker_embedding_file: bool = False
+    speaker_embedding_dim: int = 512
-    external_speaker_embedding_file: str = False
+    use_d_vector_file: bool = False
    d_vector_file: str = False
    d_vector_dim: int = None
    # optimizer parameters
    optimizer: str = "RAdam"
@ -196,3 +217,9 @@ class TacotronConfig(BaseTTSConfig):
            assert (
                self.gradual_training[0][1] == self.r
            ), f"[!] the first scheduled gradual training `r` must be equal to the model's `r` value. {self.gradual_training[0][1]} vs {self.r}"
        if self.model == "tacotron" and self.audio is not None:
            assert self.out_channels == (
                self.audio.fft_size // 2 + 1
            ), f"{self.out_channels} vs {self.audio.fft_size // 2 + 1}"
        if self.model == "tacotron2" and self.audio is not None:
            assert self.out_channels == self.audio.num_mels