coqui-tts/TTS/tts/configs/tacotron_config.py

from dataclasses import asdict, dataclass
from typing import List

from coqpit import check_argument

from .shared_configs import BaseTTSConfig, GSTConfig


@dataclass
class TacotronConfig(BaseTTSConfig):
    """Defines parameters for Tacotron based models."""

    model: str = "tacotron"
    gst: GSTConfig = None
    gst_style_input: str = None
    # model specific params
    r: int = 2
    gradual_training: List = None
    memory_size: int = -1
    prenet_type: str = "original"
    prenet_dropout: bool = True
    prenet_dropout_at_inference: bool = False
    stopnet: bool = True
    separate_stopnet: bool = True
    stopnet_pos_weight: float = 10.0

    # attention layers
    attention_type: str = "original"
    attention_heads: int = None
    attention_norm: str = "sigmoid"
    windowing: bool = False
    use_forward_attn: bool = False
    forward_attn_mask: bool = False
    transition_agent: bool = False
    location_attn: bool = True

    # advance methods
    bidirectional_decoder: bool = False
    double_decoder_consistency: bool = False
    ddc_r: int = 6

    # multi-speaker settings
    use_speaker_embedding: bool = False
    use_external_speaker_embedding_file: bool = False
    external_speaker_embedding_file: str = False

    # optimizer parameters
    noam_schedule: bool = False
    warmup_steps: int = 4000
    lr: float = 1e-4
    wd: float = 1e-6
    grad_clip: float = 5.0
    seq_len_norm: bool = False
    loss_masking: bool = True

    # loss params
    decoder_loss_alpha: float = 0.25
    postnet_loss_alpha: float = 0.25
    postnet_diff_spec_alpha: float = 0.25
    decoder_diff_spec_alpha: float = 0.25
    decoder_ssim_alpha: float = 0.25
    postnet_ssim_alpha: float = 0.25
    ga_alpha: float = 5.0


@dataclass
class Tacotron2Config(TacotronConfig):
    """Defines parameters for Tacotron2 based models."""

    model: str = "tacotron2"