coqui-tts/TTS/tts/configs/tacotron_config.py

71 lines
1.8 KiB
Python

from dataclasses import asdict, dataclass
from typing import List
from coqpit import check_argument
from .shared_configs import BaseTTSConfig, GSTConfig
@dataclass
class TacotronConfig(BaseTTSConfig):
"""Defines parameters for Tacotron based models."""
model: str = "tacotron"
gst: GSTConfig = None
gst_style_input: str = None
# model specific params
r: int = 2
gradual_training: List = None
memory_size: int = -1
prenet_type: str = "original"
prenet_dropout: bool = True
prenet_dropout_at_inference: bool = False
stopnet: bool = True
separate_stopnet: bool = True
stopnet_pos_weight: float = 10.0
# attention layers
attention_type: str = "original"
attention_heads: int = None
attention_norm: str = "sigmoid"
windowing: bool = False
use_forward_attn: bool = False
forward_attn_mask: bool = False
transition_agent: bool = False
location_attn: bool = True
# advance methods
bidirectional_decoder: bool = False
double_decoder_consistency: bool = False
ddc_r: int = 6
# multi-speaker settings
use_speaker_embedding: bool = False
use_external_speaker_embedding_file: bool = False
external_speaker_embedding_file: str = False
# optimizer parameters
noam_schedule: bool = False
warmup_steps: int = 4000
lr: float = 1e-4
wd: float = 1e-6
grad_clip: float = 5.0
seq_len_norm: bool = False
loss_masking: bool = True
# loss params
decoder_loss_alpha: float = 0.25
postnet_loss_alpha: float = 0.25
postnet_diff_spec_alpha: float = 0.25
decoder_diff_spec_alpha: float = 0.25
decoder_ssim_alpha: float = 0.25
postnet_ssim_alpha: float = 0.25
ga_alpha: float = 5.0
@dataclass
class Tacotron2Config(TacotronConfig):
"""Defines parameters for Tacotron2 based models."""
model: str = "tacotron2"