mirror of https://github.com/coqui-ai/TTS.git
177 lines
8.5 KiB
Python
177 lines
8.5 KiB
Python
from dataclasses import dataclass
|
|
from typing import List
|
|
|
|
from TTS.tts.configs.shared_configs import BaseTTSConfig, GSTConfig
|
|
|
|
|
|
@dataclass
|
|
class TacotronConfig(BaseTTSConfig):
|
|
"""Defines parameters for Tacotron based models.
|
|
|
|
Example:
|
|
|
|
>>> from TTS.tts.configs import TacotronConfig
|
|
>>> config = TacotronConfig()
|
|
|
|
Args:
|
|
model (str):
|
|
Model name used to select the right model class to initilize. Defaults to `Tacotron`.
|
|
use_gst (bool):
|
|
enable / disable the use of Global Style Token modules. Defaults to False.
|
|
gst (GSTConfig):
|
|
Instance of `GSTConfig` class.
|
|
gst_style_input (str):
|
|
Path to the wav file used at inference to set the speech style through GST. If `GST` is enabled and
|
|
this is not defined, the model uses a zero vector as an input. Defaults to None.
|
|
r (int):
|
|
Initial number of output frames that the decoder computed per iteration. Larger values makes training and inference
|
|
faster but reduces the quality of the output frames. This must be equal to the largest `r` value used in
|
|
`gradual_training` schedule. Defaults to 1.
|
|
gradual_training (List[List]):
|
|
Parameters for the gradual training schedule. It is in the form `[[a, b, c], [d ,e ,f] ..]` where `a` is
|
|
the step number to start using the rest of the values, `b` is the `r` value and `c` is the batch size.
|
|
If sets None, no gradual training is used. Defaults to None.
|
|
memory_size (int):
|
|
Defines the number of previous frames used by the Prenet. If set to < 0, then it uses only the last frame.
|
|
Defaults to -1.
|
|
prenet_type (str):
|
|
`original` or `bn`. `original` sets the default Prenet and `bn` uses Batch Normalization version of the
|
|
Prenet. Defaults to `original`.
|
|
prenet_dropout (bool):
|
|
enables / disables the use of dropout in the Prenet. Defaults to True.
|
|
prenet_dropout_at_inference (bool):
|
|
enable / disable the use of dropout in the Prenet at the inference time. Defaults to False.
|
|
stopnet (bool):
|
|
enable /disable the Stopnet that predicts the end of the decoder sequence. Defaults to True.
|
|
stopnet_pos_weight (float):
|
|
Weight that is applied to over-weight positive instances in the Stopnet loss. Use larger values with
|
|
datasets with longer sentences. Defaults to 10.
|
|
separate_stopnet (bool):
|
|
Use a distinct Stopnet which is trained separately from the rest of the model. Defaults to True.
|
|
attention_type (str):
|
|
attention type. Check ```TTS.tts.layers.attentions.init_attn```. Defaults to 'original'.
|
|
attention_heads (int):
|
|
Number of attention heads for GMM attention. Defaults to 5.
|
|
windowing (bool):
|
|
It especially useful at inference to keep attention alignment diagonal. Defaults to False.
|
|
use_forward_attn (bool):
|
|
It is only valid if ```attn_type``` is ```original```. Defaults to False.
|
|
forward_attn_mask (bool):
|
|
enable/disable extra masking over forward attention. It is useful at inference to prevent
|
|
possible attention failures. Defaults to False.
|
|
transition_agent (bool):
|
|
enable/disable transition agent in forward attention. Defaults to False.
|
|
location_attn (bool):
|
|
enable/disable location sensitive attention as in the original Tacotron2 paper.
|
|
It is only valid if ```attn_type``` is ```original```. Defaults to True.
|
|
bidirectional_decoder (bool):
|
|
enable/disable bidirectional decoding. Defaults to False.
|
|
double_decoder_consistency (bool):
|
|
enable/disable double decoder consistency. Defaults to False.
|
|
ddc_r (int):
|
|
reduction rate used by the coarse decoder when `double_decoder_consistency` is in use. Set this
|
|
as a multiple of the `r` value. Defaults to 6.
|
|
use_speaker_embedding (bool):
|
|
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
|
|
in the multi-speaker mode. Defaults to False.
|
|
use_external_speaker_embedding_file (bool):
|
|
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
|
|
external_speaker_embedding_file (str):
|
|
Path to the file including pre-computed speaker embeddings. Defaults to None.
|
|
noam_schedule (bool):
|
|
enable / disable the use of Noam LR scheduler. Defaults to False.
|
|
warmup_steps (int):
|
|
Number of warm-up steps for the Noam scheduler. Defaults 4000.
|
|
lr (float):
|
|
Initial learning rate. Defaults to `1e-4`.
|
|
wd (float):
|
|
Weight decay coefficient. Defaults to `1e-6`.
|
|
grad_clip (float):
|
|
Gradient clipping threshold. Defaults to `5`.
|
|
seq_len_notm (bool):
|
|
enable / disable the sequnce length normalization in the loss functions. If set True, loss of a sample
|
|
is divided by the sequence length. Defaults to False.
|
|
loss_masking (bool):
|
|
enable / disable masking the paddings of the samples in loss computation. Defaults to True.
|
|
decoder_loss_alpha (float):
|
|
Weight for the decoder loss of the Tacotron model. If set less than or equal to zero, it disables the
|
|
corresponding loss function. Defaults to 0.25
|
|
postnet_loss_alpha (float):
|
|
Weight for the postnet loss of the Tacotron model. If set less than or equal to zero, it disables the
|
|
corresponding loss function. Defaults to 0.25
|
|
postnet_diff_spec_alpha (float):
|
|
Weight for the postnet differential loss of the Tacotron model. If set less than or equal to zero, it disables the
|
|
corresponding loss function. Defaults to 0.25
|
|
decoder_diff_spec_alpha (float):
|
|
Weight for the decoder differential loss of the Tacotron model. If set less than or equal to zero, it disables the
|
|
corresponding loss function. Defaults to 0.25
|
|
decoder_ssim_alpha (float):
|
|
Weight for the decoder SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the
|
|
corresponding loss function. Defaults to 0.25
|
|
postnet_ssim_alpha (float):
|
|
Weight for the postnet SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the
|
|
corresponding loss function. Defaults to 0.25
|
|
ga_alpha (float):
|
|
Weight for the guided attention loss. If set less than or equal to zero, it disables the corresponding loss
|
|
function. Defaults to 5.
|
|
"""
|
|
|
|
model: str = "tacotron"
|
|
use_gst: bool = False
|
|
gst: GSTConfig = None
|
|
gst_style_input: str = None
|
|
# model specific params
|
|
r: int = 2
|
|
gradual_training: List[List] = None
|
|
memory_size: int = -1
|
|
prenet_type: str = "original"
|
|
prenet_dropout: bool = True
|
|
prenet_dropout_at_inference: bool = False
|
|
stopnet: bool = True
|
|
separate_stopnet: bool = True
|
|
stopnet_pos_weight: float = 10.0
|
|
|
|
# attention layers
|
|
attention_type: str = "original"
|
|
attention_heads: int = None
|
|
attention_norm: str = "sigmoid"
|
|
windowing: bool = False
|
|
use_forward_attn: bool = False
|
|
forward_attn_mask: bool = False
|
|
transition_agent: bool = False
|
|
location_attn: bool = True
|
|
|
|
# advance methods
|
|
bidirectional_decoder: bool = False
|
|
double_decoder_consistency: bool = False
|
|
ddc_r: int = 6
|
|
|
|
# multi-speaker settings
|
|
use_speaker_embedding: bool = False
|
|
use_external_speaker_embedding_file: bool = False
|
|
external_speaker_embedding_file: str = False
|
|
|
|
# optimizer parameters
|
|
noam_schedule: bool = False
|
|
warmup_steps: int = 4000
|
|
lr: float = 1e-4
|
|
wd: float = 1e-6
|
|
grad_clip: float = 5.0
|
|
seq_len_norm: bool = False
|
|
loss_masking: bool = True
|
|
|
|
# loss params
|
|
decoder_loss_alpha: float = 0.25
|
|
postnet_loss_alpha: float = 0.25
|
|
postnet_diff_spec_alpha: float = 0.25
|
|
decoder_diff_spec_alpha: float = 0.25
|
|
decoder_ssim_alpha: float = 0.25
|
|
postnet_ssim_alpha: float = 0.25
|
|
ga_alpha: float = 5.0
|
|
|
|
def check_values(self):
|
|
if self.gradual_training:
|
|
assert (
|
|
self.gradual_training[0][1] == self.r
|
|
), f"[!] the first scheduled gradual training `r` must be equal to the model's `r` value. {self.gradual_training[0][1]} vs {self.r}"
|