mirror of https://github.com/coqui-ai/TTS.git
Update tts model configs
This commit is contained in:
parent
626c9d41e6
commit
786170fe7d
|
@ -2,6 +2,7 @@ from dataclasses import dataclass, field
|
|||
from typing import List
|
||||
|
||||
from TTS.tts.configs.shared_configs import BaseTTSConfig
|
||||
from TTS.tts.models.align_tts import AlignTTSArgs
|
||||
|
||||
|
||||
@dataclass
|
||||
|
@ -49,9 +50,9 @@ class AlignTTSConfig(BaseTTSConfig):
|
|||
use_speaker_embedding (bool):
|
||||
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
|
||||
in the multi-speaker mode. Defaults to False.
|
||||
use_external_speaker_embedding_file (bool):
|
||||
use_d_vector_file (bool):
|
||||
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
|
||||
external_speaker_embedding_file (str):
|
||||
d_vector_file (str):
|
||||
Path to the file including pre-computed speaker embeddings. Defaults to None.
|
||||
noam_schedule (bool):
|
||||
enable / disable the use of Noam LR scheduler. Defaults to False.
|
||||
|
@ -68,17 +69,7 @@ class AlignTTSConfig(BaseTTSConfig):
|
|||
|
||||
model: str = "align_tts"
|
||||
# model specific params
|
||||
positional_encoding: bool = True
|
||||
hidden_channels_dp: int = 256
|
||||
hidden_channels: int = 256
|
||||
encoder_type: str = "fftransformer"
|
||||
encoder_params: dict = field(
|
||||
default_factory=lambda: {"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}
|
||||
)
|
||||
decoder_type: str = "fftransformer"
|
||||
decoder_params: dict = field(
|
||||
default_factory=lambda: {"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}
|
||||
)
|
||||
model_args: AlignTTSArgs = field(default_factory=AlignTTSArgs)
|
||||
phase_start_steps: List[int] = None
|
||||
|
||||
ssim_alpha: float = 1.0
|
||||
|
@ -88,8 +79,8 @@ class AlignTTSConfig(BaseTTSConfig):
|
|||
|
||||
# multi-speaker settings
|
||||
use_speaker_embedding: bool = False
|
||||
use_external_speaker_embedding_file: bool = False
|
||||
external_speaker_embedding_file: str = False
|
||||
use_d_vector_file: bool = False
|
||||
d_vector_file: str = False
|
||||
|
||||
# optimizer parameters
|
||||
optimizer: str = "Adam"
|
||||
|
|
|
@ -23,13 +23,49 @@ class GlowTTSConfig(BaseTTSConfig):
|
|||
Defaults to `{"kernel_size": 3, "dropout_p": 0.1, "num_layers": 6, "num_heads": 2, "hidden_channels_ffn": 768}`
|
||||
use_encoder_prenet (bool):
|
||||
enable / disable the use of a prenet for the encoder. Defaults to True.
|
||||
hidden_channels_encoder (int):
|
||||
hidden_channels_enc (int):
|
||||
Number of base hidden channels used by the encoder network. It defines the input and the output channel sizes,
|
||||
and for some encoder types internal hidden channels sizes too. Defaults to 192.
|
||||
hidden_channels_decoder (int):
|
||||
hidden_channels_dec (int):
|
||||
Number of base hidden channels used by the decoder WaveNet network. Defaults to 192 as in the original work.
|
||||
hidden_channels_duration_predictor (int):
|
||||
hidden_channels_dp (int):
|
||||
Number of layer channels of the duration predictor network. Defaults to 256 as in the original work.
|
||||
mean_only (bool):
|
||||
If true predict only the mean values by the decoder flow. Defaults to True.
|
||||
out_channels (int):
|
||||
Number of channels of the model output tensor. Defaults to 80.
|
||||
num_flow_blocks_dec (int):
|
||||
Number of decoder blocks. Defaults to 12.
|
||||
inference_noise_scale (float):
|
||||
Noise scale used at inference. Defaults to 0.33.
|
||||
kernel_size_dec (int):
|
||||
Decoder kernel size. Defaults to 5
|
||||
dilation_rate (int):
|
||||
Rate to increase dilation by each layer in a decoder block. Defaults to 5.
|
||||
num_block_layers (int):
|
||||
Number of decoder layers in each decoder block. Defaults to 4.
|
||||
dropout_p_dec (float):
|
||||
Dropout rate for decoder. Defaults to 0.1.
|
||||
num_speaker (int):
|
||||
Number of speaker to define the size of speaker embedding layer. Defaults to 0.
|
||||
c_in_channels (int):
|
||||
Number of speaker embedding channels. It is set to 512 if embeddings are learned. Defaults to 0.
|
||||
num_splits (int):
|
||||
Number of split levels in inversible conv1x1 operation. Defaults to 4.
|
||||
num_squeeze (int):
|
||||
Number of squeeze levels. When squeezing channels increases and time steps reduces by the factor
|
||||
'num_squeeze'. Defaults to 1.
|
||||
sigmoid_scale (bool):
|
||||
enable/disable sigmoid scaling in decoder. Defaults to False.
|
||||
mean_only (bool):
|
||||
If True, encoder only computes mean value and uses constant variance for each time step. Defaults to true.
|
||||
encoder_type (str):
|
||||
Encoder module type. Possible values are`["rel_pos_transformer", "gated_conv", "residual_conv_bn", "time_depth_separable"]`
|
||||
Check `TTS.tts.layers.glow_tts.encoder` for more details. Defaults to `rel_pos_transformers` as in the original paper.
|
||||
encoder_params (dict):
|
||||
Encoder module parameters. Defaults to None.
|
||||
d_vector_dim (int):
|
||||
Channels of external speaker embedding vectors. Defaults to 0.
|
||||
data_dep_init_steps (int):
|
||||
Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
|
||||
Activation Normalization that pre-computes normalization stats at the beginning and use the same values
|
||||
|
@ -41,9 +77,9 @@ class GlowTTSConfig(BaseTTSConfig):
|
|||
use_speaker_embedding (bool):
|
||||
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
|
||||
in the multi-speaker mode. Defaults to False.
|
||||
use_external_speaker_embedding_file (bool):
|
||||
use_d_vector_file (bool):
|
||||
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
|
||||
external_speaker_embedding_file (str):
|
||||
d_vector_file (str):
|
||||
Path to the file including pre-computed speaker embeddings. Defaults to None.
|
||||
noam_schedule (bool):
|
||||
enable / disable the use of Noam LR scheduler. Defaults to False.
|
||||
|
@ -62,6 +98,7 @@ class GlowTTSConfig(BaseTTSConfig):
|
|||
model: str = "glow_tts"
|
||||
|
||||
# model params
|
||||
num_chars: int = None
|
||||
encoder_type: str = "rel_pos_transformer"
|
||||
encoder_params: dict = field(
|
||||
default_factory=lambda: {
|
||||
|
@ -73,9 +110,36 @@ class GlowTTSConfig(BaseTTSConfig):
|
|||
}
|
||||
)
|
||||
use_encoder_prenet: bool = True
|
||||
hidden_channels_encoder: int = 192
|
||||
hidden_channels_decoder: int = 192
|
||||
hidden_channels_duration_predictor: int = 256
|
||||
hidden_channels_enc: int = 192
|
||||
hidden_channels_dec: int = 192
|
||||
hidden_channels_dp: int = 256
|
||||
dropout_p_dp: float = 0.1
|
||||
dropout_p_dec: float = 0.05
|
||||
mean_only: bool = True
|
||||
out_channels: int = 80
|
||||
num_flow_blocks_dec: int = 12
|
||||
inference_noise_scale: float = 0.33
|
||||
kernel_size_dec: int = 5
|
||||
dilation_rate: int = 5
|
||||
num_block_layers: int = 4
|
||||
num_speakers: int = 0
|
||||
c_in_channels: int = 0
|
||||
num_splits: int = 4
|
||||
num_squeeze: int = 1
|
||||
sigmoid_scale: bool = False
|
||||
mean_only: bool = False
|
||||
encoder_type: str = "rel_pos_transformer"
|
||||
encoder_params: dict = field(
|
||||
default_factory=lambda: {
|
||||
"kernel_size": 3,
|
||||
"dropout_p": 0.1,
|
||||
"num_layers": 6,
|
||||
"num_heads": 2,
|
||||
"hidden_channels_ffn": 768,
|
||||
"input_length": None,
|
||||
}
|
||||
)
|
||||
d_vector_dim: int = 0
|
||||
|
||||
# training params
|
||||
data_dep_init_steps: int = 10
|
||||
|
@ -86,8 +150,8 @@ class GlowTTSConfig(BaseTTSConfig):
|
|||
|
||||
# multi-speaker settings
|
||||
use_speaker_embedding: bool = False
|
||||
use_external_speaker_embedding_file: bool = False
|
||||
external_speaker_embedding_file: str = False
|
||||
use_d_vector_file: bool = False
|
||||
d_vector_file: str = False
|
||||
|
||||
# optimizer parameters
|
||||
optimizer: str = "RAdam"
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
from dataclasses import asdict, dataclass, field
|
||||
from typing import List
|
||||
|
||||
from coqpit import MISSING, Coqpit, check_argument
|
||||
from coqpit import Coqpit, check_argument
|
||||
|
||||
from TTS.config import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig
|
||||
|
||||
|
@ -153,7 +153,7 @@ class BaseTTSConfig(BaseTrainingConfig):
|
|||
use_espeak_phonemes: bool = True
|
||||
phoneme_language: str = None
|
||||
compute_input_seq_cache: bool = False
|
||||
text_cleaner: str = MISSING
|
||||
text_cleaner: str = None
|
||||
enable_eos_bos_chars: bool = False
|
||||
test_sentences_file: str = ""
|
||||
phoneme_cache_path: str = None
|
||||
|
@ -171,10 +171,14 @@ class BaseTTSConfig(BaseTrainingConfig):
|
|||
# dataset
|
||||
datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
|
||||
# optimizer
|
||||
optimizer: str = MISSING
|
||||
optimizer_params: dict = MISSING
|
||||
optimizer: str = None
|
||||
optimizer_params: dict = None
|
||||
# scheduler
|
||||
lr_scheduler: str = ""
|
||||
lr_scheduler_params: dict = field(default_factory=lambda: {})
|
||||
# testing
|
||||
test_sentences: List[str] = field(default_factory=lambda: [])
|
||||
# multi-speaker
|
||||
use_speaker_embedding: bool = False
|
||||
use_d_vector_file: bool = False
|
||||
d_vector_dim: int = 0
|
||||
|
|
|
@ -2,6 +2,7 @@ from dataclasses import dataclass, field
|
|||
from typing import List
|
||||
|
||||
from TTS.tts.configs.shared_configs import BaseTTSConfig
|
||||
from TTS.tts.models.speedy_speech import SpeedySpeechArgs
|
||||
|
||||
|
||||
@dataclass
|
||||
|
@ -16,30 +17,8 @@ class SpeedySpeechConfig(BaseTTSConfig):
|
|||
Args:
|
||||
model (str):
|
||||
Model name used for selecting the right model at initialization. Defaults to `speedy_speech`.
|
||||
positional_encoding (bool):
|
||||
enable / disable positional encoding applied to the encoder output. Defaults to True.
|
||||
hidden_channels (int):
|
||||
Base number of hidden channels. Defines all the layers expect ones defined by the specific encoder or decoder
|
||||
parameters. Defaults to 128.
|
||||
encoder_type (str):
|
||||
Type of the encoder used by the model. Look at `TTS.tts.layers.feed_forward.encoder` for more details.
|
||||
Defaults to `residual_conv_bn`.
|
||||
encoder_params (dict):
|
||||
Parameters used to define the encoder network. Look at `TTS.tts.layers.feed_forward.encoder` for more details.
|
||||
Defaults to `{"kernel_size": 4, "dilations": [1, 2, 4, 1, 2, 4, 1, 2, 4, 1, 2, 4, 1], "num_conv_blocks": 2, "num_res_blocks": 13}`
|
||||
decoder_type (str):
|
||||
Type of the decoder used by the model. Look at `TTS.tts.layers.feed_forward.decoder` for more details.
|
||||
Defaults to `residual_conv_bn`.
|
||||
decoder_params (dict):
|
||||
Parameters used to define the decoder network. Look at `TTS.tts.layers.feed_forward.decoder` for more details.
|
||||
Defaults to `{"kernel_size": 4, "dilations": [1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1], "num_conv_blocks": 2, "num_res_blocks": 17}`
|
||||
hidden_channels_encoder (int):
|
||||
Number of base hidden channels used by the encoder network. It defines the input and the output channel sizes,
|
||||
and for some encoder types internal hidden channels sizes too. Defaults to 192.
|
||||
hidden_channels_decoder (int):
|
||||
Number of base hidden channels used by the decoder WaveNet network. Defaults to 192 as in the original work.
|
||||
hidden_channels_duration_predictor (int):
|
||||
Number of layer channels of the duration predictor network. Defaults to 256 as in the original work.
|
||||
model_args (Coqpit):
|
||||
Model class arguments. Check `SpeedySpeechArgs` for more details. Defaults to `SpeedySpeechArgs()`.
|
||||
data_dep_init_steps (int):
|
||||
Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
|
||||
Activation Normalization that pre-computes normalization stats at the beginning and use the same values
|
||||
|
@ -47,9 +26,9 @@ class SpeedySpeechConfig(BaseTTSConfig):
|
|||
use_speaker_embedding (bool):
|
||||
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
|
||||
in the multi-speaker mode. Defaults to False.
|
||||
use_external_speaker_embedding_file (bool):
|
||||
use_d_vector_file (bool):
|
||||
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
|
||||
external_speaker_embedding_file (str):
|
||||
d_vector_file (str):
|
||||
Path to the file including pre-computed speaker embeddings. Defaults to None.
|
||||
noam_schedule (bool):
|
||||
enable / disable the use of Noam LR scheduler. Defaults to False.
|
||||
|
@ -73,31 +52,12 @@ class SpeedySpeechConfig(BaseTTSConfig):
|
|||
|
||||
model: str = "speedy_speech"
|
||||
# model specific params
|
||||
positional_encoding: bool = True
|
||||
hidden_channels: int = 128
|
||||
encoder_type: str = "residual_conv_bn"
|
||||
encoder_params: dict = field(
|
||||
default_factory=lambda: {
|
||||
"kernel_size": 4,
|
||||
"dilations": [1, 2, 4, 1, 2, 4, 1, 2, 4, 1, 2, 4, 1],
|
||||
"num_conv_blocks": 2,
|
||||
"num_res_blocks": 13,
|
||||
}
|
||||
)
|
||||
decoder_type: str = "residual_conv_bn"
|
||||
decoder_params: dict = field(
|
||||
default_factory=lambda: {
|
||||
"kernel_size": 4,
|
||||
"dilations": [1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1],
|
||||
"num_conv_blocks": 2,
|
||||
"num_res_blocks": 17,
|
||||
}
|
||||
)
|
||||
model_args: SpeedySpeechArgs = field(default_factory=SpeedySpeechArgs)
|
||||
|
||||
# multi-speaker settings
|
||||
use_speaker_embedding: bool = False
|
||||
use_external_speaker_embedding_file: bool = False
|
||||
external_speaker_embedding_file: str = False
|
||||
use_d_vector_file: bool = False
|
||||
d_vector_file: str = False
|
||||
|
||||
# optimizer parameters
|
||||
optimizer: str = "RAdam"
|
||||
|
|
|
@ -12,107 +12,10 @@ class Tacotron2Config(TacotronConfig):
|
|||
>>> from TTS.tts.configs import Tacotron2Config
|
||||
>>> config = Tacotron2Config()
|
||||
|
||||
Args:
|
||||
model (str):
|
||||
Model name used to select the right model class to initilize. Defaults to `Tacotron2`.
|
||||
use_gst (bool):
|
||||
enable / disable the use of Global Style Token modules. Defaults to False.
|
||||
gst (GSTConfig):
|
||||
Instance of `GSTConfig` class.
|
||||
gst_style_input (str):
|
||||
Path to the wav file used at inference to set the speech style through GST. If `GST` is enabled and
|
||||
this is not defined, the model uses a zero vector as an input. Defaults to None.
|
||||
r (int):
|
||||
Number of output frames that the decoder computed per iteration. Larger values makes training and inference
|
||||
faster but reduces the quality of the output frames. This needs to be tuned considering your own needs.
|
||||
Defaults to 1.
|
||||
gradual_trainin (List[List]):
|
||||
Parameters for the gradual training schedule. It is in the form `[[a, b, c], [d ,e ,f] ..]` where `a` is
|
||||
the step number to start using the rest of the values, `b` is the `r` value and `c` is the batch size.
|
||||
If sets None, no gradual training is used. Defaults to None.
|
||||
memory_size (int):
|
||||
Defines the number of previous frames used by the Prenet. If set to < 0, then it uses only the last frame.
|
||||
Defaults to -1.
|
||||
prenet_type (str):
|
||||
`original` or `bn`. `original` sets the default Prenet and `bn` uses Batch Normalization version of the
|
||||
Prenet. Defaults to `original`.
|
||||
prenet_dropout (bool):
|
||||
enables / disables the use of dropout in the Prenet. Defaults to True.
|
||||
prenet_dropout_at_inference (bool):
|
||||
enable / disable the use of dropout in the Prenet at the inference time. Defaults to False.
|
||||
stopnet (bool):
|
||||
enable /disable the Stopnet that predicts the end of the decoder sequence. Defaults to True.
|
||||
stopnet_pos_weight (float):
|
||||
Weight that is applied to over-weight positive instances in the Stopnet loss. Use larger values with
|
||||
datasets with longer sentences. Defaults to 10.
|
||||
separate_stopnet (bool):
|
||||
Use a distinct Stopnet which is trained separately from the rest of the model. Defaults to True.
|
||||
attention_type (str):
|
||||
attention type. Check ```TTS.tts.layers.attentions.init_attn```. Defaults to 'original'.
|
||||
attention_heads (int):
|
||||
Number of attention heads for GMM attention. Defaults to 5.
|
||||
windowing (bool):
|
||||
It especially useful at inference to keep attention alignment diagonal. Defaults to False.
|
||||
use_forward_attn (bool):
|
||||
It is only valid if ```attn_type``` is ```original```. Defaults to False.
|
||||
forward_attn_mask (bool):
|
||||
enable/disable extra masking over forward attention. It is useful at inference to prevent
|
||||
possible attention failures. Defaults to False.
|
||||
transition_agent (bool):
|
||||
enable/disable transition agent in forward attention. Defaults to False.
|
||||
location_attn (bool):
|
||||
enable/disable location sensitive attention as in the original Tacotron2 paper.
|
||||
It is only valid if ```attn_type``` is ```original```. Defaults to True.
|
||||
bidirectional_decoder (bool):
|
||||
enable/disable bidirectional decoding. Defaults to False.
|
||||
double_decoder_consistency (bool):
|
||||
enable/disable double decoder consistency. Defaults to False.
|
||||
ddc_r (int):
|
||||
reduction rate used by the coarse decoder when `double_decoder_consistency` is in use. Set this
|
||||
as a multiple of the `r` value. Defaults to 6.
|
||||
use_speaker_embedding (bool):
|
||||
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
|
||||
in the multi-speaker mode. Defaults to False.
|
||||
use_external_speaker_embedding_file (bool):
|
||||
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
|
||||
external_speaker_embedding_file (str):
|
||||
Path to the file including pre-computed speaker embeddings. Defaults to None.
|
||||
noam_schedule (bool):
|
||||
enable / disable the use of Noam LR scheduler. Defaults to False.
|
||||
warmup_steps (int):
|
||||
Number of warm-up steps for the Noam scheduler. Defaults 4000.
|
||||
lr (float):
|
||||
Initial learning rate. Defaults to `1e-4`.
|
||||
wd (float):
|
||||
Weight decay coefficient. Defaults to `1e-6`.
|
||||
grad_clip (float):
|
||||
Gradient clipping threshold. Defaults to `5`.
|
||||
seq_len_notm (bool):
|
||||
enable / disable the sequnce length normalization in the loss functions. If set True, loss of a sample
|
||||
is divided by the sequence length. Defaults to False.
|
||||
loss_masking (bool):
|
||||
enable / disable masking the paddings of the samples in loss computation. Defaults to True.
|
||||
decoder_loss_alpha (float):
|
||||
Weight for the decoder loss of the Tacotron model. If set less than or equal to zero, it disables the
|
||||
corresponding loss function. Defaults to 0.25
|
||||
postnet_loss_alpha (float):
|
||||
Weight for the postnet loss of the Tacotron model. If set less than or equal to zero, it disables the
|
||||
corresponding loss function. Defaults to 0.25
|
||||
postnet_diff_spec_alpha (float):
|
||||
Weight for the postnet differential loss of the Tacotron model. If set less than or equal to zero, it disables the
|
||||
corresponding loss function. Defaults to 0.25
|
||||
decoder_diff_spec_alpha (float):
|
||||
Weight for the decoder differential loss of the Tacotron model. If set less than or equal to zero, it disables the
|
||||
corresponding loss function. Defaults to 0.25
|
||||
decoder_ssim_alpha (float):
|
||||
Weight for the decoder SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the
|
||||
corresponding loss function. Defaults to 0.25
|
||||
postnet_ssim_alpha (float):
|
||||
Weight for the postnet SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the
|
||||
corresponding loss function. Defaults to 0.25
|
||||
ga_alpha (float):
|
||||
Weight for the guided attention loss. If set less than or equal to zero, it disables the corresponding loss
|
||||
function. Defaults to 5.
|
||||
Check `TacotronConfig` for argument descriptions.
|
||||
"""
|
||||
|
||||
model: str = "tacotron2"
|
||||
out_channels: int = 80
|
||||
encoder_in_features: int = 512
|
||||
decoder_in_features: int = 512
|
||||
|
|
|
@ -23,6 +23,10 @@ class TacotronConfig(BaseTTSConfig):
|
|||
gst_style_input (str):
|
||||
Path to the wav file used at inference to set the speech style through GST. If `GST` is enabled and
|
||||
this is not defined, the model uses a zero vector as an input. Defaults to None.
|
||||
num_chars (int):
|
||||
Number of characters used by the model. It must be defined before initializing the model. Defaults to None.
|
||||
num_speakers (int):
|
||||
Number of speakers for multi-speaker models. Defaults to 1.
|
||||
r (int):
|
||||
Initial number of output frames that the decoder computed per iteration. Larger values makes training and inference
|
||||
faster but reduces the quality of the output frames. This must be equal to the largest `r` value used in
|
||||
|
@ -47,7 +51,13 @@ class TacotronConfig(BaseTTSConfig):
|
|||
Weight that is applied to over-weight positive instances in the Stopnet loss. Use larger values with
|
||||
datasets with longer sentences. Defaults to 10.
|
||||
max_decoder_steps (int):
|
||||
Max number of steps allowed for the decoder. Defaults to 10000.
|
||||
Max number of steps allowed for the decoder. Defaults to 50.
|
||||
encoder_in_features (int):
|
||||
Channels of encoder input and character embedding tensors. Defaults to 256.
|
||||
decoder_in_features (int):
|
||||
Channels of decoder input and encoder output tensors. Defaults to 256.
|
||||
out_channels (int):
|
||||
Channels of the final model output. It must match the spectragram size. Defaults to 80.
|
||||
separate_stopnet (bool):
|
||||
Use a distinct Stopnet which is trained separately from the rest of the model. Defaults to True.
|
||||
attention_type (str):
|
||||
|
@ -76,9 +86,9 @@ class TacotronConfig(BaseTTSConfig):
|
|||
use_speaker_embedding (bool):
|
||||
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
|
||||
in the multi-speaker mode. Defaults to False.
|
||||
use_external_speaker_embedding_file (bool):
|
||||
use_d_vector_file (bool):
|
||||
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
|
||||
external_speaker_embedding_file (str):
|
||||
d_vector_file (str):
|
||||
Path to the file including pre-computed speaker embeddings. Defaults to None.
|
||||
optimizer (str):
|
||||
Optimizer used for the training. Set one from `torch.optim.Optimizer` or `TTS.utils.training`.
|
||||
|
@ -111,6 +121,7 @@ class TacotronConfig(BaseTTSConfig):
|
|||
Weight for the postnet differential loss of the Tacotron model. If set less than or equal to zero, it disables the
|
||||
corresponding loss function. Defaults to 0.25
|
||||
decoder_diff_spec_alpha (float):
|
||||
|
||||
Weight for the decoder differential loss of the Tacotron model. If set less than or equal to zero, it disables the
|
||||
corresponding loss function. Defaults to 0.25
|
||||
decoder_ssim_alpha (float):
|
||||
|
@ -125,11 +136,14 @@ class TacotronConfig(BaseTTSConfig):
|
|||
"""
|
||||
|
||||
model: str = "tacotron"
|
||||
# model_params: TacotronArgs = field(default_factory=lambda: TacotronArgs())
|
||||
use_gst: bool = False
|
||||
gst: GSTConfig = None
|
||||
gst_style_input: str = None
|
||||
|
||||
# model specific params
|
||||
num_speakers: int = 1
|
||||
num_chars: int = 0
|
||||
r: int = 2
|
||||
gradual_training: List[List[int]] = None
|
||||
memory_size: int = -1
|
||||
|
@ -139,12 +153,17 @@ class TacotronConfig(BaseTTSConfig):
|
|||
stopnet: bool = True
|
||||
separate_stopnet: bool = True
|
||||
stopnet_pos_weight: float = 10.0
|
||||
max_decoder_steps: int = 10000
|
||||
max_decoder_steps: int = 500
|
||||
encoder_in_features: int = 256
|
||||
decoder_in_features: int = 256
|
||||
decoder_output_dim: int = 80
|
||||
out_channels: int = 513
|
||||
|
||||
# attention layers
|
||||
attention_type: str = "original"
|
||||
attention_heads: int = None
|
||||
attention_norm: str = "sigmoid"
|
||||
attention_win: bool = False
|
||||
windowing: bool = False
|
||||
use_forward_attn: bool = False
|
||||
forward_attn_mask: bool = False
|
||||
|
@ -158,8 +177,10 @@ class TacotronConfig(BaseTTSConfig):
|
|||
|
||||
# multi-speaker settings
|
||||
use_speaker_embedding: bool = False
|
||||
use_external_speaker_embedding_file: bool = False
|
||||
external_speaker_embedding_file: str = False
|
||||
speaker_embedding_dim: int = 512
|
||||
use_d_vector_file: bool = False
|
||||
d_vector_file: str = False
|
||||
d_vector_dim: int = None
|
||||
|
||||
# optimizer parameters
|
||||
optimizer: str = "RAdam"
|
||||
|
@ -196,3 +217,9 @@ class TacotronConfig(BaseTTSConfig):
|
|||
assert (
|
||||
self.gradual_training[0][1] == self.r
|
||||
), f"[!] the first scheduled gradual training `r` must be equal to the model's `r` value. {self.gradual_training[0][1]} vs {self.r}"
|
||||
if self.model == "tacotron" and self.audio is not None:
|
||||
assert self.out_channels == (
|
||||
self.audio.fft_size // 2 + 1
|
||||
), f"{self.out_channels} vs {self.audio.fft_size // 2 + 1}"
|
||||
if self.model == "tacotron2" and self.audio is not None:
|
||||
assert self.out_channels == self.audio.num_mels
|
||||
|
|
Loading…
Reference in New Issue