mirror of https://github.com/coqui-ai/TTS.git
Update tts model configs
This commit is contained in:
parent
626c9d41e6
commit
786170fe7d
|
@ -2,6 +2,7 @@ from dataclasses import dataclass, field
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from TTS.tts.configs.shared_configs import BaseTTSConfig
|
from TTS.tts.configs.shared_configs import BaseTTSConfig
|
||||||
|
from TTS.tts.models.align_tts import AlignTTSArgs
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
@ -49,9 +50,9 @@ class AlignTTSConfig(BaseTTSConfig):
|
||||||
use_speaker_embedding (bool):
|
use_speaker_embedding (bool):
|
||||||
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
|
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
|
||||||
in the multi-speaker mode. Defaults to False.
|
in the multi-speaker mode. Defaults to False.
|
||||||
use_external_speaker_embedding_file (bool):
|
use_d_vector_file (bool):
|
||||||
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
|
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
|
||||||
external_speaker_embedding_file (str):
|
d_vector_file (str):
|
||||||
Path to the file including pre-computed speaker embeddings. Defaults to None.
|
Path to the file including pre-computed speaker embeddings. Defaults to None.
|
||||||
noam_schedule (bool):
|
noam_schedule (bool):
|
||||||
enable / disable the use of Noam LR scheduler. Defaults to False.
|
enable / disable the use of Noam LR scheduler. Defaults to False.
|
||||||
|
@ -68,17 +69,7 @@ class AlignTTSConfig(BaseTTSConfig):
|
||||||
|
|
||||||
model: str = "align_tts"
|
model: str = "align_tts"
|
||||||
# model specific params
|
# model specific params
|
||||||
positional_encoding: bool = True
|
model_args: AlignTTSArgs = field(default_factory=AlignTTSArgs)
|
||||||
hidden_channels_dp: int = 256
|
|
||||||
hidden_channels: int = 256
|
|
||||||
encoder_type: str = "fftransformer"
|
|
||||||
encoder_params: dict = field(
|
|
||||||
default_factory=lambda: {"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}
|
|
||||||
)
|
|
||||||
decoder_type: str = "fftransformer"
|
|
||||||
decoder_params: dict = field(
|
|
||||||
default_factory=lambda: {"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}
|
|
||||||
)
|
|
||||||
phase_start_steps: List[int] = None
|
phase_start_steps: List[int] = None
|
||||||
|
|
||||||
ssim_alpha: float = 1.0
|
ssim_alpha: float = 1.0
|
||||||
|
@ -88,8 +79,8 @@ class AlignTTSConfig(BaseTTSConfig):
|
||||||
|
|
||||||
# multi-speaker settings
|
# multi-speaker settings
|
||||||
use_speaker_embedding: bool = False
|
use_speaker_embedding: bool = False
|
||||||
use_external_speaker_embedding_file: bool = False
|
use_d_vector_file: bool = False
|
||||||
external_speaker_embedding_file: str = False
|
d_vector_file: str = False
|
||||||
|
|
||||||
# optimizer parameters
|
# optimizer parameters
|
||||||
optimizer: str = "Adam"
|
optimizer: str = "Adam"
|
||||||
|
|
|
@ -23,13 +23,49 @@ class GlowTTSConfig(BaseTTSConfig):
|
||||||
Defaults to `{"kernel_size": 3, "dropout_p": 0.1, "num_layers": 6, "num_heads": 2, "hidden_channels_ffn": 768}`
|
Defaults to `{"kernel_size": 3, "dropout_p": 0.1, "num_layers": 6, "num_heads": 2, "hidden_channels_ffn": 768}`
|
||||||
use_encoder_prenet (bool):
|
use_encoder_prenet (bool):
|
||||||
enable / disable the use of a prenet for the encoder. Defaults to True.
|
enable / disable the use of a prenet for the encoder. Defaults to True.
|
||||||
hidden_channels_encoder (int):
|
hidden_channels_enc (int):
|
||||||
Number of base hidden channels used by the encoder network. It defines the input and the output channel sizes,
|
Number of base hidden channels used by the encoder network. It defines the input and the output channel sizes,
|
||||||
and for some encoder types internal hidden channels sizes too. Defaults to 192.
|
and for some encoder types internal hidden channels sizes too. Defaults to 192.
|
||||||
hidden_channels_decoder (int):
|
hidden_channels_dec (int):
|
||||||
Number of base hidden channels used by the decoder WaveNet network. Defaults to 192 as in the original work.
|
Number of base hidden channels used by the decoder WaveNet network. Defaults to 192 as in the original work.
|
||||||
hidden_channels_duration_predictor (int):
|
hidden_channels_dp (int):
|
||||||
Number of layer channels of the duration predictor network. Defaults to 256 as in the original work.
|
Number of layer channels of the duration predictor network. Defaults to 256 as in the original work.
|
||||||
|
mean_only (bool):
|
||||||
|
If true predict only the mean values by the decoder flow. Defaults to True.
|
||||||
|
out_channels (int):
|
||||||
|
Number of channels of the model output tensor. Defaults to 80.
|
||||||
|
num_flow_blocks_dec (int):
|
||||||
|
Number of decoder blocks. Defaults to 12.
|
||||||
|
inference_noise_scale (float):
|
||||||
|
Noise scale used at inference. Defaults to 0.33.
|
||||||
|
kernel_size_dec (int):
|
||||||
|
Decoder kernel size. Defaults to 5
|
||||||
|
dilation_rate (int):
|
||||||
|
Rate to increase dilation by each layer in a decoder block. Defaults to 5.
|
||||||
|
num_block_layers (int):
|
||||||
|
Number of decoder layers in each decoder block. Defaults to 4.
|
||||||
|
dropout_p_dec (float):
|
||||||
|
Dropout rate for decoder. Defaults to 0.1.
|
||||||
|
num_speaker (int):
|
||||||
|
Number of speaker to define the size of speaker embedding layer. Defaults to 0.
|
||||||
|
c_in_channels (int):
|
||||||
|
Number of speaker embedding channels. It is set to 512 if embeddings are learned. Defaults to 0.
|
||||||
|
num_splits (int):
|
||||||
|
Number of split levels in inversible conv1x1 operation. Defaults to 4.
|
||||||
|
num_squeeze (int):
|
||||||
|
Number of squeeze levels. When squeezing channels increases and time steps reduces by the factor
|
||||||
|
'num_squeeze'. Defaults to 1.
|
||||||
|
sigmoid_scale (bool):
|
||||||
|
enable/disable sigmoid scaling in decoder. Defaults to False.
|
||||||
|
mean_only (bool):
|
||||||
|
If True, encoder only computes mean value and uses constant variance for each time step. Defaults to true.
|
||||||
|
encoder_type (str):
|
||||||
|
Encoder module type. Possible values are`["rel_pos_transformer", "gated_conv", "residual_conv_bn", "time_depth_separable"]`
|
||||||
|
Check `TTS.tts.layers.glow_tts.encoder` for more details. Defaults to `rel_pos_transformers` as in the original paper.
|
||||||
|
encoder_params (dict):
|
||||||
|
Encoder module parameters. Defaults to None.
|
||||||
|
d_vector_dim (int):
|
||||||
|
Channels of external speaker embedding vectors. Defaults to 0.
|
||||||
data_dep_init_steps (int):
|
data_dep_init_steps (int):
|
||||||
Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
|
Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
|
||||||
Activation Normalization that pre-computes normalization stats at the beginning and use the same values
|
Activation Normalization that pre-computes normalization stats at the beginning and use the same values
|
||||||
|
@ -41,9 +77,9 @@ class GlowTTSConfig(BaseTTSConfig):
|
||||||
use_speaker_embedding (bool):
|
use_speaker_embedding (bool):
|
||||||
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
|
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
|
||||||
in the multi-speaker mode. Defaults to False.
|
in the multi-speaker mode. Defaults to False.
|
||||||
use_external_speaker_embedding_file (bool):
|
use_d_vector_file (bool):
|
||||||
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
|
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
|
||||||
external_speaker_embedding_file (str):
|
d_vector_file (str):
|
||||||
Path to the file including pre-computed speaker embeddings. Defaults to None.
|
Path to the file including pre-computed speaker embeddings. Defaults to None.
|
||||||
noam_schedule (bool):
|
noam_schedule (bool):
|
||||||
enable / disable the use of Noam LR scheduler. Defaults to False.
|
enable / disable the use of Noam LR scheduler. Defaults to False.
|
||||||
|
@ -62,6 +98,7 @@ class GlowTTSConfig(BaseTTSConfig):
|
||||||
model: str = "glow_tts"
|
model: str = "glow_tts"
|
||||||
|
|
||||||
# model params
|
# model params
|
||||||
|
num_chars: int = None
|
||||||
encoder_type: str = "rel_pos_transformer"
|
encoder_type: str = "rel_pos_transformer"
|
||||||
encoder_params: dict = field(
|
encoder_params: dict = field(
|
||||||
default_factory=lambda: {
|
default_factory=lambda: {
|
||||||
|
@ -73,9 +110,36 @@ class GlowTTSConfig(BaseTTSConfig):
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
use_encoder_prenet: bool = True
|
use_encoder_prenet: bool = True
|
||||||
hidden_channels_encoder: int = 192
|
hidden_channels_enc: int = 192
|
||||||
hidden_channels_decoder: int = 192
|
hidden_channels_dec: int = 192
|
||||||
hidden_channels_duration_predictor: int = 256
|
hidden_channels_dp: int = 256
|
||||||
|
dropout_p_dp: float = 0.1
|
||||||
|
dropout_p_dec: float = 0.05
|
||||||
|
mean_only: bool = True
|
||||||
|
out_channels: int = 80
|
||||||
|
num_flow_blocks_dec: int = 12
|
||||||
|
inference_noise_scale: float = 0.33
|
||||||
|
kernel_size_dec: int = 5
|
||||||
|
dilation_rate: int = 5
|
||||||
|
num_block_layers: int = 4
|
||||||
|
num_speakers: int = 0
|
||||||
|
c_in_channels: int = 0
|
||||||
|
num_splits: int = 4
|
||||||
|
num_squeeze: int = 1
|
||||||
|
sigmoid_scale: bool = False
|
||||||
|
mean_only: bool = False
|
||||||
|
encoder_type: str = "rel_pos_transformer"
|
||||||
|
encoder_params: dict = field(
|
||||||
|
default_factory=lambda: {
|
||||||
|
"kernel_size": 3,
|
||||||
|
"dropout_p": 0.1,
|
||||||
|
"num_layers": 6,
|
||||||
|
"num_heads": 2,
|
||||||
|
"hidden_channels_ffn": 768,
|
||||||
|
"input_length": None,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
d_vector_dim: int = 0
|
||||||
|
|
||||||
# training params
|
# training params
|
||||||
data_dep_init_steps: int = 10
|
data_dep_init_steps: int = 10
|
||||||
|
@ -86,8 +150,8 @@ class GlowTTSConfig(BaseTTSConfig):
|
||||||
|
|
||||||
# multi-speaker settings
|
# multi-speaker settings
|
||||||
use_speaker_embedding: bool = False
|
use_speaker_embedding: bool = False
|
||||||
use_external_speaker_embedding_file: bool = False
|
use_d_vector_file: bool = False
|
||||||
external_speaker_embedding_file: str = False
|
d_vector_file: str = False
|
||||||
|
|
||||||
# optimizer parameters
|
# optimizer parameters
|
||||||
optimizer: str = "RAdam"
|
optimizer: str = "RAdam"
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
from dataclasses import asdict, dataclass, field
|
from dataclasses import asdict, dataclass, field
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from coqpit import MISSING, Coqpit, check_argument
|
from coqpit import Coqpit, check_argument
|
||||||
|
|
||||||
from TTS.config import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig
|
from TTS.config import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig
|
||||||
|
|
||||||
|
@ -153,7 +153,7 @@ class BaseTTSConfig(BaseTrainingConfig):
|
||||||
use_espeak_phonemes: bool = True
|
use_espeak_phonemes: bool = True
|
||||||
phoneme_language: str = None
|
phoneme_language: str = None
|
||||||
compute_input_seq_cache: bool = False
|
compute_input_seq_cache: bool = False
|
||||||
text_cleaner: str = MISSING
|
text_cleaner: str = None
|
||||||
enable_eos_bos_chars: bool = False
|
enable_eos_bos_chars: bool = False
|
||||||
test_sentences_file: str = ""
|
test_sentences_file: str = ""
|
||||||
phoneme_cache_path: str = None
|
phoneme_cache_path: str = None
|
||||||
|
@ -171,10 +171,14 @@ class BaseTTSConfig(BaseTrainingConfig):
|
||||||
# dataset
|
# dataset
|
||||||
datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
|
datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
|
||||||
# optimizer
|
# optimizer
|
||||||
optimizer: str = MISSING
|
optimizer: str = None
|
||||||
optimizer_params: dict = MISSING
|
optimizer_params: dict = None
|
||||||
# scheduler
|
# scheduler
|
||||||
lr_scheduler: str = ""
|
lr_scheduler: str = ""
|
||||||
lr_scheduler_params: dict = field(default_factory=lambda: {})
|
lr_scheduler_params: dict = field(default_factory=lambda: {})
|
||||||
# testing
|
# testing
|
||||||
test_sentences: List[str] = field(default_factory=lambda: [])
|
test_sentences: List[str] = field(default_factory=lambda: [])
|
||||||
|
# multi-speaker
|
||||||
|
use_speaker_embedding: bool = False
|
||||||
|
use_d_vector_file: bool = False
|
||||||
|
d_vector_dim: int = 0
|
||||||
|
|
|
@ -2,6 +2,7 @@ from dataclasses import dataclass, field
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from TTS.tts.configs.shared_configs import BaseTTSConfig
|
from TTS.tts.configs.shared_configs import BaseTTSConfig
|
||||||
|
from TTS.tts.models.speedy_speech import SpeedySpeechArgs
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
@ -16,30 +17,8 @@ class SpeedySpeechConfig(BaseTTSConfig):
|
||||||
Args:
|
Args:
|
||||||
model (str):
|
model (str):
|
||||||
Model name used for selecting the right model at initialization. Defaults to `speedy_speech`.
|
Model name used for selecting the right model at initialization. Defaults to `speedy_speech`.
|
||||||
positional_encoding (bool):
|
model_args (Coqpit):
|
||||||
enable / disable positional encoding applied to the encoder output. Defaults to True.
|
Model class arguments. Check `SpeedySpeechArgs` for more details. Defaults to `SpeedySpeechArgs()`.
|
||||||
hidden_channels (int):
|
|
||||||
Base number of hidden channels. Defines all the layers expect ones defined by the specific encoder or decoder
|
|
||||||
parameters. Defaults to 128.
|
|
||||||
encoder_type (str):
|
|
||||||
Type of the encoder used by the model. Look at `TTS.tts.layers.feed_forward.encoder` for more details.
|
|
||||||
Defaults to `residual_conv_bn`.
|
|
||||||
encoder_params (dict):
|
|
||||||
Parameters used to define the encoder network. Look at `TTS.tts.layers.feed_forward.encoder` for more details.
|
|
||||||
Defaults to `{"kernel_size": 4, "dilations": [1, 2, 4, 1, 2, 4, 1, 2, 4, 1, 2, 4, 1], "num_conv_blocks": 2, "num_res_blocks": 13}`
|
|
||||||
decoder_type (str):
|
|
||||||
Type of the decoder used by the model. Look at `TTS.tts.layers.feed_forward.decoder` for more details.
|
|
||||||
Defaults to `residual_conv_bn`.
|
|
||||||
decoder_params (dict):
|
|
||||||
Parameters used to define the decoder network. Look at `TTS.tts.layers.feed_forward.decoder` for more details.
|
|
||||||
Defaults to `{"kernel_size": 4, "dilations": [1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1], "num_conv_blocks": 2, "num_res_blocks": 17}`
|
|
||||||
hidden_channels_encoder (int):
|
|
||||||
Number of base hidden channels used by the encoder network. It defines the input and the output channel sizes,
|
|
||||||
and for some encoder types internal hidden channels sizes too. Defaults to 192.
|
|
||||||
hidden_channels_decoder (int):
|
|
||||||
Number of base hidden channels used by the decoder WaveNet network. Defaults to 192 as in the original work.
|
|
||||||
hidden_channels_duration_predictor (int):
|
|
||||||
Number of layer channels of the duration predictor network. Defaults to 256 as in the original work.
|
|
||||||
data_dep_init_steps (int):
|
data_dep_init_steps (int):
|
||||||
Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
|
Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
|
||||||
Activation Normalization that pre-computes normalization stats at the beginning and use the same values
|
Activation Normalization that pre-computes normalization stats at the beginning and use the same values
|
||||||
|
@ -47,9 +26,9 @@ class SpeedySpeechConfig(BaseTTSConfig):
|
||||||
use_speaker_embedding (bool):
|
use_speaker_embedding (bool):
|
||||||
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
|
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
|
||||||
in the multi-speaker mode. Defaults to False.
|
in the multi-speaker mode. Defaults to False.
|
||||||
use_external_speaker_embedding_file (bool):
|
use_d_vector_file (bool):
|
||||||
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
|
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
|
||||||
external_speaker_embedding_file (str):
|
d_vector_file (str):
|
||||||
Path to the file including pre-computed speaker embeddings. Defaults to None.
|
Path to the file including pre-computed speaker embeddings. Defaults to None.
|
||||||
noam_schedule (bool):
|
noam_schedule (bool):
|
||||||
enable / disable the use of Noam LR scheduler. Defaults to False.
|
enable / disable the use of Noam LR scheduler. Defaults to False.
|
||||||
|
@ -73,31 +52,12 @@ class SpeedySpeechConfig(BaseTTSConfig):
|
||||||
|
|
||||||
model: str = "speedy_speech"
|
model: str = "speedy_speech"
|
||||||
# model specific params
|
# model specific params
|
||||||
positional_encoding: bool = True
|
model_args: SpeedySpeechArgs = field(default_factory=SpeedySpeechArgs)
|
||||||
hidden_channels: int = 128
|
|
||||||
encoder_type: str = "residual_conv_bn"
|
|
||||||
encoder_params: dict = field(
|
|
||||||
default_factory=lambda: {
|
|
||||||
"kernel_size": 4,
|
|
||||||
"dilations": [1, 2, 4, 1, 2, 4, 1, 2, 4, 1, 2, 4, 1],
|
|
||||||
"num_conv_blocks": 2,
|
|
||||||
"num_res_blocks": 13,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
decoder_type: str = "residual_conv_bn"
|
|
||||||
decoder_params: dict = field(
|
|
||||||
default_factory=lambda: {
|
|
||||||
"kernel_size": 4,
|
|
||||||
"dilations": [1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1],
|
|
||||||
"num_conv_blocks": 2,
|
|
||||||
"num_res_blocks": 17,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
# multi-speaker settings
|
# multi-speaker settings
|
||||||
use_speaker_embedding: bool = False
|
use_speaker_embedding: bool = False
|
||||||
use_external_speaker_embedding_file: bool = False
|
use_d_vector_file: bool = False
|
||||||
external_speaker_embedding_file: str = False
|
d_vector_file: str = False
|
||||||
|
|
||||||
# optimizer parameters
|
# optimizer parameters
|
||||||
optimizer: str = "RAdam"
|
optimizer: str = "RAdam"
|
||||||
|
|
|
@ -12,107 +12,10 @@ class Tacotron2Config(TacotronConfig):
|
||||||
>>> from TTS.tts.configs import Tacotron2Config
|
>>> from TTS.tts.configs import Tacotron2Config
|
||||||
>>> config = Tacotron2Config()
|
>>> config = Tacotron2Config()
|
||||||
|
|
||||||
Args:
|
Check `TacotronConfig` for argument descriptions.
|
||||||
model (str):
|
|
||||||
Model name used to select the right model class to initilize. Defaults to `Tacotron2`.
|
|
||||||
use_gst (bool):
|
|
||||||
enable / disable the use of Global Style Token modules. Defaults to False.
|
|
||||||
gst (GSTConfig):
|
|
||||||
Instance of `GSTConfig` class.
|
|
||||||
gst_style_input (str):
|
|
||||||
Path to the wav file used at inference to set the speech style through GST. If `GST` is enabled and
|
|
||||||
this is not defined, the model uses a zero vector as an input. Defaults to None.
|
|
||||||
r (int):
|
|
||||||
Number of output frames that the decoder computed per iteration. Larger values makes training and inference
|
|
||||||
faster but reduces the quality of the output frames. This needs to be tuned considering your own needs.
|
|
||||||
Defaults to 1.
|
|
||||||
gradual_trainin (List[List]):
|
|
||||||
Parameters for the gradual training schedule. It is in the form `[[a, b, c], [d ,e ,f] ..]` where `a` is
|
|
||||||
the step number to start using the rest of the values, `b` is the `r` value and `c` is the batch size.
|
|
||||||
If sets None, no gradual training is used. Defaults to None.
|
|
||||||
memory_size (int):
|
|
||||||
Defines the number of previous frames used by the Prenet. If set to < 0, then it uses only the last frame.
|
|
||||||
Defaults to -1.
|
|
||||||
prenet_type (str):
|
|
||||||
`original` or `bn`. `original` sets the default Prenet and `bn` uses Batch Normalization version of the
|
|
||||||
Prenet. Defaults to `original`.
|
|
||||||
prenet_dropout (bool):
|
|
||||||
enables / disables the use of dropout in the Prenet. Defaults to True.
|
|
||||||
prenet_dropout_at_inference (bool):
|
|
||||||
enable / disable the use of dropout in the Prenet at the inference time. Defaults to False.
|
|
||||||
stopnet (bool):
|
|
||||||
enable /disable the Stopnet that predicts the end of the decoder sequence. Defaults to True.
|
|
||||||
stopnet_pos_weight (float):
|
|
||||||
Weight that is applied to over-weight positive instances in the Stopnet loss. Use larger values with
|
|
||||||
datasets with longer sentences. Defaults to 10.
|
|
||||||
separate_stopnet (bool):
|
|
||||||
Use a distinct Stopnet which is trained separately from the rest of the model. Defaults to True.
|
|
||||||
attention_type (str):
|
|
||||||
attention type. Check ```TTS.tts.layers.attentions.init_attn```. Defaults to 'original'.
|
|
||||||
attention_heads (int):
|
|
||||||
Number of attention heads for GMM attention. Defaults to 5.
|
|
||||||
windowing (bool):
|
|
||||||
It especially useful at inference to keep attention alignment diagonal. Defaults to False.
|
|
||||||
use_forward_attn (bool):
|
|
||||||
It is only valid if ```attn_type``` is ```original```. Defaults to False.
|
|
||||||
forward_attn_mask (bool):
|
|
||||||
enable/disable extra masking over forward attention. It is useful at inference to prevent
|
|
||||||
possible attention failures. Defaults to False.
|
|
||||||
transition_agent (bool):
|
|
||||||
enable/disable transition agent in forward attention. Defaults to False.
|
|
||||||
location_attn (bool):
|
|
||||||
enable/disable location sensitive attention as in the original Tacotron2 paper.
|
|
||||||
It is only valid if ```attn_type``` is ```original```. Defaults to True.
|
|
||||||
bidirectional_decoder (bool):
|
|
||||||
enable/disable bidirectional decoding. Defaults to False.
|
|
||||||
double_decoder_consistency (bool):
|
|
||||||
enable/disable double decoder consistency. Defaults to False.
|
|
||||||
ddc_r (int):
|
|
||||||
reduction rate used by the coarse decoder when `double_decoder_consistency` is in use. Set this
|
|
||||||
as a multiple of the `r` value. Defaults to 6.
|
|
||||||
use_speaker_embedding (bool):
|
|
||||||
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
|
|
||||||
in the multi-speaker mode. Defaults to False.
|
|
||||||
use_external_speaker_embedding_file (bool):
|
|
||||||
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
|
|
||||||
external_speaker_embedding_file (str):
|
|
||||||
Path to the file including pre-computed speaker embeddings. Defaults to None.
|
|
||||||
noam_schedule (bool):
|
|
||||||
enable / disable the use of Noam LR scheduler. Defaults to False.
|
|
||||||
warmup_steps (int):
|
|
||||||
Number of warm-up steps for the Noam scheduler. Defaults 4000.
|
|
||||||
lr (float):
|
|
||||||
Initial learning rate. Defaults to `1e-4`.
|
|
||||||
wd (float):
|
|
||||||
Weight decay coefficient. Defaults to `1e-6`.
|
|
||||||
grad_clip (float):
|
|
||||||
Gradient clipping threshold. Defaults to `5`.
|
|
||||||
seq_len_notm (bool):
|
|
||||||
enable / disable the sequnce length normalization in the loss functions. If set True, loss of a sample
|
|
||||||
is divided by the sequence length. Defaults to False.
|
|
||||||
loss_masking (bool):
|
|
||||||
enable / disable masking the paddings of the samples in loss computation. Defaults to True.
|
|
||||||
decoder_loss_alpha (float):
|
|
||||||
Weight for the decoder loss of the Tacotron model. If set less than or equal to zero, it disables the
|
|
||||||
corresponding loss function. Defaults to 0.25
|
|
||||||
postnet_loss_alpha (float):
|
|
||||||
Weight for the postnet loss of the Tacotron model. If set less than or equal to zero, it disables the
|
|
||||||
corresponding loss function. Defaults to 0.25
|
|
||||||
postnet_diff_spec_alpha (float):
|
|
||||||
Weight for the postnet differential loss of the Tacotron model. If set less than or equal to zero, it disables the
|
|
||||||
corresponding loss function. Defaults to 0.25
|
|
||||||
decoder_diff_spec_alpha (float):
|
|
||||||
Weight for the decoder differential loss of the Tacotron model. If set less than or equal to zero, it disables the
|
|
||||||
corresponding loss function. Defaults to 0.25
|
|
||||||
decoder_ssim_alpha (float):
|
|
||||||
Weight for the decoder SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the
|
|
||||||
corresponding loss function. Defaults to 0.25
|
|
||||||
postnet_ssim_alpha (float):
|
|
||||||
Weight for the postnet SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the
|
|
||||||
corresponding loss function. Defaults to 0.25
|
|
||||||
ga_alpha (float):
|
|
||||||
Weight for the guided attention loss. If set less than or equal to zero, it disables the corresponding loss
|
|
||||||
function. Defaults to 5.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
model: str = "tacotron2"
|
model: str = "tacotron2"
|
||||||
|
out_channels: int = 80
|
||||||
|
encoder_in_features: int = 512
|
||||||
|
decoder_in_features: int = 512
|
||||||
|
|
|
@ -23,6 +23,10 @@ class TacotronConfig(BaseTTSConfig):
|
||||||
gst_style_input (str):
|
gst_style_input (str):
|
||||||
Path to the wav file used at inference to set the speech style through GST. If `GST` is enabled and
|
Path to the wav file used at inference to set the speech style through GST. If `GST` is enabled and
|
||||||
this is not defined, the model uses a zero vector as an input. Defaults to None.
|
this is not defined, the model uses a zero vector as an input. Defaults to None.
|
||||||
|
num_chars (int):
|
||||||
|
Number of characters used by the model. It must be defined before initializing the model. Defaults to None.
|
||||||
|
num_speakers (int):
|
||||||
|
Number of speakers for multi-speaker models. Defaults to 1.
|
||||||
r (int):
|
r (int):
|
||||||
Initial number of output frames that the decoder computed per iteration. Larger values makes training and inference
|
Initial number of output frames that the decoder computed per iteration. Larger values makes training and inference
|
||||||
faster but reduces the quality of the output frames. This must be equal to the largest `r` value used in
|
faster but reduces the quality of the output frames. This must be equal to the largest `r` value used in
|
||||||
|
@ -47,7 +51,13 @@ class TacotronConfig(BaseTTSConfig):
|
||||||
Weight that is applied to over-weight positive instances in the Stopnet loss. Use larger values with
|
Weight that is applied to over-weight positive instances in the Stopnet loss. Use larger values with
|
||||||
datasets with longer sentences. Defaults to 10.
|
datasets with longer sentences. Defaults to 10.
|
||||||
max_decoder_steps (int):
|
max_decoder_steps (int):
|
||||||
Max number of steps allowed for the decoder. Defaults to 10000.
|
Max number of steps allowed for the decoder. Defaults to 50.
|
||||||
|
encoder_in_features (int):
|
||||||
|
Channels of encoder input and character embedding tensors. Defaults to 256.
|
||||||
|
decoder_in_features (int):
|
||||||
|
Channels of decoder input and encoder output tensors. Defaults to 256.
|
||||||
|
out_channels (int):
|
||||||
|
Channels of the final model output. It must match the spectragram size. Defaults to 80.
|
||||||
separate_stopnet (bool):
|
separate_stopnet (bool):
|
||||||
Use a distinct Stopnet which is trained separately from the rest of the model. Defaults to True.
|
Use a distinct Stopnet which is trained separately from the rest of the model. Defaults to True.
|
||||||
attention_type (str):
|
attention_type (str):
|
||||||
|
@ -76,9 +86,9 @@ class TacotronConfig(BaseTTSConfig):
|
||||||
use_speaker_embedding (bool):
|
use_speaker_embedding (bool):
|
||||||
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
|
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
|
||||||
in the multi-speaker mode. Defaults to False.
|
in the multi-speaker mode. Defaults to False.
|
||||||
use_external_speaker_embedding_file (bool):
|
use_d_vector_file (bool):
|
||||||
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
|
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
|
||||||
external_speaker_embedding_file (str):
|
d_vector_file (str):
|
||||||
Path to the file including pre-computed speaker embeddings. Defaults to None.
|
Path to the file including pre-computed speaker embeddings. Defaults to None.
|
||||||
optimizer (str):
|
optimizer (str):
|
||||||
Optimizer used for the training. Set one from `torch.optim.Optimizer` or `TTS.utils.training`.
|
Optimizer used for the training. Set one from `torch.optim.Optimizer` or `TTS.utils.training`.
|
||||||
|
@ -111,6 +121,7 @@ class TacotronConfig(BaseTTSConfig):
|
||||||
Weight for the postnet differential loss of the Tacotron model. If set less than or equal to zero, it disables the
|
Weight for the postnet differential loss of the Tacotron model. If set less than or equal to zero, it disables the
|
||||||
corresponding loss function. Defaults to 0.25
|
corresponding loss function. Defaults to 0.25
|
||||||
decoder_diff_spec_alpha (float):
|
decoder_diff_spec_alpha (float):
|
||||||
|
|
||||||
Weight for the decoder differential loss of the Tacotron model. If set less than or equal to zero, it disables the
|
Weight for the decoder differential loss of the Tacotron model. If set less than or equal to zero, it disables the
|
||||||
corresponding loss function. Defaults to 0.25
|
corresponding loss function. Defaults to 0.25
|
||||||
decoder_ssim_alpha (float):
|
decoder_ssim_alpha (float):
|
||||||
|
@ -125,11 +136,14 @@ class TacotronConfig(BaseTTSConfig):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
model: str = "tacotron"
|
model: str = "tacotron"
|
||||||
|
# model_params: TacotronArgs = field(default_factory=lambda: TacotronArgs())
|
||||||
use_gst: bool = False
|
use_gst: bool = False
|
||||||
gst: GSTConfig = None
|
gst: GSTConfig = None
|
||||||
gst_style_input: str = None
|
gst_style_input: str = None
|
||||||
|
|
||||||
# model specific params
|
# model specific params
|
||||||
|
num_speakers: int = 1
|
||||||
|
num_chars: int = 0
|
||||||
r: int = 2
|
r: int = 2
|
||||||
gradual_training: List[List[int]] = None
|
gradual_training: List[List[int]] = None
|
||||||
memory_size: int = -1
|
memory_size: int = -1
|
||||||
|
@ -139,12 +153,17 @@ class TacotronConfig(BaseTTSConfig):
|
||||||
stopnet: bool = True
|
stopnet: bool = True
|
||||||
separate_stopnet: bool = True
|
separate_stopnet: bool = True
|
||||||
stopnet_pos_weight: float = 10.0
|
stopnet_pos_weight: float = 10.0
|
||||||
max_decoder_steps: int = 10000
|
max_decoder_steps: int = 500
|
||||||
|
encoder_in_features: int = 256
|
||||||
|
decoder_in_features: int = 256
|
||||||
|
decoder_output_dim: int = 80
|
||||||
|
out_channels: int = 513
|
||||||
|
|
||||||
# attention layers
|
# attention layers
|
||||||
attention_type: str = "original"
|
attention_type: str = "original"
|
||||||
attention_heads: int = None
|
attention_heads: int = None
|
||||||
attention_norm: str = "sigmoid"
|
attention_norm: str = "sigmoid"
|
||||||
|
attention_win: bool = False
|
||||||
windowing: bool = False
|
windowing: bool = False
|
||||||
use_forward_attn: bool = False
|
use_forward_attn: bool = False
|
||||||
forward_attn_mask: bool = False
|
forward_attn_mask: bool = False
|
||||||
|
@ -158,8 +177,10 @@ class TacotronConfig(BaseTTSConfig):
|
||||||
|
|
||||||
# multi-speaker settings
|
# multi-speaker settings
|
||||||
use_speaker_embedding: bool = False
|
use_speaker_embedding: bool = False
|
||||||
use_external_speaker_embedding_file: bool = False
|
speaker_embedding_dim: int = 512
|
||||||
external_speaker_embedding_file: str = False
|
use_d_vector_file: bool = False
|
||||||
|
d_vector_file: str = False
|
||||||
|
d_vector_dim: int = None
|
||||||
|
|
||||||
# optimizer parameters
|
# optimizer parameters
|
||||||
optimizer: str = "RAdam"
|
optimizer: str = "RAdam"
|
||||||
|
@ -196,3 +217,9 @@ class TacotronConfig(BaseTTSConfig):
|
||||||
assert (
|
assert (
|
||||||
self.gradual_training[0][1] == self.r
|
self.gradual_training[0][1] == self.r
|
||||||
), f"[!] the first scheduled gradual training `r` must be equal to the model's `r` value. {self.gradual_training[0][1]} vs {self.r}"
|
), f"[!] the first scheduled gradual training `r` must be equal to the model's `r` value. {self.gradual_training[0][1]} vs {self.r}"
|
||||||
|
if self.model == "tacotron" and self.audio is not None:
|
||||||
|
assert self.out_channels == (
|
||||||
|
self.audio.fft_size // 2 + 1
|
||||||
|
), f"{self.out_channels} vs {self.audio.fft_size // 2 + 1}"
|
||||||
|
if self.model == "tacotron2" and self.audio is not None:
|
||||||
|
assert self.out_channels == self.audio.num_mels
|
||||||
|
|
Loading…
Reference in New Issue