Update tts model configs

This commit is contained in:
Eren Gölge 2021-06-18 13:28:58 +02:00
parent 626c9d41e6
commit 786170fe7d
6 changed files with 133 additions and 184 deletions

View File

@ -2,6 +2,7 @@ from dataclasses import dataclass, field
from typing import List
from TTS.tts.configs.shared_configs import BaseTTSConfig
from TTS.tts.models.align_tts import AlignTTSArgs
@dataclass
@ -49,9 +50,9 @@ class AlignTTSConfig(BaseTTSConfig):
use_speaker_embedding (bool):
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
in the multi-speaker mode. Defaults to False.
use_external_speaker_embedding_file (bool):
use_d_vector_file (bool):
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
external_speaker_embedding_file (str):
d_vector_file (str):
Path to the file including pre-computed speaker embeddings. Defaults to None.
noam_schedule (bool):
enable / disable the use of Noam LR scheduler. Defaults to False.
@ -68,17 +69,7 @@ class AlignTTSConfig(BaseTTSConfig):
model: str = "align_tts"
# model specific params
positional_encoding: bool = True
hidden_channels_dp: int = 256
hidden_channels: int = 256
encoder_type: str = "fftransformer"
encoder_params: dict = field(
default_factory=lambda: {"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}
)
decoder_type: str = "fftransformer"
decoder_params: dict = field(
default_factory=lambda: {"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}
)
model_args: AlignTTSArgs = field(default_factory=AlignTTSArgs)
phase_start_steps: List[int] = None
ssim_alpha: float = 1.0
@ -88,8 +79,8 @@ class AlignTTSConfig(BaseTTSConfig):
# multi-speaker settings
use_speaker_embedding: bool = False
use_external_speaker_embedding_file: bool = False
external_speaker_embedding_file: str = False
use_d_vector_file: bool = False
d_vector_file: str = False
# optimizer parameters
optimizer: str = "Adam"

View File

@ -23,13 +23,49 @@ class GlowTTSConfig(BaseTTSConfig):
Defaults to `{"kernel_size": 3, "dropout_p": 0.1, "num_layers": 6, "num_heads": 2, "hidden_channels_ffn": 768}`
use_encoder_prenet (bool):
enable / disable the use of a prenet for the encoder. Defaults to True.
hidden_channels_encoder (int):
hidden_channels_enc (int):
Number of base hidden channels used by the encoder network. It defines the input and the output channel sizes,
and for some encoder types internal hidden channels sizes too. Defaults to 192.
hidden_channels_decoder (int):
hidden_channels_dec (int):
Number of base hidden channels used by the decoder WaveNet network. Defaults to 192 as in the original work.
hidden_channels_duration_predictor (int):
hidden_channels_dp (int):
Number of layer channels of the duration predictor network. Defaults to 256 as in the original work.
mean_only (bool):
If true predict only the mean values by the decoder flow. Defaults to True.
out_channels (int):
Number of channels of the model output tensor. Defaults to 80.
num_flow_blocks_dec (int):
Number of decoder blocks. Defaults to 12.
inference_noise_scale (float):
Noise scale used at inference. Defaults to 0.33.
kernel_size_dec (int):
Decoder kernel size. Defaults to 5
dilation_rate (int):
Rate to increase dilation by each layer in a decoder block. Defaults to 5.
num_block_layers (int):
Number of decoder layers in each decoder block. Defaults to 4.
dropout_p_dec (float):
Dropout rate for decoder. Defaults to 0.1.
num_speaker (int):
Number of speaker to define the size of speaker embedding layer. Defaults to 0.
c_in_channels (int):
Number of speaker embedding channels. It is set to 512 if embeddings are learned. Defaults to 0.
num_splits (int):
Number of split levels in inversible conv1x1 operation. Defaults to 4.
num_squeeze (int):
Number of squeeze levels. When squeezing channels increases and time steps reduces by the factor
'num_squeeze'. Defaults to 1.
sigmoid_scale (bool):
enable/disable sigmoid scaling in decoder. Defaults to False.
mean_only (bool):
If True, encoder only computes mean value and uses constant variance for each time step. Defaults to true.
encoder_type (str):
Encoder module type. Possible values are`["rel_pos_transformer", "gated_conv", "residual_conv_bn", "time_depth_separable"]`
Check `TTS.tts.layers.glow_tts.encoder` for more details. Defaults to `rel_pos_transformers` as in the original paper.
encoder_params (dict):
Encoder module parameters. Defaults to None.
d_vector_dim (int):
Channels of external speaker embedding vectors. Defaults to 0.
data_dep_init_steps (int):
Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
Activation Normalization that pre-computes normalization stats at the beginning and use the same values
@ -41,9 +77,9 @@ class GlowTTSConfig(BaseTTSConfig):
use_speaker_embedding (bool):
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
in the multi-speaker mode. Defaults to False.
use_external_speaker_embedding_file (bool):
use_d_vector_file (bool):
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
external_speaker_embedding_file (str):
d_vector_file (str):
Path to the file including pre-computed speaker embeddings. Defaults to None.
noam_schedule (bool):
enable / disable the use of Noam LR scheduler. Defaults to False.
@ -62,6 +98,7 @@ class GlowTTSConfig(BaseTTSConfig):
model: str = "glow_tts"
# model params
num_chars: int = None
encoder_type: str = "rel_pos_transformer"
encoder_params: dict = field(
default_factory=lambda: {
@ -73,9 +110,36 @@ class GlowTTSConfig(BaseTTSConfig):
}
)
use_encoder_prenet: bool = True
hidden_channels_encoder: int = 192
hidden_channels_decoder: int = 192
hidden_channels_duration_predictor: int = 256
hidden_channels_enc: int = 192
hidden_channels_dec: int = 192
hidden_channels_dp: int = 256
dropout_p_dp: float = 0.1
dropout_p_dec: float = 0.05
mean_only: bool = True
out_channels: int = 80
num_flow_blocks_dec: int = 12
inference_noise_scale: float = 0.33
kernel_size_dec: int = 5
dilation_rate: int = 5
num_block_layers: int = 4
num_speakers: int = 0
c_in_channels: int = 0
num_splits: int = 4
num_squeeze: int = 1
sigmoid_scale: bool = False
mean_only: bool = False
encoder_type: str = "rel_pos_transformer"
encoder_params: dict = field(
default_factory=lambda: {
"kernel_size": 3,
"dropout_p": 0.1,
"num_layers": 6,
"num_heads": 2,
"hidden_channels_ffn": 768,
"input_length": None,
}
)
d_vector_dim: int = 0
# training params
data_dep_init_steps: int = 10
@ -86,8 +150,8 @@ class GlowTTSConfig(BaseTTSConfig):
# multi-speaker settings
use_speaker_embedding: bool = False
use_external_speaker_embedding_file: bool = False
external_speaker_embedding_file: str = False
use_d_vector_file: bool = False
d_vector_file: str = False
# optimizer parameters
optimizer: str = "RAdam"

View File

@ -1,7 +1,7 @@
from dataclasses import asdict, dataclass, field
from typing import List
from coqpit import MISSING, Coqpit, check_argument
from coqpit import Coqpit, check_argument
from TTS.config import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig
@ -153,7 +153,7 @@ class BaseTTSConfig(BaseTrainingConfig):
use_espeak_phonemes: bool = True
phoneme_language: str = None
compute_input_seq_cache: bool = False
text_cleaner: str = MISSING
text_cleaner: str = None
enable_eos_bos_chars: bool = False
test_sentences_file: str = ""
phoneme_cache_path: str = None
@ -171,10 +171,14 @@ class BaseTTSConfig(BaseTrainingConfig):
# dataset
datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
# optimizer
optimizer: str = MISSING
optimizer_params: dict = MISSING
optimizer: str = None
optimizer_params: dict = None
# scheduler
lr_scheduler: str = ""
lr_scheduler_params: dict = field(default_factory=lambda: {})
# testing
test_sentences: List[str] = field(default_factory=lambda: [])
# multi-speaker
use_speaker_embedding: bool = False
use_d_vector_file: bool = False
d_vector_dim: int = 0

View File

@ -2,6 +2,7 @@ from dataclasses import dataclass, field
from typing import List
from TTS.tts.configs.shared_configs import BaseTTSConfig
from TTS.tts.models.speedy_speech import SpeedySpeechArgs
@dataclass
@ -16,30 +17,8 @@ class SpeedySpeechConfig(BaseTTSConfig):
Args:
model (str):
Model name used for selecting the right model at initialization. Defaults to `speedy_speech`.
positional_encoding (bool):
enable / disable positional encoding applied to the encoder output. Defaults to True.
hidden_channels (int):
Base number of hidden channels. Defines all the layers expect ones defined by the specific encoder or decoder
parameters. Defaults to 128.
encoder_type (str):
Type of the encoder used by the model. Look at `TTS.tts.layers.feed_forward.encoder` for more details.
Defaults to `residual_conv_bn`.
encoder_params (dict):
Parameters used to define the encoder network. Look at `TTS.tts.layers.feed_forward.encoder` for more details.
Defaults to `{"kernel_size": 4, "dilations": [1, 2, 4, 1, 2, 4, 1, 2, 4, 1, 2, 4, 1], "num_conv_blocks": 2, "num_res_blocks": 13}`
decoder_type (str):
Type of the decoder used by the model. Look at `TTS.tts.layers.feed_forward.decoder` for more details.
Defaults to `residual_conv_bn`.
decoder_params (dict):
Parameters used to define the decoder network. Look at `TTS.tts.layers.feed_forward.decoder` for more details.
Defaults to `{"kernel_size": 4, "dilations": [1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1], "num_conv_blocks": 2, "num_res_blocks": 17}`
hidden_channels_encoder (int):
Number of base hidden channels used by the encoder network. It defines the input and the output channel sizes,
and for some encoder types internal hidden channels sizes too. Defaults to 192.
hidden_channels_decoder (int):
Number of base hidden channels used by the decoder WaveNet network. Defaults to 192 as in the original work.
hidden_channels_duration_predictor (int):
Number of layer channels of the duration predictor network. Defaults to 256 as in the original work.
model_args (Coqpit):
Model class arguments. Check `SpeedySpeechArgs` for more details. Defaults to `SpeedySpeechArgs()`.
data_dep_init_steps (int):
Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
Activation Normalization that pre-computes normalization stats at the beginning and use the same values
@ -47,9 +26,9 @@ class SpeedySpeechConfig(BaseTTSConfig):
use_speaker_embedding (bool):
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
in the multi-speaker mode. Defaults to False.
use_external_speaker_embedding_file (bool):
use_d_vector_file (bool):
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
external_speaker_embedding_file (str):
d_vector_file (str):
Path to the file including pre-computed speaker embeddings. Defaults to None.
noam_schedule (bool):
enable / disable the use of Noam LR scheduler. Defaults to False.
@ -73,31 +52,12 @@ class SpeedySpeechConfig(BaseTTSConfig):
model: str = "speedy_speech"
# model specific params
positional_encoding: bool = True
hidden_channels: int = 128
encoder_type: str = "residual_conv_bn"
encoder_params: dict = field(
default_factory=lambda: {
"kernel_size": 4,
"dilations": [1, 2, 4, 1, 2, 4, 1, 2, 4, 1, 2, 4, 1],
"num_conv_blocks": 2,
"num_res_blocks": 13,
}
)
decoder_type: str = "residual_conv_bn"
decoder_params: dict = field(
default_factory=lambda: {
"kernel_size": 4,
"dilations": [1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1],
"num_conv_blocks": 2,
"num_res_blocks": 17,
}
)
model_args: SpeedySpeechArgs = field(default_factory=SpeedySpeechArgs)
# multi-speaker settings
use_speaker_embedding: bool = False
use_external_speaker_embedding_file: bool = False
external_speaker_embedding_file: str = False
use_d_vector_file: bool = False
d_vector_file: str = False
# optimizer parameters
optimizer: str = "RAdam"

View File

@ -12,107 +12,10 @@ class Tacotron2Config(TacotronConfig):
>>> from TTS.tts.configs import Tacotron2Config
>>> config = Tacotron2Config()
Args:
model (str):
Model name used to select the right model class to initilize. Defaults to `Tacotron2`.
use_gst (bool):
enable / disable the use of Global Style Token modules. Defaults to False.
gst (GSTConfig):
Instance of `GSTConfig` class.
gst_style_input (str):
Path to the wav file used at inference to set the speech style through GST. If `GST` is enabled and
this is not defined, the model uses a zero vector as an input. Defaults to None.
r (int):
Number of output frames that the decoder computed per iteration. Larger values makes training and inference
faster but reduces the quality of the output frames. This needs to be tuned considering your own needs.
Defaults to 1.
gradual_trainin (List[List]):
Parameters for the gradual training schedule. It is in the form `[[a, b, c], [d ,e ,f] ..]` where `a` is
the step number to start using the rest of the values, `b` is the `r` value and `c` is the batch size.
If sets None, no gradual training is used. Defaults to None.
memory_size (int):
Defines the number of previous frames used by the Prenet. If set to < 0, then it uses only the last frame.
Defaults to -1.
prenet_type (str):
`original` or `bn`. `original` sets the default Prenet and `bn` uses Batch Normalization version of the
Prenet. Defaults to `original`.
prenet_dropout (bool):
enables / disables the use of dropout in the Prenet. Defaults to True.
prenet_dropout_at_inference (bool):
enable / disable the use of dropout in the Prenet at the inference time. Defaults to False.
stopnet (bool):
enable /disable the Stopnet that predicts the end of the decoder sequence. Defaults to True.
stopnet_pos_weight (float):
Weight that is applied to over-weight positive instances in the Stopnet loss. Use larger values with
datasets with longer sentences. Defaults to 10.
separate_stopnet (bool):
Use a distinct Stopnet which is trained separately from the rest of the model. Defaults to True.
attention_type (str):
attention type. Check ```TTS.tts.layers.attentions.init_attn```. Defaults to 'original'.
attention_heads (int):
Number of attention heads for GMM attention. Defaults to 5.
windowing (bool):
It especially useful at inference to keep attention alignment diagonal. Defaults to False.
use_forward_attn (bool):
It is only valid if ```attn_type``` is ```original```. Defaults to False.
forward_attn_mask (bool):
enable/disable extra masking over forward attention. It is useful at inference to prevent
possible attention failures. Defaults to False.
transition_agent (bool):
enable/disable transition agent in forward attention. Defaults to False.
location_attn (bool):
enable/disable location sensitive attention as in the original Tacotron2 paper.
It is only valid if ```attn_type``` is ```original```. Defaults to True.
bidirectional_decoder (bool):
enable/disable bidirectional decoding. Defaults to False.
double_decoder_consistency (bool):
enable/disable double decoder consistency. Defaults to False.
ddc_r (int):
reduction rate used by the coarse decoder when `double_decoder_consistency` is in use. Set this
as a multiple of the `r` value. Defaults to 6.
use_speaker_embedding (bool):
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
in the multi-speaker mode. Defaults to False.
use_external_speaker_embedding_file (bool):
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
external_speaker_embedding_file (str):
Path to the file including pre-computed speaker embeddings. Defaults to None.
noam_schedule (bool):
enable / disable the use of Noam LR scheduler. Defaults to False.
warmup_steps (int):
Number of warm-up steps for the Noam scheduler. Defaults 4000.
lr (float):
Initial learning rate. Defaults to `1e-4`.
wd (float):
Weight decay coefficient. Defaults to `1e-6`.
grad_clip (float):
Gradient clipping threshold. Defaults to `5`.
seq_len_notm (bool):
enable / disable the sequnce length normalization in the loss functions. If set True, loss of a sample
is divided by the sequence length. Defaults to False.
loss_masking (bool):
enable / disable masking the paddings of the samples in loss computation. Defaults to True.
decoder_loss_alpha (float):
Weight for the decoder loss of the Tacotron model. If set less than or equal to zero, it disables the
corresponding loss function. Defaults to 0.25
postnet_loss_alpha (float):
Weight for the postnet loss of the Tacotron model. If set less than or equal to zero, it disables the
corresponding loss function. Defaults to 0.25
postnet_diff_spec_alpha (float):
Weight for the postnet differential loss of the Tacotron model. If set less than or equal to zero, it disables the
corresponding loss function. Defaults to 0.25
decoder_diff_spec_alpha (float):
Weight for the decoder differential loss of the Tacotron model. If set less than or equal to zero, it disables the
corresponding loss function. Defaults to 0.25
decoder_ssim_alpha (float):
Weight for the decoder SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the
corresponding loss function. Defaults to 0.25
postnet_ssim_alpha (float):
Weight for the postnet SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the
corresponding loss function. Defaults to 0.25
ga_alpha (float):
Weight for the guided attention loss. If set less than or equal to zero, it disables the corresponding loss
function. Defaults to 5.
Check `TacotronConfig` for argument descriptions.
"""
model: str = "tacotron2"
out_channels: int = 80
encoder_in_features: int = 512
decoder_in_features: int = 512

View File

@ -23,6 +23,10 @@ class TacotronConfig(BaseTTSConfig):
gst_style_input (str):
Path to the wav file used at inference to set the speech style through GST. If `GST` is enabled and
this is not defined, the model uses a zero vector as an input. Defaults to None.
num_chars (int):
Number of characters used by the model. It must be defined before initializing the model. Defaults to None.
num_speakers (int):
Number of speakers for multi-speaker models. Defaults to 1.
r (int):
Initial number of output frames that the decoder computed per iteration. Larger values makes training and inference
faster but reduces the quality of the output frames. This must be equal to the largest `r` value used in
@ -47,7 +51,13 @@ class TacotronConfig(BaseTTSConfig):
Weight that is applied to over-weight positive instances in the Stopnet loss. Use larger values with
datasets with longer sentences. Defaults to 10.
max_decoder_steps (int):
Max number of steps allowed for the decoder. Defaults to 10000.
Max number of steps allowed for the decoder. Defaults to 50.
encoder_in_features (int):
Channels of encoder input and character embedding tensors. Defaults to 256.
decoder_in_features (int):
Channels of decoder input and encoder output tensors. Defaults to 256.
out_channels (int):
Channels of the final model output. It must match the spectragram size. Defaults to 80.
separate_stopnet (bool):
Use a distinct Stopnet which is trained separately from the rest of the model. Defaults to True.
attention_type (str):
@ -76,9 +86,9 @@ class TacotronConfig(BaseTTSConfig):
use_speaker_embedding (bool):
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
in the multi-speaker mode. Defaults to False.
use_external_speaker_embedding_file (bool):
use_d_vector_file (bool):
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
external_speaker_embedding_file (str):
d_vector_file (str):
Path to the file including pre-computed speaker embeddings. Defaults to None.
optimizer (str):
Optimizer used for the training. Set one from `torch.optim.Optimizer` or `TTS.utils.training`.
@ -111,6 +121,7 @@ class TacotronConfig(BaseTTSConfig):
Weight for the postnet differential loss of the Tacotron model. If set less than or equal to zero, it disables the
corresponding loss function. Defaults to 0.25
decoder_diff_spec_alpha (float):
Weight for the decoder differential loss of the Tacotron model. If set less than or equal to zero, it disables the
corresponding loss function. Defaults to 0.25
decoder_ssim_alpha (float):
@ -125,11 +136,14 @@ class TacotronConfig(BaseTTSConfig):
"""
model: str = "tacotron"
# model_params: TacotronArgs = field(default_factory=lambda: TacotronArgs())
use_gst: bool = False
gst: GSTConfig = None
gst_style_input: str = None
# model specific params
num_speakers: int = 1
num_chars: int = 0
r: int = 2
gradual_training: List[List[int]] = None
memory_size: int = -1
@ -139,12 +153,17 @@ class TacotronConfig(BaseTTSConfig):
stopnet: bool = True
separate_stopnet: bool = True
stopnet_pos_weight: float = 10.0
max_decoder_steps: int = 10000
max_decoder_steps: int = 500
encoder_in_features: int = 256
decoder_in_features: int = 256
decoder_output_dim: int = 80
out_channels: int = 513
# attention layers
attention_type: str = "original"
attention_heads: int = None
attention_norm: str = "sigmoid"
attention_win: bool = False
windowing: bool = False
use_forward_attn: bool = False
forward_attn_mask: bool = False
@ -158,8 +177,10 @@ class TacotronConfig(BaseTTSConfig):
# multi-speaker settings
use_speaker_embedding: bool = False
use_external_speaker_embedding_file: bool = False
external_speaker_embedding_file: str = False
speaker_embedding_dim: int = 512
use_d_vector_file: bool = False
d_vector_file: str = False
d_vector_dim: int = None
# optimizer parameters
optimizer: str = "RAdam"
@ -196,3 +217,9 @@ class TacotronConfig(BaseTTSConfig):
assert (
self.gradual_training[0][1] == self.r
), f"[!] the first scheduled gradual training `r` must be equal to the model's `r` value. {self.gradual_training[0][1]} vs {self.r}"
if self.model == "tacotron" and self.audio is not None:
assert self.out_channels == (
self.audio.fft_size // 2 + 1
), f"{self.out_channels} vs {self.audio.fft_size // 2 + 1}"
if self.model == "tacotron2" and self.audio is not None:
assert self.out_channels == self.audio.num_mels