From 786170fe7d5da036bbb44fa269f9e20865a9354f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 13:28:58 +0200 Subject: [PATCH] Update tts model configs --- TTS/tts/configs/align_tts_config.py | 21 ++--- TTS/tts/configs/glow_tts_config.py | 84 ++++++++++++++++--- TTS/tts/configs/shared_configs.py | 12 ++- TTS/tts/configs/speedy_speech_config.py | 56 ++----------- TTS/tts/configs/tacotron2_config.py | 105 +----------------------- TTS/tts/configs/tacotron_config.py | 39 +++++++-- 6 files changed, 133 insertions(+), 184 deletions(-) diff --git a/TTS/tts/configs/align_tts_config.py b/TTS/tts/configs/align_tts_config.py index 56622741..837cd519 100644 --- a/TTS/tts/configs/align_tts_config.py +++ b/TTS/tts/configs/align_tts_config.py @@ -2,6 +2,7 @@ from dataclasses import dataclass, field from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig +from TTS.tts.models.align_tts import AlignTTSArgs @dataclass @@ -49,9 +50,9 @@ class AlignTTSConfig(BaseTTSConfig): use_speaker_embedding (bool): enable / disable using speaker embeddings for multi-speaker models. If set True, the model is in the multi-speaker mode. Defaults to False. - use_external_speaker_embedding_file (bool): + use_d_vector_file (bool): enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False. - external_speaker_embedding_file (str): + d_vector_file (str): Path to the file including pre-computed speaker embeddings. Defaults to None. noam_schedule (bool): enable / disable the use of Noam LR scheduler. Defaults to False. @@ -68,17 +69,7 @@ class AlignTTSConfig(BaseTTSConfig): model: str = "align_tts" # model specific params - positional_encoding: bool = True - hidden_channels_dp: int = 256 - hidden_channels: int = 256 - encoder_type: str = "fftransformer" - encoder_params: dict = field( - default_factory=lambda: {"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1} - ) - decoder_type: str = "fftransformer" - decoder_params: dict = field( - default_factory=lambda: {"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1} - ) + model_args: AlignTTSArgs = field(default_factory=AlignTTSArgs) phase_start_steps: List[int] = None ssim_alpha: float = 1.0 @@ -88,8 +79,8 @@ class AlignTTSConfig(BaseTTSConfig): # multi-speaker settings use_speaker_embedding: bool = False - use_external_speaker_embedding_file: bool = False - external_speaker_embedding_file: str = False + use_d_vector_file: bool = False + d_vector_file: str = False # optimizer parameters optimizer: str = "Adam" diff --git a/TTS/tts/configs/glow_tts_config.py b/TTS/tts/configs/glow_tts_config.py index 925854c9..19b7abd9 100644 --- a/TTS/tts/configs/glow_tts_config.py +++ b/TTS/tts/configs/glow_tts_config.py @@ -23,13 +23,49 @@ class GlowTTSConfig(BaseTTSConfig): Defaults to `{"kernel_size": 3, "dropout_p": 0.1, "num_layers": 6, "num_heads": 2, "hidden_channels_ffn": 768}` use_encoder_prenet (bool): enable / disable the use of a prenet for the encoder. Defaults to True. - hidden_channels_encoder (int): + hidden_channels_enc (int): Number of base hidden channels used by the encoder network. It defines the input and the output channel sizes, and for some encoder types internal hidden channels sizes too. Defaults to 192. - hidden_channels_decoder (int): + hidden_channels_dec (int): Number of base hidden channels used by the decoder WaveNet network. Defaults to 192 as in the original work. - hidden_channels_duration_predictor (int): + hidden_channels_dp (int): Number of layer channels of the duration predictor network. Defaults to 256 as in the original work. + mean_only (bool): + If true predict only the mean values by the decoder flow. Defaults to True. + out_channels (int): + Number of channels of the model output tensor. Defaults to 80. + num_flow_blocks_dec (int): + Number of decoder blocks. Defaults to 12. + inference_noise_scale (float): + Noise scale used at inference. Defaults to 0.33. + kernel_size_dec (int): + Decoder kernel size. Defaults to 5 + dilation_rate (int): + Rate to increase dilation by each layer in a decoder block. Defaults to 5. + num_block_layers (int): + Number of decoder layers in each decoder block. Defaults to 4. + dropout_p_dec (float): + Dropout rate for decoder. Defaults to 0.1. + num_speaker (int): + Number of speaker to define the size of speaker embedding layer. Defaults to 0. + c_in_channels (int): + Number of speaker embedding channels. It is set to 512 if embeddings are learned. Defaults to 0. + num_splits (int): + Number of split levels in inversible conv1x1 operation. Defaults to 4. + num_squeeze (int): + Number of squeeze levels. When squeezing channels increases and time steps reduces by the factor + 'num_squeeze'. Defaults to 1. + sigmoid_scale (bool): + enable/disable sigmoid scaling in decoder. Defaults to False. + mean_only (bool): + If True, encoder only computes mean value and uses constant variance for each time step. Defaults to true. + encoder_type (str): + Encoder module type. Possible values are`["rel_pos_transformer", "gated_conv", "residual_conv_bn", "time_depth_separable"]` + Check `TTS.tts.layers.glow_tts.encoder` for more details. Defaults to `rel_pos_transformers` as in the original paper. + encoder_params (dict): + Encoder module parameters. Defaults to None. + d_vector_dim (int): + Channels of external speaker embedding vectors. Defaults to 0. data_dep_init_steps (int): Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses Activation Normalization that pre-computes normalization stats at the beginning and use the same values @@ -41,9 +77,9 @@ class GlowTTSConfig(BaseTTSConfig): use_speaker_embedding (bool): enable / disable using speaker embeddings for multi-speaker models. If set True, the model is in the multi-speaker mode. Defaults to False. - use_external_speaker_embedding_file (bool): + use_d_vector_file (bool): enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False. - external_speaker_embedding_file (str): + d_vector_file (str): Path to the file including pre-computed speaker embeddings. Defaults to None. noam_schedule (bool): enable / disable the use of Noam LR scheduler. Defaults to False. @@ -62,6 +98,7 @@ class GlowTTSConfig(BaseTTSConfig): model: str = "glow_tts" # model params + num_chars: int = None encoder_type: str = "rel_pos_transformer" encoder_params: dict = field( default_factory=lambda: { @@ -73,9 +110,36 @@ class GlowTTSConfig(BaseTTSConfig): } ) use_encoder_prenet: bool = True - hidden_channels_encoder: int = 192 - hidden_channels_decoder: int = 192 - hidden_channels_duration_predictor: int = 256 + hidden_channels_enc: int = 192 + hidden_channels_dec: int = 192 + hidden_channels_dp: int = 256 + dropout_p_dp: float = 0.1 + dropout_p_dec: float = 0.05 + mean_only: bool = True + out_channels: int = 80 + num_flow_blocks_dec: int = 12 + inference_noise_scale: float = 0.33 + kernel_size_dec: int = 5 + dilation_rate: int = 5 + num_block_layers: int = 4 + num_speakers: int = 0 + c_in_channels: int = 0 + num_splits: int = 4 + num_squeeze: int = 1 + sigmoid_scale: bool = False + mean_only: bool = False + encoder_type: str = "rel_pos_transformer" + encoder_params: dict = field( + default_factory=lambda: { + "kernel_size": 3, + "dropout_p": 0.1, + "num_layers": 6, + "num_heads": 2, + "hidden_channels_ffn": 768, + "input_length": None, + } + ) + d_vector_dim: int = 0 # training params data_dep_init_steps: int = 10 @@ -86,8 +150,8 @@ class GlowTTSConfig(BaseTTSConfig): # multi-speaker settings use_speaker_embedding: bool = False - use_external_speaker_embedding_file: bool = False - external_speaker_embedding_file: str = False + use_d_vector_file: bool = False + d_vector_file: str = False # optimizer parameters optimizer: str = "RAdam" diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py index d02e58ae..4b916a17 100644 --- a/TTS/tts/configs/shared_configs.py +++ b/TTS/tts/configs/shared_configs.py @@ -1,7 +1,7 @@ from dataclasses import asdict, dataclass, field from typing import List -from coqpit import MISSING, Coqpit, check_argument +from coqpit import Coqpit, check_argument from TTS.config import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig @@ -153,7 +153,7 @@ class BaseTTSConfig(BaseTrainingConfig): use_espeak_phonemes: bool = True phoneme_language: str = None compute_input_seq_cache: bool = False - text_cleaner: str = MISSING + text_cleaner: str = None enable_eos_bos_chars: bool = False test_sentences_file: str = "" phoneme_cache_path: str = None @@ -171,10 +171,14 @@ class BaseTTSConfig(BaseTrainingConfig): # dataset datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()]) # optimizer - optimizer: str = MISSING - optimizer_params: dict = MISSING + optimizer: str = None + optimizer_params: dict = None # scheduler lr_scheduler: str = "" lr_scheduler_params: dict = field(default_factory=lambda: {}) # testing test_sentences: List[str] = field(default_factory=lambda: []) + # multi-speaker + use_speaker_embedding: bool = False + use_d_vector_file: bool = False + d_vector_dim: int = 0 diff --git a/TTS/tts/configs/speedy_speech_config.py b/TTS/tts/configs/speedy_speech_config.py index d76d94e2..b2641ab5 100644 --- a/TTS/tts/configs/speedy_speech_config.py +++ b/TTS/tts/configs/speedy_speech_config.py @@ -2,6 +2,7 @@ from dataclasses import dataclass, field from typing import List from TTS.tts.configs.shared_configs import BaseTTSConfig +from TTS.tts.models.speedy_speech import SpeedySpeechArgs @dataclass @@ -16,30 +17,8 @@ class SpeedySpeechConfig(BaseTTSConfig): Args: model (str): Model name used for selecting the right model at initialization. Defaults to `speedy_speech`. - positional_encoding (bool): - enable / disable positional encoding applied to the encoder output. Defaults to True. - hidden_channels (int): - Base number of hidden channels. Defines all the layers expect ones defined by the specific encoder or decoder - parameters. Defaults to 128. - encoder_type (str): - Type of the encoder used by the model. Look at `TTS.tts.layers.feed_forward.encoder` for more details. - Defaults to `residual_conv_bn`. - encoder_params (dict): - Parameters used to define the encoder network. Look at `TTS.tts.layers.feed_forward.encoder` for more details. - Defaults to `{"kernel_size": 4, "dilations": [1, 2, 4, 1, 2, 4, 1, 2, 4, 1, 2, 4, 1], "num_conv_blocks": 2, "num_res_blocks": 13}` - decoder_type (str): - Type of the decoder used by the model. Look at `TTS.tts.layers.feed_forward.decoder` for more details. - Defaults to `residual_conv_bn`. - decoder_params (dict): - Parameters used to define the decoder network. Look at `TTS.tts.layers.feed_forward.decoder` for more details. - Defaults to `{"kernel_size": 4, "dilations": [1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1], "num_conv_blocks": 2, "num_res_blocks": 17}` - hidden_channels_encoder (int): - Number of base hidden channels used by the encoder network. It defines the input and the output channel sizes, - and for some encoder types internal hidden channels sizes too. Defaults to 192. - hidden_channels_decoder (int): - Number of base hidden channels used by the decoder WaveNet network. Defaults to 192 as in the original work. - hidden_channels_duration_predictor (int): - Number of layer channels of the duration predictor network. Defaults to 256 as in the original work. + model_args (Coqpit): + Model class arguments. Check `SpeedySpeechArgs` for more details. Defaults to `SpeedySpeechArgs()`. data_dep_init_steps (int): Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses Activation Normalization that pre-computes normalization stats at the beginning and use the same values @@ -47,9 +26,9 @@ class SpeedySpeechConfig(BaseTTSConfig): use_speaker_embedding (bool): enable / disable using speaker embeddings for multi-speaker models. If set True, the model is in the multi-speaker mode. Defaults to False. - use_external_speaker_embedding_file (bool): + use_d_vector_file (bool): enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False. - external_speaker_embedding_file (str): + d_vector_file (str): Path to the file including pre-computed speaker embeddings. Defaults to None. noam_schedule (bool): enable / disable the use of Noam LR scheduler. Defaults to False. @@ -73,31 +52,12 @@ class SpeedySpeechConfig(BaseTTSConfig): model: str = "speedy_speech" # model specific params - positional_encoding: bool = True - hidden_channels: int = 128 - encoder_type: str = "residual_conv_bn" - encoder_params: dict = field( - default_factory=lambda: { - "kernel_size": 4, - "dilations": [1, 2, 4, 1, 2, 4, 1, 2, 4, 1, 2, 4, 1], - "num_conv_blocks": 2, - "num_res_blocks": 13, - } - ) - decoder_type: str = "residual_conv_bn" - decoder_params: dict = field( - default_factory=lambda: { - "kernel_size": 4, - "dilations": [1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1], - "num_conv_blocks": 2, - "num_res_blocks": 17, - } - ) + model_args: SpeedySpeechArgs = field(default_factory=SpeedySpeechArgs) # multi-speaker settings use_speaker_embedding: bool = False - use_external_speaker_embedding_file: bool = False - external_speaker_embedding_file: str = False + use_d_vector_file: bool = False + d_vector_file: str = False # optimizer parameters optimizer: str = "RAdam" diff --git a/TTS/tts/configs/tacotron2_config.py b/TTS/tts/configs/tacotron2_config.py index ea66fae8..b622e640 100644 --- a/TTS/tts/configs/tacotron2_config.py +++ b/TTS/tts/configs/tacotron2_config.py @@ -12,107 +12,10 @@ class Tacotron2Config(TacotronConfig): >>> from TTS.tts.configs import Tacotron2Config >>> config = Tacotron2Config() - Args: - model (str): - Model name used to select the right model class to initilize. Defaults to `Tacotron2`. - use_gst (bool): - enable / disable the use of Global Style Token modules. Defaults to False. - gst (GSTConfig): - Instance of `GSTConfig` class. - gst_style_input (str): - Path to the wav file used at inference to set the speech style through GST. If `GST` is enabled and - this is not defined, the model uses a zero vector as an input. Defaults to None. - r (int): - Number of output frames that the decoder computed per iteration. Larger values makes training and inference - faster but reduces the quality of the output frames. This needs to be tuned considering your own needs. - Defaults to 1. - gradual_trainin (List[List]): - Parameters for the gradual training schedule. It is in the form `[[a, b, c], [d ,e ,f] ..]` where `a` is - the step number to start using the rest of the values, `b` is the `r` value and `c` is the batch size. - If sets None, no gradual training is used. Defaults to None. - memory_size (int): - Defines the number of previous frames used by the Prenet. If set to < 0, then it uses only the last frame. - Defaults to -1. - prenet_type (str): - `original` or `bn`. `original` sets the default Prenet and `bn` uses Batch Normalization version of the - Prenet. Defaults to `original`. - prenet_dropout (bool): - enables / disables the use of dropout in the Prenet. Defaults to True. - prenet_dropout_at_inference (bool): - enable / disable the use of dropout in the Prenet at the inference time. Defaults to False. - stopnet (bool): - enable /disable the Stopnet that predicts the end of the decoder sequence. Defaults to True. - stopnet_pos_weight (float): - Weight that is applied to over-weight positive instances in the Stopnet loss. Use larger values with - datasets with longer sentences. Defaults to 10. - separate_stopnet (bool): - Use a distinct Stopnet which is trained separately from the rest of the model. Defaults to True. - attention_type (str): - attention type. Check ```TTS.tts.layers.attentions.init_attn```. Defaults to 'original'. - attention_heads (int): - Number of attention heads for GMM attention. Defaults to 5. - windowing (bool): - It especially useful at inference to keep attention alignment diagonal. Defaults to False. - use_forward_attn (bool): - It is only valid if ```attn_type``` is ```original```. Defaults to False. - forward_attn_mask (bool): - enable/disable extra masking over forward attention. It is useful at inference to prevent - possible attention failures. Defaults to False. - transition_agent (bool): - enable/disable transition agent in forward attention. Defaults to False. - location_attn (bool): - enable/disable location sensitive attention as in the original Tacotron2 paper. - It is only valid if ```attn_type``` is ```original```. Defaults to True. - bidirectional_decoder (bool): - enable/disable bidirectional decoding. Defaults to False. - double_decoder_consistency (bool): - enable/disable double decoder consistency. Defaults to False. - ddc_r (int): - reduction rate used by the coarse decoder when `double_decoder_consistency` is in use. Set this - as a multiple of the `r` value. Defaults to 6. - use_speaker_embedding (bool): - enable / disable using speaker embeddings for multi-speaker models. If set True, the model is - in the multi-speaker mode. Defaults to False. - use_external_speaker_embedding_file (bool): - enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False. - external_speaker_embedding_file (str): - Path to the file including pre-computed speaker embeddings. Defaults to None. - noam_schedule (bool): - enable / disable the use of Noam LR scheduler. Defaults to False. - warmup_steps (int): - Number of warm-up steps for the Noam scheduler. Defaults 4000. - lr (float): - Initial learning rate. Defaults to `1e-4`. - wd (float): - Weight decay coefficient. Defaults to `1e-6`. - grad_clip (float): - Gradient clipping threshold. Defaults to `5`. - seq_len_notm (bool): - enable / disable the sequnce length normalization in the loss functions. If set True, loss of a sample - is divided by the sequence length. Defaults to False. - loss_masking (bool): - enable / disable masking the paddings of the samples in loss computation. Defaults to True. - decoder_loss_alpha (float): - Weight for the decoder loss of the Tacotron model. If set less than or equal to zero, it disables the - corresponding loss function. Defaults to 0.25 - postnet_loss_alpha (float): - Weight for the postnet loss of the Tacotron model. If set less than or equal to zero, it disables the - corresponding loss function. Defaults to 0.25 - postnet_diff_spec_alpha (float): - Weight for the postnet differential loss of the Tacotron model. If set less than or equal to zero, it disables the - corresponding loss function. Defaults to 0.25 - decoder_diff_spec_alpha (float): - Weight for the decoder differential loss of the Tacotron model. If set less than or equal to zero, it disables the - corresponding loss function. Defaults to 0.25 - decoder_ssim_alpha (float): - Weight for the decoder SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the - corresponding loss function. Defaults to 0.25 - postnet_ssim_alpha (float): - Weight for the postnet SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the - corresponding loss function. Defaults to 0.25 - ga_alpha (float): - Weight for the guided attention loss. If set less than or equal to zero, it disables the corresponding loss - function. Defaults to 5. + Check `TacotronConfig` for argument descriptions. """ model: str = "tacotron2" + out_channels: int = 80 + encoder_in_features: int = 512 + decoder_in_features: int = 512 diff --git a/TTS/tts/configs/tacotron_config.py b/TTS/tts/configs/tacotron_config.py index 2b67901c..89fb8d81 100644 --- a/TTS/tts/configs/tacotron_config.py +++ b/TTS/tts/configs/tacotron_config.py @@ -23,6 +23,10 @@ class TacotronConfig(BaseTTSConfig): gst_style_input (str): Path to the wav file used at inference to set the speech style through GST. If `GST` is enabled and this is not defined, the model uses a zero vector as an input. Defaults to None. + num_chars (int): + Number of characters used by the model. It must be defined before initializing the model. Defaults to None. + num_speakers (int): + Number of speakers for multi-speaker models. Defaults to 1. r (int): Initial number of output frames that the decoder computed per iteration. Larger values makes training and inference faster but reduces the quality of the output frames. This must be equal to the largest `r` value used in @@ -47,7 +51,13 @@ class TacotronConfig(BaseTTSConfig): Weight that is applied to over-weight positive instances in the Stopnet loss. Use larger values with datasets with longer sentences. Defaults to 10. max_decoder_steps (int): - Max number of steps allowed for the decoder. Defaults to 10000. + Max number of steps allowed for the decoder. Defaults to 50. + encoder_in_features (int): + Channels of encoder input and character embedding tensors. Defaults to 256. + decoder_in_features (int): + Channels of decoder input and encoder output tensors. Defaults to 256. + out_channels (int): + Channels of the final model output. It must match the spectragram size. Defaults to 80. separate_stopnet (bool): Use a distinct Stopnet which is trained separately from the rest of the model. Defaults to True. attention_type (str): @@ -76,9 +86,9 @@ class TacotronConfig(BaseTTSConfig): use_speaker_embedding (bool): enable / disable using speaker embeddings for multi-speaker models. If set True, the model is in the multi-speaker mode. Defaults to False. - use_external_speaker_embedding_file (bool): + use_d_vector_file (bool): enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False. - external_speaker_embedding_file (str): + d_vector_file (str): Path to the file including pre-computed speaker embeddings. Defaults to None. optimizer (str): Optimizer used for the training. Set one from `torch.optim.Optimizer` or `TTS.utils.training`. @@ -111,6 +121,7 @@ class TacotronConfig(BaseTTSConfig): Weight for the postnet differential loss of the Tacotron model. If set less than or equal to zero, it disables the corresponding loss function. Defaults to 0.25 decoder_diff_spec_alpha (float): + Weight for the decoder differential loss of the Tacotron model. If set less than or equal to zero, it disables the corresponding loss function. Defaults to 0.25 decoder_ssim_alpha (float): @@ -125,11 +136,14 @@ class TacotronConfig(BaseTTSConfig): """ model: str = "tacotron" + # model_params: TacotronArgs = field(default_factory=lambda: TacotronArgs()) use_gst: bool = False gst: GSTConfig = None gst_style_input: str = None # model specific params + num_speakers: int = 1 + num_chars: int = 0 r: int = 2 gradual_training: List[List[int]] = None memory_size: int = -1 @@ -139,12 +153,17 @@ class TacotronConfig(BaseTTSConfig): stopnet: bool = True separate_stopnet: bool = True stopnet_pos_weight: float = 10.0 - max_decoder_steps: int = 10000 + max_decoder_steps: int = 500 + encoder_in_features: int = 256 + decoder_in_features: int = 256 + decoder_output_dim: int = 80 + out_channels: int = 513 # attention layers attention_type: str = "original" attention_heads: int = None attention_norm: str = "sigmoid" + attention_win: bool = False windowing: bool = False use_forward_attn: bool = False forward_attn_mask: bool = False @@ -158,8 +177,10 @@ class TacotronConfig(BaseTTSConfig): # multi-speaker settings use_speaker_embedding: bool = False - use_external_speaker_embedding_file: bool = False - external_speaker_embedding_file: str = False + speaker_embedding_dim: int = 512 + use_d_vector_file: bool = False + d_vector_file: str = False + d_vector_dim: int = None # optimizer parameters optimizer: str = "RAdam" @@ -196,3 +217,9 @@ class TacotronConfig(BaseTTSConfig): assert ( self.gradual_training[0][1] == self.r ), f"[!] the first scheduled gradual training `r` must be equal to the model's `r` value. {self.gradual_training[0][1]} vs {self.r}" + if self.model == "tacotron" and self.audio is not None: + assert self.out_channels == ( + self.audio.fft_size // 2 + 1 + ), f"{self.out_channels} vs {self.audio.fft_size // 2 + 1}" + if self.model == "tacotron2" and self.audio is not None: + assert self.out_channels == self.audio.num_mels