diff --git a/TTS/config/shared_configs.py b/TTS/config/shared_configs.py index 153b3279..7df1582f 100644 --- a/TTS/config/shared_configs.py +++ b/TTS/config/shared_configs.py @@ -13,7 +13,7 @@ class BaseAudioConfig(Coqpit): Number of STFT frequency levels aka.size of the linear spectogram frame. Defaults to 1024. win_length (int): Each frame of audio is windowed by window of length ```win_length``` and then padded with zeros to match - ```fft_size```. Defaults to 256. + ```fft_size```. Defaults to 1024. hop_length (int): Number of audio samples between adjacent STFT columns. Defaults to 1024. frame_shift_ms (int): @@ -21,7 +21,7 @@ class BaseAudioConfig(Coqpit): frame_length_ms (int): Set ```win_length``` based on milliseconds and sampling rate. stft_pad_mode (str): - Padding method used in STFT. 'reflect' or 'center'. + Padding method used in STFT. 'reflect' or 'center'. Defaults to 'reflect'. sample_rate (int): Audio sampling rate. Defaults to 22050. resample (bool): @@ -135,11 +135,27 @@ class BaseAudioConfig(Coqpit): @dataclass class BaseDatasetConfig(Coqpit): - name: str = None - path: str = None - meta_file_train: Union[str, List] = None # TODO: don't take ignored speakers for multi-speaker datasets over this. This is Union for SC-Glow compat. - meta_file_val: str = None - meta_file_attn_mask: str = None + """Base config for TTS datasets. + + Args: + name (str): + Dataset name that defines the preprocessor in use. Defaults to None. + path (str): + Root path to the dataset files. Defaults to None. + meta_file_train (Union[str, List]): + Name of the dataset meta file. Or a list of speakers to be ignored at training for multi-speaker datasets. + Defaults to None. + meta_file_val (str): + Name of the dataset meta file that defines the instances used at validation. + meta_file_attn_mask (str): + Path to the file that lists the attention mask files used with models that require attention masks to + train the duration predictor. + """ + name: str = '' + path: str = '' + meta_file_train: Union[str, List] = '' # TODO: don't take ignored speakers for multi-speaker datasets over this. This is Union for SC-Glow compat. + meta_file_val: str = '' + meta_file_attn_mask: str = '' def check_values( self, @@ -161,12 +177,8 @@ class BaseTrainingConfig(Coqpit): Args: batch_size (int): Training batch size. - batch_group_size (int): - Number of batches to shuffle after bucketing. eval_batch_size (int): Validation batch size. - loss_masking (bool): - Enable / Disable masking padding segments of sequences. mixed_precision (bool): Enable / Disable mixed precision training. It reduces the VRAM use and allows larger batch sizes, however it may also cause numerical unstability in some cases. @@ -195,34 +207,13 @@ class BaseTrainingConfig(Coqpit): keep_after (int): Number of steps to wait before saving all the best models. In use if ```keep_all_best == True```. Defaults to 10000. - text_cleaner (str): - Text cleaner to be used at model training. It is set to be one of the cleaners in - ```TTS.tts.utils.text.cleaners```. - enable_eos_bos_chars (bool): - Enable / Disable using special characters indicating end-of-sentence and begining-of-sentence. num_loader_workers (int): Number of workers for training time dataloader. num_val_loader_workers (int): Number of workers for evaluation time dataloader. - min_seq_len (int): - Minimum sequence length to be used at training. - max_seq_len (int): - Maximum sequence length to be used at training. VRAM use at training depends on this parameter. Consider to - decrease it if you get OOM errors. - compute_f0 (bool): - Return F0 frames from the dataloader. Defaults to ```False```. - compute_input_seq_cache (bool): - Enable / Disable computing and caching phonemes sequences from character sequences at the begining of the - training. It allows faster data loading times and more precise max-min sequence prunning. Defaults - to ```False```. output_path (str): Path for training output folder. The nonexist part of the given path is created automatically. All training outputs are saved there. - phoneme_cache_path (str): - Path to a folder to save the computed phoneme sequences. - datasets (List[BaseDatasetConfig]): - ist of DatasetConfig. - """ model: str = None diff --git a/TTS/tts/configs/align_tts_config.py b/TTS/tts/configs/align_tts_config.py index 6e09e398..84e0ba13 100644 --- a/TTS/tts/configs/align_tts_config.py +++ b/TTS/tts/configs/align_tts_config.py @@ -1,11 +1,69 @@ from dataclasses import dataclass, field -from .shared_configs import BaseTTSConfig +from TTS.tts.configs.shared_configs import BaseTTSConfig @dataclass class AlignTTSConfig(BaseTTSConfig): - """Defines parameters for AlignTTS model.""" + """Defines parameters for AlignTTS model. + Example: + + >>> from TTS.tts.configs import AlignTTSConfig + >>> config = AlignTTSConfig() + + Args: + model(str): + Model name used for selecting the right model at initialization. Defaults to `align_tts`. + positional_encoding (bool): + enable / disable positional encoding applied to the encoder output. Defaults to True. + hidden_channels (int): + Base number of hidden channels. Defines all the layers expect ones defined by the specific encoder or decoder + parameters. Defaults to 256. + hidden_channels_dp (int): + Number of hidden channels of the duration predictor's layers. Defaults to 256. + encoder_type (str): + Type of the encoder used by the model. Look at `TTS.tts.layers.feed_forward.encoder` for more details. + Defaults to `fftransformer`. + encoder_params (dict): + Parameters used to define the encoder network. Look at `TTS.tts.layers.feed_forward.encoder` for more details. + Defaults to `{"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}`. + decoder_type (str): + Type of the decoder used by the model. Look at `TTS.tts.layers.feed_forward.decoder` for more details. + Defaults to `fftransformer`. + decoder_params (dict): + Parameters used to define the decoder network. Look at `TTS.tts.layers.feed_forward.decoder` for more details. + Defaults to `{"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}`. + phase_start_steps (List[int]): + A list of number of steps required to start the next training phase. AlignTTS has 4 different training + phases. Thus you need to define 4 different values to enable phase based training. If None, it + trains the whole model together. Defaults to None. + ssim_alpha (float): + Weight for the SSIM loss. If set <= 0, disables the SSIM loss. Defaults to 1.0. + duration_loss_alpha (float): + Weight for the duration predictor's loss. Defaults to 1.0. + mdn_alpha (float): + Weight for the MDN loss. Defaults to 1.0. + spec_loss_alpha (float): + Weight for the MSE spectrogram loss. If set <= 0, disables the L1 loss. Defaults to 1.0. + use_speaker_embedding (bool): + enable / disable using speaker embeddings for multi-speaker models. If set True, the model is + in the multi-speaker mode. Defaults to False. + use_external_speaker_embedding_file (bool): + enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False. + external_speaker_embedding_file (str): + Path to the file including pre-computed speaker embeddings. Defaults to None. + noam_schedule (bool): + enable / disable the use of Noam LR scheduler. Defaults to False. + warmup_steps (int): + Number of warm-up steps for the Noam scheduler. Defaults 4000. + lr (float): + Initial learning rate. Defaults to `1e-3`. + wd (float): + Weight decay coefficient. Defaults to `1e-7`. + min_seq_len (int): + Minimum input sequence length to be used at training. + max_seq_len (int): + Maximum input sequence length to be used at training. Larger values result in more VRAM usage.""" model: str = "align_tts" # model specific params diff --git a/TTS/tts/configs/glow_tts_config.py b/TTS/tts/configs/glow_tts_config.py index 8474caae..c0eadb1f 100644 --- a/TTS/tts/configs/glow_tts_config.py +++ b/TTS/tts/configs/glow_tts_config.py @@ -1,11 +1,64 @@ from dataclasses import dataclass, field -from .shared_configs import BaseTTSConfig +from TTS.tts.configs.shared_configs import BaseTTSConfig @dataclass class GlowTTSConfig(BaseTTSConfig): - """Defines parameters for GlowTTS model.""" + """Defines parameters for GlowTTS model. + + Example: + + >>> from TTS.tts.configs import GlowTTSConfig + >>> config = GlowTTSConfig() + + Args: + model(str): + Model name used for selecting the right model at initialization. Defaults to `glow_tts`. + encoder_type (str): + Type of the encoder used by the model. Look at `TTS.tts.layers.glow_tts.encoder` for more details. + Defaults to `rel_pos_transformers`. + encoder_params (dict): + Parameters used to define the encoder network. Look at `TTS.tts.layers.glow_tts.encoder` for more details. + Defaults to `{"kernel_size": 3, "dropout_p": 0.1, "num_layers": 6, "num_heads": 2, "hidden_channels_ffn": 768}` + use_encoder_prenet (bool): + enable / disable the use of a prenet for the encoder. Defaults to True. + hidden_channels_encoder (int): + Number of base hidden channels used by the encoder network. It defines the input and the output channel sizes, + and for some encoder types internal hidden channels sizes too. Defaults to 192. + hidden_channels_decoder (int): + Number of base hidden channels used by the decoder WaveNet network. Defaults to 192 as in the original work. + hidden_channels_duration_predictor (int): + Number of layer channels of the duration predictor network. Defaults to 256 as in the original work. + data_dep_init_steps (int): + Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses + Activation Normalization that pre-computes normalization stats at the beginning and use the same values + for the rest. Defaults to 10. + style_wav_for_test (str): + Path to the wav file used for changing the style of the speech. Defaults to None. + inference_noise_scale (float): + Variance used for sampling the random noise added to the decoder's input at inference. Defaults to 0.0. + use_speaker_embedding (bool): + enable / disable using speaker embeddings for multi-speaker models. If set True, the model is + in the multi-speaker mode. Defaults to False. + use_external_speaker_embedding_file (bool): + enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False. + external_speaker_embedding_file (str): + Path to the file including pre-computed speaker embeddings. Defaults to None. + noam_schedule (bool): + enable / disable the use of Noam LR scheduler. Defaults to False. + warmup_steps (int): + Number of warm-up steps for the Noam scheduler. Defaults 4000. + lr (float): + Initial learning rate. Defaults to `1e-3`. + wd (float): + Weight decay coefficient. Defaults to `1e-7`. + min_seq_len (int): + Minimum input sequence length to be used at training. + max_seq_len (int): + Maximum input sequence length to be used at training. Larger values result in more VRAM usage. + """ + model: str = "glow_tts" @@ -47,4 +100,4 @@ class GlowTTSConfig(BaseTTSConfig): # overrides min_seq_len: int = 3 max_seq_len: int = 500 - r: int = 1 + r: int = 1 # DO NOT CHANGE - TODO: make this immutable once coqpit implements it. diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py index f3d0a528..885896c7 100644 --- a/TTS/tts/configs/shared_configs.py +++ b/TTS/tts/configs/shared_configs.py @@ -8,8 +8,20 @@ from TTS.config import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig @dataclass class GSTConfig(Coqpit): - """Defines Global Style Toke module""" + """Defines the Global Style Token Module + Args: + gst_style_input_wav (str): + Path to the wav file used to define the style of the output speech at inference. Defaults to None. + gst_style_input_weights (dict): + Defines the weights for each style token used at inference. Defaults to None. + gst_embedding_dim (int): + Defines the size of the GST embedding vector dimensions. Defaults to 256. + gst_num_heads (int): + Number of attention heads used by the multi-head attention. Defaults to 4. + gst_num_style_tokens (int): + Number of style token vectors. Defaults to 10. + """ gst_style_input_wav: str = None gst_style_input_weights: dict = None gst_embedding_dim: int = 256 @@ -33,7 +45,26 @@ class GSTConfig(Coqpit): @dataclass class CharactersConfig(Coqpit): - """Defines character or phoneme set used by the model""" + """Defines character or phoneme set used by the model + + Args: + pad (str): + characters in place of empty padding. Defaults to None. + eos (str): + characters showing the end of a sentence. Defaults to None. + bos (str): + characters showing the beginning of a sentence. Defaults to None. + characters (str): + character set used by the model. Characters not in this list are ignored when converting input text to + a list of sequence IDs. Defaults to None. + punctuations (str): + characters considered as punctuation as parsing the input sentence. Defaults to None. + phonemes (str): + characters considered as parsing phonemes. Defaults to None. + unique (bool): + remove any duplicate characters in the character lists. It is a bandaid for compatibility with the old + models trained with character lists with duplicates. + """ pad: str = None eos: str = None @@ -58,7 +89,48 @@ class CharactersConfig(Coqpit): @dataclass class BaseTTSConfig(BaseTrainingConfig): - """Shared parameters among all the tts models.""" + """Shared parameters among all the tts models. + + Args: + audio (BaseAudioConfig): + Audio processor config object instance. + use_phonemes (bool): + enable / disable phoneme use. + compute_input_seq_cache (bool): + enable / disable precomputation of the phoneme sequences. At the expense of some delay at the beginning of + the training, It allows faster data loader time and precise limitation with `max_seq_len` and + `min_seq_len`. + text_cleaner (str): + Name of the text cleaner used for cleaning and formatting transcripts. + enable_eos_bos_chars (bool): + enable / disable the use of eos and bos characters. + test_senteces_file (str): + Path to a txt file that has sentences used at test time. The file must have a sentence per line. + phoneme_cache_path (str): + Path to the output folder caching the computed phonemes for each sample. + characters (CharactersConfig): + Instance of a CharactersConfig class. + batch_group_size (int): + Size of the batch groups used for bucketing. By default, the dataloader orders samples by the sequence + length for a more efficient and stable training. If `batch_group_size > 1` then it performs bucketing to + prevent using the same batches for each epoch. + loss_masking (bool): + enable / disable masking loss values against padded segments of samples in a batch. + min_seq_len (int): + Minimum input sequence length to be used at training. + max_seq_len (int): + Maximum input sequence length to be used at training. Larger values result in more VRAM usage. + compute_f0 (int): + (Not in use yet). + use_noise_augment (bool): + Augment the input audio with random noise. + add_blank (bool): + Add blank characters between each other two characters. It improves performance for some models at expense + of slower run-time due to the longer input sequence. + datasets (List[BaseDatasetConfig]): + List of datasets used for training. If multiple datasets are provided, they are merged and used together + for training. + """ audio: BaseAudioConfig = field(default_factory=BaseAudioConfig) # phoneme settings diff --git a/TTS/tts/configs/speedy_speech_config.py b/TTS/tts/configs/speedy_speech_config.py index a2e90cb8..85df543d 100644 --- a/TTS/tts/configs/speedy_speech_config.py +++ b/TTS/tts/configs/speedy_speech_config.py @@ -1,11 +1,74 @@ from dataclasses import dataclass, field -from .shared_configs import BaseTTSConfig +from TTS.tts.configs.shared_configs import BaseTTSConfig @dataclass class SpeedySpeechConfig(BaseTTSConfig): - """Defines parameters for Speedy Speech (feed-forward encoder-decoder) based models.""" + """Defines parameters for Speedy Speech (feed-forward encoder-decoder) based models. + + Example: + + >>> from TTS.tts.configs import SpeedySpeechConfig + >>> config = SpeedySpeechConfig() + + Args: + model (str): + Model name used for selecting the right model at initialization. Defaults to `speedy_speech`. + positional_encoding (bool): + enable / disable positional encoding applied to the encoder output. Defaults to True. + hidden_channels (int): + Base number of hidden channels. Defines all the layers expect ones defined by the specific encoder or decoder + parameters. Defaults to 128. + encoder_type (str): + Type of the encoder used by the model. Look at `TTS.tts.layers.feed_forward.encoder` for more details. + Defaults to `residual_conv_bn`. + encoder_params (dict): + Parameters used to define the encoder network. Look at `TTS.tts.layers.feed_forward.encoder` for more details. + Defaults to `{"kernel_size": 4, "dilations": [1, 2, 4, 1, 2, 4, 1, 2, 4, 1, 2, 4, 1], "num_conv_blocks": 2, "num_res_blocks": 13}` + decoder_type (str): + Type of the decoder used by the model. Look at `TTS.tts.layers.feed_forward.decoder` for more details. + Defaults to `residual_conv_bn`. + decoder_params (dict): + Parameters used to define the decoder network. Look at `TTS.tts.layers.feed_forward.decoder` for more details. + Defaults to `{"kernel_size": 4, "dilations": [1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1], "num_conv_blocks": 2, "num_res_blocks": 17}` + hidden_channels_encoder (int): + Number of base hidden channels used by the encoder network. It defines the input and the output channel sizes, + and for some encoder types internal hidden channels sizes too. Defaults to 192. + hidden_channels_decoder (int): + Number of base hidden channels used by the decoder WaveNet network. Defaults to 192 as in the original work. + hidden_channels_duration_predictor (int): + Number of layer channels of the duration predictor network. Defaults to 256 as in the original work. + data_dep_init_steps (int): + Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses + Activation Normalization that pre-computes normalization stats at the beginning and use the same values + for the rest. Defaults to 10. + use_speaker_embedding (bool): + enable / disable using speaker embeddings for multi-speaker models. If set True, the model is + in the multi-speaker mode. Defaults to False. + use_external_speaker_embedding_file (bool): + enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False. + external_speaker_embedding_file (str): + Path to the file including pre-computed speaker embeddings. Defaults to None. + noam_schedule (bool): + enable / disable the use of Noam LR scheduler. Defaults to False. + warmup_steps (int): + Number of warm-up steps for the Noam scheduler. Defaults 4000. + lr (float): + Initial learning rate. Defaults to `1e-3`. + wd (float): + Weight decay coefficient. Defaults to `1e-7`. + ssim_alpha (float): + Weight for the SSIM loss. If set <= 0, disables the SSIM loss. Defaults to 1.0. + huber_alpha (float): + Weight for the duration predictor's loss. Defaults to 1.0. + l1_alpha (float): + Weight for the L1 spectrogram loss. If set <= 0, disables the L1 loss. Defaults to 1.0. + min_seq_len (int): + Minimum input sequence length to be used at training. + max_seq_len (int): + Maximum input sequence length to be used at training. Larger values result in more VRAM usage. + """ model: str = "speedy_speech" # model specific params @@ -50,4 +113,4 @@ class SpeedySpeechConfig(BaseTTSConfig): # overrides min_seq_len: int = 13 max_seq_len: int = 200 - r: int = 1 + r: int = 1 #DO NOT CHANGE diff --git a/TTS/tts/configs/tacotron2_config.py b/TTS/tts/configs/tacotron2_config.py index e6767d41..ea66fae8 100644 --- a/TTS/tts/configs/tacotron2_config.py +++ b/TTS/tts/configs/tacotron2_config.py @@ -5,6 +5,114 @@ from TTS.tts.configs.tacotron_config import TacotronConfig @dataclass class Tacotron2Config(TacotronConfig): - """Defines parameters for Tacotron2 based models.""" + """Defines parameters for Tacotron2 based models. + + Example: + + >>> from TTS.tts.configs import Tacotron2Config + >>> config = Tacotron2Config() + + Args: + model (str): + Model name used to select the right model class to initilize. Defaults to `Tacotron2`. + use_gst (bool): + enable / disable the use of Global Style Token modules. Defaults to False. + gst (GSTConfig): + Instance of `GSTConfig` class. + gst_style_input (str): + Path to the wav file used at inference to set the speech style through GST. If `GST` is enabled and + this is not defined, the model uses a zero vector as an input. Defaults to None. + r (int): + Number of output frames that the decoder computed per iteration. Larger values makes training and inference + faster but reduces the quality of the output frames. This needs to be tuned considering your own needs. + Defaults to 1. + gradual_trainin (List[List]): + Parameters for the gradual training schedule. It is in the form `[[a, b, c], [d ,e ,f] ..]` where `a` is + the step number to start using the rest of the values, `b` is the `r` value and `c` is the batch size. + If sets None, no gradual training is used. Defaults to None. + memory_size (int): + Defines the number of previous frames used by the Prenet. If set to < 0, then it uses only the last frame. + Defaults to -1. + prenet_type (str): + `original` or `bn`. `original` sets the default Prenet and `bn` uses Batch Normalization version of the + Prenet. Defaults to `original`. + prenet_dropout (bool): + enables / disables the use of dropout in the Prenet. Defaults to True. + prenet_dropout_at_inference (bool): + enable / disable the use of dropout in the Prenet at the inference time. Defaults to False. + stopnet (bool): + enable /disable the Stopnet that predicts the end of the decoder sequence. Defaults to True. + stopnet_pos_weight (float): + Weight that is applied to over-weight positive instances in the Stopnet loss. Use larger values with + datasets with longer sentences. Defaults to 10. + separate_stopnet (bool): + Use a distinct Stopnet which is trained separately from the rest of the model. Defaults to True. + attention_type (str): + attention type. Check ```TTS.tts.layers.attentions.init_attn```. Defaults to 'original'. + attention_heads (int): + Number of attention heads for GMM attention. Defaults to 5. + windowing (bool): + It especially useful at inference to keep attention alignment diagonal. Defaults to False. + use_forward_attn (bool): + It is only valid if ```attn_type``` is ```original```. Defaults to False. + forward_attn_mask (bool): + enable/disable extra masking over forward attention. It is useful at inference to prevent + possible attention failures. Defaults to False. + transition_agent (bool): + enable/disable transition agent in forward attention. Defaults to False. + location_attn (bool): + enable/disable location sensitive attention as in the original Tacotron2 paper. + It is only valid if ```attn_type``` is ```original```. Defaults to True. + bidirectional_decoder (bool): + enable/disable bidirectional decoding. Defaults to False. + double_decoder_consistency (bool): + enable/disable double decoder consistency. Defaults to False. + ddc_r (int): + reduction rate used by the coarse decoder when `double_decoder_consistency` is in use. Set this + as a multiple of the `r` value. Defaults to 6. + use_speaker_embedding (bool): + enable / disable using speaker embeddings for multi-speaker models. If set True, the model is + in the multi-speaker mode. Defaults to False. + use_external_speaker_embedding_file (bool): + enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False. + external_speaker_embedding_file (str): + Path to the file including pre-computed speaker embeddings. Defaults to None. + noam_schedule (bool): + enable / disable the use of Noam LR scheduler. Defaults to False. + warmup_steps (int): + Number of warm-up steps for the Noam scheduler. Defaults 4000. + lr (float): + Initial learning rate. Defaults to `1e-4`. + wd (float): + Weight decay coefficient. Defaults to `1e-6`. + grad_clip (float): + Gradient clipping threshold. Defaults to `5`. + seq_len_notm (bool): + enable / disable the sequnce length normalization in the loss functions. If set True, loss of a sample + is divided by the sequence length. Defaults to False. + loss_masking (bool): + enable / disable masking the paddings of the samples in loss computation. Defaults to True. + decoder_loss_alpha (float): + Weight for the decoder loss of the Tacotron model. If set less than or equal to zero, it disables the + corresponding loss function. Defaults to 0.25 + postnet_loss_alpha (float): + Weight for the postnet loss of the Tacotron model. If set less than or equal to zero, it disables the + corresponding loss function. Defaults to 0.25 + postnet_diff_spec_alpha (float): + Weight for the postnet differential loss of the Tacotron model. If set less than or equal to zero, it disables the + corresponding loss function. Defaults to 0.25 + decoder_diff_spec_alpha (float): + Weight for the decoder differential loss of the Tacotron model. If set less than or equal to zero, it disables the + corresponding loss function. Defaults to 0.25 + decoder_ssim_alpha (float): + Weight for the decoder SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the + corresponding loss function. Defaults to 0.25 + postnet_ssim_alpha (float): + Weight for the postnet SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the + corresponding loss function. Defaults to 0.25 + ga_alpha (float): + Weight for the guided attention loss. If set less than or equal to zero, it disables the corresponding loss + function. Defaults to 5. + """ model: str = "tacotron2" diff --git a/TTS/tts/configs/tacotron_config.py b/TTS/tts/configs/tacotron_config.py index 5c509927..53f5739e 100644 --- a/TTS/tts/configs/tacotron_config.py +++ b/TTS/tts/configs/tacotron_config.py @@ -1,12 +1,120 @@ from dataclasses import dataclass from typing import List -from .shared_configs import BaseTTSConfig, GSTConfig +from TTS.tts.configs.shared_configs import BaseTTSConfig, GSTConfig @dataclass class TacotronConfig(BaseTTSConfig): - """Defines parameters for Tacotron based models.""" + """Defines parameters for Tacotron based models. + + Example: + + >>> from TTS.tts.configs import TacotronConfig + >>> config = TacotronConfig() + + Args: + model (str): + Model name used to select the right model class to initilize. Defaults to `Tacotron`. + use_gst (bool): + enable / disable the use of Global Style Token modules. Defaults to False. + gst (GSTConfig): + Instance of `GSTConfig` class. + gst_style_input (str): + Path to the wav file used at inference to set the speech style through GST. If `GST` is enabled and + this is not defined, the model uses a zero vector as an input. Defaults to None. + r (int): + Number of output frames that the decoder computed per iteration. Larger values makes training and inference + faster but reduces the quality of the output frames. This needs to be tuned considering your own needs. + Defaults to 1. + gradual_trainin (List[List]): + Parameters for the gradual training schedule. It is in the form `[[a, b, c], [d ,e ,f] ..]` where `a` is + the step number to start using the rest of the values, `b` is the `r` value and `c` is the batch size. + If sets None, no gradual training is used. Defaults to None. + memory_size (int): + Defines the number of previous frames used by the Prenet. If set to < 0, then it uses only the last frame. + Defaults to -1. + prenet_type (str): + `original` or `bn`. `original` sets the default Prenet and `bn` uses Batch Normalization version of the + Prenet. Defaults to `original`. + prenet_dropout (bool): + enables / disables the use of dropout in the Prenet. Defaults to True. + prenet_dropout_at_inference (bool): + enable / disable the use of dropout in the Prenet at the inference time. Defaults to False. + stopnet (bool): + enable /disable the Stopnet that predicts the end of the decoder sequence. Defaults to True. + stopnet_pos_weight (float): + Weight that is applied to over-weight positive instances in the Stopnet loss. Use larger values with + datasets with longer sentences. Defaults to 10. + separate_stopnet (bool): + Use a distinct Stopnet which is trained separately from the rest of the model. Defaults to True. + attention_type (str): + attention type. Check ```TTS.tts.layers.attentions.init_attn```. Defaults to 'original'. + attention_heads (int): + Number of attention heads for GMM attention. Defaults to 5. + windowing (bool): + It especially useful at inference to keep attention alignment diagonal. Defaults to False. + use_forward_attn (bool): + It is only valid if ```attn_type``` is ```original```. Defaults to False. + forward_attn_mask (bool): + enable/disable extra masking over forward attention. It is useful at inference to prevent + possible attention failures. Defaults to False. + transition_agent (bool): + enable/disable transition agent in forward attention. Defaults to False. + location_attn (bool): + enable/disable location sensitive attention as in the original Tacotron2 paper. + It is only valid if ```attn_type``` is ```original```. Defaults to True. + bidirectional_decoder (bool): + enable/disable bidirectional decoding. Defaults to False. + double_decoder_consistency (bool): + enable/disable double decoder consistency. Defaults to False. + ddc_r (int): + reduction rate used by the coarse decoder when `double_decoder_consistency` is in use. Set this + as a multiple of the `r` value. Defaults to 6. + use_speaker_embedding (bool): + enable / disable using speaker embeddings for multi-speaker models. If set True, the model is + in the multi-speaker mode. Defaults to False. + use_external_speaker_embedding_file (bool): + enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False. + external_speaker_embedding_file (str): + Path to the file including pre-computed speaker embeddings. Defaults to None. + noam_schedule (bool): + enable / disable the use of Noam LR scheduler. Defaults to False. + warmup_steps (int): + Number of warm-up steps for the Noam scheduler. Defaults 4000. + lr (float): + Initial learning rate. Defaults to `1e-4`. + wd (float): + Weight decay coefficient. Defaults to `1e-6`. + grad_clip (float): + Gradient clipping threshold. Defaults to `5`. + seq_len_notm (bool): + enable / disable the sequnce length normalization in the loss functions. If set True, loss of a sample + is divided by the sequence length. Defaults to False. + loss_masking (bool): + enable / disable masking the paddings of the samples in loss computation. Defaults to True. + decoder_loss_alpha (float): + Weight for the decoder loss of the Tacotron model. If set less than or equal to zero, it disables the + corresponding loss function. Defaults to 0.25 + postnet_loss_alpha (float): + Weight for the postnet loss of the Tacotron model. If set less than or equal to zero, it disables the + corresponding loss function. Defaults to 0.25 + postnet_diff_spec_alpha (float): + Weight for the postnet differential loss of the Tacotron model. If set less than or equal to zero, it disables the + corresponding loss function. Defaults to 0.25 + decoder_diff_spec_alpha (float): + Weight for the decoder differential loss of the Tacotron model. If set less than or equal to zero, it disables the + corresponding loss function. Defaults to 0.25 + decoder_ssim_alpha (float): + Weight for the decoder SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the + corresponding loss function. Defaults to 0.25 + postnet_ssim_alpha (float): + Weight for the postnet SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the + corresponding loss function. Defaults to 0.25 + ga_alpha (float): + Weight for the guided attention loss. If set less than or equal to zero, it disables the corresponding loss + function. Defaults to 5. + """ model: str = "tacotron" use_gst: bool = False diff --git a/TTS/tts/datasets/preprocess.py b/TTS/tts/datasets/preprocess.py index 4523d70b..72ab160e 100644 --- a/TTS/tts/datasets/preprocess.py +++ b/TTS/tts/datasets/preprocess.py @@ -52,19 +52,19 @@ def load_meta_data(datasets, eval_split=True): print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}") # load evaluation split if set if eval_split: - if meta_file_val is None: - meta_data_eval, meta_data_train = split_dataset(meta_data_train) - else: + if meta_file_val: meta_data_eval = preprocessor(root_path, meta_file_val) + else: + meta_data_eval, meta_data_train = split_dataset(meta_data_train) meta_data_eval_all += meta_data_eval meta_data_train_all += meta_data_train # load attention masks for duration predictor training - if dataset.meta_file_attn_mask is not None: + if dataset.meta_file_attn_mask: meta_data = dict(load_attention_mask_meta_data(dataset["meta_file_attn_mask"])) for idx, ins in enumerate(meta_data_train_all): attn_file = meta_data[ins[1]].strip() meta_data_train_all[idx].append(attn_file) - if meta_data_eval_all is not None: + if meta_data_eval_all: for idx, ins in enumerate(meta_data_eval_all): attn_file = meta_data[ins[1]].strip() meta_data_eval_all[idx].append(attn_file) diff --git a/TTS/vocoder/configs/fullband_melgan_config.py b/TTS/vocoder/configs/fullband_melgan_config.py index 7baa68ee..53444214 100644 --- a/TTS/vocoder/configs/fullband_melgan_config.py +++ b/TTS/vocoder/configs/fullband_melgan_config.py @@ -5,7 +5,62 @@ from .shared_configs import BaseGANVocoderConfig @dataclass class FullbandMelganConfig(BaseGANVocoderConfig): - """Defines parameters for FullbandMelGAN vocoder.""" + """Defines parameters for FullBand MelGAN vocoder. + + Example: + + >>> from TTS.vocoder.configs import FullbandMelganConfig + >>> config = FullbandMelganConfig() + + Args: + model (str): + Model name used for selecting the right model at initialization. Defaults to `melgan`. + discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to + 'melgan_multiscale_discriminator`. + discriminator_model_params (dict): The discriminator model parameters. Defaults to + '{"base_channels": 16, "max_channels": 1024, "downsample_factors": [4, 4, 4, 4]}` + generator_model (str): One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is + considered as a generator too. Defaults to `melgan_generator`. + batch_size (int): + Batch size used at training. Larger values use more memory. Defaults to 16. + seq_len (int): + Audio segment length used at training. Larger values use more memory. Defaults to 8192. + pad_short (int): + Additional padding applied to the audio samples shorter than `seq_len`. Defaults to 0. + use_noise_augment (bool): + enable / disable random noise added to the input waveform. The noise is added after computing the + features. Defaults to True. + use_cache (bool): + enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is + not large enough. Defaults to True. + use_stft_loss (bool): + enable / disable use of STFT loss originally used by ParallelWaveGAN model. Defaults to True. + use_subband_stft (bool): + enable / disable use of subband loss computation originally used by MultiBandMelgan model. Defaults to True. + use_mse_gan_loss (bool): + enable / disable using Mean Squeare Error GAN loss. Defaults to True. + use_hinge_gan_loss (bool): + enable / disable using Hinge GAN loss. You should choose either Hinge or MSE loss for training GAN models. + Defaults to False. + use_feat_match_loss (bool): + enable / disable using Feature Matching loss originally used by MelGAN model. Defaults to True. + use_l1_spec_loss (bool): + enable / disable using L1 spectrogram loss originally used by HifiGAN model. Defaults to False. + stft_loss_params (dict): STFT loss parameters. Default to + `{"n_ffts": [1024, 2048, 512], "hop_lengths": [120, 240, 50], "win_lengths": [600, 1200, 240]}` + stft_loss_weight (float): STFT loss weight that multiplies the computed loss before summing up the total + model loss. Defaults to 0.5. + subband_stft_loss_weight (float): + Subband STFT loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0. + mse_G_loss_weight (float): + MSE generator loss weight that multiplies the computed loss before summing up the total loss. faults to 2.5. + hinge_G_loss_weight (float): + Hinge generator loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0. + feat_match_loss_weight (float): + Feature matching loss weight that multiplies the computed loss before summing up the total loss. faults to 108. + l1_spec_loss_weight (float): + L1 spectrogram loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0. + """ model: str = "melgan" @@ -48,4 +103,4 @@ class FullbandMelganConfig(BaseGANVocoderConfig): mse_G_loss_weight: float = 2.5 hinge_G_loss_weight: float = 0 feat_match_loss_weight: float = 108 - l1_spec_loss_weight: float = 0 + l1_spec_loss_weight: float = 0.0 diff --git a/TTS/vocoder/configs/hifigan_config.py b/TTS/vocoder/configs/hifigan_config.py index cfbf5510..f76bb14c 100644 --- a/TTS/vocoder/configs/hifigan_config.py +++ b/TTS/vocoder/configs/hifigan_config.py @@ -1,11 +1,94 @@ from dataclasses import dataclass, field -from .shared_configs import BaseGANVocoderConfig +from TTS.vocoder.configs.shared_configs import BaseGANVocoderConfig @dataclass class HifiganConfig(BaseGANVocoderConfig): - """Defines parameters for HifiGAN vocoder.""" + """Defines parameters for FullBand MelGAN vocoder. + + Example: + + >>> from TTS.vocoder.configs import HifiganConfig + >>> config = HifiganConfig() + + Args: + model (str): + Model name used for selecting the right model at initialization. Defaults to `hifigan`. + discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to + 'hifigan_discriminator`. + generator_model (str): One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is + considered as a generator too. Defaults to `hifigan_generator`. + generator_model_params (dict): Parameters of the generator model. Defaults to + ` + { + "use_mel": True, + "sample_rate": 22050, + "n_fft": 1024, + "hop_length": 256, + "win_length": 1024, + "n_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": None, + } + ` + batch_size (int): + Batch size used at training. Larger values use more memory. Defaults to 16. + seq_len (int): + Audio segment length used at training. Larger values use more memory. Defaults to 8192. + pad_short (int): + Additional padding applied to the audio samples shorter than `seq_len`. Defaults to 0. + use_noise_augment (bool): + enable / disable random noise added to the input waveform. The noise is added after computing the + features. Defaults to True. + use_cache (bool): + enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is + not large enough. Defaults to True. + use_stft_loss (bool): + enable / disable use of STFT loss originally used by ParallelWaveGAN model. Defaults to True. + use_subband_stft (bool): + enable / disable use of subband loss computation originally used by MultiBandMelgan model. Defaults to True. + use_mse_gan_loss (bool): + enable / disable using Mean Squeare Error GAN loss. Defaults to True. + use_hinge_gan_loss (bool): + enable / disable using Hinge GAN loss. You should choose either Hinge or MSE loss for training GAN models. + Defaults to False. + use_feat_match_loss (bool): + enable / disable using Feature Matching loss originally used by MelGAN model. Defaults to True. + use_l1_spec_loss (bool): + enable / disable using L1 spectrogram loss originally used by HifiGAN model. Defaults to False. + stft_loss_params (dict): + STFT loss parameters. Default to + `{ + "n_ffts": [1024, 2048, 512], + "hop_lengths": [120, 240, 50], + "win_lengths": [600, 1200, 240] + }` + l1_spec_loss_params (dict): + L1 spectrogram loss parameters. Default to + `{ + "use_mel": True, + "sample_rate": 22050, + "n_fft": 1024, + "hop_length": 256, + "win_length": 1024, + "n_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": None, + }` + stft_loss_weight (float): STFT loss weight that multiplies the computed loss before summing up the total + model loss. Defaults to 0.5. + subband_stft_loss_weight (float): + Subband STFT loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0. + mse_G_loss_weight (float): + MSE generator loss weight that multiplies the computed loss before summing up the total loss. faults to 2.5. + hinge_G_loss_weight (float): + Hinge generator loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0. + feat_match_loss_weight (float): + Feature matching loss weight that multiplies the computed loss before summing up the total loss. faults to 108. + l1_spec_loss_weight (float): + L1 spectrogram loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0. + """ model: str = "hifigan" # model specific params diff --git a/TTS/vocoder/configs/melgan_config.py b/TTS/vocoder/configs/melgan_config.py index cee64330..dc35b6f8 100644 --- a/TTS/vocoder/configs/melgan_config.py +++ b/TTS/vocoder/configs/melgan_config.py @@ -1,11 +1,66 @@ from dataclasses import dataclass, field -from .shared_configs import BaseGANVocoderConfig +from TTS.vocoder.configs.shared_configs import BaseGANVocoderConfig @dataclass class MelganConfig(BaseGANVocoderConfig): - """Defines parameters for MelGAN vocoder.""" + """Defines parameters for MelGAN vocoder. + + Example: + + >>> from TTS.vocoder.configs import MelganConfig + >>> config = MelganConfig() + + Args: + model (str): + Model name used for selecting the right model at initialization. Defaults to `melgan`. + discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to + 'melgan_multiscale_discriminator`. + discriminator_model_params (dict): The discriminator model parameters. Defaults to + '{"base_channels": 16, "max_channels": 1024, "downsample_factors": [4, 4, 4, 4]}` + generator_model (str): One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is + considered as a generator too. Defaults to `melgan_generator`. + batch_size (int): + Batch size used at training. Larger values use more memory. Defaults to 16. + seq_len (int): + Audio segment length used at training. Larger values use more memory. Defaults to 8192. + pad_short (int): + Additional padding applied to the audio samples shorter than `seq_len`. Defaults to 0. + use_noise_augment (bool): + enable / disable random noise added to the input waveform. The noise is added after computing the + features. Defaults to True. + use_cache (bool): + enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is + not large enough. Defaults to True. + use_stft_loss (bool): + enable / disable use of STFT loss originally used by ParallelWaveGAN model. Defaults to True. + use_subband_stft (bool): + enable / disable use of subband loss computation originally used by MultiBandMelgan model. Defaults to True. + use_mse_gan_loss (bool): + enable / disable using Mean Squeare Error GAN loss. Defaults to True. + use_hinge_gan_loss (bool): + enable / disable using Hinge GAN loss. You should choose either Hinge or MSE loss for training GAN models. + Defaults to False. + use_feat_match_loss (bool): + enable / disable using Feature Matching loss originally used by MelGAN model. Defaults to True. + use_l1_spec_loss (bool): + enable / disable using L1 spectrogram loss originally used by HifiGAN model. Defaults to False. + stft_loss_params (dict): STFT loss parameters. Default to + `{"n_ffts": [1024, 2048, 512], "hop_lengths": [120, 240, 50], "win_lengths": [600, 1200, 240]}` + stft_loss_weight (float): STFT loss weight that multiplies the computed loss before summing up the total + model loss. Defaults to 0.5. + subband_stft_loss_weight (float): + Subband STFT loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0. + mse_G_loss_weight (float): + MSE generator loss weight that multiplies the computed loss before summing up the total loss. faults to 2.5. + hinge_G_loss_weight (float): + Hinge generator loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0. + feat_match_loss_weight (float): + Feature matching loss weight that multiplies the computed loss before summing up the total loss. faults to 108. + l1_spec_loss_weight (float): + L1 spectrogram loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0. + """ model: str = "melgan" diff --git a/TTS/vocoder/configs/multiband_melgan_config.py b/TTS/vocoder/configs/multiband_melgan_config.py index 98f1d353..f7aabdb3 100644 --- a/TTS/vocoder/configs/multiband_melgan_config.py +++ b/TTS/vocoder/configs/multiband_melgan_config.py @@ -1,11 +1,95 @@ from dataclasses import dataclass, field -from .shared_configs import BaseGANVocoderConfig +from TTS.vocoder.configs.shared_configs import BaseGANVocoderConfig @dataclass class MultibandMelganConfig(BaseGANVocoderConfig): - """Defines parameters for MultiBandMelGAN vocoder.""" + """Defines parameters for MultiBandMelGAN vocoder. + + Example: + + >>> from TTS.vocoder.configs import MultibandMelganConfig + >>> config = MultibandMelganConfig() + + Args: + model (str): + Model name used for selecting the right model at initialization. Defaults to `melgan`. + discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to + 'melgan_multiscale_discriminator`. + discriminator_model_params (dict): The discriminator model parameters. Defaults to + '{ + "base_channels": 16, + "max_channels": 512, + "downsample_factors": [4, 4, 4] + }` + generator_model (str): One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is + considered as a generator too. Defaults to `melgan_generator`. + generator_model_param (dict): + The generator model parameters. Defaults to `{"upsample_factors": [8, 4, 2], "num_res_blocks": 4}`. + use_pqmf (bool): + enable / disable PQMF modulation for multi-band training. Defaults to True. + lr_gen (float): + Initial learning rate for the generator model. Defaults to 0.0001. + lr_disc (float): + Initial learning rate for the discriminator model. Defaults to 0.0001. + optimizer (torch.optim.Optimizer): + Optimizer used for the training. Defaults to `AdamW`. + optimizer_params (dict): + Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}` + lr_scheduler_gen (torch.optim.Scheduler): + Learning rate scheduler for the generator. Defaults to `MultiStepLR`. + lr_scheduler_gen_params (dict): + Parameters for the generator learning rate scheduler. Defaults to + `{"gamma": 0.5, "milestones": [100000, 200000, 300000, 400000, 500000, 600000]}`. + lr_scheduler_disc (torch.optim.Scheduler): + Learning rate scheduler for the discriminator. Defaults to `MultiStepLR`. + lr_scheduler_dict_params (dict): + Parameters for the discriminator learning rate scheduler. Defaults to + `{"gamma": 0.5, "milestones": [100000, 200000, 300000, 400000, 500000, 600000]}`. + batch_size (int): + Batch size used at training. Larger values use more memory. Defaults to 16. + seq_len (int): + Audio segment length used at training. Larger values use more memory. Defaults to 8192. + pad_short (int): + Additional padding applied to the audio samples shorter than `seq_len`. Defaults to 0. + use_noise_augment (bool): + enable / disable random noise added to the input waveform. The noise is added after computing the + features. Defaults to True. + use_cache (bool): + enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is + not large enough. Defaults to True. + steps_to_start_discriminator (int): + Number of steps required to start training the discriminator. Defaults to 0. + use_stft_loss (bool):` + enable / disable use of STFT loss originally used by ParallelWaveGAN model. Defaults to True. + use_subband_stft (bool): + enable / disable use of subband loss computation originally used by MultiBandMelgan model. Defaults to True. + use_mse_gan_loss (bool): + enable / disable using Mean Squeare Error GAN loss. Defaults to True. + use_hinge_gan_loss (bool): + enable / disable using Hinge GAN loss. You should choose either Hinge or MSE loss for training GAN models. + Defaults to False. + use_feat_match_loss (bool): + enable / disable using Feature Matching loss originally used by MelGAN model. Defaults to True. + use_l1_spec_loss (bool): + enable / disable using L1 spectrogram loss originally used by HifiGAN model. Defaults to False. + stft_loss_params (dict): STFT loss parameters. Default to + `{"n_ffts": [1024, 2048, 512], "hop_lengths": [120, 240, 50], "win_lengths": [600, 1200, 240]}` + stft_loss_weight (float): STFT loss weight that multiplies the computed loss before summing up the total + model loss. Defaults to 0.5. + subband_stft_loss_weight (float): + Subband STFT loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0. + mse_G_loss_weight (float): + MSE generator loss weight that multiplies the computed loss before summing up the total loss. faults to 2.5. + hinge_G_loss_weight (float): + Hinge generator loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0. + feat_match_loss_weight (float): + Feature matching loss weight that multiplies the computed loss before summing up the total loss. faults to 108. + l1_spec_loss_weight (float): + L1 spectrogram loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0. + """ + model: str = "multiband_melgan" @@ -58,8 +142,4 @@ class MultibandMelganConfig(BaseGANVocoderConfig): mse_G_loss_weight: float = 2.5 hinge_G_loss_weight: float = 0 feat_match_loss_weight: float = 108 - l1_spec_loss_weight: float = 0 - - # optimizer parameters - lr: float = 1e-4 - wd: float = 1e-6 + l1_spec_loss_weight: float = 0 \ No newline at end of file diff --git a/TTS/vocoder/configs/parallel_wavegan_config.py b/TTS/vocoder/configs/parallel_wavegan_config.py index b8a489a3..d132d2e1 100644 --- a/TTS/vocoder/configs/parallel_wavegan_config.py +++ b/TTS/vocoder/configs/parallel_wavegan_config.py @@ -5,7 +5,77 @@ from .shared_configs import BaseGANVocoderConfig @dataclass class ParallelWaveganConfig(BaseGANVocoderConfig): - """Defines parameters for ParallelWavegan vocoder.""" + """Defines parameters for ParallelWavegan vocoder. + + Args: + model (str): + Model name used for selecting the right configuration at initialization. Defaults to `parallel_wavegan`. + discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to + 'parallel_wavegan_discriminator`. + discriminator_model_params (dict): The discriminator model kwargs. Defaults to + '{"num_layers": 10}` + generator_model (str): One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is + considered as a generator too. Defaults to `parallel_wavegan_generator`. + generator_model_param (dict): + The generator model kwargs. Defaults to `{"upsample_factors": [4, 4, 4, 4], "stacks": 3, "num_res_blocks": 30}`. + batch_size (int): + Batch size used at training. Larger values use more memory. Defaults to 16. + seq_len (int): + Audio segment length used at training. Larger values use more memory. Defaults to 8192. + pad_short (int): + Additional padding applied to the audio samples shorter than `seq_len`. Defaults to 0. + use_noise_augment (bool): + enable / disable random noise added to the input waveform. The noise is added after computing the + features. Defaults to True. + use_cache (bool): + enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is + not large enough. Defaults to True. + steps_to_start_discriminator (int): + Number of steps required to start training the discriminator. Defaults to 0. + use_stft_loss (bool):` + enable / disable use of STFT loss originally used by ParallelWaveGAN model. Defaults to True. + use_subband_stft (bool): + enable / disable use of subband loss computation originally used by MultiBandMelgan model. Defaults to True. + use_mse_gan_loss (bool): + enable / disable using Mean Squeare Error GAN loss. Defaults to True. + use_hinge_gan_loss (bool): + enable / disable using Hinge GAN loss. You should choose either Hinge or MSE loss for training GAN models. + Defaults to False. + use_feat_match_loss (bool): + enable / disable using Feature Matching loss originally used by MelGAN model. Defaults to True. + use_l1_spec_loss (bool): + enable / disable using L1 spectrogram loss originally used by HifiGAN model. Defaults to False. + stft_loss_params (dict): STFT loss parameters. Default to + `{"n_ffts": [1024, 2048, 512], "hop_lengths": [120, 240, 50], "win_lengths": [600, 1200, 240]}` + stft_loss_weight (float): STFT loss weight that multiplies the computed loss before summing up the total + model loss. Defaults to 0.5. + subband_stft_loss_weight (float): + Subband STFT loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0. + mse_G_loss_weight (float): + MSE generator loss weight that multiplies the computed loss before summing up the total loss. faults to 2.5. + hinge_G_loss_weight (float): + Hinge generator loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0. + feat_match_loss_weight (float): + Feature matching loss weight that multiplies the computed loss before summing up the total loss. faults to 0. + l1_spec_loss_weight (float): + L1 spectrogram loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0. + lr_gen (float): + Generator model initial learning rate. Defaults to 0.0002. + lr_disc (float): + Discriminator model initial learning rate. Defaults to 0.0002. + optimizer (torch.optim.Optimizer): + Optimizer used for the training. Defaults to `AdamW`. + optimizer_params (dict): + Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}` + lr_scheduler_gen (torch.optim.Scheduler): + Learning rate scheduler for the generator. Defaults to `ExponentialLR`. + lr_scheduler_gen_params (dict): + Parameters for the generator learning rate scheduler. Defaults to `{"gamma": 0.999, "last_epoch": -1}`. + lr_scheduler_disc (torch.optim.Scheduler): + Learning rate scheduler for the discriminator. Defaults to `ExponentialLR`. + lr_scheduler_dict_params (dict): + Parameters for the discriminator learning rate scheduler. Defaults to `{"gamma": 0.999, "last_epoch": -1}`. + """ model: str = "parallel_wavegan" diff --git a/TTS/vocoder/configs/shared_configs.py b/TTS/vocoder/configs/shared_configs.py index b335c36f..664032d2 100644 --- a/TTS/vocoder/configs/shared_configs.py +++ b/TTS/vocoder/configs/shared_configs.py @@ -7,7 +7,34 @@ from TTS.config import BaseAudioConfig, BaseTrainingConfig @dataclass class BaseVocoderConfig(BaseTrainingConfig): - """Shared parameters among all the vocoder models.""" + """Shared parameters among all the vocoder models. + Args: + audio (BaseAudioConfig): + Audio processor config instance. Defaultsto `BaseAudioConfig()`. + use_noise_augment (bool): + Augment the input audio with random noise. Defaults to False/ + eval_split_size (int): + Number of instances used for evaluation. Defaults to 10. + data_path (str): + Root path of the training data. All the audio files found recursively from this root path are used for + training. Defaults to MISSING. + feature_path (str): + Root path to the precomputed feature files. Defaults to None. + seq_len (int): + Length of the waveform segments used for training. Defaults to MISSING. + pad_short (int): + Extra padding for the waveforms shorter than `seq_len`. Defaults to 0. + conv_path (int): + Extra padding for the feature frames against convolution of the edge frames. Defaults to MISSING. + Defaults to 0. + use_cache (bool): + enable / disable in memory caching of the computed features. If the RAM is not enough, if may cause OOM. + Defaults to False. + epochs (int): + Number of training epochs to. Defaults to 10000. + wd (float): + Weight decay. + """ audio: BaseAudioConfig = field(default_factory=BaseAudioConfig) # dataloading @@ -19,7 +46,6 @@ class BaseVocoderConfig(BaseTrainingConfig): seq_len: int = MISSING # signal length used in training. pad_short: int = 0 # additional padding for short wavs conv_pad: int = 0 # additional padding against convolutions applied to spectrograms - use_noise_augment: bool = False # add noise to the audio signal for augmentation use_cache: bool = False # use in memory cache to keep the computed features. This might cause OOM. # OPTIMIZER epochs: int = 10000 # total number of epochs to train. @@ -28,7 +54,78 @@ class BaseVocoderConfig(BaseTrainingConfig): @dataclass class BaseGANVocoderConfig(BaseVocoderConfig): - """Common config interface for all the GAN based vocoder models.""" + """Base config class used among all the GAN based vocoders. + Args: + use_stft_loss (bool): + enable / disable the use of STFT loss. Defaults to True. + use_subband_stft_loss (bool): + enable / disable the use of Subband STFT loss. Defaults to True. + use_mse_gan_loss (bool): + enable / disable the use of Mean Squared Error based GAN loss. Defaults to True. + use_hinge_gan_loss (bool): + enable / disable the use of Hinge GAN loss. Defaults to True. + use_feat_match_loss (bool): + enable / disable feature matching loss. Defaults to True. + use_l1_spec_loss (bool): + enable / disable L1 spectrogram loss. Defaults to True. + stft_loss_weight (float): + Loss weight that multiplies the computed loss value. Defaults to 0. + subband_stft_loss_weight (float): + Loss weight that multiplies the computed loss value. Defaults to 0. + mse_G_loss_weight (float): + Loss weight that multiplies the computed loss value. Defaults to 1. + hinge_G_loss_weight (float): + Loss weight that multiplies the computed loss value. Defaults to 0. + feat_match_loss_weight (float): + Loss weight that multiplies the computed loss value. Defaults to 100. + l1_spec_loss_weight (float): + Loss weight that multiplies the computed loss value. Defaults to 45. + stft_loss_params (dict): + Parameters for the STFT loss. Defaults to `{"n_ffts": [1024, 2048, 512], "hop_lengths": [120, 240, 50], "win_lengths": [600, 1200, 240]}`. + l1_spec_loss_params (dict): + Parameters for the L1 spectrogram loss. Defaults to + `{ + "use_mel": True, + "sample_rate": 22050, + "n_fft": 1024, + "hop_length": 256, + "win_length": 1024, + "n_mels": 80, + "mel_fmin": 0.0, + "mel_fmax": None, + }` + target_loss (str): + Target loss name that defines the quality of the model. Defaults to `avg_G_loss`. + gen_clip_grad (float): + Gradient clipping threshold for the generator model. Any value less than 0 disables clipping. + Defaults to -1. + disc_clip_grad (float): + Gradient clipping threshold for the discriminator model. Any value less than 0 disables clipping. + Defaults to -1. + lr_gen (float): + Generator model initial learning rate. Defaults to 0.0002. + lr_disc (float): + Discriminator model initial learning rate. Defaults to 0.0002. + optimizer (torch.optim.Optimizer): + Optimizer used for the training. Defaults to `AdamW`. + optimizer_params (dict): + Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}` + lr_scheduler_gen (torch.optim.Scheduler): + Learning rate scheduler for the generator. Defaults to `ExponentialLR`. + lr_scheduler_gen_params (dict): + Parameters for the generator learning rate scheduler. Defaults to `{"gamma": 0.999, "last_epoch": -1}`. + lr_scheduler_disc (torch.optim.Scheduler): + Learning rate scheduler for the discriminator. Defaults to `ExponentialLR`. + lr_scheduler_dict_params (dict): + Parameters for the discriminator learning rate scheduler. Defaults to `{"gamma": 0.999, "last_epoch": -1}`. + use_pqmf (bool): + enable / disable PQMF for subband approximation at training. Defaults to False. + steps_to_start_discriminator (int): + Number of steps required to start training the discriminator. Defaults to 0. + diff_samples_for_G_and_D (bool): + enable / disable use of different training samples for the generator and the discriminator iterations. + Enabling it results in slower iterations but faster convergance in some cases. Defaults to False. + """ # LOSS PARAMETERS use_stft_loss: bool = True @@ -43,7 +140,7 @@ class BaseGANVocoderConfig(BaseVocoderConfig): subband_stft_loss_weight: float = 0 mse_G_loss_weight: float = 1 hinge_G_loss_weight: float = 0 - feat_match_loss_weight: float = 10 + feat_match_loss_weight: float = 100 l1_spec_loss_weight: float = 45 stft_loss_params: dict = field( diff --git a/TTS/vocoder/configs/wavegrad_config.py b/TTS/vocoder/configs/wavegrad_config.py index 48e3aba2..9d5cc683 100644 --- a/TTS/vocoder/configs/wavegrad_config.py +++ b/TTS/vocoder/configs/wavegrad_config.py @@ -1,12 +1,71 @@ from dataclasses import dataclass, field -from .shared_configs import BaseVocoderConfig +from TTS.vocoder.configs.shared_configs import BaseVocoderConfig @dataclass class WavegradConfig(BaseVocoderConfig): - """Defines parameters for Wavernn vocoder.""" + """Defines parameters for WaveGrad vocoder. + Example: + >>> from TTS.vocoder.configs import WavegradConfig + >>> config = WavegradConfig() + + Args: + model (str): + Model name used for selecting the right model at initialization. Defaults to `wavegrad`. + generator_model (str): One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is + considered as a generator too. Defaults to `wavegrad`. + model_params (dict): + WaveGrad kwargs. Defaults to + ` + { + "use_weight_norm": True, + "y_conv_channels": 32, + "x_conv_channels": 768, + "ublock_out_channels": [512, 512, 256, 128, 128], + "dblock_out_channels": [128, 128, 256, 512], + "upsample_factors": [4, 4, 4, 2, 2], + "upsample_dilations": [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]], + } + ` + target_loss (str): + Target loss name that defines the quality of the model. Defaults to `avg_wavegrad_loss`. + epochs (int): + Number of epochs to traing the model. Defaults to 10000. + batch_size (int): + Batch size used at training. Larger values use more memory. Defaults to 96. + seq_len (int): + Audio segment length used at training. Larger values use more memory. Defaults to 6144. + use_cache (bool): + enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is + not large enough. Defaults to True. + mixed_precision (bool): + enable / disable mixed precision training. Default is True. + eval_split_size (int): + Number of samples used for evalutaion. Defaults to 50. + train_noise_schedule (dict): + Training noise schedule. Defaults to + `{"min_val": 1e-6, "max_val": 1e-2, "num_steps": 1000}` + test_noise_schedule (dict): + Inference noise schedule. For a better performance, you may need to use `bin/tune_wavegrad.py` to find a + better schedule. Defaults to + ` + { + "min_val": 1e-6, + "max_val": 1e-2, + "num_steps": 50, + } + ` + grad_clip (float): + Gradient clipping threshold. If <= 0.0, no clipping is applied. Defaults to 1.0 + lr (float): + Initila leraning rate. Defaults to 1e-4. + lr_scheduler (str): + One of the learning rate schedulers from `torch.optim.scheduler.*`. Defaults to `MultiStepLR`. + lr_scheduler_params (dict): + kwargs for the scheduler. Defaults to `{"gamma": 0.5, "milestones": [100000, 200000, 300000, 400000, 500000, 600000]}` + """ model: str = "wavegrad" # Model specific params generator_model: str = "wavegrad" @@ -28,7 +87,6 @@ class WavegradConfig(BaseVocoderConfig): batch_size: int = 96 seq_len: int = 6144 use_cache: bool = True - steps_to_start_discriminator: int = 200000 mixed_precision: bool = True eval_split_size: int = 50 diff --git a/TTS/vocoder/configs/wavernn_config.py b/TTS/vocoder/configs/wavernn_config.py index 0b546a91..95a3cfc4 100644 --- a/TTS/vocoder/configs/wavernn_config.py +++ b/TTS/vocoder/configs/wavernn_config.py @@ -1,11 +1,77 @@ from dataclasses import dataclass, field -from .shared_configs import BaseVocoderConfig +from TTS.vocoder.configs.shared_configs import BaseVocoderConfig @dataclass class WavernnConfig(BaseVocoderConfig): - """Defines parameters for Wavernn vocoder.""" + """Defines parameters for Wavernn vocoder. + Example: + + >>> from TTS.vocoder.configs import WavernnConfig + >>> config = WavernnConfig() + + Args: + model (str): + Model name used for selecting the right model at initialization. Defaults to `wavernn`. + mode (str): + Output mode of the WaveRNN vocoder. `mold` for Mixture of Logistic Distribution, `gauss` for a single + Gaussian Distribution and `bits` for quantized bits as the model's output. + mulaw (bool): + enable / disable the use of Mulaw quantization for training. Only applicable if `mode == 'bits'`. Defaults + to `True`. + generator_model (str): + One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is + considered as a generator too. Defaults to `WaveRNN`. + wavernn_model_params (dict): + kwargs for the WaveRNN model. Defaults to + `{ + "rnn_dims": 512, + "fc_dims": 512, + "compute_dims": 128, + "res_out_dims": 128, + "num_res_blocks": 10, + "use_aux_net": True, + "use_upsample_net": True, + "upsample_factors": [4, 8, 8] + }` + batched (bool): + enable / disable the batched inference. It speeds up the inference by splitting the input into segments and + processing the segments in a batch. Then it merges the outputs with a certain overlap and smoothing. If + you set it False, without CUDA, it is too slow to be practical. Defaults to True. + target_samples (int): + Size of the segments in batched mode. Defaults to 11000. + overlap_sampels (int): + Size of the overlap between consecutive segments. Defaults to 550. + batch_size (int): + Batch size used at training. Larger values use more memory. Defaults to 256. + seq_len (int): + Audio segment length used at training. Larger values use more memory. Defaults to 1280. + padding (int): + Padding applied to the input feature frames against the convolution layers of the feature network. + Defaults to 2. + use_noise_augment (bool): + enable / disable random noise added to the input waveform. The noise is added after computing the + features. Defaults to True. + use_cache (bool): + enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is + not large enough. Defaults to True. + mixed_precision (bool): + enable / disable mixed precision training. Default is True. + eval_split_size (int): + Number of samples used for evalutaion. Defaults to 50. + test_every_epoch (int): + Number of epochs waited to run the next evalution. Since inference takes some time, it is better to + wait some number of epochs not ot waste training time. Defaults to 10. + grad_clip (float): + Gradient clipping threshold. If <= 0.0, no clipping is applied. Defaults to 4.0 + lr (float): + Initila leraning rate. Defaults to 1e-4. + lr_scheduler (str): + One of the learning rate schedulers from `torch.optim.scheduler.*`. Defaults to `MultiStepLR`. + lr_scheduler_params (dict): + kwargs for the scheduler. Defaults to `{"gamma": 0.5, "milestones": [200000, 400000, 600000]}` + """ model: str = "wavernn" @@ -38,7 +104,6 @@ class WavernnConfig(BaseVocoderConfig): padding: int = 2 use_noise_augment: bool = False use_cache: bool = True - steps_to_start_discriminator: int = 200000 mixed_precision: bool = True eval_split_size: int = 50 test_every_epochs: int = 10 # number of epochs to wait until the next test run (synthesizing a full audio clip).