add docstrings with default value fixes

This commit is contained in:
Eren Gölge 2021-05-13 16:04:49 +02:00
parent 7e02cff924
commit 8b1014d188
16 changed files with 1091 additions and 75 deletions

View File

@ -13,7 +13,7 @@ class BaseAudioConfig(Coqpit):
Number of STFT frequency levels aka.size of the linear spectogram frame. Defaults to 1024.
win_length (int):
Each frame of audio is windowed by window of length ```win_length``` and then padded with zeros to match
```fft_size```. Defaults to 256.
```fft_size```. Defaults to 1024.
hop_length (int):
Number of audio samples between adjacent STFT columns. Defaults to 1024.
frame_shift_ms (int):
@ -21,7 +21,7 @@ class BaseAudioConfig(Coqpit):
frame_length_ms (int):
Set ```win_length``` based on milliseconds and sampling rate.
stft_pad_mode (str):
Padding method used in STFT. 'reflect' or 'center'.
Padding method used in STFT. 'reflect' or 'center'. Defaults to 'reflect'.
sample_rate (int):
Audio sampling rate. Defaults to 22050.
resample (bool):
@ -135,11 +135,27 @@ class BaseAudioConfig(Coqpit):
@dataclass
class BaseDatasetConfig(Coqpit):
name: str = None
path: str = None
meta_file_train: Union[str, List] = None # TODO: don't take ignored speakers for multi-speaker datasets over this. This is Union for SC-Glow compat.
meta_file_val: str = None
meta_file_attn_mask: str = None
"""Base config for TTS datasets.
Args:
name (str):
Dataset name that defines the preprocessor in use. Defaults to None.
path (str):
Root path to the dataset files. Defaults to None.
meta_file_train (Union[str, List]):
Name of the dataset meta file. Or a list of speakers to be ignored at training for multi-speaker datasets.
Defaults to None.
meta_file_val (str):
Name of the dataset meta file that defines the instances used at validation.
meta_file_attn_mask (str):
Path to the file that lists the attention mask files used with models that require attention masks to
train the duration predictor.
"""
name: str = ''
path: str = ''
meta_file_train: Union[str, List] = '' # TODO: don't take ignored speakers for multi-speaker datasets over this. This is Union for SC-Glow compat.
meta_file_val: str = ''
meta_file_attn_mask: str = ''
def check_values(
self,
@ -161,12 +177,8 @@ class BaseTrainingConfig(Coqpit):
Args:
batch_size (int):
Training batch size.
batch_group_size (int):
Number of batches to shuffle after bucketing.
eval_batch_size (int):
Validation batch size.
loss_masking (bool):
Enable / Disable masking padding segments of sequences.
mixed_precision (bool):
Enable / Disable mixed precision training. It reduces the VRAM use and allows larger batch sizes, however
it may also cause numerical unstability in some cases.
@ -195,34 +207,13 @@ class BaseTrainingConfig(Coqpit):
keep_after (int):
Number of steps to wait before saving all the best models. In use if ```keep_all_best == True```. Defaults
to 10000.
text_cleaner (str):
Text cleaner to be used at model training. It is set to be one of the cleaners in
```TTS.tts.utils.text.cleaners```.
enable_eos_bos_chars (bool):
Enable / Disable using special characters indicating end-of-sentence and begining-of-sentence.
num_loader_workers (int):
Number of workers for training time dataloader.
num_val_loader_workers (int):
Number of workers for evaluation time dataloader.
min_seq_len (int):
Minimum sequence length to be used at training.
max_seq_len (int):
Maximum sequence length to be used at training. VRAM use at training depends on this parameter. Consider to
decrease it if you get OOM errors.
compute_f0 (bool):
Return F0 frames from the dataloader. Defaults to ```False```.
compute_input_seq_cache (bool):
Enable / Disable computing and caching phonemes sequences from character sequences at the begining of the
training. It allows faster data loading times and more precise max-min sequence prunning. Defaults
to ```False```.
output_path (str):
Path for training output folder. The nonexist part of the given path is created automatically.
All training outputs are saved there.
phoneme_cache_path (str):
Path to a folder to save the computed phoneme sequences.
datasets (List[BaseDatasetConfig]):
ist of DatasetConfig.
"""
model: str = None

View File

@ -1,11 +1,69 @@
from dataclasses import dataclass, field
from .shared_configs import BaseTTSConfig
from TTS.tts.configs.shared_configs import BaseTTSConfig
@dataclass
class AlignTTSConfig(BaseTTSConfig):
"""Defines parameters for AlignTTS model."""
"""Defines parameters for AlignTTS model.
Example:
>>> from TTS.tts.configs import AlignTTSConfig
>>> config = AlignTTSConfig()
Args:
model(str):
Model name used for selecting the right model at initialization. Defaults to `align_tts`.
positional_encoding (bool):
enable / disable positional encoding applied to the encoder output. Defaults to True.
hidden_channels (int):
Base number of hidden channels. Defines all the layers expect ones defined by the specific encoder or decoder
parameters. Defaults to 256.
hidden_channels_dp (int):
Number of hidden channels of the duration predictor's layers. Defaults to 256.
encoder_type (str):
Type of the encoder used by the model. Look at `TTS.tts.layers.feed_forward.encoder` for more details.
Defaults to `fftransformer`.
encoder_params (dict):
Parameters used to define the encoder network. Look at `TTS.tts.layers.feed_forward.encoder` for more details.
Defaults to `{"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}`.
decoder_type (str):
Type of the decoder used by the model. Look at `TTS.tts.layers.feed_forward.decoder` for more details.
Defaults to `fftransformer`.
decoder_params (dict):
Parameters used to define the decoder network. Look at `TTS.tts.layers.feed_forward.decoder` for more details.
Defaults to `{"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}`.
phase_start_steps (List[int]):
A list of number of steps required to start the next training phase. AlignTTS has 4 different training
phases. Thus you need to define 4 different values to enable phase based training. If None, it
trains the whole model together. Defaults to None.
ssim_alpha (float):
Weight for the SSIM loss. If set <= 0, disables the SSIM loss. Defaults to 1.0.
duration_loss_alpha (float):
Weight for the duration predictor's loss. Defaults to 1.0.
mdn_alpha (float):
Weight for the MDN loss. Defaults to 1.0.
spec_loss_alpha (float):
Weight for the MSE spectrogram loss. If set <= 0, disables the L1 loss. Defaults to 1.0.
use_speaker_embedding (bool):
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
in the multi-speaker mode. Defaults to False.
use_external_speaker_embedding_file (bool):
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
external_speaker_embedding_file (str):
Path to the file including pre-computed speaker embeddings. Defaults to None.
noam_schedule (bool):
enable / disable the use of Noam LR scheduler. Defaults to False.
warmup_steps (int):
Number of warm-up steps for the Noam scheduler. Defaults 4000.
lr (float):
Initial learning rate. Defaults to `1e-3`.
wd (float):
Weight decay coefficient. Defaults to `1e-7`.
min_seq_len (int):
Minimum input sequence length to be used at training.
max_seq_len (int):
Maximum input sequence length to be used at training. Larger values result in more VRAM usage."""
model: str = "align_tts"
# model specific params

View File

@ -1,11 +1,64 @@
from dataclasses import dataclass, field
from .shared_configs import BaseTTSConfig
from TTS.tts.configs.shared_configs import BaseTTSConfig
@dataclass
class GlowTTSConfig(BaseTTSConfig):
"""Defines parameters for GlowTTS model."""
"""Defines parameters for GlowTTS model.
Example:
>>> from TTS.tts.configs import GlowTTSConfig
>>> config = GlowTTSConfig()
Args:
model(str):
Model name used for selecting the right model at initialization. Defaults to `glow_tts`.
encoder_type (str):
Type of the encoder used by the model. Look at `TTS.tts.layers.glow_tts.encoder` for more details.
Defaults to `rel_pos_transformers`.
encoder_params (dict):
Parameters used to define the encoder network. Look at `TTS.tts.layers.glow_tts.encoder` for more details.
Defaults to `{"kernel_size": 3, "dropout_p": 0.1, "num_layers": 6, "num_heads": 2, "hidden_channels_ffn": 768}`
use_encoder_prenet (bool):
enable / disable the use of a prenet for the encoder. Defaults to True.
hidden_channels_encoder (int):
Number of base hidden channels used by the encoder network. It defines the input and the output channel sizes,
and for some encoder types internal hidden channels sizes too. Defaults to 192.
hidden_channels_decoder (int):
Number of base hidden channels used by the decoder WaveNet network. Defaults to 192 as in the original work.
hidden_channels_duration_predictor (int):
Number of layer channels of the duration predictor network. Defaults to 256 as in the original work.
data_dep_init_steps (int):
Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
Activation Normalization that pre-computes normalization stats at the beginning and use the same values
for the rest. Defaults to 10.
style_wav_for_test (str):
Path to the wav file used for changing the style of the speech. Defaults to None.
inference_noise_scale (float):
Variance used for sampling the random noise added to the decoder's input at inference. Defaults to 0.0.
use_speaker_embedding (bool):
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
in the multi-speaker mode. Defaults to False.
use_external_speaker_embedding_file (bool):
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
external_speaker_embedding_file (str):
Path to the file including pre-computed speaker embeddings. Defaults to None.
noam_schedule (bool):
enable / disable the use of Noam LR scheduler. Defaults to False.
warmup_steps (int):
Number of warm-up steps for the Noam scheduler. Defaults 4000.
lr (float):
Initial learning rate. Defaults to `1e-3`.
wd (float):
Weight decay coefficient. Defaults to `1e-7`.
min_seq_len (int):
Minimum input sequence length to be used at training.
max_seq_len (int):
Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
"""
model: str = "glow_tts"
@ -47,4 +100,4 @@ class GlowTTSConfig(BaseTTSConfig):
# overrides
min_seq_len: int = 3
max_seq_len: int = 500
r: int = 1
r: int = 1 # DO NOT CHANGE - TODO: make this immutable once coqpit implements it.

View File

@ -8,8 +8,20 @@ from TTS.config import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig
@dataclass
class GSTConfig(Coqpit):
"""Defines Global Style Toke module"""
"""Defines the Global Style Token Module
Args:
gst_style_input_wav (str):
Path to the wav file used to define the style of the output speech at inference. Defaults to None.
gst_style_input_weights (dict):
Defines the weights for each style token used at inference. Defaults to None.
gst_embedding_dim (int):
Defines the size of the GST embedding vector dimensions. Defaults to 256.
gst_num_heads (int):
Number of attention heads used by the multi-head attention. Defaults to 4.
gst_num_style_tokens (int):
Number of style token vectors. Defaults to 10.
"""
gst_style_input_wav: str = None
gst_style_input_weights: dict = None
gst_embedding_dim: int = 256
@ -33,7 +45,26 @@ class GSTConfig(Coqpit):
@dataclass
class CharactersConfig(Coqpit):
"""Defines character or phoneme set used by the model"""
"""Defines character or phoneme set used by the model
Args:
pad (str):
characters in place of empty padding. Defaults to None.
eos (str):
characters showing the end of a sentence. Defaults to None.
bos (str):
characters showing the beginning of a sentence. Defaults to None.
characters (str):
character set used by the model. Characters not in this list are ignored when converting input text to
a list of sequence IDs. Defaults to None.
punctuations (str):
characters considered as punctuation as parsing the input sentence. Defaults to None.
phonemes (str):
characters considered as parsing phonemes. Defaults to None.
unique (bool):
remove any duplicate characters in the character lists. It is a bandaid for compatibility with the old
models trained with character lists with duplicates.
"""
pad: str = None
eos: str = None
@ -58,7 +89,48 @@ class CharactersConfig(Coqpit):
@dataclass
class BaseTTSConfig(BaseTrainingConfig):
"""Shared parameters among all the tts models."""
"""Shared parameters among all the tts models.
Args:
audio (BaseAudioConfig):
Audio processor config object instance.
use_phonemes (bool):
enable / disable phoneme use.
compute_input_seq_cache (bool):
enable / disable precomputation of the phoneme sequences. At the expense of some delay at the beginning of
the training, It allows faster data loader time and precise limitation with `max_seq_len` and
`min_seq_len`.
text_cleaner (str):
Name of the text cleaner used for cleaning and formatting transcripts.
enable_eos_bos_chars (bool):
enable / disable the use of eos and bos characters.
test_senteces_file (str):
Path to a txt file that has sentences used at test time. The file must have a sentence per line.
phoneme_cache_path (str):
Path to the output folder caching the computed phonemes for each sample.
characters (CharactersConfig):
Instance of a CharactersConfig class.
batch_group_size (int):
Size of the batch groups used for bucketing. By default, the dataloader orders samples by the sequence
length for a more efficient and stable training. If `batch_group_size > 1` then it performs bucketing to
prevent using the same batches for each epoch.
loss_masking (bool):
enable / disable masking loss values against padded segments of samples in a batch.
min_seq_len (int):
Minimum input sequence length to be used at training.
max_seq_len (int):
Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
compute_f0 (int):
(Not in use yet).
use_noise_augment (bool):
Augment the input audio with random noise.
add_blank (bool):
Add blank characters between each other two characters. It improves performance for some models at expense
of slower run-time due to the longer input sequence.
datasets (List[BaseDatasetConfig]):
List of datasets used for training. If multiple datasets are provided, they are merged and used together
for training.
"""
audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
# phoneme settings

View File

@ -1,11 +1,74 @@
from dataclasses import dataclass, field
from .shared_configs import BaseTTSConfig
from TTS.tts.configs.shared_configs import BaseTTSConfig
@dataclass
class SpeedySpeechConfig(BaseTTSConfig):
"""Defines parameters for Speedy Speech (feed-forward encoder-decoder) based models."""
"""Defines parameters for Speedy Speech (feed-forward encoder-decoder) based models.
Example:
>>> from TTS.tts.configs import SpeedySpeechConfig
>>> config = SpeedySpeechConfig()
Args:
model (str):
Model name used for selecting the right model at initialization. Defaults to `speedy_speech`.
positional_encoding (bool):
enable / disable positional encoding applied to the encoder output. Defaults to True.
hidden_channels (int):
Base number of hidden channels. Defines all the layers expect ones defined by the specific encoder or decoder
parameters. Defaults to 128.
encoder_type (str):
Type of the encoder used by the model. Look at `TTS.tts.layers.feed_forward.encoder` for more details.
Defaults to `residual_conv_bn`.
encoder_params (dict):
Parameters used to define the encoder network. Look at `TTS.tts.layers.feed_forward.encoder` for more details.
Defaults to `{"kernel_size": 4, "dilations": [1, 2, 4, 1, 2, 4, 1, 2, 4, 1, 2, 4, 1], "num_conv_blocks": 2, "num_res_blocks": 13}`
decoder_type (str):
Type of the decoder used by the model. Look at `TTS.tts.layers.feed_forward.decoder` for more details.
Defaults to `residual_conv_bn`.
decoder_params (dict):
Parameters used to define the decoder network. Look at `TTS.tts.layers.feed_forward.decoder` for more details.
Defaults to `{"kernel_size": 4, "dilations": [1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1], "num_conv_blocks": 2, "num_res_blocks": 17}`
hidden_channels_encoder (int):
Number of base hidden channels used by the encoder network. It defines the input and the output channel sizes,
and for some encoder types internal hidden channels sizes too. Defaults to 192.
hidden_channels_decoder (int):
Number of base hidden channels used by the decoder WaveNet network. Defaults to 192 as in the original work.
hidden_channels_duration_predictor (int):
Number of layer channels of the duration predictor network. Defaults to 256 as in the original work.
data_dep_init_steps (int):
Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
Activation Normalization that pre-computes normalization stats at the beginning and use the same values
for the rest. Defaults to 10.
use_speaker_embedding (bool):
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
in the multi-speaker mode. Defaults to False.
use_external_speaker_embedding_file (bool):
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
external_speaker_embedding_file (str):
Path to the file including pre-computed speaker embeddings. Defaults to None.
noam_schedule (bool):
enable / disable the use of Noam LR scheduler. Defaults to False.
warmup_steps (int):
Number of warm-up steps for the Noam scheduler. Defaults 4000.
lr (float):
Initial learning rate. Defaults to `1e-3`.
wd (float):
Weight decay coefficient. Defaults to `1e-7`.
ssim_alpha (float):
Weight for the SSIM loss. If set <= 0, disables the SSIM loss. Defaults to 1.0.
huber_alpha (float):
Weight for the duration predictor's loss. Defaults to 1.0.
l1_alpha (float):
Weight for the L1 spectrogram loss. If set <= 0, disables the L1 loss. Defaults to 1.0.
min_seq_len (int):
Minimum input sequence length to be used at training.
max_seq_len (int):
Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
"""
model: str = "speedy_speech"
# model specific params
@ -50,4 +113,4 @@ class SpeedySpeechConfig(BaseTTSConfig):
# overrides
min_seq_len: int = 13
max_seq_len: int = 200
r: int = 1
r: int = 1 #DO NOT CHANGE

View File

@ -5,6 +5,114 @@ from TTS.tts.configs.tacotron_config import TacotronConfig
@dataclass
class Tacotron2Config(TacotronConfig):
"""Defines parameters for Tacotron2 based models."""
"""Defines parameters for Tacotron2 based models.
Example:
>>> from TTS.tts.configs import Tacotron2Config
>>> config = Tacotron2Config()
Args:
model (str):
Model name used to select the right model class to initilize. Defaults to `Tacotron2`.
use_gst (bool):
enable / disable the use of Global Style Token modules. Defaults to False.
gst (GSTConfig):
Instance of `GSTConfig` class.
gst_style_input (str):
Path to the wav file used at inference to set the speech style through GST. If `GST` is enabled and
this is not defined, the model uses a zero vector as an input. Defaults to None.
r (int):
Number of output frames that the decoder computed per iteration. Larger values makes training and inference
faster but reduces the quality of the output frames. This needs to be tuned considering your own needs.
Defaults to 1.
gradual_trainin (List[List]):
Parameters for the gradual training schedule. It is in the form `[[a, b, c], [d ,e ,f] ..]` where `a` is
the step number to start using the rest of the values, `b` is the `r` value and `c` is the batch size.
If sets None, no gradual training is used. Defaults to None.
memory_size (int):
Defines the number of previous frames used by the Prenet. If set to < 0, then it uses only the last frame.
Defaults to -1.
prenet_type (str):
`original` or `bn`. `original` sets the default Prenet and `bn` uses Batch Normalization version of the
Prenet. Defaults to `original`.
prenet_dropout (bool):
enables / disables the use of dropout in the Prenet. Defaults to True.
prenet_dropout_at_inference (bool):
enable / disable the use of dropout in the Prenet at the inference time. Defaults to False.
stopnet (bool):
enable /disable the Stopnet that predicts the end of the decoder sequence. Defaults to True.
stopnet_pos_weight (float):
Weight that is applied to over-weight positive instances in the Stopnet loss. Use larger values with
datasets with longer sentences. Defaults to 10.
separate_stopnet (bool):
Use a distinct Stopnet which is trained separately from the rest of the model. Defaults to True.
attention_type (str):
attention type. Check ```TTS.tts.layers.attentions.init_attn```. Defaults to 'original'.
attention_heads (int):
Number of attention heads for GMM attention. Defaults to 5.
windowing (bool):
It especially useful at inference to keep attention alignment diagonal. Defaults to False.
use_forward_attn (bool):
It is only valid if ```attn_type``` is ```original```. Defaults to False.
forward_attn_mask (bool):
enable/disable extra masking over forward attention. It is useful at inference to prevent
possible attention failures. Defaults to False.
transition_agent (bool):
enable/disable transition agent in forward attention. Defaults to False.
location_attn (bool):
enable/disable location sensitive attention as in the original Tacotron2 paper.
It is only valid if ```attn_type``` is ```original```. Defaults to True.
bidirectional_decoder (bool):
enable/disable bidirectional decoding. Defaults to False.
double_decoder_consistency (bool):
enable/disable double decoder consistency. Defaults to False.
ddc_r (int):
reduction rate used by the coarse decoder when `double_decoder_consistency` is in use. Set this
as a multiple of the `r` value. Defaults to 6.
use_speaker_embedding (bool):
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
in the multi-speaker mode. Defaults to False.
use_external_speaker_embedding_file (bool):
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
external_speaker_embedding_file (str):
Path to the file including pre-computed speaker embeddings. Defaults to None.
noam_schedule (bool):
enable / disable the use of Noam LR scheduler. Defaults to False.
warmup_steps (int):
Number of warm-up steps for the Noam scheduler. Defaults 4000.
lr (float):
Initial learning rate. Defaults to `1e-4`.
wd (float):
Weight decay coefficient. Defaults to `1e-6`.
grad_clip (float):
Gradient clipping threshold. Defaults to `5`.
seq_len_notm (bool):
enable / disable the sequnce length normalization in the loss functions. If set True, loss of a sample
is divided by the sequence length. Defaults to False.
loss_masking (bool):
enable / disable masking the paddings of the samples in loss computation. Defaults to True.
decoder_loss_alpha (float):
Weight for the decoder loss of the Tacotron model. If set less than or equal to zero, it disables the
corresponding loss function. Defaults to 0.25
postnet_loss_alpha (float):
Weight for the postnet loss of the Tacotron model. If set less than or equal to zero, it disables the
corresponding loss function. Defaults to 0.25
postnet_diff_spec_alpha (float):
Weight for the postnet differential loss of the Tacotron model. If set less than or equal to zero, it disables the
corresponding loss function. Defaults to 0.25
decoder_diff_spec_alpha (float):
Weight for the decoder differential loss of the Tacotron model. If set less than or equal to zero, it disables the
corresponding loss function. Defaults to 0.25
decoder_ssim_alpha (float):
Weight for the decoder SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the
corresponding loss function. Defaults to 0.25
postnet_ssim_alpha (float):
Weight for the postnet SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the
corresponding loss function. Defaults to 0.25
ga_alpha (float):
Weight for the guided attention loss. If set less than or equal to zero, it disables the corresponding loss
function. Defaults to 5.
"""
model: str = "tacotron2"

View File

@ -1,12 +1,120 @@
from dataclasses import dataclass
from typing import List
from .shared_configs import BaseTTSConfig, GSTConfig
from TTS.tts.configs.shared_configs import BaseTTSConfig, GSTConfig
@dataclass
class TacotronConfig(BaseTTSConfig):
"""Defines parameters for Tacotron based models."""
"""Defines parameters for Tacotron based models.
Example:
>>> from TTS.tts.configs import TacotronConfig
>>> config = TacotronConfig()
Args:
model (str):
Model name used to select the right model class to initilize. Defaults to `Tacotron`.
use_gst (bool):
enable / disable the use of Global Style Token modules. Defaults to False.
gst (GSTConfig):
Instance of `GSTConfig` class.
gst_style_input (str):
Path to the wav file used at inference to set the speech style through GST. If `GST` is enabled and
this is not defined, the model uses a zero vector as an input. Defaults to None.
r (int):
Number of output frames that the decoder computed per iteration. Larger values makes training and inference
faster but reduces the quality of the output frames. This needs to be tuned considering your own needs.
Defaults to 1.
gradual_trainin (List[List]):
Parameters for the gradual training schedule. It is in the form `[[a, b, c], [d ,e ,f] ..]` where `a` is
the step number to start using the rest of the values, `b` is the `r` value and `c` is the batch size.
If sets None, no gradual training is used. Defaults to None.
memory_size (int):
Defines the number of previous frames used by the Prenet. If set to < 0, then it uses only the last frame.
Defaults to -1.
prenet_type (str):
`original` or `bn`. `original` sets the default Prenet and `bn` uses Batch Normalization version of the
Prenet. Defaults to `original`.
prenet_dropout (bool):
enables / disables the use of dropout in the Prenet. Defaults to True.
prenet_dropout_at_inference (bool):
enable / disable the use of dropout in the Prenet at the inference time. Defaults to False.
stopnet (bool):
enable /disable the Stopnet that predicts the end of the decoder sequence. Defaults to True.
stopnet_pos_weight (float):
Weight that is applied to over-weight positive instances in the Stopnet loss. Use larger values with
datasets with longer sentences. Defaults to 10.
separate_stopnet (bool):
Use a distinct Stopnet which is trained separately from the rest of the model. Defaults to True.
attention_type (str):
attention type. Check ```TTS.tts.layers.attentions.init_attn```. Defaults to 'original'.
attention_heads (int):
Number of attention heads for GMM attention. Defaults to 5.
windowing (bool):
It especially useful at inference to keep attention alignment diagonal. Defaults to False.
use_forward_attn (bool):
It is only valid if ```attn_type``` is ```original```. Defaults to False.
forward_attn_mask (bool):
enable/disable extra masking over forward attention. It is useful at inference to prevent
possible attention failures. Defaults to False.
transition_agent (bool):
enable/disable transition agent in forward attention. Defaults to False.
location_attn (bool):
enable/disable location sensitive attention as in the original Tacotron2 paper.
It is only valid if ```attn_type``` is ```original```. Defaults to True.
bidirectional_decoder (bool):
enable/disable bidirectional decoding. Defaults to False.
double_decoder_consistency (bool):
enable/disable double decoder consistency. Defaults to False.
ddc_r (int):
reduction rate used by the coarse decoder when `double_decoder_consistency` is in use. Set this
as a multiple of the `r` value. Defaults to 6.
use_speaker_embedding (bool):
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
in the multi-speaker mode. Defaults to False.
use_external_speaker_embedding_file (bool):
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
external_speaker_embedding_file (str):
Path to the file including pre-computed speaker embeddings. Defaults to None.
noam_schedule (bool):
enable / disable the use of Noam LR scheduler. Defaults to False.
warmup_steps (int):
Number of warm-up steps for the Noam scheduler. Defaults 4000.
lr (float):
Initial learning rate. Defaults to `1e-4`.
wd (float):
Weight decay coefficient. Defaults to `1e-6`.
grad_clip (float):
Gradient clipping threshold. Defaults to `5`.
seq_len_notm (bool):
enable / disable the sequnce length normalization in the loss functions. If set True, loss of a sample
is divided by the sequence length. Defaults to False.
loss_masking (bool):
enable / disable masking the paddings of the samples in loss computation. Defaults to True.
decoder_loss_alpha (float):
Weight for the decoder loss of the Tacotron model. If set less than or equal to zero, it disables the
corresponding loss function. Defaults to 0.25
postnet_loss_alpha (float):
Weight for the postnet loss of the Tacotron model. If set less than or equal to zero, it disables the
corresponding loss function. Defaults to 0.25
postnet_diff_spec_alpha (float):
Weight for the postnet differential loss of the Tacotron model. If set less than or equal to zero, it disables the
corresponding loss function. Defaults to 0.25
decoder_diff_spec_alpha (float):
Weight for the decoder differential loss of the Tacotron model. If set less than or equal to zero, it disables the
corresponding loss function. Defaults to 0.25
decoder_ssim_alpha (float):
Weight for the decoder SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the
corresponding loss function. Defaults to 0.25
postnet_ssim_alpha (float):
Weight for the postnet SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the
corresponding loss function. Defaults to 0.25
ga_alpha (float):
Weight for the guided attention loss. If set less than or equal to zero, it disables the corresponding loss
function. Defaults to 5.
"""
model: str = "tacotron"
use_gst: bool = False

View File

@ -52,19 +52,19 @@ def load_meta_data(datasets, eval_split=True):
print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}")
# load evaluation split if set
if eval_split:
if meta_file_val is None:
meta_data_eval, meta_data_train = split_dataset(meta_data_train)
else:
if meta_file_val:
meta_data_eval = preprocessor(root_path, meta_file_val)
else:
meta_data_eval, meta_data_train = split_dataset(meta_data_train)
meta_data_eval_all += meta_data_eval
meta_data_train_all += meta_data_train
# load attention masks for duration predictor training
if dataset.meta_file_attn_mask is not None:
if dataset.meta_file_attn_mask:
meta_data = dict(load_attention_mask_meta_data(dataset["meta_file_attn_mask"]))
for idx, ins in enumerate(meta_data_train_all):
attn_file = meta_data[ins[1]].strip()
meta_data_train_all[idx].append(attn_file)
if meta_data_eval_all is not None:
if meta_data_eval_all:
for idx, ins in enumerate(meta_data_eval_all):
attn_file = meta_data[ins[1]].strip()
meta_data_eval_all[idx].append(attn_file)

View File

@ -5,7 +5,62 @@ from .shared_configs import BaseGANVocoderConfig
@dataclass
class FullbandMelganConfig(BaseGANVocoderConfig):
"""Defines parameters for FullbandMelGAN vocoder."""
"""Defines parameters for FullBand MelGAN vocoder.
Example:
>>> from TTS.vocoder.configs import FullbandMelganConfig
>>> config = FullbandMelganConfig()
Args:
model (str):
Model name used for selecting the right model at initialization. Defaults to `melgan`.
discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to
'melgan_multiscale_discriminator`.
discriminator_model_params (dict): The discriminator model parameters. Defaults to
'{"base_channels": 16, "max_channels": 1024, "downsample_factors": [4, 4, 4, 4]}`
generator_model (str): One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is
considered as a generator too. Defaults to `melgan_generator`.
batch_size (int):
Batch size used at training. Larger values use more memory. Defaults to 16.
seq_len (int):
Audio segment length used at training. Larger values use more memory. Defaults to 8192.
pad_short (int):
Additional padding applied to the audio samples shorter than `seq_len`. Defaults to 0.
use_noise_augment (bool):
enable / disable random noise added to the input waveform. The noise is added after computing the
features. Defaults to True.
use_cache (bool):
enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is
not large enough. Defaults to True.
use_stft_loss (bool):
enable / disable use of STFT loss originally used by ParallelWaveGAN model. Defaults to True.
use_subband_stft (bool):
enable / disable use of subband loss computation originally used by MultiBandMelgan model. Defaults to True.
use_mse_gan_loss (bool):
enable / disable using Mean Squeare Error GAN loss. Defaults to True.
use_hinge_gan_loss (bool):
enable / disable using Hinge GAN loss. You should choose either Hinge or MSE loss for training GAN models.
Defaults to False.
use_feat_match_loss (bool):
enable / disable using Feature Matching loss originally used by MelGAN model. Defaults to True.
use_l1_spec_loss (bool):
enable / disable using L1 spectrogram loss originally used by HifiGAN model. Defaults to False.
stft_loss_params (dict): STFT loss parameters. Default to
`{"n_ffts": [1024, 2048, 512], "hop_lengths": [120, 240, 50], "win_lengths": [600, 1200, 240]}`
stft_loss_weight (float): STFT loss weight that multiplies the computed loss before summing up the total
model loss. Defaults to 0.5.
subband_stft_loss_weight (float):
Subband STFT loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
mse_G_loss_weight (float):
MSE generator loss weight that multiplies the computed loss before summing up the total loss. faults to 2.5.
hinge_G_loss_weight (float):
Hinge generator loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
feat_match_loss_weight (float):
Feature matching loss weight that multiplies the computed loss before summing up the total loss. faults to 108.
l1_spec_loss_weight (float):
L1 spectrogram loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
"""
model: str = "melgan"
@ -48,4 +103,4 @@ class FullbandMelganConfig(BaseGANVocoderConfig):
mse_G_loss_weight: float = 2.5
hinge_G_loss_weight: float = 0
feat_match_loss_weight: float = 108
l1_spec_loss_weight: float = 0
l1_spec_loss_weight: float = 0.0

View File

@ -1,11 +1,94 @@
from dataclasses import dataclass, field
from .shared_configs import BaseGANVocoderConfig
from TTS.vocoder.configs.shared_configs import BaseGANVocoderConfig
@dataclass
class HifiganConfig(BaseGANVocoderConfig):
"""Defines parameters for HifiGAN vocoder."""
"""Defines parameters for FullBand MelGAN vocoder.
Example:
>>> from TTS.vocoder.configs import HifiganConfig
>>> config = HifiganConfig()
Args:
model (str):
Model name used for selecting the right model at initialization. Defaults to `hifigan`.
discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to
'hifigan_discriminator`.
generator_model (str): One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is
considered as a generator too. Defaults to `hifigan_generator`.
generator_model_params (dict): Parameters of the generator model. Defaults to
`
{
"use_mel": True,
"sample_rate": 22050,
"n_fft": 1024,
"hop_length": 256,
"win_length": 1024,
"n_mels": 80,
"mel_fmin": 0.0,
"mel_fmax": None,
}
`
batch_size (int):
Batch size used at training. Larger values use more memory. Defaults to 16.
seq_len (int):
Audio segment length used at training. Larger values use more memory. Defaults to 8192.
pad_short (int):
Additional padding applied to the audio samples shorter than `seq_len`. Defaults to 0.
use_noise_augment (bool):
enable / disable random noise added to the input waveform. The noise is added after computing the
features. Defaults to True.
use_cache (bool):
enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is
not large enough. Defaults to True.
use_stft_loss (bool):
enable / disable use of STFT loss originally used by ParallelWaveGAN model. Defaults to True.
use_subband_stft (bool):
enable / disable use of subband loss computation originally used by MultiBandMelgan model. Defaults to True.
use_mse_gan_loss (bool):
enable / disable using Mean Squeare Error GAN loss. Defaults to True.
use_hinge_gan_loss (bool):
enable / disable using Hinge GAN loss. You should choose either Hinge or MSE loss for training GAN models.
Defaults to False.
use_feat_match_loss (bool):
enable / disable using Feature Matching loss originally used by MelGAN model. Defaults to True.
use_l1_spec_loss (bool):
enable / disable using L1 spectrogram loss originally used by HifiGAN model. Defaults to False.
stft_loss_params (dict):
STFT loss parameters. Default to
`{
"n_ffts": [1024, 2048, 512],
"hop_lengths": [120, 240, 50],
"win_lengths": [600, 1200, 240]
}`
l1_spec_loss_params (dict):
L1 spectrogram loss parameters. Default to
`{
"use_mel": True,
"sample_rate": 22050,
"n_fft": 1024,
"hop_length": 256,
"win_length": 1024,
"n_mels": 80,
"mel_fmin": 0.0,
"mel_fmax": None,
}`
stft_loss_weight (float): STFT loss weight that multiplies the computed loss before summing up the total
model loss. Defaults to 0.5.
subband_stft_loss_weight (float):
Subband STFT loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
mse_G_loss_weight (float):
MSE generator loss weight that multiplies the computed loss before summing up the total loss. faults to 2.5.
hinge_G_loss_weight (float):
Hinge generator loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
feat_match_loss_weight (float):
Feature matching loss weight that multiplies the computed loss before summing up the total loss. faults to 108.
l1_spec_loss_weight (float):
L1 spectrogram loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
"""
model: str = "hifigan"
# model specific params

View File

@ -1,11 +1,66 @@
from dataclasses import dataclass, field
from .shared_configs import BaseGANVocoderConfig
from TTS.vocoder.configs.shared_configs import BaseGANVocoderConfig
@dataclass
class MelganConfig(BaseGANVocoderConfig):
"""Defines parameters for MelGAN vocoder."""
"""Defines parameters for MelGAN vocoder.
Example:
>>> from TTS.vocoder.configs import MelganConfig
>>> config = MelganConfig()
Args:
model (str):
Model name used for selecting the right model at initialization. Defaults to `melgan`.
discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to
'melgan_multiscale_discriminator`.
discriminator_model_params (dict): The discriminator model parameters. Defaults to
'{"base_channels": 16, "max_channels": 1024, "downsample_factors": [4, 4, 4, 4]}`
generator_model (str): One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is
considered as a generator too. Defaults to `melgan_generator`.
batch_size (int):
Batch size used at training. Larger values use more memory. Defaults to 16.
seq_len (int):
Audio segment length used at training. Larger values use more memory. Defaults to 8192.
pad_short (int):
Additional padding applied to the audio samples shorter than `seq_len`. Defaults to 0.
use_noise_augment (bool):
enable / disable random noise added to the input waveform. The noise is added after computing the
features. Defaults to True.
use_cache (bool):
enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is
not large enough. Defaults to True.
use_stft_loss (bool):
enable / disable use of STFT loss originally used by ParallelWaveGAN model. Defaults to True.
use_subband_stft (bool):
enable / disable use of subband loss computation originally used by MultiBandMelgan model. Defaults to True.
use_mse_gan_loss (bool):
enable / disable using Mean Squeare Error GAN loss. Defaults to True.
use_hinge_gan_loss (bool):
enable / disable using Hinge GAN loss. You should choose either Hinge or MSE loss for training GAN models.
Defaults to False.
use_feat_match_loss (bool):
enable / disable using Feature Matching loss originally used by MelGAN model. Defaults to True.
use_l1_spec_loss (bool):
enable / disable using L1 spectrogram loss originally used by HifiGAN model. Defaults to False.
stft_loss_params (dict): STFT loss parameters. Default to
`{"n_ffts": [1024, 2048, 512], "hop_lengths": [120, 240, 50], "win_lengths": [600, 1200, 240]}`
stft_loss_weight (float): STFT loss weight that multiplies the computed loss before summing up the total
model loss. Defaults to 0.5.
subband_stft_loss_weight (float):
Subband STFT loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
mse_G_loss_weight (float):
MSE generator loss weight that multiplies the computed loss before summing up the total loss. faults to 2.5.
hinge_G_loss_weight (float):
Hinge generator loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
feat_match_loss_weight (float):
Feature matching loss weight that multiplies the computed loss before summing up the total loss. faults to 108.
l1_spec_loss_weight (float):
L1 spectrogram loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
"""
model: str = "melgan"

View File

@ -1,11 +1,95 @@
from dataclasses import dataclass, field
from .shared_configs import BaseGANVocoderConfig
from TTS.vocoder.configs.shared_configs import BaseGANVocoderConfig
@dataclass
class MultibandMelganConfig(BaseGANVocoderConfig):
"""Defines parameters for MultiBandMelGAN vocoder."""
"""Defines parameters for MultiBandMelGAN vocoder.
Example:
>>> from TTS.vocoder.configs import MultibandMelganConfig
>>> config = MultibandMelganConfig()
Args:
model (str):
Model name used for selecting the right model at initialization. Defaults to `melgan`.
discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to
'melgan_multiscale_discriminator`.
discriminator_model_params (dict): The discriminator model parameters. Defaults to
'{
"base_channels": 16,
"max_channels": 512,
"downsample_factors": [4, 4, 4]
}`
generator_model (str): One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is
considered as a generator too. Defaults to `melgan_generator`.
generator_model_param (dict):
The generator model parameters. Defaults to `{"upsample_factors": [8, 4, 2], "num_res_blocks": 4}`.
use_pqmf (bool):
enable / disable PQMF modulation for multi-band training. Defaults to True.
lr_gen (float):
Initial learning rate for the generator model. Defaults to 0.0001.
lr_disc (float):
Initial learning rate for the discriminator model. Defaults to 0.0001.
optimizer (torch.optim.Optimizer):
Optimizer used for the training. Defaults to `AdamW`.
optimizer_params (dict):
Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}`
lr_scheduler_gen (torch.optim.Scheduler):
Learning rate scheduler for the generator. Defaults to `MultiStepLR`.
lr_scheduler_gen_params (dict):
Parameters for the generator learning rate scheduler. Defaults to
`{"gamma": 0.5, "milestones": [100000, 200000, 300000, 400000, 500000, 600000]}`.
lr_scheduler_disc (torch.optim.Scheduler):
Learning rate scheduler for the discriminator. Defaults to `MultiStepLR`.
lr_scheduler_dict_params (dict):
Parameters for the discriminator learning rate scheduler. Defaults to
`{"gamma": 0.5, "milestones": [100000, 200000, 300000, 400000, 500000, 600000]}`.
batch_size (int):
Batch size used at training. Larger values use more memory. Defaults to 16.
seq_len (int):
Audio segment length used at training. Larger values use more memory. Defaults to 8192.
pad_short (int):
Additional padding applied to the audio samples shorter than `seq_len`. Defaults to 0.
use_noise_augment (bool):
enable / disable random noise added to the input waveform. The noise is added after computing the
features. Defaults to True.
use_cache (bool):
enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is
not large enough. Defaults to True.
steps_to_start_discriminator (int):
Number of steps required to start training the discriminator. Defaults to 0.
use_stft_loss (bool):`
enable / disable use of STFT loss originally used by ParallelWaveGAN model. Defaults to True.
use_subband_stft (bool):
enable / disable use of subband loss computation originally used by MultiBandMelgan model. Defaults to True.
use_mse_gan_loss (bool):
enable / disable using Mean Squeare Error GAN loss. Defaults to True.
use_hinge_gan_loss (bool):
enable / disable using Hinge GAN loss. You should choose either Hinge or MSE loss for training GAN models.
Defaults to False.
use_feat_match_loss (bool):
enable / disable using Feature Matching loss originally used by MelGAN model. Defaults to True.
use_l1_spec_loss (bool):
enable / disable using L1 spectrogram loss originally used by HifiGAN model. Defaults to False.
stft_loss_params (dict): STFT loss parameters. Default to
`{"n_ffts": [1024, 2048, 512], "hop_lengths": [120, 240, 50], "win_lengths": [600, 1200, 240]}`
stft_loss_weight (float): STFT loss weight that multiplies the computed loss before summing up the total
model loss. Defaults to 0.5.
subband_stft_loss_weight (float):
Subband STFT loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
mse_G_loss_weight (float):
MSE generator loss weight that multiplies the computed loss before summing up the total loss. faults to 2.5.
hinge_G_loss_weight (float):
Hinge generator loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
feat_match_loss_weight (float):
Feature matching loss weight that multiplies the computed loss before summing up the total loss. faults to 108.
l1_spec_loss_weight (float):
L1 spectrogram loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
"""
model: str = "multiband_melgan"
@ -59,7 +143,3 @@ class MultibandMelganConfig(BaseGANVocoderConfig):
hinge_G_loss_weight: float = 0
feat_match_loss_weight: float = 108
l1_spec_loss_weight: float = 0
# optimizer parameters
lr: float = 1e-4
wd: float = 1e-6

View File

@ -5,7 +5,77 @@ from .shared_configs import BaseGANVocoderConfig
@dataclass
class ParallelWaveganConfig(BaseGANVocoderConfig):
"""Defines parameters for ParallelWavegan vocoder."""
"""Defines parameters for ParallelWavegan vocoder.
Args:
model (str):
Model name used for selecting the right configuration at initialization. Defaults to `parallel_wavegan`.
discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to
'parallel_wavegan_discriminator`.
discriminator_model_params (dict): The discriminator model kwargs. Defaults to
'{"num_layers": 10}`
generator_model (str): One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is
considered as a generator too. Defaults to `parallel_wavegan_generator`.
generator_model_param (dict):
The generator model kwargs. Defaults to `{"upsample_factors": [4, 4, 4, 4], "stacks": 3, "num_res_blocks": 30}`.
batch_size (int):
Batch size used at training. Larger values use more memory. Defaults to 16.
seq_len (int):
Audio segment length used at training. Larger values use more memory. Defaults to 8192.
pad_short (int):
Additional padding applied to the audio samples shorter than `seq_len`. Defaults to 0.
use_noise_augment (bool):
enable / disable random noise added to the input waveform. The noise is added after computing the
features. Defaults to True.
use_cache (bool):
enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is
not large enough. Defaults to True.
steps_to_start_discriminator (int):
Number of steps required to start training the discriminator. Defaults to 0.
use_stft_loss (bool):`
enable / disable use of STFT loss originally used by ParallelWaveGAN model. Defaults to True.
use_subband_stft (bool):
enable / disable use of subband loss computation originally used by MultiBandMelgan model. Defaults to True.
use_mse_gan_loss (bool):
enable / disable using Mean Squeare Error GAN loss. Defaults to True.
use_hinge_gan_loss (bool):
enable / disable using Hinge GAN loss. You should choose either Hinge or MSE loss for training GAN models.
Defaults to False.
use_feat_match_loss (bool):
enable / disable using Feature Matching loss originally used by MelGAN model. Defaults to True.
use_l1_spec_loss (bool):
enable / disable using L1 spectrogram loss originally used by HifiGAN model. Defaults to False.
stft_loss_params (dict): STFT loss parameters. Default to
`{"n_ffts": [1024, 2048, 512], "hop_lengths": [120, 240, 50], "win_lengths": [600, 1200, 240]}`
stft_loss_weight (float): STFT loss weight that multiplies the computed loss before summing up the total
model loss. Defaults to 0.5.
subband_stft_loss_weight (float):
Subband STFT loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
mse_G_loss_weight (float):
MSE generator loss weight that multiplies the computed loss before summing up the total loss. faults to 2.5.
hinge_G_loss_weight (float):
Hinge generator loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
feat_match_loss_weight (float):
Feature matching loss weight that multiplies the computed loss before summing up the total loss. faults to 0.
l1_spec_loss_weight (float):
L1 spectrogram loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
lr_gen (float):
Generator model initial learning rate. Defaults to 0.0002.
lr_disc (float):
Discriminator model initial learning rate. Defaults to 0.0002.
optimizer (torch.optim.Optimizer):
Optimizer used for the training. Defaults to `AdamW`.
optimizer_params (dict):
Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}`
lr_scheduler_gen (torch.optim.Scheduler):
Learning rate scheduler for the generator. Defaults to `ExponentialLR`.
lr_scheduler_gen_params (dict):
Parameters for the generator learning rate scheduler. Defaults to `{"gamma": 0.999, "last_epoch": -1}`.
lr_scheduler_disc (torch.optim.Scheduler):
Learning rate scheduler for the discriminator. Defaults to `ExponentialLR`.
lr_scheduler_dict_params (dict):
Parameters for the discriminator learning rate scheduler. Defaults to `{"gamma": 0.999, "last_epoch": -1}`.
"""
model: str = "parallel_wavegan"

View File

@ -7,7 +7,34 @@ from TTS.config import BaseAudioConfig, BaseTrainingConfig
@dataclass
class BaseVocoderConfig(BaseTrainingConfig):
"""Shared parameters among all the vocoder models."""
"""Shared parameters among all the vocoder models.
Args:
audio (BaseAudioConfig):
Audio processor config instance. Defaultsto `BaseAudioConfig()`.
use_noise_augment (bool):
Augment the input audio with random noise. Defaults to False/
eval_split_size (int):
Number of instances used for evaluation. Defaults to 10.
data_path (str):
Root path of the training data. All the audio files found recursively from this root path are used for
training. Defaults to MISSING.
feature_path (str):
Root path to the precomputed feature files. Defaults to None.
seq_len (int):
Length of the waveform segments used for training. Defaults to MISSING.
pad_short (int):
Extra padding for the waveforms shorter than `seq_len`. Defaults to 0.
conv_path (int):
Extra padding for the feature frames against convolution of the edge frames. Defaults to MISSING.
Defaults to 0.
use_cache (bool):
enable / disable in memory caching of the computed features. If the RAM is not enough, if may cause OOM.
Defaults to False.
epochs (int):
Number of training epochs to. Defaults to 10000.
wd (float):
Weight decay.
"""
audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
# dataloading
@ -19,7 +46,6 @@ class BaseVocoderConfig(BaseTrainingConfig):
seq_len: int = MISSING # signal length used in training.
pad_short: int = 0 # additional padding for short wavs
conv_pad: int = 0 # additional padding against convolutions applied to spectrograms
use_noise_augment: bool = False # add noise to the audio signal for augmentation
use_cache: bool = False # use in memory cache to keep the computed features. This might cause OOM.
# OPTIMIZER
epochs: int = 10000 # total number of epochs to train.
@ -28,7 +54,78 @@ class BaseVocoderConfig(BaseTrainingConfig):
@dataclass
class BaseGANVocoderConfig(BaseVocoderConfig):
"""Common config interface for all the GAN based vocoder models."""
"""Base config class used among all the GAN based vocoders.
Args:
use_stft_loss (bool):
enable / disable the use of STFT loss. Defaults to True.
use_subband_stft_loss (bool):
enable / disable the use of Subband STFT loss. Defaults to True.
use_mse_gan_loss (bool):
enable / disable the use of Mean Squared Error based GAN loss. Defaults to True.
use_hinge_gan_loss (bool):
enable / disable the use of Hinge GAN loss. Defaults to True.
use_feat_match_loss (bool):
enable / disable feature matching loss. Defaults to True.
use_l1_spec_loss (bool):
enable / disable L1 spectrogram loss. Defaults to True.
stft_loss_weight (float):
Loss weight that multiplies the computed loss value. Defaults to 0.
subband_stft_loss_weight (float):
Loss weight that multiplies the computed loss value. Defaults to 0.
mse_G_loss_weight (float):
Loss weight that multiplies the computed loss value. Defaults to 1.
hinge_G_loss_weight (float):
Loss weight that multiplies the computed loss value. Defaults to 0.
feat_match_loss_weight (float):
Loss weight that multiplies the computed loss value. Defaults to 100.
l1_spec_loss_weight (float):
Loss weight that multiplies the computed loss value. Defaults to 45.
stft_loss_params (dict):
Parameters for the STFT loss. Defaults to `{"n_ffts": [1024, 2048, 512], "hop_lengths": [120, 240, 50], "win_lengths": [600, 1200, 240]}`.
l1_spec_loss_params (dict):
Parameters for the L1 spectrogram loss. Defaults to
`{
"use_mel": True,
"sample_rate": 22050,
"n_fft": 1024,
"hop_length": 256,
"win_length": 1024,
"n_mels": 80,
"mel_fmin": 0.0,
"mel_fmax": None,
}`
target_loss (str):
Target loss name that defines the quality of the model. Defaults to `avg_G_loss`.
gen_clip_grad (float):
Gradient clipping threshold for the generator model. Any value less than 0 disables clipping.
Defaults to -1.
disc_clip_grad (float):
Gradient clipping threshold for the discriminator model. Any value less than 0 disables clipping.
Defaults to -1.
lr_gen (float):
Generator model initial learning rate. Defaults to 0.0002.
lr_disc (float):
Discriminator model initial learning rate. Defaults to 0.0002.
optimizer (torch.optim.Optimizer):
Optimizer used for the training. Defaults to `AdamW`.
optimizer_params (dict):
Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}`
lr_scheduler_gen (torch.optim.Scheduler):
Learning rate scheduler for the generator. Defaults to `ExponentialLR`.
lr_scheduler_gen_params (dict):
Parameters for the generator learning rate scheduler. Defaults to `{"gamma": 0.999, "last_epoch": -1}`.
lr_scheduler_disc (torch.optim.Scheduler):
Learning rate scheduler for the discriminator. Defaults to `ExponentialLR`.
lr_scheduler_dict_params (dict):
Parameters for the discriminator learning rate scheduler. Defaults to `{"gamma": 0.999, "last_epoch": -1}`.
use_pqmf (bool):
enable / disable PQMF for subband approximation at training. Defaults to False.
steps_to_start_discriminator (int):
Number of steps required to start training the discriminator. Defaults to 0.
diff_samples_for_G_and_D (bool):
enable / disable use of different training samples for the generator and the discriminator iterations.
Enabling it results in slower iterations but faster convergance in some cases. Defaults to False.
"""
# LOSS PARAMETERS
use_stft_loss: bool = True
@ -43,7 +140,7 @@ class BaseGANVocoderConfig(BaseVocoderConfig):
subband_stft_loss_weight: float = 0
mse_G_loss_weight: float = 1
hinge_G_loss_weight: float = 0
feat_match_loss_weight: float = 10
feat_match_loss_weight: float = 100
l1_spec_loss_weight: float = 45
stft_loss_params: dict = field(

View File

@ -1,12 +1,71 @@
from dataclasses import dataclass, field
from .shared_configs import BaseVocoderConfig
from TTS.vocoder.configs.shared_configs import BaseVocoderConfig
@dataclass
class WavegradConfig(BaseVocoderConfig):
"""Defines parameters for Wavernn vocoder."""
"""Defines parameters for WaveGrad vocoder.
Example:
>>> from TTS.vocoder.configs import WavegradConfig
>>> config = WavegradConfig()
Args:
model (str):
Model name used for selecting the right model at initialization. Defaults to `wavegrad`.
generator_model (str): One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is
considered as a generator too. Defaults to `wavegrad`.
model_params (dict):
WaveGrad kwargs. Defaults to
`
{
"use_weight_norm": True,
"y_conv_channels": 32,
"x_conv_channels": 768,
"ublock_out_channels": [512, 512, 256, 128, 128],
"dblock_out_channels": [128, 128, 256, 512],
"upsample_factors": [4, 4, 4, 2, 2],
"upsample_dilations": [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]],
}
`
target_loss (str):
Target loss name that defines the quality of the model. Defaults to `avg_wavegrad_loss`.
epochs (int):
Number of epochs to traing the model. Defaults to 10000.
batch_size (int):
Batch size used at training. Larger values use more memory. Defaults to 96.
seq_len (int):
Audio segment length used at training. Larger values use more memory. Defaults to 6144.
use_cache (bool):
enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is
not large enough. Defaults to True.
mixed_precision (bool):
enable / disable mixed precision training. Default is True.
eval_split_size (int):
Number of samples used for evalutaion. Defaults to 50.
train_noise_schedule (dict):
Training noise schedule. Defaults to
`{"min_val": 1e-6, "max_val": 1e-2, "num_steps": 1000}`
test_noise_schedule (dict):
Inference noise schedule. For a better performance, you may need to use `bin/tune_wavegrad.py` to find a
better schedule. Defaults to
`
{
"min_val": 1e-6,
"max_val": 1e-2,
"num_steps": 50,
}
`
grad_clip (float):
Gradient clipping threshold. If <= 0.0, no clipping is applied. Defaults to 1.0
lr (float):
Initila leraning rate. Defaults to 1e-4.
lr_scheduler (str):
One of the learning rate schedulers from `torch.optim.scheduler.*`. Defaults to `MultiStepLR`.
lr_scheduler_params (dict):
kwargs for the scheduler. Defaults to `{"gamma": 0.5, "milestones": [100000, 200000, 300000, 400000, 500000, 600000]}`
"""
model: str = "wavegrad"
# Model specific params
generator_model: str = "wavegrad"
@ -28,7 +87,6 @@ class WavegradConfig(BaseVocoderConfig):
batch_size: int = 96
seq_len: int = 6144
use_cache: bool = True
steps_to_start_discriminator: int = 200000
mixed_precision: bool = True
eval_split_size: int = 50

View File

@ -1,11 +1,77 @@
from dataclasses import dataclass, field
from .shared_configs import BaseVocoderConfig
from TTS.vocoder.configs.shared_configs import BaseVocoderConfig
@dataclass
class WavernnConfig(BaseVocoderConfig):
"""Defines parameters for Wavernn vocoder."""
"""Defines parameters for Wavernn vocoder.
Example:
>>> from TTS.vocoder.configs import WavernnConfig
>>> config = WavernnConfig()
Args:
model (str):
Model name used for selecting the right model at initialization. Defaults to `wavernn`.
mode (str):
Output mode of the WaveRNN vocoder. `mold` for Mixture of Logistic Distribution, `gauss` for a single
Gaussian Distribution and `bits` for quantized bits as the model's output.
mulaw (bool):
enable / disable the use of Mulaw quantization for training. Only applicable if `mode == 'bits'`. Defaults
to `True`.
generator_model (str):
One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is
considered as a generator too. Defaults to `WaveRNN`.
wavernn_model_params (dict):
kwargs for the WaveRNN model. Defaults to
`{
"rnn_dims": 512,
"fc_dims": 512,
"compute_dims": 128,
"res_out_dims": 128,
"num_res_blocks": 10,
"use_aux_net": True,
"use_upsample_net": True,
"upsample_factors": [4, 8, 8]
}`
batched (bool):
enable / disable the batched inference. It speeds up the inference by splitting the input into segments and
processing the segments in a batch. Then it merges the outputs with a certain overlap and smoothing. If
you set it False, without CUDA, it is too slow to be practical. Defaults to True.
target_samples (int):
Size of the segments in batched mode. Defaults to 11000.
overlap_sampels (int):
Size of the overlap between consecutive segments. Defaults to 550.
batch_size (int):
Batch size used at training. Larger values use more memory. Defaults to 256.
seq_len (int):
Audio segment length used at training. Larger values use more memory. Defaults to 1280.
padding (int):
Padding applied to the input feature frames against the convolution layers of the feature network.
Defaults to 2.
use_noise_augment (bool):
enable / disable random noise added to the input waveform. The noise is added after computing the
features. Defaults to True.
use_cache (bool):
enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is
not large enough. Defaults to True.
mixed_precision (bool):
enable / disable mixed precision training. Default is True.
eval_split_size (int):
Number of samples used for evalutaion. Defaults to 50.
test_every_epoch (int):
Number of epochs waited to run the next evalution. Since inference takes some time, it is better to
wait some number of epochs not ot waste training time. Defaults to 10.
grad_clip (float):
Gradient clipping threshold. If <= 0.0, no clipping is applied. Defaults to 4.0
lr (float):
Initila leraning rate. Defaults to 1e-4.
lr_scheduler (str):
One of the learning rate schedulers from `torch.optim.scheduler.*`. Defaults to `MultiStepLR`.
lr_scheduler_params (dict):
kwargs for the scheduler. Defaults to `{"gamma": 0.5, "milestones": [200000, 400000, 600000]}`
"""
model: str = "wavernn"
@ -38,7 +104,6 @@ class WavernnConfig(BaseVocoderConfig):
padding: int = 2
use_noise_augment: bool = False
use_cache: bool = True
steps_to_start_discriminator: int = 200000
mixed_precision: bool = True
eval_split_size: int = 50
test_every_epochs: int = 10 # number of epochs to wait until the next test run (synthesizing a full audio clip).