mirror of https://github.com/coqui-ai/TTS.git
add docstrings with default value fixes
This commit is contained in:
parent
7e02cff924
commit
8b1014d188
|
@ -13,7 +13,7 @@ class BaseAudioConfig(Coqpit):
|
|||
Number of STFT frequency levels aka.size of the linear spectogram frame. Defaults to 1024.
|
||||
win_length (int):
|
||||
Each frame of audio is windowed by window of length ```win_length``` and then padded with zeros to match
|
||||
```fft_size```. Defaults to 256.
|
||||
```fft_size```. Defaults to 1024.
|
||||
hop_length (int):
|
||||
Number of audio samples between adjacent STFT columns. Defaults to 1024.
|
||||
frame_shift_ms (int):
|
||||
|
@ -21,7 +21,7 @@ class BaseAudioConfig(Coqpit):
|
|||
frame_length_ms (int):
|
||||
Set ```win_length``` based on milliseconds and sampling rate.
|
||||
stft_pad_mode (str):
|
||||
Padding method used in STFT. 'reflect' or 'center'.
|
||||
Padding method used in STFT. 'reflect' or 'center'. Defaults to 'reflect'.
|
||||
sample_rate (int):
|
||||
Audio sampling rate. Defaults to 22050.
|
||||
resample (bool):
|
||||
|
@ -135,11 +135,27 @@ class BaseAudioConfig(Coqpit):
|
|||
|
||||
@dataclass
|
||||
class BaseDatasetConfig(Coqpit):
|
||||
name: str = None
|
||||
path: str = None
|
||||
meta_file_train: Union[str, List] = None # TODO: don't take ignored speakers for multi-speaker datasets over this. This is Union for SC-Glow compat.
|
||||
meta_file_val: str = None
|
||||
meta_file_attn_mask: str = None
|
||||
"""Base config for TTS datasets.
|
||||
|
||||
Args:
|
||||
name (str):
|
||||
Dataset name that defines the preprocessor in use. Defaults to None.
|
||||
path (str):
|
||||
Root path to the dataset files. Defaults to None.
|
||||
meta_file_train (Union[str, List]):
|
||||
Name of the dataset meta file. Or a list of speakers to be ignored at training for multi-speaker datasets.
|
||||
Defaults to None.
|
||||
meta_file_val (str):
|
||||
Name of the dataset meta file that defines the instances used at validation.
|
||||
meta_file_attn_mask (str):
|
||||
Path to the file that lists the attention mask files used with models that require attention masks to
|
||||
train the duration predictor.
|
||||
"""
|
||||
name: str = ''
|
||||
path: str = ''
|
||||
meta_file_train: Union[str, List] = '' # TODO: don't take ignored speakers for multi-speaker datasets over this. This is Union for SC-Glow compat.
|
||||
meta_file_val: str = ''
|
||||
meta_file_attn_mask: str = ''
|
||||
|
||||
def check_values(
|
||||
self,
|
||||
|
@ -161,12 +177,8 @@ class BaseTrainingConfig(Coqpit):
|
|||
Args:
|
||||
batch_size (int):
|
||||
Training batch size.
|
||||
batch_group_size (int):
|
||||
Number of batches to shuffle after bucketing.
|
||||
eval_batch_size (int):
|
||||
Validation batch size.
|
||||
loss_masking (bool):
|
||||
Enable / Disable masking padding segments of sequences.
|
||||
mixed_precision (bool):
|
||||
Enable / Disable mixed precision training. It reduces the VRAM use and allows larger batch sizes, however
|
||||
it may also cause numerical unstability in some cases.
|
||||
|
@ -195,34 +207,13 @@ class BaseTrainingConfig(Coqpit):
|
|||
keep_after (int):
|
||||
Number of steps to wait before saving all the best models. In use if ```keep_all_best == True```. Defaults
|
||||
to 10000.
|
||||
text_cleaner (str):
|
||||
Text cleaner to be used at model training. It is set to be one of the cleaners in
|
||||
```TTS.tts.utils.text.cleaners```.
|
||||
enable_eos_bos_chars (bool):
|
||||
Enable / Disable using special characters indicating end-of-sentence and begining-of-sentence.
|
||||
num_loader_workers (int):
|
||||
Number of workers for training time dataloader.
|
||||
num_val_loader_workers (int):
|
||||
Number of workers for evaluation time dataloader.
|
||||
min_seq_len (int):
|
||||
Minimum sequence length to be used at training.
|
||||
max_seq_len (int):
|
||||
Maximum sequence length to be used at training. VRAM use at training depends on this parameter. Consider to
|
||||
decrease it if you get OOM errors.
|
||||
compute_f0 (bool):
|
||||
Return F0 frames from the dataloader. Defaults to ```False```.
|
||||
compute_input_seq_cache (bool):
|
||||
Enable / Disable computing and caching phonemes sequences from character sequences at the begining of the
|
||||
training. It allows faster data loading times and more precise max-min sequence prunning. Defaults
|
||||
to ```False```.
|
||||
output_path (str):
|
||||
Path for training output folder. The nonexist part of the given path is created automatically.
|
||||
All training outputs are saved there.
|
||||
phoneme_cache_path (str):
|
||||
Path to a folder to save the computed phoneme sequences.
|
||||
datasets (List[BaseDatasetConfig]):
|
||||
ist of DatasetConfig.
|
||||
|
||||
"""
|
||||
|
||||
model: str = None
|
||||
|
|
|
@ -1,11 +1,69 @@
|
|||
from dataclasses import dataclass, field
|
||||
|
||||
from .shared_configs import BaseTTSConfig
|
||||
from TTS.tts.configs.shared_configs import BaseTTSConfig
|
||||
|
||||
|
||||
@dataclass
|
||||
class AlignTTSConfig(BaseTTSConfig):
|
||||
"""Defines parameters for AlignTTS model."""
|
||||
"""Defines parameters for AlignTTS model.
|
||||
Example:
|
||||
|
||||
>>> from TTS.tts.configs import AlignTTSConfig
|
||||
>>> config = AlignTTSConfig()
|
||||
|
||||
Args:
|
||||
model(str):
|
||||
Model name used for selecting the right model at initialization. Defaults to `align_tts`.
|
||||
positional_encoding (bool):
|
||||
enable / disable positional encoding applied to the encoder output. Defaults to True.
|
||||
hidden_channels (int):
|
||||
Base number of hidden channels. Defines all the layers expect ones defined by the specific encoder or decoder
|
||||
parameters. Defaults to 256.
|
||||
hidden_channels_dp (int):
|
||||
Number of hidden channels of the duration predictor's layers. Defaults to 256.
|
||||
encoder_type (str):
|
||||
Type of the encoder used by the model. Look at `TTS.tts.layers.feed_forward.encoder` for more details.
|
||||
Defaults to `fftransformer`.
|
||||
encoder_params (dict):
|
||||
Parameters used to define the encoder network. Look at `TTS.tts.layers.feed_forward.encoder` for more details.
|
||||
Defaults to `{"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}`.
|
||||
decoder_type (str):
|
||||
Type of the decoder used by the model. Look at `TTS.tts.layers.feed_forward.decoder` for more details.
|
||||
Defaults to `fftransformer`.
|
||||
decoder_params (dict):
|
||||
Parameters used to define the decoder network. Look at `TTS.tts.layers.feed_forward.decoder` for more details.
|
||||
Defaults to `{"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}`.
|
||||
phase_start_steps (List[int]):
|
||||
A list of number of steps required to start the next training phase. AlignTTS has 4 different training
|
||||
phases. Thus you need to define 4 different values to enable phase based training. If None, it
|
||||
trains the whole model together. Defaults to None.
|
||||
ssim_alpha (float):
|
||||
Weight for the SSIM loss. If set <= 0, disables the SSIM loss. Defaults to 1.0.
|
||||
duration_loss_alpha (float):
|
||||
Weight for the duration predictor's loss. Defaults to 1.0.
|
||||
mdn_alpha (float):
|
||||
Weight for the MDN loss. Defaults to 1.0.
|
||||
spec_loss_alpha (float):
|
||||
Weight for the MSE spectrogram loss. If set <= 0, disables the L1 loss. Defaults to 1.0.
|
||||
use_speaker_embedding (bool):
|
||||
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
|
||||
in the multi-speaker mode. Defaults to False.
|
||||
use_external_speaker_embedding_file (bool):
|
||||
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
|
||||
external_speaker_embedding_file (str):
|
||||
Path to the file including pre-computed speaker embeddings. Defaults to None.
|
||||
noam_schedule (bool):
|
||||
enable / disable the use of Noam LR scheduler. Defaults to False.
|
||||
warmup_steps (int):
|
||||
Number of warm-up steps for the Noam scheduler. Defaults 4000.
|
||||
lr (float):
|
||||
Initial learning rate. Defaults to `1e-3`.
|
||||
wd (float):
|
||||
Weight decay coefficient. Defaults to `1e-7`.
|
||||
min_seq_len (int):
|
||||
Minimum input sequence length to be used at training.
|
||||
max_seq_len (int):
|
||||
Maximum input sequence length to be used at training. Larger values result in more VRAM usage."""
|
||||
|
||||
model: str = "align_tts"
|
||||
# model specific params
|
||||
|
|
|
@ -1,11 +1,64 @@
|
|||
from dataclasses import dataclass, field
|
||||
|
||||
from .shared_configs import BaseTTSConfig
|
||||
from TTS.tts.configs.shared_configs import BaseTTSConfig
|
||||
|
||||
|
||||
@dataclass
|
||||
class GlowTTSConfig(BaseTTSConfig):
|
||||
"""Defines parameters for GlowTTS model."""
|
||||
"""Defines parameters for GlowTTS model.
|
||||
|
||||
Example:
|
||||
|
||||
>>> from TTS.tts.configs import GlowTTSConfig
|
||||
>>> config = GlowTTSConfig()
|
||||
|
||||
Args:
|
||||
model(str):
|
||||
Model name used for selecting the right model at initialization. Defaults to `glow_tts`.
|
||||
encoder_type (str):
|
||||
Type of the encoder used by the model. Look at `TTS.tts.layers.glow_tts.encoder` for more details.
|
||||
Defaults to `rel_pos_transformers`.
|
||||
encoder_params (dict):
|
||||
Parameters used to define the encoder network. Look at `TTS.tts.layers.glow_tts.encoder` for more details.
|
||||
Defaults to `{"kernel_size": 3, "dropout_p": 0.1, "num_layers": 6, "num_heads": 2, "hidden_channels_ffn": 768}`
|
||||
use_encoder_prenet (bool):
|
||||
enable / disable the use of a prenet for the encoder. Defaults to True.
|
||||
hidden_channels_encoder (int):
|
||||
Number of base hidden channels used by the encoder network. It defines the input and the output channel sizes,
|
||||
and for some encoder types internal hidden channels sizes too. Defaults to 192.
|
||||
hidden_channels_decoder (int):
|
||||
Number of base hidden channels used by the decoder WaveNet network. Defaults to 192 as in the original work.
|
||||
hidden_channels_duration_predictor (int):
|
||||
Number of layer channels of the duration predictor network. Defaults to 256 as in the original work.
|
||||
data_dep_init_steps (int):
|
||||
Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
|
||||
Activation Normalization that pre-computes normalization stats at the beginning and use the same values
|
||||
for the rest. Defaults to 10.
|
||||
style_wav_for_test (str):
|
||||
Path to the wav file used for changing the style of the speech. Defaults to None.
|
||||
inference_noise_scale (float):
|
||||
Variance used for sampling the random noise added to the decoder's input at inference. Defaults to 0.0.
|
||||
use_speaker_embedding (bool):
|
||||
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
|
||||
in the multi-speaker mode. Defaults to False.
|
||||
use_external_speaker_embedding_file (bool):
|
||||
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
|
||||
external_speaker_embedding_file (str):
|
||||
Path to the file including pre-computed speaker embeddings. Defaults to None.
|
||||
noam_schedule (bool):
|
||||
enable / disable the use of Noam LR scheduler. Defaults to False.
|
||||
warmup_steps (int):
|
||||
Number of warm-up steps for the Noam scheduler. Defaults 4000.
|
||||
lr (float):
|
||||
Initial learning rate. Defaults to `1e-3`.
|
||||
wd (float):
|
||||
Weight decay coefficient. Defaults to `1e-7`.
|
||||
min_seq_len (int):
|
||||
Minimum input sequence length to be used at training.
|
||||
max_seq_len (int):
|
||||
Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
|
||||
"""
|
||||
|
||||
|
||||
model: str = "glow_tts"
|
||||
|
||||
|
@ -47,4 +100,4 @@ class GlowTTSConfig(BaseTTSConfig):
|
|||
# overrides
|
||||
min_seq_len: int = 3
|
||||
max_seq_len: int = 500
|
||||
r: int = 1
|
||||
r: int = 1 # DO NOT CHANGE - TODO: make this immutable once coqpit implements it.
|
||||
|
|
|
@ -8,8 +8,20 @@ from TTS.config import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig
|
|||
|
||||
@dataclass
|
||||
class GSTConfig(Coqpit):
|
||||
"""Defines Global Style Toke module"""
|
||||
"""Defines the Global Style Token Module
|
||||
|
||||
Args:
|
||||
gst_style_input_wav (str):
|
||||
Path to the wav file used to define the style of the output speech at inference. Defaults to None.
|
||||
gst_style_input_weights (dict):
|
||||
Defines the weights for each style token used at inference. Defaults to None.
|
||||
gst_embedding_dim (int):
|
||||
Defines the size of the GST embedding vector dimensions. Defaults to 256.
|
||||
gst_num_heads (int):
|
||||
Number of attention heads used by the multi-head attention. Defaults to 4.
|
||||
gst_num_style_tokens (int):
|
||||
Number of style token vectors. Defaults to 10.
|
||||
"""
|
||||
gst_style_input_wav: str = None
|
||||
gst_style_input_weights: dict = None
|
||||
gst_embedding_dim: int = 256
|
||||
|
@ -33,7 +45,26 @@ class GSTConfig(Coqpit):
|
|||
|
||||
@dataclass
|
||||
class CharactersConfig(Coqpit):
|
||||
"""Defines character or phoneme set used by the model"""
|
||||
"""Defines character or phoneme set used by the model
|
||||
|
||||
Args:
|
||||
pad (str):
|
||||
characters in place of empty padding. Defaults to None.
|
||||
eos (str):
|
||||
characters showing the end of a sentence. Defaults to None.
|
||||
bos (str):
|
||||
characters showing the beginning of a sentence. Defaults to None.
|
||||
characters (str):
|
||||
character set used by the model. Characters not in this list are ignored when converting input text to
|
||||
a list of sequence IDs. Defaults to None.
|
||||
punctuations (str):
|
||||
characters considered as punctuation as parsing the input sentence. Defaults to None.
|
||||
phonemes (str):
|
||||
characters considered as parsing phonemes. Defaults to None.
|
||||
unique (bool):
|
||||
remove any duplicate characters in the character lists. It is a bandaid for compatibility with the old
|
||||
models trained with character lists with duplicates.
|
||||
"""
|
||||
|
||||
pad: str = None
|
||||
eos: str = None
|
||||
|
@ -58,7 +89,48 @@ class CharactersConfig(Coqpit):
|
|||
|
||||
@dataclass
|
||||
class BaseTTSConfig(BaseTrainingConfig):
|
||||
"""Shared parameters among all the tts models."""
|
||||
"""Shared parameters among all the tts models.
|
||||
|
||||
Args:
|
||||
audio (BaseAudioConfig):
|
||||
Audio processor config object instance.
|
||||
use_phonemes (bool):
|
||||
enable / disable phoneme use.
|
||||
compute_input_seq_cache (bool):
|
||||
enable / disable precomputation of the phoneme sequences. At the expense of some delay at the beginning of
|
||||
the training, It allows faster data loader time and precise limitation with `max_seq_len` and
|
||||
`min_seq_len`.
|
||||
text_cleaner (str):
|
||||
Name of the text cleaner used for cleaning and formatting transcripts.
|
||||
enable_eos_bos_chars (bool):
|
||||
enable / disable the use of eos and bos characters.
|
||||
test_senteces_file (str):
|
||||
Path to a txt file that has sentences used at test time. The file must have a sentence per line.
|
||||
phoneme_cache_path (str):
|
||||
Path to the output folder caching the computed phonemes for each sample.
|
||||
characters (CharactersConfig):
|
||||
Instance of a CharactersConfig class.
|
||||
batch_group_size (int):
|
||||
Size of the batch groups used for bucketing. By default, the dataloader orders samples by the sequence
|
||||
length for a more efficient and stable training. If `batch_group_size > 1` then it performs bucketing to
|
||||
prevent using the same batches for each epoch.
|
||||
loss_masking (bool):
|
||||
enable / disable masking loss values against padded segments of samples in a batch.
|
||||
min_seq_len (int):
|
||||
Minimum input sequence length to be used at training.
|
||||
max_seq_len (int):
|
||||
Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
|
||||
compute_f0 (int):
|
||||
(Not in use yet).
|
||||
use_noise_augment (bool):
|
||||
Augment the input audio with random noise.
|
||||
add_blank (bool):
|
||||
Add blank characters between each other two characters. It improves performance for some models at expense
|
||||
of slower run-time due to the longer input sequence.
|
||||
datasets (List[BaseDatasetConfig]):
|
||||
List of datasets used for training. If multiple datasets are provided, they are merged and used together
|
||||
for training.
|
||||
"""
|
||||
|
||||
audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
|
||||
# phoneme settings
|
||||
|
|
|
@ -1,11 +1,74 @@
|
|||
from dataclasses import dataclass, field
|
||||
|
||||
from .shared_configs import BaseTTSConfig
|
||||
from TTS.tts.configs.shared_configs import BaseTTSConfig
|
||||
|
||||
|
||||
@dataclass
|
||||
class SpeedySpeechConfig(BaseTTSConfig):
|
||||
"""Defines parameters for Speedy Speech (feed-forward encoder-decoder) based models."""
|
||||
"""Defines parameters for Speedy Speech (feed-forward encoder-decoder) based models.
|
||||
|
||||
Example:
|
||||
|
||||
>>> from TTS.tts.configs import SpeedySpeechConfig
|
||||
>>> config = SpeedySpeechConfig()
|
||||
|
||||
Args:
|
||||
model (str):
|
||||
Model name used for selecting the right model at initialization. Defaults to `speedy_speech`.
|
||||
positional_encoding (bool):
|
||||
enable / disable positional encoding applied to the encoder output. Defaults to True.
|
||||
hidden_channels (int):
|
||||
Base number of hidden channels. Defines all the layers expect ones defined by the specific encoder or decoder
|
||||
parameters. Defaults to 128.
|
||||
encoder_type (str):
|
||||
Type of the encoder used by the model. Look at `TTS.tts.layers.feed_forward.encoder` for more details.
|
||||
Defaults to `residual_conv_bn`.
|
||||
encoder_params (dict):
|
||||
Parameters used to define the encoder network. Look at `TTS.tts.layers.feed_forward.encoder` for more details.
|
||||
Defaults to `{"kernel_size": 4, "dilations": [1, 2, 4, 1, 2, 4, 1, 2, 4, 1, 2, 4, 1], "num_conv_blocks": 2, "num_res_blocks": 13}`
|
||||
decoder_type (str):
|
||||
Type of the decoder used by the model. Look at `TTS.tts.layers.feed_forward.decoder` for more details.
|
||||
Defaults to `residual_conv_bn`.
|
||||
decoder_params (dict):
|
||||
Parameters used to define the decoder network. Look at `TTS.tts.layers.feed_forward.decoder` for more details.
|
||||
Defaults to `{"kernel_size": 4, "dilations": [1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1], "num_conv_blocks": 2, "num_res_blocks": 17}`
|
||||
hidden_channels_encoder (int):
|
||||
Number of base hidden channels used by the encoder network. It defines the input and the output channel sizes,
|
||||
and for some encoder types internal hidden channels sizes too. Defaults to 192.
|
||||
hidden_channels_decoder (int):
|
||||
Number of base hidden channels used by the decoder WaveNet network. Defaults to 192 as in the original work.
|
||||
hidden_channels_duration_predictor (int):
|
||||
Number of layer channels of the duration predictor network. Defaults to 256 as in the original work.
|
||||
data_dep_init_steps (int):
|
||||
Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
|
||||
Activation Normalization that pre-computes normalization stats at the beginning and use the same values
|
||||
for the rest. Defaults to 10.
|
||||
use_speaker_embedding (bool):
|
||||
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
|
||||
in the multi-speaker mode. Defaults to False.
|
||||
use_external_speaker_embedding_file (bool):
|
||||
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
|
||||
external_speaker_embedding_file (str):
|
||||
Path to the file including pre-computed speaker embeddings. Defaults to None.
|
||||
noam_schedule (bool):
|
||||
enable / disable the use of Noam LR scheduler. Defaults to False.
|
||||
warmup_steps (int):
|
||||
Number of warm-up steps for the Noam scheduler. Defaults 4000.
|
||||
lr (float):
|
||||
Initial learning rate. Defaults to `1e-3`.
|
||||
wd (float):
|
||||
Weight decay coefficient. Defaults to `1e-7`.
|
||||
ssim_alpha (float):
|
||||
Weight for the SSIM loss. If set <= 0, disables the SSIM loss. Defaults to 1.0.
|
||||
huber_alpha (float):
|
||||
Weight for the duration predictor's loss. Defaults to 1.0.
|
||||
l1_alpha (float):
|
||||
Weight for the L1 spectrogram loss. If set <= 0, disables the L1 loss. Defaults to 1.0.
|
||||
min_seq_len (int):
|
||||
Minimum input sequence length to be used at training.
|
||||
max_seq_len (int):
|
||||
Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
|
||||
"""
|
||||
|
||||
model: str = "speedy_speech"
|
||||
# model specific params
|
||||
|
@ -50,4 +113,4 @@ class SpeedySpeechConfig(BaseTTSConfig):
|
|||
# overrides
|
||||
min_seq_len: int = 13
|
||||
max_seq_len: int = 200
|
||||
r: int = 1
|
||||
r: int = 1 #DO NOT CHANGE
|
||||
|
|
|
@ -5,6 +5,114 @@ from TTS.tts.configs.tacotron_config import TacotronConfig
|
|||
|
||||
@dataclass
|
||||
class Tacotron2Config(TacotronConfig):
|
||||
"""Defines parameters for Tacotron2 based models."""
|
||||
"""Defines parameters for Tacotron2 based models.
|
||||
|
||||
Example:
|
||||
|
||||
>>> from TTS.tts.configs import Tacotron2Config
|
||||
>>> config = Tacotron2Config()
|
||||
|
||||
Args:
|
||||
model (str):
|
||||
Model name used to select the right model class to initilize. Defaults to `Tacotron2`.
|
||||
use_gst (bool):
|
||||
enable / disable the use of Global Style Token modules. Defaults to False.
|
||||
gst (GSTConfig):
|
||||
Instance of `GSTConfig` class.
|
||||
gst_style_input (str):
|
||||
Path to the wav file used at inference to set the speech style through GST. If `GST` is enabled and
|
||||
this is not defined, the model uses a zero vector as an input. Defaults to None.
|
||||
r (int):
|
||||
Number of output frames that the decoder computed per iteration. Larger values makes training and inference
|
||||
faster but reduces the quality of the output frames. This needs to be tuned considering your own needs.
|
||||
Defaults to 1.
|
||||
gradual_trainin (List[List]):
|
||||
Parameters for the gradual training schedule. It is in the form `[[a, b, c], [d ,e ,f] ..]` where `a` is
|
||||
the step number to start using the rest of the values, `b` is the `r` value and `c` is the batch size.
|
||||
If sets None, no gradual training is used. Defaults to None.
|
||||
memory_size (int):
|
||||
Defines the number of previous frames used by the Prenet. If set to < 0, then it uses only the last frame.
|
||||
Defaults to -1.
|
||||
prenet_type (str):
|
||||
`original` or `bn`. `original` sets the default Prenet and `bn` uses Batch Normalization version of the
|
||||
Prenet. Defaults to `original`.
|
||||
prenet_dropout (bool):
|
||||
enables / disables the use of dropout in the Prenet. Defaults to True.
|
||||
prenet_dropout_at_inference (bool):
|
||||
enable / disable the use of dropout in the Prenet at the inference time. Defaults to False.
|
||||
stopnet (bool):
|
||||
enable /disable the Stopnet that predicts the end of the decoder sequence. Defaults to True.
|
||||
stopnet_pos_weight (float):
|
||||
Weight that is applied to over-weight positive instances in the Stopnet loss. Use larger values with
|
||||
datasets with longer sentences. Defaults to 10.
|
||||
separate_stopnet (bool):
|
||||
Use a distinct Stopnet which is trained separately from the rest of the model. Defaults to True.
|
||||
attention_type (str):
|
||||
attention type. Check ```TTS.tts.layers.attentions.init_attn```. Defaults to 'original'.
|
||||
attention_heads (int):
|
||||
Number of attention heads for GMM attention. Defaults to 5.
|
||||
windowing (bool):
|
||||
It especially useful at inference to keep attention alignment diagonal. Defaults to False.
|
||||
use_forward_attn (bool):
|
||||
It is only valid if ```attn_type``` is ```original```. Defaults to False.
|
||||
forward_attn_mask (bool):
|
||||
enable/disable extra masking over forward attention. It is useful at inference to prevent
|
||||
possible attention failures. Defaults to False.
|
||||
transition_agent (bool):
|
||||
enable/disable transition agent in forward attention. Defaults to False.
|
||||
location_attn (bool):
|
||||
enable/disable location sensitive attention as in the original Tacotron2 paper.
|
||||
It is only valid if ```attn_type``` is ```original```. Defaults to True.
|
||||
bidirectional_decoder (bool):
|
||||
enable/disable bidirectional decoding. Defaults to False.
|
||||
double_decoder_consistency (bool):
|
||||
enable/disable double decoder consistency. Defaults to False.
|
||||
ddc_r (int):
|
||||
reduction rate used by the coarse decoder when `double_decoder_consistency` is in use. Set this
|
||||
as a multiple of the `r` value. Defaults to 6.
|
||||
use_speaker_embedding (bool):
|
||||
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
|
||||
in the multi-speaker mode. Defaults to False.
|
||||
use_external_speaker_embedding_file (bool):
|
||||
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
|
||||
external_speaker_embedding_file (str):
|
||||
Path to the file including pre-computed speaker embeddings. Defaults to None.
|
||||
noam_schedule (bool):
|
||||
enable / disable the use of Noam LR scheduler. Defaults to False.
|
||||
warmup_steps (int):
|
||||
Number of warm-up steps for the Noam scheduler. Defaults 4000.
|
||||
lr (float):
|
||||
Initial learning rate. Defaults to `1e-4`.
|
||||
wd (float):
|
||||
Weight decay coefficient. Defaults to `1e-6`.
|
||||
grad_clip (float):
|
||||
Gradient clipping threshold. Defaults to `5`.
|
||||
seq_len_notm (bool):
|
||||
enable / disable the sequnce length normalization in the loss functions. If set True, loss of a sample
|
||||
is divided by the sequence length. Defaults to False.
|
||||
loss_masking (bool):
|
||||
enable / disable masking the paddings of the samples in loss computation. Defaults to True.
|
||||
decoder_loss_alpha (float):
|
||||
Weight for the decoder loss of the Tacotron model. If set less than or equal to zero, it disables the
|
||||
corresponding loss function. Defaults to 0.25
|
||||
postnet_loss_alpha (float):
|
||||
Weight for the postnet loss of the Tacotron model. If set less than or equal to zero, it disables the
|
||||
corresponding loss function. Defaults to 0.25
|
||||
postnet_diff_spec_alpha (float):
|
||||
Weight for the postnet differential loss of the Tacotron model. If set less than or equal to zero, it disables the
|
||||
corresponding loss function. Defaults to 0.25
|
||||
decoder_diff_spec_alpha (float):
|
||||
Weight for the decoder differential loss of the Tacotron model. If set less than or equal to zero, it disables the
|
||||
corresponding loss function. Defaults to 0.25
|
||||
decoder_ssim_alpha (float):
|
||||
Weight for the decoder SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the
|
||||
corresponding loss function. Defaults to 0.25
|
||||
postnet_ssim_alpha (float):
|
||||
Weight for the postnet SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the
|
||||
corresponding loss function. Defaults to 0.25
|
||||
ga_alpha (float):
|
||||
Weight for the guided attention loss. If set less than or equal to zero, it disables the corresponding loss
|
||||
function. Defaults to 5.
|
||||
"""
|
||||
|
||||
model: str = "tacotron2"
|
||||
|
|
|
@ -1,12 +1,120 @@
|
|||
from dataclasses import dataclass
|
||||
from typing import List
|
||||
|
||||
from .shared_configs import BaseTTSConfig, GSTConfig
|
||||
from TTS.tts.configs.shared_configs import BaseTTSConfig, GSTConfig
|
||||
|
||||
|
||||
@dataclass
|
||||
class TacotronConfig(BaseTTSConfig):
|
||||
"""Defines parameters for Tacotron based models."""
|
||||
"""Defines parameters for Tacotron based models.
|
||||
|
||||
Example:
|
||||
|
||||
>>> from TTS.tts.configs import TacotronConfig
|
||||
>>> config = TacotronConfig()
|
||||
|
||||
Args:
|
||||
model (str):
|
||||
Model name used to select the right model class to initilize. Defaults to `Tacotron`.
|
||||
use_gst (bool):
|
||||
enable / disable the use of Global Style Token modules. Defaults to False.
|
||||
gst (GSTConfig):
|
||||
Instance of `GSTConfig` class.
|
||||
gst_style_input (str):
|
||||
Path to the wav file used at inference to set the speech style through GST. If `GST` is enabled and
|
||||
this is not defined, the model uses a zero vector as an input. Defaults to None.
|
||||
r (int):
|
||||
Number of output frames that the decoder computed per iteration. Larger values makes training and inference
|
||||
faster but reduces the quality of the output frames. This needs to be tuned considering your own needs.
|
||||
Defaults to 1.
|
||||
gradual_trainin (List[List]):
|
||||
Parameters for the gradual training schedule. It is in the form `[[a, b, c], [d ,e ,f] ..]` where `a` is
|
||||
the step number to start using the rest of the values, `b` is the `r` value and `c` is the batch size.
|
||||
If sets None, no gradual training is used. Defaults to None.
|
||||
memory_size (int):
|
||||
Defines the number of previous frames used by the Prenet. If set to < 0, then it uses only the last frame.
|
||||
Defaults to -1.
|
||||
prenet_type (str):
|
||||
`original` or `bn`. `original` sets the default Prenet and `bn` uses Batch Normalization version of the
|
||||
Prenet. Defaults to `original`.
|
||||
prenet_dropout (bool):
|
||||
enables / disables the use of dropout in the Prenet. Defaults to True.
|
||||
prenet_dropout_at_inference (bool):
|
||||
enable / disable the use of dropout in the Prenet at the inference time. Defaults to False.
|
||||
stopnet (bool):
|
||||
enable /disable the Stopnet that predicts the end of the decoder sequence. Defaults to True.
|
||||
stopnet_pos_weight (float):
|
||||
Weight that is applied to over-weight positive instances in the Stopnet loss. Use larger values with
|
||||
datasets with longer sentences. Defaults to 10.
|
||||
separate_stopnet (bool):
|
||||
Use a distinct Stopnet which is trained separately from the rest of the model. Defaults to True.
|
||||
attention_type (str):
|
||||
attention type. Check ```TTS.tts.layers.attentions.init_attn```. Defaults to 'original'.
|
||||
attention_heads (int):
|
||||
Number of attention heads for GMM attention. Defaults to 5.
|
||||
windowing (bool):
|
||||
It especially useful at inference to keep attention alignment diagonal. Defaults to False.
|
||||
use_forward_attn (bool):
|
||||
It is only valid if ```attn_type``` is ```original```. Defaults to False.
|
||||
forward_attn_mask (bool):
|
||||
enable/disable extra masking over forward attention. It is useful at inference to prevent
|
||||
possible attention failures. Defaults to False.
|
||||
transition_agent (bool):
|
||||
enable/disable transition agent in forward attention. Defaults to False.
|
||||
location_attn (bool):
|
||||
enable/disable location sensitive attention as in the original Tacotron2 paper.
|
||||
It is only valid if ```attn_type``` is ```original```. Defaults to True.
|
||||
bidirectional_decoder (bool):
|
||||
enable/disable bidirectional decoding. Defaults to False.
|
||||
double_decoder_consistency (bool):
|
||||
enable/disable double decoder consistency. Defaults to False.
|
||||
ddc_r (int):
|
||||
reduction rate used by the coarse decoder when `double_decoder_consistency` is in use. Set this
|
||||
as a multiple of the `r` value. Defaults to 6.
|
||||
use_speaker_embedding (bool):
|
||||
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
|
||||
in the multi-speaker mode. Defaults to False.
|
||||
use_external_speaker_embedding_file (bool):
|
||||
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
|
||||
external_speaker_embedding_file (str):
|
||||
Path to the file including pre-computed speaker embeddings. Defaults to None.
|
||||
noam_schedule (bool):
|
||||
enable / disable the use of Noam LR scheduler. Defaults to False.
|
||||
warmup_steps (int):
|
||||
Number of warm-up steps for the Noam scheduler. Defaults 4000.
|
||||
lr (float):
|
||||
Initial learning rate. Defaults to `1e-4`.
|
||||
wd (float):
|
||||
Weight decay coefficient. Defaults to `1e-6`.
|
||||
grad_clip (float):
|
||||
Gradient clipping threshold. Defaults to `5`.
|
||||
seq_len_notm (bool):
|
||||
enable / disable the sequnce length normalization in the loss functions. If set True, loss of a sample
|
||||
is divided by the sequence length. Defaults to False.
|
||||
loss_masking (bool):
|
||||
enable / disable masking the paddings of the samples in loss computation. Defaults to True.
|
||||
decoder_loss_alpha (float):
|
||||
Weight for the decoder loss of the Tacotron model. If set less than or equal to zero, it disables the
|
||||
corresponding loss function. Defaults to 0.25
|
||||
postnet_loss_alpha (float):
|
||||
Weight for the postnet loss of the Tacotron model. If set less than or equal to zero, it disables the
|
||||
corresponding loss function. Defaults to 0.25
|
||||
postnet_diff_spec_alpha (float):
|
||||
Weight for the postnet differential loss of the Tacotron model. If set less than or equal to zero, it disables the
|
||||
corresponding loss function. Defaults to 0.25
|
||||
decoder_diff_spec_alpha (float):
|
||||
Weight for the decoder differential loss of the Tacotron model. If set less than or equal to zero, it disables the
|
||||
corresponding loss function. Defaults to 0.25
|
||||
decoder_ssim_alpha (float):
|
||||
Weight for the decoder SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the
|
||||
corresponding loss function. Defaults to 0.25
|
||||
postnet_ssim_alpha (float):
|
||||
Weight for the postnet SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the
|
||||
corresponding loss function. Defaults to 0.25
|
||||
ga_alpha (float):
|
||||
Weight for the guided attention loss. If set less than or equal to zero, it disables the corresponding loss
|
||||
function. Defaults to 5.
|
||||
"""
|
||||
|
||||
model: str = "tacotron"
|
||||
use_gst: bool = False
|
||||
|
|
|
@ -52,19 +52,19 @@ def load_meta_data(datasets, eval_split=True):
|
|||
print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}")
|
||||
# load evaluation split if set
|
||||
if eval_split:
|
||||
if meta_file_val is None:
|
||||
meta_data_eval, meta_data_train = split_dataset(meta_data_train)
|
||||
else:
|
||||
if meta_file_val:
|
||||
meta_data_eval = preprocessor(root_path, meta_file_val)
|
||||
else:
|
||||
meta_data_eval, meta_data_train = split_dataset(meta_data_train)
|
||||
meta_data_eval_all += meta_data_eval
|
||||
meta_data_train_all += meta_data_train
|
||||
# load attention masks for duration predictor training
|
||||
if dataset.meta_file_attn_mask is not None:
|
||||
if dataset.meta_file_attn_mask:
|
||||
meta_data = dict(load_attention_mask_meta_data(dataset["meta_file_attn_mask"]))
|
||||
for idx, ins in enumerate(meta_data_train_all):
|
||||
attn_file = meta_data[ins[1]].strip()
|
||||
meta_data_train_all[idx].append(attn_file)
|
||||
if meta_data_eval_all is not None:
|
||||
if meta_data_eval_all:
|
||||
for idx, ins in enumerate(meta_data_eval_all):
|
||||
attn_file = meta_data[ins[1]].strip()
|
||||
meta_data_eval_all[idx].append(attn_file)
|
||||
|
|
|
@ -5,7 +5,62 @@ from .shared_configs import BaseGANVocoderConfig
|
|||
|
||||
@dataclass
|
||||
class FullbandMelganConfig(BaseGANVocoderConfig):
|
||||
"""Defines parameters for FullbandMelGAN vocoder."""
|
||||
"""Defines parameters for FullBand MelGAN vocoder.
|
||||
|
||||
Example:
|
||||
|
||||
>>> from TTS.vocoder.configs import FullbandMelganConfig
|
||||
>>> config = FullbandMelganConfig()
|
||||
|
||||
Args:
|
||||
model (str):
|
||||
Model name used for selecting the right model at initialization. Defaults to `melgan`.
|
||||
discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to
|
||||
'melgan_multiscale_discriminator`.
|
||||
discriminator_model_params (dict): The discriminator model parameters. Defaults to
|
||||
'{"base_channels": 16, "max_channels": 1024, "downsample_factors": [4, 4, 4, 4]}`
|
||||
generator_model (str): One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is
|
||||
considered as a generator too. Defaults to `melgan_generator`.
|
||||
batch_size (int):
|
||||
Batch size used at training. Larger values use more memory. Defaults to 16.
|
||||
seq_len (int):
|
||||
Audio segment length used at training. Larger values use more memory. Defaults to 8192.
|
||||
pad_short (int):
|
||||
Additional padding applied to the audio samples shorter than `seq_len`. Defaults to 0.
|
||||
use_noise_augment (bool):
|
||||
enable / disable random noise added to the input waveform. The noise is added after computing the
|
||||
features. Defaults to True.
|
||||
use_cache (bool):
|
||||
enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is
|
||||
not large enough. Defaults to True.
|
||||
use_stft_loss (bool):
|
||||
enable / disable use of STFT loss originally used by ParallelWaveGAN model. Defaults to True.
|
||||
use_subband_stft (bool):
|
||||
enable / disable use of subband loss computation originally used by MultiBandMelgan model. Defaults to True.
|
||||
use_mse_gan_loss (bool):
|
||||
enable / disable using Mean Squeare Error GAN loss. Defaults to True.
|
||||
use_hinge_gan_loss (bool):
|
||||
enable / disable using Hinge GAN loss. You should choose either Hinge or MSE loss for training GAN models.
|
||||
Defaults to False.
|
||||
use_feat_match_loss (bool):
|
||||
enable / disable using Feature Matching loss originally used by MelGAN model. Defaults to True.
|
||||
use_l1_spec_loss (bool):
|
||||
enable / disable using L1 spectrogram loss originally used by HifiGAN model. Defaults to False.
|
||||
stft_loss_params (dict): STFT loss parameters. Default to
|
||||
`{"n_ffts": [1024, 2048, 512], "hop_lengths": [120, 240, 50], "win_lengths": [600, 1200, 240]}`
|
||||
stft_loss_weight (float): STFT loss weight that multiplies the computed loss before summing up the total
|
||||
model loss. Defaults to 0.5.
|
||||
subband_stft_loss_weight (float):
|
||||
Subband STFT loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
|
||||
mse_G_loss_weight (float):
|
||||
MSE generator loss weight that multiplies the computed loss before summing up the total loss. faults to 2.5.
|
||||
hinge_G_loss_weight (float):
|
||||
Hinge generator loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
|
||||
feat_match_loss_weight (float):
|
||||
Feature matching loss weight that multiplies the computed loss before summing up the total loss. faults to 108.
|
||||
l1_spec_loss_weight (float):
|
||||
L1 spectrogram loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
|
||||
"""
|
||||
|
||||
model: str = "melgan"
|
||||
|
||||
|
@ -48,4 +103,4 @@ class FullbandMelganConfig(BaseGANVocoderConfig):
|
|||
mse_G_loss_weight: float = 2.5
|
||||
hinge_G_loss_weight: float = 0
|
||||
feat_match_loss_weight: float = 108
|
||||
l1_spec_loss_weight: float = 0
|
||||
l1_spec_loss_weight: float = 0.0
|
||||
|
|
|
@ -1,11 +1,94 @@
|
|||
from dataclasses import dataclass, field
|
||||
|
||||
from .shared_configs import BaseGANVocoderConfig
|
||||
from TTS.vocoder.configs.shared_configs import BaseGANVocoderConfig
|
||||
|
||||
|
||||
@dataclass
|
||||
class HifiganConfig(BaseGANVocoderConfig):
|
||||
"""Defines parameters for HifiGAN vocoder."""
|
||||
"""Defines parameters for FullBand MelGAN vocoder.
|
||||
|
||||
Example:
|
||||
|
||||
>>> from TTS.vocoder.configs import HifiganConfig
|
||||
>>> config = HifiganConfig()
|
||||
|
||||
Args:
|
||||
model (str):
|
||||
Model name used for selecting the right model at initialization. Defaults to `hifigan`.
|
||||
discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to
|
||||
'hifigan_discriminator`.
|
||||
generator_model (str): One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is
|
||||
considered as a generator too. Defaults to `hifigan_generator`.
|
||||
generator_model_params (dict): Parameters of the generator model. Defaults to
|
||||
`
|
||||
{
|
||||
"use_mel": True,
|
||||
"sample_rate": 22050,
|
||||
"n_fft": 1024,
|
||||
"hop_length": 256,
|
||||
"win_length": 1024,
|
||||
"n_mels": 80,
|
||||
"mel_fmin": 0.0,
|
||||
"mel_fmax": None,
|
||||
}
|
||||
`
|
||||
batch_size (int):
|
||||
Batch size used at training. Larger values use more memory. Defaults to 16.
|
||||
seq_len (int):
|
||||
Audio segment length used at training. Larger values use more memory. Defaults to 8192.
|
||||
pad_short (int):
|
||||
Additional padding applied to the audio samples shorter than `seq_len`. Defaults to 0.
|
||||
use_noise_augment (bool):
|
||||
enable / disable random noise added to the input waveform. The noise is added after computing the
|
||||
features. Defaults to True.
|
||||
use_cache (bool):
|
||||
enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is
|
||||
not large enough. Defaults to True.
|
||||
use_stft_loss (bool):
|
||||
enable / disable use of STFT loss originally used by ParallelWaveGAN model. Defaults to True.
|
||||
use_subband_stft (bool):
|
||||
enable / disable use of subband loss computation originally used by MultiBandMelgan model. Defaults to True.
|
||||
use_mse_gan_loss (bool):
|
||||
enable / disable using Mean Squeare Error GAN loss. Defaults to True.
|
||||
use_hinge_gan_loss (bool):
|
||||
enable / disable using Hinge GAN loss. You should choose either Hinge or MSE loss for training GAN models.
|
||||
Defaults to False.
|
||||
use_feat_match_loss (bool):
|
||||
enable / disable using Feature Matching loss originally used by MelGAN model. Defaults to True.
|
||||
use_l1_spec_loss (bool):
|
||||
enable / disable using L1 spectrogram loss originally used by HifiGAN model. Defaults to False.
|
||||
stft_loss_params (dict):
|
||||
STFT loss parameters. Default to
|
||||
`{
|
||||
"n_ffts": [1024, 2048, 512],
|
||||
"hop_lengths": [120, 240, 50],
|
||||
"win_lengths": [600, 1200, 240]
|
||||
}`
|
||||
l1_spec_loss_params (dict):
|
||||
L1 spectrogram loss parameters. Default to
|
||||
`{
|
||||
"use_mel": True,
|
||||
"sample_rate": 22050,
|
||||
"n_fft": 1024,
|
||||
"hop_length": 256,
|
||||
"win_length": 1024,
|
||||
"n_mels": 80,
|
||||
"mel_fmin": 0.0,
|
||||
"mel_fmax": None,
|
||||
}`
|
||||
stft_loss_weight (float): STFT loss weight that multiplies the computed loss before summing up the total
|
||||
model loss. Defaults to 0.5.
|
||||
subband_stft_loss_weight (float):
|
||||
Subband STFT loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
|
||||
mse_G_loss_weight (float):
|
||||
MSE generator loss weight that multiplies the computed loss before summing up the total loss. faults to 2.5.
|
||||
hinge_G_loss_weight (float):
|
||||
Hinge generator loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
|
||||
feat_match_loss_weight (float):
|
||||
Feature matching loss weight that multiplies the computed loss before summing up the total loss. faults to 108.
|
||||
l1_spec_loss_weight (float):
|
||||
L1 spectrogram loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
|
||||
"""
|
||||
|
||||
model: str = "hifigan"
|
||||
# model specific params
|
||||
|
|
|
@ -1,11 +1,66 @@
|
|||
from dataclasses import dataclass, field
|
||||
|
||||
from .shared_configs import BaseGANVocoderConfig
|
||||
from TTS.vocoder.configs.shared_configs import BaseGANVocoderConfig
|
||||
|
||||
|
||||
@dataclass
|
||||
class MelganConfig(BaseGANVocoderConfig):
|
||||
"""Defines parameters for MelGAN vocoder."""
|
||||
"""Defines parameters for MelGAN vocoder.
|
||||
|
||||
Example:
|
||||
|
||||
>>> from TTS.vocoder.configs import MelganConfig
|
||||
>>> config = MelganConfig()
|
||||
|
||||
Args:
|
||||
model (str):
|
||||
Model name used for selecting the right model at initialization. Defaults to `melgan`.
|
||||
discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to
|
||||
'melgan_multiscale_discriminator`.
|
||||
discriminator_model_params (dict): The discriminator model parameters. Defaults to
|
||||
'{"base_channels": 16, "max_channels": 1024, "downsample_factors": [4, 4, 4, 4]}`
|
||||
generator_model (str): One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is
|
||||
considered as a generator too. Defaults to `melgan_generator`.
|
||||
batch_size (int):
|
||||
Batch size used at training. Larger values use more memory. Defaults to 16.
|
||||
seq_len (int):
|
||||
Audio segment length used at training. Larger values use more memory. Defaults to 8192.
|
||||
pad_short (int):
|
||||
Additional padding applied to the audio samples shorter than `seq_len`. Defaults to 0.
|
||||
use_noise_augment (bool):
|
||||
enable / disable random noise added to the input waveform. The noise is added after computing the
|
||||
features. Defaults to True.
|
||||
use_cache (bool):
|
||||
enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is
|
||||
not large enough. Defaults to True.
|
||||
use_stft_loss (bool):
|
||||
enable / disable use of STFT loss originally used by ParallelWaveGAN model. Defaults to True.
|
||||
use_subband_stft (bool):
|
||||
enable / disable use of subband loss computation originally used by MultiBandMelgan model. Defaults to True.
|
||||
use_mse_gan_loss (bool):
|
||||
enable / disable using Mean Squeare Error GAN loss. Defaults to True.
|
||||
use_hinge_gan_loss (bool):
|
||||
enable / disable using Hinge GAN loss. You should choose either Hinge or MSE loss for training GAN models.
|
||||
Defaults to False.
|
||||
use_feat_match_loss (bool):
|
||||
enable / disable using Feature Matching loss originally used by MelGAN model. Defaults to True.
|
||||
use_l1_spec_loss (bool):
|
||||
enable / disable using L1 spectrogram loss originally used by HifiGAN model. Defaults to False.
|
||||
stft_loss_params (dict): STFT loss parameters. Default to
|
||||
`{"n_ffts": [1024, 2048, 512], "hop_lengths": [120, 240, 50], "win_lengths": [600, 1200, 240]}`
|
||||
stft_loss_weight (float): STFT loss weight that multiplies the computed loss before summing up the total
|
||||
model loss. Defaults to 0.5.
|
||||
subband_stft_loss_weight (float):
|
||||
Subband STFT loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
|
||||
mse_G_loss_weight (float):
|
||||
MSE generator loss weight that multiplies the computed loss before summing up the total loss. faults to 2.5.
|
||||
hinge_G_loss_weight (float):
|
||||
Hinge generator loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
|
||||
feat_match_loss_weight (float):
|
||||
Feature matching loss weight that multiplies the computed loss before summing up the total loss. faults to 108.
|
||||
l1_spec_loss_weight (float):
|
||||
L1 spectrogram loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
|
||||
"""
|
||||
|
||||
model: str = "melgan"
|
||||
|
||||
|
|
|
@ -1,11 +1,95 @@
|
|||
from dataclasses import dataclass, field
|
||||
|
||||
from .shared_configs import BaseGANVocoderConfig
|
||||
from TTS.vocoder.configs.shared_configs import BaseGANVocoderConfig
|
||||
|
||||
|
||||
@dataclass
|
||||
class MultibandMelganConfig(BaseGANVocoderConfig):
|
||||
"""Defines parameters for MultiBandMelGAN vocoder."""
|
||||
"""Defines parameters for MultiBandMelGAN vocoder.
|
||||
|
||||
Example:
|
||||
|
||||
>>> from TTS.vocoder.configs import MultibandMelganConfig
|
||||
>>> config = MultibandMelganConfig()
|
||||
|
||||
Args:
|
||||
model (str):
|
||||
Model name used for selecting the right model at initialization. Defaults to `melgan`.
|
||||
discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to
|
||||
'melgan_multiscale_discriminator`.
|
||||
discriminator_model_params (dict): The discriminator model parameters. Defaults to
|
||||
'{
|
||||
"base_channels": 16,
|
||||
"max_channels": 512,
|
||||
"downsample_factors": [4, 4, 4]
|
||||
}`
|
||||
generator_model (str): One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is
|
||||
considered as a generator too. Defaults to `melgan_generator`.
|
||||
generator_model_param (dict):
|
||||
The generator model parameters. Defaults to `{"upsample_factors": [8, 4, 2], "num_res_blocks": 4}`.
|
||||
use_pqmf (bool):
|
||||
enable / disable PQMF modulation for multi-band training. Defaults to True.
|
||||
lr_gen (float):
|
||||
Initial learning rate for the generator model. Defaults to 0.0001.
|
||||
lr_disc (float):
|
||||
Initial learning rate for the discriminator model. Defaults to 0.0001.
|
||||
optimizer (torch.optim.Optimizer):
|
||||
Optimizer used for the training. Defaults to `AdamW`.
|
||||
optimizer_params (dict):
|
||||
Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}`
|
||||
lr_scheduler_gen (torch.optim.Scheduler):
|
||||
Learning rate scheduler for the generator. Defaults to `MultiStepLR`.
|
||||
lr_scheduler_gen_params (dict):
|
||||
Parameters for the generator learning rate scheduler. Defaults to
|
||||
`{"gamma": 0.5, "milestones": [100000, 200000, 300000, 400000, 500000, 600000]}`.
|
||||
lr_scheduler_disc (torch.optim.Scheduler):
|
||||
Learning rate scheduler for the discriminator. Defaults to `MultiStepLR`.
|
||||
lr_scheduler_dict_params (dict):
|
||||
Parameters for the discriminator learning rate scheduler. Defaults to
|
||||
`{"gamma": 0.5, "milestones": [100000, 200000, 300000, 400000, 500000, 600000]}`.
|
||||
batch_size (int):
|
||||
Batch size used at training. Larger values use more memory. Defaults to 16.
|
||||
seq_len (int):
|
||||
Audio segment length used at training. Larger values use more memory. Defaults to 8192.
|
||||
pad_short (int):
|
||||
Additional padding applied to the audio samples shorter than `seq_len`. Defaults to 0.
|
||||
use_noise_augment (bool):
|
||||
enable / disable random noise added to the input waveform. The noise is added after computing the
|
||||
features. Defaults to True.
|
||||
use_cache (bool):
|
||||
enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is
|
||||
not large enough. Defaults to True.
|
||||
steps_to_start_discriminator (int):
|
||||
Number of steps required to start training the discriminator. Defaults to 0.
|
||||
use_stft_loss (bool):`
|
||||
enable / disable use of STFT loss originally used by ParallelWaveGAN model. Defaults to True.
|
||||
use_subband_stft (bool):
|
||||
enable / disable use of subband loss computation originally used by MultiBandMelgan model. Defaults to True.
|
||||
use_mse_gan_loss (bool):
|
||||
enable / disable using Mean Squeare Error GAN loss. Defaults to True.
|
||||
use_hinge_gan_loss (bool):
|
||||
enable / disable using Hinge GAN loss. You should choose either Hinge or MSE loss for training GAN models.
|
||||
Defaults to False.
|
||||
use_feat_match_loss (bool):
|
||||
enable / disable using Feature Matching loss originally used by MelGAN model. Defaults to True.
|
||||
use_l1_spec_loss (bool):
|
||||
enable / disable using L1 spectrogram loss originally used by HifiGAN model. Defaults to False.
|
||||
stft_loss_params (dict): STFT loss parameters. Default to
|
||||
`{"n_ffts": [1024, 2048, 512], "hop_lengths": [120, 240, 50], "win_lengths": [600, 1200, 240]}`
|
||||
stft_loss_weight (float): STFT loss weight that multiplies the computed loss before summing up the total
|
||||
model loss. Defaults to 0.5.
|
||||
subband_stft_loss_weight (float):
|
||||
Subband STFT loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
|
||||
mse_G_loss_weight (float):
|
||||
MSE generator loss weight that multiplies the computed loss before summing up the total loss. faults to 2.5.
|
||||
hinge_G_loss_weight (float):
|
||||
Hinge generator loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
|
||||
feat_match_loss_weight (float):
|
||||
Feature matching loss weight that multiplies the computed loss before summing up the total loss. faults to 108.
|
||||
l1_spec_loss_weight (float):
|
||||
L1 spectrogram loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
|
||||
"""
|
||||
|
||||
|
||||
model: str = "multiband_melgan"
|
||||
|
||||
|
@ -58,8 +142,4 @@ class MultibandMelganConfig(BaseGANVocoderConfig):
|
|||
mse_G_loss_weight: float = 2.5
|
||||
hinge_G_loss_weight: float = 0
|
||||
feat_match_loss_weight: float = 108
|
||||
l1_spec_loss_weight: float = 0
|
||||
|
||||
# optimizer parameters
|
||||
lr: float = 1e-4
|
||||
wd: float = 1e-6
|
||||
l1_spec_loss_weight: float = 0
|
|
@ -5,7 +5,77 @@ from .shared_configs import BaseGANVocoderConfig
|
|||
|
||||
@dataclass
|
||||
class ParallelWaveganConfig(BaseGANVocoderConfig):
|
||||
"""Defines parameters for ParallelWavegan vocoder."""
|
||||
"""Defines parameters for ParallelWavegan vocoder.
|
||||
|
||||
Args:
|
||||
model (str):
|
||||
Model name used for selecting the right configuration at initialization. Defaults to `parallel_wavegan`.
|
||||
discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to
|
||||
'parallel_wavegan_discriminator`.
|
||||
discriminator_model_params (dict): The discriminator model kwargs. Defaults to
|
||||
'{"num_layers": 10}`
|
||||
generator_model (str): One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is
|
||||
considered as a generator too. Defaults to `parallel_wavegan_generator`.
|
||||
generator_model_param (dict):
|
||||
The generator model kwargs. Defaults to `{"upsample_factors": [4, 4, 4, 4], "stacks": 3, "num_res_blocks": 30}`.
|
||||
batch_size (int):
|
||||
Batch size used at training. Larger values use more memory. Defaults to 16.
|
||||
seq_len (int):
|
||||
Audio segment length used at training. Larger values use more memory. Defaults to 8192.
|
||||
pad_short (int):
|
||||
Additional padding applied to the audio samples shorter than `seq_len`. Defaults to 0.
|
||||
use_noise_augment (bool):
|
||||
enable / disable random noise added to the input waveform. The noise is added after computing the
|
||||
features. Defaults to True.
|
||||
use_cache (bool):
|
||||
enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is
|
||||
not large enough. Defaults to True.
|
||||
steps_to_start_discriminator (int):
|
||||
Number of steps required to start training the discriminator. Defaults to 0.
|
||||
use_stft_loss (bool):`
|
||||
enable / disable use of STFT loss originally used by ParallelWaveGAN model. Defaults to True.
|
||||
use_subband_stft (bool):
|
||||
enable / disable use of subband loss computation originally used by MultiBandMelgan model. Defaults to True.
|
||||
use_mse_gan_loss (bool):
|
||||
enable / disable using Mean Squeare Error GAN loss. Defaults to True.
|
||||
use_hinge_gan_loss (bool):
|
||||
enable / disable using Hinge GAN loss. You should choose either Hinge or MSE loss for training GAN models.
|
||||
Defaults to False.
|
||||
use_feat_match_loss (bool):
|
||||
enable / disable using Feature Matching loss originally used by MelGAN model. Defaults to True.
|
||||
use_l1_spec_loss (bool):
|
||||
enable / disable using L1 spectrogram loss originally used by HifiGAN model. Defaults to False.
|
||||
stft_loss_params (dict): STFT loss parameters. Default to
|
||||
`{"n_ffts": [1024, 2048, 512], "hop_lengths": [120, 240, 50], "win_lengths": [600, 1200, 240]}`
|
||||
stft_loss_weight (float): STFT loss weight that multiplies the computed loss before summing up the total
|
||||
model loss. Defaults to 0.5.
|
||||
subband_stft_loss_weight (float):
|
||||
Subband STFT loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
|
||||
mse_G_loss_weight (float):
|
||||
MSE generator loss weight that multiplies the computed loss before summing up the total loss. faults to 2.5.
|
||||
hinge_G_loss_weight (float):
|
||||
Hinge generator loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
|
||||
feat_match_loss_weight (float):
|
||||
Feature matching loss weight that multiplies the computed loss before summing up the total loss. faults to 0.
|
||||
l1_spec_loss_weight (float):
|
||||
L1 spectrogram loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
|
||||
lr_gen (float):
|
||||
Generator model initial learning rate. Defaults to 0.0002.
|
||||
lr_disc (float):
|
||||
Discriminator model initial learning rate. Defaults to 0.0002.
|
||||
optimizer (torch.optim.Optimizer):
|
||||
Optimizer used for the training. Defaults to `AdamW`.
|
||||
optimizer_params (dict):
|
||||
Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}`
|
||||
lr_scheduler_gen (torch.optim.Scheduler):
|
||||
Learning rate scheduler for the generator. Defaults to `ExponentialLR`.
|
||||
lr_scheduler_gen_params (dict):
|
||||
Parameters for the generator learning rate scheduler. Defaults to `{"gamma": 0.999, "last_epoch": -1}`.
|
||||
lr_scheduler_disc (torch.optim.Scheduler):
|
||||
Learning rate scheduler for the discriminator. Defaults to `ExponentialLR`.
|
||||
lr_scheduler_dict_params (dict):
|
||||
Parameters for the discriminator learning rate scheduler. Defaults to `{"gamma": 0.999, "last_epoch": -1}`.
|
||||
"""
|
||||
|
||||
model: str = "parallel_wavegan"
|
||||
|
||||
|
|
|
@ -7,7 +7,34 @@ from TTS.config import BaseAudioConfig, BaseTrainingConfig
|
|||
|
||||
@dataclass
|
||||
class BaseVocoderConfig(BaseTrainingConfig):
|
||||
"""Shared parameters among all the vocoder models."""
|
||||
"""Shared parameters among all the vocoder models.
|
||||
Args:
|
||||
audio (BaseAudioConfig):
|
||||
Audio processor config instance. Defaultsto `BaseAudioConfig()`.
|
||||
use_noise_augment (bool):
|
||||
Augment the input audio with random noise. Defaults to False/
|
||||
eval_split_size (int):
|
||||
Number of instances used for evaluation. Defaults to 10.
|
||||
data_path (str):
|
||||
Root path of the training data. All the audio files found recursively from this root path are used for
|
||||
training. Defaults to MISSING.
|
||||
feature_path (str):
|
||||
Root path to the precomputed feature files. Defaults to None.
|
||||
seq_len (int):
|
||||
Length of the waveform segments used for training. Defaults to MISSING.
|
||||
pad_short (int):
|
||||
Extra padding for the waveforms shorter than `seq_len`. Defaults to 0.
|
||||
conv_path (int):
|
||||
Extra padding for the feature frames against convolution of the edge frames. Defaults to MISSING.
|
||||
Defaults to 0.
|
||||
use_cache (bool):
|
||||
enable / disable in memory caching of the computed features. If the RAM is not enough, if may cause OOM.
|
||||
Defaults to False.
|
||||
epochs (int):
|
||||
Number of training epochs to. Defaults to 10000.
|
||||
wd (float):
|
||||
Weight decay.
|
||||
"""
|
||||
|
||||
audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
|
||||
# dataloading
|
||||
|
@ -19,7 +46,6 @@ class BaseVocoderConfig(BaseTrainingConfig):
|
|||
seq_len: int = MISSING # signal length used in training.
|
||||
pad_short: int = 0 # additional padding for short wavs
|
||||
conv_pad: int = 0 # additional padding against convolutions applied to spectrograms
|
||||
use_noise_augment: bool = False # add noise to the audio signal for augmentation
|
||||
use_cache: bool = False # use in memory cache to keep the computed features. This might cause OOM.
|
||||
# OPTIMIZER
|
||||
epochs: int = 10000 # total number of epochs to train.
|
||||
|
@ -28,7 +54,78 @@ class BaseVocoderConfig(BaseTrainingConfig):
|
|||
|
||||
@dataclass
|
||||
class BaseGANVocoderConfig(BaseVocoderConfig):
|
||||
"""Common config interface for all the GAN based vocoder models."""
|
||||
"""Base config class used among all the GAN based vocoders.
|
||||
Args:
|
||||
use_stft_loss (bool):
|
||||
enable / disable the use of STFT loss. Defaults to True.
|
||||
use_subband_stft_loss (bool):
|
||||
enable / disable the use of Subband STFT loss. Defaults to True.
|
||||
use_mse_gan_loss (bool):
|
||||
enable / disable the use of Mean Squared Error based GAN loss. Defaults to True.
|
||||
use_hinge_gan_loss (bool):
|
||||
enable / disable the use of Hinge GAN loss. Defaults to True.
|
||||
use_feat_match_loss (bool):
|
||||
enable / disable feature matching loss. Defaults to True.
|
||||
use_l1_spec_loss (bool):
|
||||
enable / disable L1 spectrogram loss. Defaults to True.
|
||||
stft_loss_weight (float):
|
||||
Loss weight that multiplies the computed loss value. Defaults to 0.
|
||||
subband_stft_loss_weight (float):
|
||||
Loss weight that multiplies the computed loss value. Defaults to 0.
|
||||
mse_G_loss_weight (float):
|
||||
Loss weight that multiplies the computed loss value. Defaults to 1.
|
||||
hinge_G_loss_weight (float):
|
||||
Loss weight that multiplies the computed loss value. Defaults to 0.
|
||||
feat_match_loss_weight (float):
|
||||
Loss weight that multiplies the computed loss value. Defaults to 100.
|
||||
l1_spec_loss_weight (float):
|
||||
Loss weight that multiplies the computed loss value. Defaults to 45.
|
||||
stft_loss_params (dict):
|
||||
Parameters for the STFT loss. Defaults to `{"n_ffts": [1024, 2048, 512], "hop_lengths": [120, 240, 50], "win_lengths": [600, 1200, 240]}`.
|
||||
l1_spec_loss_params (dict):
|
||||
Parameters for the L1 spectrogram loss. Defaults to
|
||||
`{
|
||||
"use_mel": True,
|
||||
"sample_rate": 22050,
|
||||
"n_fft": 1024,
|
||||
"hop_length": 256,
|
||||
"win_length": 1024,
|
||||
"n_mels": 80,
|
||||
"mel_fmin": 0.0,
|
||||
"mel_fmax": None,
|
||||
}`
|
||||
target_loss (str):
|
||||
Target loss name that defines the quality of the model. Defaults to `avg_G_loss`.
|
||||
gen_clip_grad (float):
|
||||
Gradient clipping threshold for the generator model. Any value less than 0 disables clipping.
|
||||
Defaults to -1.
|
||||
disc_clip_grad (float):
|
||||
Gradient clipping threshold for the discriminator model. Any value less than 0 disables clipping.
|
||||
Defaults to -1.
|
||||
lr_gen (float):
|
||||
Generator model initial learning rate. Defaults to 0.0002.
|
||||
lr_disc (float):
|
||||
Discriminator model initial learning rate. Defaults to 0.0002.
|
||||
optimizer (torch.optim.Optimizer):
|
||||
Optimizer used for the training. Defaults to `AdamW`.
|
||||
optimizer_params (dict):
|
||||
Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}`
|
||||
lr_scheduler_gen (torch.optim.Scheduler):
|
||||
Learning rate scheduler for the generator. Defaults to `ExponentialLR`.
|
||||
lr_scheduler_gen_params (dict):
|
||||
Parameters for the generator learning rate scheduler. Defaults to `{"gamma": 0.999, "last_epoch": -1}`.
|
||||
lr_scheduler_disc (torch.optim.Scheduler):
|
||||
Learning rate scheduler for the discriminator. Defaults to `ExponentialLR`.
|
||||
lr_scheduler_dict_params (dict):
|
||||
Parameters for the discriminator learning rate scheduler. Defaults to `{"gamma": 0.999, "last_epoch": -1}`.
|
||||
use_pqmf (bool):
|
||||
enable / disable PQMF for subband approximation at training. Defaults to False.
|
||||
steps_to_start_discriminator (int):
|
||||
Number of steps required to start training the discriminator. Defaults to 0.
|
||||
diff_samples_for_G_and_D (bool):
|
||||
enable / disable use of different training samples for the generator and the discriminator iterations.
|
||||
Enabling it results in slower iterations but faster convergance in some cases. Defaults to False.
|
||||
"""
|
||||
|
||||
# LOSS PARAMETERS
|
||||
use_stft_loss: bool = True
|
||||
|
@ -43,7 +140,7 @@ class BaseGANVocoderConfig(BaseVocoderConfig):
|
|||
subband_stft_loss_weight: float = 0
|
||||
mse_G_loss_weight: float = 1
|
||||
hinge_G_loss_weight: float = 0
|
||||
feat_match_loss_weight: float = 10
|
||||
feat_match_loss_weight: float = 100
|
||||
l1_spec_loss_weight: float = 45
|
||||
|
||||
stft_loss_params: dict = field(
|
||||
|
|
|
@ -1,12 +1,71 @@
|
|||
from dataclasses import dataclass, field
|
||||
|
||||
from .shared_configs import BaseVocoderConfig
|
||||
from TTS.vocoder.configs.shared_configs import BaseVocoderConfig
|
||||
|
||||
|
||||
@dataclass
|
||||
class WavegradConfig(BaseVocoderConfig):
|
||||
"""Defines parameters for Wavernn vocoder."""
|
||||
"""Defines parameters for WaveGrad vocoder.
|
||||
Example:
|
||||
|
||||
>>> from TTS.vocoder.configs import WavegradConfig
|
||||
>>> config = WavegradConfig()
|
||||
|
||||
Args:
|
||||
model (str):
|
||||
Model name used for selecting the right model at initialization. Defaults to `wavegrad`.
|
||||
generator_model (str): One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is
|
||||
considered as a generator too. Defaults to `wavegrad`.
|
||||
model_params (dict):
|
||||
WaveGrad kwargs. Defaults to
|
||||
`
|
||||
{
|
||||
"use_weight_norm": True,
|
||||
"y_conv_channels": 32,
|
||||
"x_conv_channels": 768,
|
||||
"ublock_out_channels": [512, 512, 256, 128, 128],
|
||||
"dblock_out_channels": [128, 128, 256, 512],
|
||||
"upsample_factors": [4, 4, 4, 2, 2],
|
||||
"upsample_dilations": [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]],
|
||||
}
|
||||
`
|
||||
target_loss (str):
|
||||
Target loss name that defines the quality of the model. Defaults to `avg_wavegrad_loss`.
|
||||
epochs (int):
|
||||
Number of epochs to traing the model. Defaults to 10000.
|
||||
batch_size (int):
|
||||
Batch size used at training. Larger values use more memory. Defaults to 96.
|
||||
seq_len (int):
|
||||
Audio segment length used at training. Larger values use more memory. Defaults to 6144.
|
||||
use_cache (bool):
|
||||
enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is
|
||||
not large enough. Defaults to True.
|
||||
mixed_precision (bool):
|
||||
enable / disable mixed precision training. Default is True.
|
||||
eval_split_size (int):
|
||||
Number of samples used for evalutaion. Defaults to 50.
|
||||
train_noise_schedule (dict):
|
||||
Training noise schedule. Defaults to
|
||||
`{"min_val": 1e-6, "max_val": 1e-2, "num_steps": 1000}`
|
||||
test_noise_schedule (dict):
|
||||
Inference noise schedule. For a better performance, you may need to use `bin/tune_wavegrad.py` to find a
|
||||
better schedule. Defaults to
|
||||
`
|
||||
{
|
||||
"min_val": 1e-6,
|
||||
"max_val": 1e-2,
|
||||
"num_steps": 50,
|
||||
}
|
||||
`
|
||||
grad_clip (float):
|
||||
Gradient clipping threshold. If <= 0.0, no clipping is applied. Defaults to 1.0
|
||||
lr (float):
|
||||
Initila leraning rate. Defaults to 1e-4.
|
||||
lr_scheduler (str):
|
||||
One of the learning rate schedulers from `torch.optim.scheduler.*`. Defaults to `MultiStepLR`.
|
||||
lr_scheduler_params (dict):
|
||||
kwargs for the scheduler. Defaults to `{"gamma": 0.5, "milestones": [100000, 200000, 300000, 400000, 500000, 600000]}`
|
||||
"""
|
||||
model: str = "wavegrad"
|
||||
# Model specific params
|
||||
generator_model: str = "wavegrad"
|
||||
|
@ -28,7 +87,6 @@ class WavegradConfig(BaseVocoderConfig):
|
|||
batch_size: int = 96
|
||||
seq_len: int = 6144
|
||||
use_cache: bool = True
|
||||
steps_to_start_discriminator: int = 200000
|
||||
mixed_precision: bool = True
|
||||
eval_split_size: int = 50
|
||||
|
||||
|
|
|
@ -1,11 +1,77 @@
|
|||
from dataclasses import dataclass, field
|
||||
|
||||
from .shared_configs import BaseVocoderConfig
|
||||
from TTS.vocoder.configs.shared_configs import BaseVocoderConfig
|
||||
|
||||
|
||||
@dataclass
|
||||
class WavernnConfig(BaseVocoderConfig):
|
||||
"""Defines parameters for Wavernn vocoder."""
|
||||
"""Defines parameters for Wavernn vocoder.
|
||||
Example:
|
||||
|
||||
>>> from TTS.vocoder.configs import WavernnConfig
|
||||
>>> config = WavernnConfig()
|
||||
|
||||
Args:
|
||||
model (str):
|
||||
Model name used for selecting the right model at initialization. Defaults to `wavernn`.
|
||||
mode (str):
|
||||
Output mode of the WaveRNN vocoder. `mold` for Mixture of Logistic Distribution, `gauss` for a single
|
||||
Gaussian Distribution and `bits` for quantized bits as the model's output.
|
||||
mulaw (bool):
|
||||
enable / disable the use of Mulaw quantization for training. Only applicable if `mode == 'bits'`. Defaults
|
||||
to `True`.
|
||||
generator_model (str):
|
||||
One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is
|
||||
considered as a generator too. Defaults to `WaveRNN`.
|
||||
wavernn_model_params (dict):
|
||||
kwargs for the WaveRNN model. Defaults to
|
||||
`{
|
||||
"rnn_dims": 512,
|
||||
"fc_dims": 512,
|
||||
"compute_dims": 128,
|
||||
"res_out_dims": 128,
|
||||
"num_res_blocks": 10,
|
||||
"use_aux_net": True,
|
||||
"use_upsample_net": True,
|
||||
"upsample_factors": [4, 8, 8]
|
||||
}`
|
||||
batched (bool):
|
||||
enable / disable the batched inference. It speeds up the inference by splitting the input into segments and
|
||||
processing the segments in a batch. Then it merges the outputs with a certain overlap and smoothing. If
|
||||
you set it False, without CUDA, it is too slow to be practical. Defaults to True.
|
||||
target_samples (int):
|
||||
Size of the segments in batched mode. Defaults to 11000.
|
||||
overlap_sampels (int):
|
||||
Size of the overlap between consecutive segments. Defaults to 550.
|
||||
batch_size (int):
|
||||
Batch size used at training. Larger values use more memory. Defaults to 256.
|
||||
seq_len (int):
|
||||
Audio segment length used at training. Larger values use more memory. Defaults to 1280.
|
||||
padding (int):
|
||||
Padding applied to the input feature frames against the convolution layers of the feature network.
|
||||
Defaults to 2.
|
||||
use_noise_augment (bool):
|
||||
enable / disable random noise added to the input waveform. The noise is added after computing the
|
||||
features. Defaults to True.
|
||||
use_cache (bool):
|
||||
enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is
|
||||
not large enough. Defaults to True.
|
||||
mixed_precision (bool):
|
||||
enable / disable mixed precision training. Default is True.
|
||||
eval_split_size (int):
|
||||
Number of samples used for evalutaion. Defaults to 50.
|
||||
test_every_epoch (int):
|
||||
Number of epochs waited to run the next evalution. Since inference takes some time, it is better to
|
||||
wait some number of epochs not ot waste training time. Defaults to 10.
|
||||
grad_clip (float):
|
||||
Gradient clipping threshold. If <= 0.0, no clipping is applied. Defaults to 4.0
|
||||
lr (float):
|
||||
Initila leraning rate. Defaults to 1e-4.
|
||||
lr_scheduler (str):
|
||||
One of the learning rate schedulers from `torch.optim.scheduler.*`. Defaults to `MultiStepLR`.
|
||||
lr_scheduler_params (dict):
|
||||
kwargs for the scheduler. Defaults to `{"gamma": 0.5, "milestones": [200000, 400000, 600000]}`
|
||||
"""
|
||||
|
||||
model: str = "wavernn"
|
||||
|
||||
|
@ -38,7 +104,6 @@ class WavernnConfig(BaseVocoderConfig):
|
|||
padding: int = 2
|
||||
use_noise_augment: bool = False
|
||||
use_cache: bool = True
|
||||
steps_to_start_discriminator: int = 200000
|
||||
mixed_precision: bool = True
|
||||
eval_split_size: int = 50
|
||||
test_every_epochs: int = 10 # number of epochs to wait until the next test run (synthesizing a full audio clip).
|
||||
|
|
Loading…
Reference in New Issue