mirror of https://github.com/coqui-ai/TTS.git
add docstrings with default value fixes
This commit is contained in:
parent
7e02cff924
commit
8b1014d188
|
@ -13,7 +13,7 @@ class BaseAudioConfig(Coqpit):
|
||||||
Number of STFT frequency levels aka.size of the linear spectogram frame. Defaults to 1024.
|
Number of STFT frequency levels aka.size of the linear spectogram frame. Defaults to 1024.
|
||||||
win_length (int):
|
win_length (int):
|
||||||
Each frame of audio is windowed by window of length ```win_length``` and then padded with zeros to match
|
Each frame of audio is windowed by window of length ```win_length``` and then padded with zeros to match
|
||||||
```fft_size```. Defaults to 256.
|
```fft_size```. Defaults to 1024.
|
||||||
hop_length (int):
|
hop_length (int):
|
||||||
Number of audio samples between adjacent STFT columns. Defaults to 1024.
|
Number of audio samples between adjacent STFT columns. Defaults to 1024.
|
||||||
frame_shift_ms (int):
|
frame_shift_ms (int):
|
||||||
|
@ -21,7 +21,7 @@ class BaseAudioConfig(Coqpit):
|
||||||
frame_length_ms (int):
|
frame_length_ms (int):
|
||||||
Set ```win_length``` based on milliseconds and sampling rate.
|
Set ```win_length``` based on milliseconds and sampling rate.
|
||||||
stft_pad_mode (str):
|
stft_pad_mode (str):
|
||||||
Padding method used in STFT. 'reflect' or 'center'.
|
Padding method used in STFT. 'reflect' or 'center'. Defaults to 'reflect'.
|
||||||
sample_rate (int):
|
sample_rate (int):
|
||||||
Audio sampling rate. Defaults to 22050.
|
Audio sampling rate. Defaults to 22050.
|
||||||
resample (bool):
|
resample (bool):
|
||||||
|
@ -135,11 +135,27 @@ class BaseAudioConfig(Coqpit):
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class BaseDatasetConfig(Coqpit):
|
class BaseDatasetConfig(Coqpit):
|
||||||
name: str = None
|
"""Base config for TTS datasets.
|
||||||
path: str = None
|
|
||||||
meta_file_train: Union[str, List] = None # TODO: don't take ignored speakers for multi-speaker datasets over this. This is Union for SC-Glow compat.
|
Args:
|
||||||
meta_file_val: str = None
|
name (str):
|
||||||
meta_file_attn_mask: str = None
|
Dataset name that defines the preprocessor in use. Defaults to None.
|
||||||
|
path (str):
|
||||||
|
Root path to the dataset files. Defaults to None.
|
||||||
|
meta_file_train (Union[str, List]):
|
||||||
|
Name of the dataset meta file. Or a list of speakers to be ignored at training for multi-speaker datasets.
|
||||||
|
Defaults to None.
|
||||||
|
meta_file_val (str):
|
||||||
|
Name of the dataset meta file that defines the instances used at validation.
|
||||||
|
meta_file_attn_mask (str):
|
||||||
|
Path to the file that lists the attention mask files used with models that require attention masks to
|
||||||
|
train the duration predictor.
|
||||||
|
"""
|
||||||
|
name: str = ''
|
||||||
|
path: str = ''
|
||||||
|
meta_file_train: Union[str, List] = '' # TODO: don't take ignored speakers for multi-speaker datasets over this. This is Union for SC-Glow compat.
|
||||||
|
meta_file_val: str = ''
|
||||||
|
meta_file_attn_mask: str = ''
|
||||||
|
|
||||||
def check_values(
|
def check_values(
|
||||||
self,
|
self,
|
||||||
|
@ -161,12 +177,8 @@ class BaseTrainingConfig(Coqpit):
|
||||||
Args:
|
Args:
|
||||||
batch_size (int):
|
batch_size (int):
|
||||||
Training batch size.
|
Training batch size.
|
||||||
batch_group_size (int):
|
|
||||||
Number of batches to shuffle after bucketing.
|
|
||||||
eval_batch_size (int):
|
eval_batch_size (int):
|
||||||
Validation batch size.
|
Validation batch size.
|
||||||
loss_masking (bool):
|
|
||||||
Enable / Disable masking padding segments of sequences.
|
|
||||||
mixed_precision (bool):
|
mixed_precision (bool):
|
||||||
Enable / Disable mixed precision training. It reduces the VRAM use and allows larger batch sizes, however
|
Enable / Disable mixed precision training. It reduces the VRAM use and allows larger batch sizes, however
|
||||||
it may also cause numerical unstability in some cases.
|
it may also cause numerical unstability in some cases.
|
||||||
|
@ -195,34 +207,13 @@ class BaseTrainingConfig(Coqpit):
|
||||||
keep_after (int):
|
keep_after (int):
|
||||||
Number of steps to wait before saving all the best models. In use if ```keep_all_best == True```. Defaults
|
Number of steps to wait before saving all the best models. In use if ```keep_all_best == True```. Defaults
|
||||||
to 10000.
|
to 10000.
|
||||||
text_cleaner (str):
|
|
||||||
Text cleaner to be used at model training. It is set to be one of the cleaners in
|
|
||||||
```TTS.tts.utils.text.cleaners```.
|
|
||||||
enable_eos_bos_chars (bool):
|
|
||||||
Enable / Disable using special characters indicating end-of-sentence and begining-of-sentence.
|
|
||||||
num_loader_workers (int):
|
num_loader_workers (int):
|
||||||
Number of workers for training time dataloader.
|
Number of workers for training time dataloader.
|
||||||
num_val_loader_workers (int):
|
num_val_loader_workers (int):
|
||||||
Number of workers for evaluation time dataloader.
|
Number of workers for evaluation time dataloader.
|
||||||
min_seq_len (int):
|
|
||||||
Minimum sequence length to be used at training.
|
|
||||||
max_seq_len (int):
|
|
||||||
Maximum sequence length to be used at training. VRAM use at training depends on this parameter. Consider to
|
|
||||||
decrease it if you get OOM errors.
|
|
||||||
compute_f0 (bool):
|
|
||||||
Return F0 frames from the dataloader. Defaults to ```False```.
|
|
||||||
compute_input_seq_cache (bool):
|
|
||||||
Enable / Disable computing and caching phonemes sequences from character sequences at the begining of the
|
|
||||||
training. It allows faster data loading times and more precise max-min sequence prunning. Defaults
|
|
||||||
to ```False```.
|
|
||||||
output_path (str):
|
output_path (str):
|
||||||
Path for training output folder. The nonexist part of the given path is created automatically.
|
Path for training output folder. The nonexist part of the given path is created automatically.
|
||||||
All training outputs are saved there.
|
All training outputs are saved there.
|
||||||
phoneme_cache_path (str):
|
|
||||||
Path to a folder to save the computed phoneme sequences.
|
|
||||||
datasets (List[BaseDatasetConfig]):
|
|
||||||
ist of DatasetConfig.
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
model: str = None
|
model: str = None
|
||||||
|
|
|
@ -1,11 +1,69 @@
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
from .shared_configs import BaseTTSConfig
|
from TTS.tts.configs.shared_configs import BaseTTSConfig
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class AlignTTSConfig(BaseTTSConfig):
|
class AlignTTSConfig(BaseTTSConfig):
|
||||||
"""Defines parameters for AlignTTS model."""
|
"""Defines parameters for AlignTTS model.
|
||||||
|
Example:
|
||||||
|
|
||||||
|
>>> from TTS.tts.configs import AlignTTSConfig
|
||||||
|
>>> config = AlignTTSConfig()
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model(str):
|
||||||
|
Model name used for selecting the right model at initialization. Defaults to `align_tts`.
|
||||||
|
positional_encoding (bool):
|
||||||
|
enable / disable positional encoding applied to the encoder output. Defaults to True.
|
||||||
|
hidden_channels (int):
|
||||||
|
Base number of hidden channels. Defines all the layers expect ones defined by the specific encoder or decoder
|
||||||
|
parameters. Defaults to 256.
|
||||||
|
hidden_channels_dp (int):
|
||||||
|
Number of hidden channels of the duration predictor's layers. Defaults to 256.
|
||||||
|
encoder_type (str):
|
||||||
|
Type of the encoder used by the model. Look at `TTS.tts.layers.feed_forward.encoder` for more details.
|
||||||
|
Defaults to `fftransformer`.
|
||||||
|
encoder_params (dict):
|
||||||
|
Parameters used to define the encoder network. Look at `TTS.tts.layers.feed_forward.encoder` for more details.
|
||||||
|
Defaults to `{"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}`.
|
||||||
|
decoder_type (str):
|
||||||
|
Type of the decoder used by the model. Look at `TTS.tts.layers.feed_forward.decoder` for more details.
|
||||||
|
Defaults to `fftransformer`.
|
||||||
|
decoder_params (dict):
|
||||||
|
Parameters used to define the decoder network. Look at `TTS.tts.layers.feed_forward.decoder` for more details.
|
||||||
|
Defaults to `{"hidden_channels_ffn": 1024, "num_heads": 2, "num_layers": 6, "dropout_p": 0.1}`.
|
||||||
|
phase_start_steps (List[int]):
|
||||||
|
A list of number of steps required to start the next training phase. AlignTTS has 4 different training
|
||||||
|
phases. Thus you need to define 4 different values to enable phase based training. If None, it
|
||||||
|
trains the whole model together. Defaults to None.
|
||||||
|
ssim_alpha (float):
|
||||||
|
Weight for the SSIM loss. If set <= 0, disables the SSIM loss. Defaults to 1.0.
|
||||||
|
duration_loss_alpha (float):
|
||||||
|
Weight for the duration predictor's loss. Defaults to 1.0.
|
||||||
|
mdn_alpha (float):
|
||||||
|
Weight for the MDN loss. Defaults to 1.0.
|
||||||
|
spec_loss_alpha (float):
|
||||||
|
Weight for the MSE spectrogram loss. If set <= 0, disables the L1 loss. Defaults to 1.0.
|
||||||
|
use_speaker_embedding (bool):
|
||||||
|
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
|
||||||
|
in the multi-speaker mode. Defaults to False.
|
||||||
|
use_external_speaker_embedding_file (bool):
|
||||||
|
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
|
||||||
|
external_speaker_embedding_file (str):
|
||||||
|
Path to the file including pre-computed speaker embeddings. Defaults to None.
|
||||||
|
noam_schedule (bool):
|
||||||
|
enable / disable the use of Noam LR scheduler. Defaults to False.
|
||||||
|
warmup_steps (int):
|
||||||
|
Number of warm-up steps for the Noam scheduler. Defaults 4000.
|
||||||
|
lr (float):
|
||||||
|
Initial learning rate. Defaults to `1e-3`.
|
||||||
|
wd (float):
|
||||||
|
Weight decay coefficient. Defaults to `1e-7`.
|
||||||
|
min_seq_len (int):
|
||||||
|
Minimum input sequence length to be used at training.
|
||||||
|
max_seq_len (int):
|
||||||
|
Maximum input sequence length to be used at training. Larger values result in more VRAM usage."""
|
||||||
|
|
||||||
model: str = "align_tts"
|
model: str = "align_tts"
|
||||||
# model specific params
|
# model specific params
|
||||||
|
|
|
@ -1,11 +1,64 @@
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
from .shared_configs import BaseTTSConfig
|
from TTS.tts.configs.shared_configs import BaseTTSConfig
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class GlowTTSConfig(BaseTTSConfig):
|
class GlowTTSConfig(BaseTTSConfig):
|
||||||
"""Defines parameters for GlowTTS model."""
|
"""Defines parameters for GlowTTS model.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
>>> from TTS.tts.configs import GlowTTSConfig
|
||||||
|
>>> config = GlowTTSConfig()
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model(str):
|
||||||
|
Model name used for selecting the right model at initialization. Defaults to `glow_tts`.
|
||||||
|
encoder_type (str):
|
||||||
|
Type of the encoder used by the model. Look at `TTS.tts.layers.glow_tts.encoder` for more details.
|
||||||
|
Defaults to `rel_pos_transformers`.
|
||||||
|
encoder_params (dict):
|
||||||
|
Parameters used to define the encoder network. Look at `TTS.tts.layers.glow_tts.encoder` for more details.
|
||||||
|
Defaults to `{"kernel_size": 3, "dropout_p": 0.1, "num_layers": 6, "num_heads": 2, "hidden_channels_ffn": 768}`
|
||||||
|
use_encoder_prenet (bool):
|
||||||
|
enable / disable the use of a prenet for the encoder. Defaults to True.
|
||||||
|
hidden_channels_encoder (int):
|
||||||
|
Number of base hidden channels used by the encoder network. It defines the input and the output channel sizes,
|
||||||
|
and for some encoder types internal hidden channels sizes too. Defaults to 192.
|
||||||
|
hidden_channels_decoder (int):
|
||||||
|
Number of base hidden channels used by the decoder WaveNet network. Defaults to 192 as in the original work.
|
||||||
|
hidden_channels_duration_predictor (int):
|
||||||
|
Number of layer channels of the duration predictor network. Defaults to 256 as in the original work.
|
||||||
|
data_dep_init_steps (int):
|
||||||
|
Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
|
||||||
|
Activation Normalization that pre-computes normalization stats at the beginning and use the same values
|
||||||
|
for the rest. Defaults to 10.
|
||||||
|
style_wav_for_test (str):
|
||||||
|
Path to the wav file used for changing the style of the speech. Defaults to None.
|
||||||
|
inference_noise_scale (float):
|
||||||
|
Variance used for sampling the random noise added to the decoder's input at inference. Defaults to 0.0.
|
||||||
|
use_speaker_embedding (bool):
|
||||||
|
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
|
||||||
|
in the multi-speaker mode. Defaults to False.
|
||||||
|
use_external_speaker_embedding_file (bool):
|
||||||
|
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
|
||||||
|
external_speaker_embedding_file (str):
|
||||||
|
Path to the file including pre-computed speaker embeddings. Defaults to None.
|
||||||
|
noam_schedule (bool):
|
||||||
|
enable / disable the use of Noam LR scheduler. Defaults to False.
|
||||||
|
warmup_steps (int):
|
||||||
|
Number of warm-up steps for the Noam scheduler. Defaults 4000.
|
||||||
|
lr (float):
|
||||||
|
Initial learning rate. Defaults to `1e-3`.
|
||||||
|
wd (float):
|
||||||
|
Weight decay coefficient. Defaults to `1e-7`.
|
||||||
|
min_seq_len (int):
|
||||||
|
Minimum input sequence length to be used at training.
|
||||||
|
max_seq_len (int):
|
||||||
|
Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
model: str = "glow_tts"
|
model: str = "glow_tts"
|
||||||
|
|
||||||
|
@ -47,4 +100,4 @@ class GlowTTSConfig(BaseTTSConfig):
|
||||||
# overrides
|
# overrides
|
||||||
min_seq_len: int = 3
|
min_seq_len: int = 3
|
||||||
max_seq_len: int = 500
|
max_seq_len: int = 500
|
||||||
r: int = 1
|
r: int = 1 # DO NOT CHANGE - TODO: make this immutable once coqpit implements it.
|
||||||
|
|
|
@ -8,8 +8,20 @@ from TTS.config import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class GSTConfig(Coqpit):
|
class GSTConfig(Coqpit):
|
||||||
"""Defines Global Style Toke module"""
|
"""Defines the Global Style Token Module
|
||||||
|
|
||||||
|
Args:
|
||||||
|
gst_style_input_wav (str):
|
||||||
|
Path to the wav file used to define the style of the output speech at inference. Defaults to None.
|
||||||
|
gst_style_input_weights (dict):
|
||||||
|
Defines the weights for each style token used at inference. Defaults to None.
|
||||||
|
gst_embedding_dim (int):
|
||||||
|
Defines the size of the GST embedding vector dimensions. Defaults to 256.
|
||||||
|
gst_num_heads (int):
|
||||||
|
Number of attention heads used by the multi-head attention. Defaults to 4.
|
||||||
|
gst_num_style_tokens (int):
|
||||||
|
Number of style token vectors. Defaults to 10.
|
||||||
|
"""
|
||||||
gst_style_input_wav: str = None
|
gst_style_input_wav: str = None
|
||||||
gst_style_input_weights: dict = None
|
gst_style_input_weights: dict = None
|
||||||
gst_embedding_dim: int = 256
|
gst_embedding_dim: int = 256
|
||||||
|
@ -33,7 +45,26 @@ class GSTConfig(Coqpit):
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class CharactersConfig(Coqpit):
|
class CharactersConfig(Coqpit):
|
||||||
"""Defines character or phoneme set used by the model"""
|
"""Defines character or phoneme set used by the model
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pad (str):
|
||||||
|
characters in place of empty padding. Defaults to None.
|
||||||
|
eos (str):
|
||||||
|
characters showing the end of a sentence. Defaults to None.
|
||||||
|
bos (str):
|
||||||
|
characters showing the beginning of a sentence. Defaults to None.
|
||||||
|
characters (str):
|
||||||
|
character set used by the model. Characters not in this list are ignored when converting input text to
|
||||||
|
a list of sequence IDs. Defaults to None.
|
||||||
|
punctuations (str):
|
||||||
|
characters considered as punctuation as parsing the input sentence. Defaults to None.
|
||||||
|
phonemes (str):
|
||||||
|
characters considered as parsing phonemes. Defaults to None.
|
||||||
|
unique (bool):
|
||||||
|
remove any duplicate characters in the character lists. It is a bandaid for compatibility with the old
|
||||||
|
models trained with character lists with duplicates.
|
||||||
|
"""
|
||||||
|
|
||||||
pad: str = None
|
pad: str = None
|
||||||
eos: str = None
|
eos: str = None
|
||||||
|
@ -58,7 +89,48 @@ class CharactersConfig(Coqpit):
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class BaseTTSConfig(BaseTrainingConfig):
|
class BaseTTSConfig(BaseTrainingConfig):
|
||||||
"""Shared parameters among all the tts models."""
|
"""Shared parameters among all the tts models.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
audio (BaseAudioConfig):
|
||||||
|
Audio processor config object instance.
|
||||||
|
use_phonemes (bool):
|
||||||
|
enable / disable phoneme use.
|
||||||
|
compute_input_seq_cache (bool):
|
||||||
|
enable / disable precomputation of the phoneme sequences. At the expense of some delay at the beginning of
|
||||||
|
the training, It allows faster data loader time and precise limitation with `max_seq_len` and
|
||||||
|
`min_seq_len`.
|
||||||
|
text_cleaner (str):
|
||||||
|
Name of the text cleaner used for cleaning and formatting transcripts.
|
||||||
|
enable_eos_bos_chars (bool):
|
||||||
|
enable / disable the use of eos and bos characters.
|
||||||
|
test_senteces_file (str):
|
||||||
|
Path to a txt file that has sentences used at test time. The file must have a sentence per line.
|
||||||
|
phoneme_cache_path (str):
|
||||||
|
Path to the output folder caching the computed phonemes for each sample.
|
||||||
|
characters (CharactersConfig):
|
||||||
|
Instance of a CharactersConfig class.
|
||||||
|
batch_group_size (int):
|
||||||
|
Size of the batch groups used for bucketing. By default, the dataloader orders samples by the sequence
|
||||||
|
length for a more efficient and stable training. If `batch_group_size > 1` then it performs bucketing to
|
||||||
|
prevent using the same batches for each epoch.
|
||||||
|
loss_masking (bool):
|
||||||
|
enable / disable masking loss values against padded segments of samples in a batch.
|
||||||
|
min_seq_len (int):
|
||||||
|
Minimum input sequence length to be used at training.
|
||||||
|
max_seq_len (int):
|
||||||
|
Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
|
||||||
|
compute_f0 (int):
|
||||||
|
(Not in use yet).
|
||||||
|
use_noise_augment (bool):
|
||||||
|
Augment the input audio with random noise.
|
||||||
|
add_blank (bool):
|
||||||
|
Add blank characters between each other two characters. It improves performance for some models at expense
|
||||||
|
of slower run-time due to the longer input sequence.
|
||||||
|
datasets (List[BaseDatasetConfig]):
|
||||||
|
List of datasets used for training. If multiple datasets are provided, they are merged and used together
|
||||||
|
for training.
|
||||||
|
"""
|
||||||
|
|
||||||
audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
|
audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
|
||||||
# phoneme settings
|
# phoneme settings
|
||||||
|
|
|
@ -1,11 +1,74 @@
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
from .shared_configs import BaseTTSConfig
|
from TTS.tts.configs.shared_configs import BaseTTSConfig
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class SpeedySpeechConfig(BaseTTSConfig):
|
class SpeedySpeechConfig(BaseTTSConfig):
|
||||||
"""Defines parameters for Speedy Speech (feed-forward encoder-decoder) based models."""
|
"""Defines parameters for Speedy Speech (feed-forward encoder-decoder) based models.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
>>> from TTS.tts.configs import SpeedySpeechConfig
|
||||||
|
>>> config = SpeedySpeechConfig()
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model (str):
|
||||||
|
Model name used for selecting the right model at initialization. Defaults to `speedy_speech`.
|
||||||
|
positional_encoding (bool):
|
||||||
|
enable / disable positional encoding applied to the encoder output. Defaults to True.
|
||||||
|
hidden_channels (int):
|
||||||
|
Base number of hidden channels. Defines all the layers expect ones defined by the specific encoder or decoder
|
||||||
|
parameters. Defaults to 128.
|
||||||
|
encoder_type (str):
|
||||||
|
Type of the encoder used by the model. Look at `TTS.tts.layers.feed_forward.encoder` for more details.
|
||||||
|
Defaults to `residual_conv_bn`.
|
||||||
|
encoder_params (dict):
|
||||||
|
Parameters used to define the encoder network. Look at `TTS.tts.layers.feed_forward.encoder` for more details.
|
||||||
|
Defaults to `{"kernel_size": 4, "dilations": [1, 2, 4, 1, 2, 4, 1, 2, 4, 1, 2, 4, 1], "num_conv_blocks": 2, "num_res_blocks": 13}`
|
||||||
|
decoder_type (str):
|
||||||
|
Type of the decoder used by the model. Look at `TTS.tts.layers.feed_forward.decoder` for more details.
|
||||||
|
Defaults to `residual_conv_bn`.
|
||||||
|
decoder_params (dict):
|
||||||
|
Parameters used to define the decoder network. Look at `TTS.tts.layers.feed_forward.decoder` for more details.
|
||||||
|
Defaults to `{"kernel_size": 4, "dilations": [1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1], "num_conv_blocks": 2, "num_res_blocks": 17}`
|
||||||
|
hidden_channels_encoder (int):
|
||||||
|
Number of base hidden channels used by the encoder network. It defines the input and the output channel sizes,
|
||||||
|
and for some encoder types internal hidden channels sizes too. Defaults to 192.
|
||||||
|
hidden_channels_decoder (int):
|
||||||
|
Number of base hidden channels used by the decoder WaveNet network. Defaults to 192 as in the original work.
|
||||||
|
hidden_channels_duration_predictor (int):
|
||||||
|
Number of layer channels of the duration predictor network. Defaults to 256 as in the original work.
|
||||||
|
data_dep_init_steps (int):
|
||||||
|
Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
|
||||||
|
Activation Normalization that pre-computes normalization stats at the beginning and use the same values
|
||||||
|
for the rest. Defaults to 10.
|
||||||
|
use_speaker_embedding (bool):
|
||||||
|
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
|
||||||
|
in the multi-speaker mode. Defaults to False.
|
||||||
|
use_external_speaker_embedding_file (bool):
|
||||||
|
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
|
||||||
|
external_speaker_embedding_file (str):
|
||||||
|
Path to the file including pre-computed speaker embeddings. Defaults to None.
|
||||||
|
noam_schedule (bool):
|
||||||
|
enable / disable the use of Noam LR scheduler. Defaults to False.
|
||||||
|
warmup_steps (int):
|
||||||
|
Number of warm-up steps for the Noam scheduler. Defaults 4000.
|
||||||
|
lr (float):
|
||||||
|
Initial learning rate. Defaults to `1e-3`.
|
||||||
|
wd (float):
|
||||||
|
Weight decay coefficient. Defaults to `1e-7`.
|
||||||
|
ssim_alpha (float):
|
||||||
|
Weight for the SSIM loss. If set <= 0, disables the SSIM loss. Defaults to 1.0.
|
||||||
|
huber_alpha (float):
|
||||||
|
Weight for the duration predictor's loss. Defaults to 1.0.
|
||||||
|
l1_alpha (float):
|
||||||
|
Weight for the L1 spectrogram loss. If set <= 0, disables the L1 loss. Defaults to 1.0.
|
||||||
|
min_seq_len (int):
|
||||||
|
Minimum input sequence length to be used at training.
|
||||||
|
max_seq_len (int):
|
||||||
|
Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
|
||||||
|
"""
|
||||||
|
|
||||||
model: str = "speedy_speech"
|
model: str = "speedy_speech"
|
||||||
# model specific params
|
# model specific params
|
||||||
|
@ -50,4 +113,4 @@ class SpeedySpeechConfig(BaseTTSConfig):
|
||||||
# overrides
|
# overrides
|
||||||
min_seq_len: int = 13
|
min_seq_len: int = 13
|
||||||
max_seq_len: int = 200
|
max_seq_len: int = 200
|
||||||
r: int = 1
|
r: int = 1 #DO NOT CHANGE
|
||||||
|
|
|
@ -5,6 +5,114 @@ from TTS.tts.configs.tacotron_config import TacotronConfig
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Tacotron2Config(TacotronConfig):
|
class Tacotron2Config(TacotronConfig):
|
||||||
"""Defines parameters for Tacotron2 based models."""
|
"""Defines parameters for Tacotron2 based models.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
>>> from TTS.tts.configs import Tacotron2Config
|
||||||
|
>>> config = Tacotron2Config()
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model (str):
|
||||||
|
Model name used to select the right model class to initilize. Defaults to `Tacotron2`.
|
||||||
|
use_gst (bool):
|
||||||
|
enable / disable the use of Global Style Token modules. Defaults to False.
|
||||||
|
gst (GSTConfig):
|
||||||
|
Instance of `GSTConfig` class.
|
||||||
|
gst_style_input (str):
|
||||||
|
Path to the wav file used at inference to set the speech style through GST. If `GST` is enabled and
|
||||||
|
this is not defined, the model uses a zero vector as an input. Defaults to None.
|
||||||
|
r (int):
|
||||||
|
Number of output frames that the decoder computed per iteration. Larger values makes training and inference
|
||||||
|
faster but reduces the quality of the output frames. This needs to be tuned considering your own needs.
|
||||||
|
Defaults to 1.
|
||||||
|
gradual_trainin (List[List]):
|
||||||
|
Parameters for the gradual training schedule. It is in the form `[[a, b, c], [d ,e ,f] ..]` where `a` is
|
||||||
|
the step number to start using the rest of the values, `b` is the `r` value and `c` is the batch size.
|
||||||
|
If sets None, no gradual training is used. Defaults to None.
|
||||||
|
memory_size (int):
|
||||||
|
Defines the number of previous frames used by the Prenet. If set to < 0, then it uses only the last frame.
|
||||||
|
Defaults to -1.
|
||||||
|
prenet_type (str):
|
||||||
|
`original` or `bn`. `original` sets the default Prenet and `bn` uses Batch Normalization version of the
|
||||||
|
Prenet. Defaults to `original`.
|
||||||
|
prenet_dropout (bool):
|
||||||
|
enables / disables the use of dropout in the Prenet. Defaults to True.
|
||||||
|
prenet_dropout_at_inference (bool):
|
||||||
|
enable / disable the use of dropout in the Prenet at the inference time. Defaults to False.
|
||||||
|
stopnet (bool):
|
||||||
|
enable /disable the Stopnet that predicts the end of the decoder sequence. Defaults to True.
|
||||||
|
stopnet_pos_weight (float):
|
||||||
|
Weight that is applied to over-weight positive instances in the Stopnet loss. Use larger values with
|
||||||
|
datasets with longer sentences. Defaults to 10.
|
||||||
|
separate_stopnet (bool):
|
||||||
|
Use a distinct Stopnet which is trained separately from the rest of the model. Defaults to True.
|
||||||
|
attention_type (str):
|
||||||
|
attention type. Check ```TTS.tts.layers.attentions.init_attn```. Defaults to 'original'.
|
||||||
|
attention_heads (int):
|
||||||
|
Number of attention heads for GMM attention. Defaults to 5.
|
||||||
|
windowing (bool):
|
||||||
|
It especially useful at inference to keep attention alignment diagonal. Defaults to False.
|
||||||
|
use_forward_attn (bool):
|
||||||
|
It is only valid if ```attn_type``` is ```original```. Defaults to False.
|
||||||
|
forward_attn_mask (bool):
|
||||||
|
enable/disable extra masking over forward attention. It is useful at inference to prevent
|
||||||
|
possible attention failures. Defaults to False.
|
||||||
|
transition_agent (bool):
|
||||||
|
enable/disable transition agent in forward attention. Defaults to False.
|
||||||
|
location_attn (bool):
|
||||||
|
enable/disable location sensitive attention as in the original Tacotron2 paper.
|
||||||
|
It is only valid if ```attn_type``` is ```original```. Defaults to True.
|
||||||
|
bidirectional_decoder (bool):
|
||||||
|
enable/disable bidirectional decoding. Defaults to False.
|
||||||
|
double_decoder_consistency (bool):
|
||||||
|
enable/disable double decoder consistency. Defaults to False.
|
||||||
|
ddc_r (int):
|
||||||
|
reduction rate used by the coarse decoder when `double_decoder_consistency` is in use. Set this
|
||||||
|
as a multiple of the `r` value. Defaults to 6.
|
||||||
|
use_speaker_embedding (bool):
|
||||||
|
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
|
||||||
|
in the multi-speaker mode. Defaults to False.
|
||||||
|
use_external_speaker_embedding_file (bool):
|
||||||
|
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
|
||||||
|
external_speaker_embedding_file (str):
|
||||||
|
Path to the file including pre-computed speaker embeddings. Defaults to None.
|
||||||
|
noam_schedule (bool):
|
||||||
|
enable / disable the use of Noam LR scheduler. Defaults to False.
|
||||||
|
warmup_steps (int):
|
||||||
|
Number of warm-up steps for the Noam scheduler. Defaults 4000.
|
||||||
|
lr (float):
|
||||||
|
Initial learning rate. Defaults to `1e-4`.
|
||||||
|
wd (float):
|
||||||
|
Weight decay coefficient. Defaults to `1e-6`.
|
||||||
|
grad_clip (float):
|
||||||
|
Gradient clipping threshold. Defaults to `5`.
|
||||||
|
seq_len_notm (bool):
|
||||||
|
enable / disable the sequnce length normalization in the loss functions. If set True, loss of a sample
|
||||||
|
is divided by the sequence length. Defaults to False.
|
||||||
|
loss_masking (bool):
|
||||||
|
enable / disable masking the paddings of the samples in loss computation. Defaults to True.
|
||||||
|
decoder_loss_alpha (float):
|
||||||
|
Weight for the decoder loss of the Tacotron model. If set less than or equal to zero, it disables the
|
||||||
|
corresponding loss function. Defaults to 0.25
|
||||||
|
postnet_loss_alpha (float):
|
||||||
|
Weight for the postnet loss of the Tacotron model. If set less than or equal to zero, it disables the
|
||||||
|
corresponding loss function. Defaults to 0.25
|
||||||
|
postnet_diff_spec_alpha (float):
|
||||||
|
Weight for the postnet differential loss of the Tacotron model. If set less than or equal to zero, it disables the
|
||||||
|
corresponding loss function. Defaults to 0.25
|
||||||
|
decoder_diff_spec_alpha (float):
|
||||||
|
Weight for the decoder differential loss of the Tacotron model. If set less than or equal to zero, it disables the
|
||||||
|
corresponding loss function. Defaults to 0.25
|
||||||
|
decoder_ssim_alpha (float):
|
||||||
|
Weight for the decoder SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the
|
||||||
|
corresponding loss function. Defaults to 0.25
|
||||||
|
postnet_ssim_alpha (float):
|
||||||
|
Weight for the postnet SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the
|
||||||
|
corresponding loss function. Defaults to 0.25
|
||||||
|
ga_alpha (float):
|
||||||
|
Weight for the guided attention loss. If set less than or equal to zero, it disables the corresponding loss
|
||||||
|
function. Defaults to 5.
|
||||||
|
"""
|
||||||
|
|
||||||
model: str = "tacotron2"
|
model: str = "tacotron2"
|
||||||
|
|
|
@ -1,12 +1,120 @@
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from .shared_configs import BaseTTSConfig, GSTConfig
|
from TTS.tts.configs.shared_configs import BaseTTSConfig, GSTConfig
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class TacotronConfig(BaseTTSConfig):
|
class TacotronConfig(BaseTTSConfig):
|
||||||
"""Defines parameters for Tacotron based models."""
|
"""Defines parameters for Tacotron based models.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
>>> from TTS.tts.configs import TacotronConfig
|
||||||
|
>>> config = TacotronConfig()
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model (str):
|
||||||
|
Model name used to select the right model class to initilize. Defaults to `Tacotron`.
|
||||||
|
use_gst (bool):
|
||||||
|
enable / disable the use of Global Style Token modules. Defaults to False.
|
||||||
|
gst (GSTConfig):
|
||||||
|
Instance of `GSTConfig` class.
|
||||||
|
gst_style_input (str):
|
||||||
|
Path to the wav file used at inference to set the speech style through GST. If `GST` is enabled and
|
||||||
|
this is not defined, the model uses a zero vector as an input. Defaults to None.
|
||||||
|
r (int):
|
||||||
|
Number of output frames that the decoder computed per iteration. Larger values makes training and inference
|
||||||
|
faster but reduces the quality of the output frames. This needs to be tuned considering your own needs.
|
||||||
|
Defaults to 1.
|
||||||
|
gradual_trainin (List[List]):
|
||||||
|
Parameters for the gradual training schedule. It is in the form `[[a, b, c], [d ,e ,f] ..]` where `a` is
|
||||||
|
the step number to start using the rest of the values, `b` is the `r` value and `c` is the batch size.
|
||||||
|
If sets None, no gradual training is used. Defaults to None.
|
||||||
|
memory_size (int):
|
||||||
|
Defines the number of previous frames used by the Prenet. If set to < 0, then it uses only the last frame.
|
||||||
|
Defaults to -1.
|
||||||
|
prenet_type (str):
|
||||||
|
`original` or `bn`. `original` sets the default Prenet and `bn` uses Batch Normalization version of the
|
||||||
|
Prenet. Defaults to `original`.
|
||||||
|
prenet_dropout (bool):
|
||||||
|
enables / disables the use of dropout in the Prenet. Defaults to True.
|
||||||
|
prenet_dropout_at_inference (bool):
|
||||||
|
enable / disable the use of dropout in the Prenet at the inference time. Defaults to False.
|
||||||
|
stopnet (bool):
|
||||||
|
enable /disable the Stopnet that predicts the end of the decoder sequence. Defaults to True.
|
||||||
|
stopnet_pos_weight (float):
|
||||||
|
Weight that is applied to over-weight positive instances in the Stopnet loss. Use larger values with
|
||||||
|
datasets with longer sentences. Defaults to 10.
|
||||||
|
separate_stopnet (bool):
|
||||||
|
Use a distinct Stopnet which is trained separately from the rest of the model. Defaults to True.
|
||||||
|
attention_type (str):
|
||||||
|
attention type. Check ```TTS.tts.layers.attentions.init_attn```. Defaults to 'original'.
|
||||||
|
attention_heads (int):
|
||||||
|
Number of attention heads for GMM attention. Defaults to 5.
|
||||||
|
windowing (bool):
|
||||||
|
It especially useful at inference to keep attention alignment diagonal. Defaults to False.
|
||||||
|
use_forward_attn (bool):
|
||||||
|
It is only valid if ```attn_type``` is ```original```. Defaults to False.
|
||||||
|
forward_attn_mask (bool):
|
||||||
|
enable/disable extra masking over forward attention. It is useful at inference to prevent
|
||||||
|
possible attention failures. Defaults to False.
|
||||||
|
transition_agent (bool):
|
||||||
|
enable/disable transition agent in forward attention. Defaults to False.
|
||||||
|
location_attn (bool):
|
||||||
|
enable/disable location sensitive attention as in the original Tacotron2 paper.
|
||||||
|
It is only valid if ```attn_type``` is ```original```. Defaults to True.
|
||||||
|
bidirectional_decoder (bool):
|
||||||
|
enable/disable bidirectional decoding. Defaults to False.
|
||||||
|
double_decoder_consistency (bool):
|
||||||
|
enable/disable double decoder consistency. Defaults to False.
|
||||||
|
ddc_r (int):
|
||||||
|
reduction rate used by the coarse decoder when `double_decoder_consistency` is in use. Set this
|
||||||
|
as a multiple of the `r` value. Defaults to 6.
|
||||||
|
use_speaker_embedding (bool):
|
||||||
|
enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
|
||||||
|
in the multi-speaker mode. Defaults to False.
|
||||||
|
use_external_speaker_embedding_file (bool):
|
||||||
|
enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
|
||||||
|
external_speaker_embedding_file (str):
|
||||||
|
Path to the file including pre-computed speaker embeddings. Defaults to None.
|
||||||
|
noam_schedule (bool):
|
||||||
|
enable / disable the use of Noam LR scheduler. Defaults to False.
|
||||||
|
warmup_steps (int):
|
||||||
|
Number of warm-up steps for the Noam scheduler. Defaults 4000.
|
||||||
|
lr (float):
|
||||||
|
Initial learning rate. Defaults to `1e-4`.
|
||||||
|
wd (float):
|
||||||
|
Weight decay coefficient. Defaults to `1e-6`.
|
||||||
|
grad_clip (float):
|
||||||
|
Gradient clipping threshold. Defaults to `5`.
|
||||||
|
seq_len_notm (bool):
|
||||||
|
enable / disable the sequnce length normalization in the loss functions. If set True, loss of a sample
|
||||||
|
is divided by the sequence length. Defaults to False.
|
||||||
|
loss_masking (bool):
|
||||||
|
enable / disable masking the paddings of the samples in loss computation. Defaults to True.
|
||||||
|
decoder_loss_alpha (float):
|
||||||
|
Weight for the decoder loss of the Tacotron model. If set less than or equal to zero, it disables the
|
||||||
|
corresponding loss function. Defaults to 0.25
|
||||||
|
postnet_loss_alpha (float):
|
||||||
|
Weight for the postnet loss of the Tacotron model. If set less than or equal to zero, it disables the
|
||||||
|
corresponding loss function. Defaults to 0.25
|
||||||
|
postnet_diff_spec_alpha (float):
|
||||||
|
Weight for the postnet differential loss of the Tacotron model. If set less than or equal to zero, it disables the
|
||||||
|
corresponding loss function. Defaults to 0.25
|
||||||
|
decoder_diff_spec_alpha (float):
|
||||||
|
Weight for the decoder differential loss of the Tacotron model. If set less than or equal to zero, it disables the
|
||||||
|
corresponding loss function. Defaults to 0.25
|
||||||
|
decoder_ssim_alpha (float):
|
||||||
|
Weight for the decoder SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the
|
||||||
|
corresponding loss function. Defaults to 0.25
|
||||||
|
postnet_ssim_alpha (float):
|
||||||
|
Weight for the postnet SSIM loss of the Tacotron model. If set less than or equal to zero, it disables the
|
||||||
|
corresponding loss function. Defaults to 0.25
|
||||||
|
ga_alpha (float):
|
||||||
|
Weight for the guided attention loss. If set less than or equal to zero, it disables the corresponding loss
|
||||||
|
function. Defaults to 5.
|
||||||
|
"""
|
||||||
|
|
||||||
model: str = "tacotron"
|
model: str = "tacotron"
|
||||||
use_gst: bool = False
|
use_gst: bool = False
|
||||||
|
|
|
@ -52,19 +52,19 @@ def load_meta_data(datasets, eval_split=True):
|
||||||
print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}")
|
print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}")
|
||||||
# load evaluation split if set
|
# load evaluation split if set
|
||||||
if eval_split:
|
if eval_split:
|
||||||
if meta_file_val is None:
|
if meta_file_val:
|
||||||
meta_data_eval, meta_data_train = split_dataset(meta_data_train)
|
|
||||||
else:
|
|
||||||
meta_data_eval = preprocessor(root_path, meta_file_val)
|
meta_data_eval = preprocessor(root_path, meta_file_val)
|
||||||
|
else:
|
||||||
|
meta_data_eval, meta_data_train = split_dataset(meta_data_train)
|
||||||
meta_data_eval_all += meta_data_eval
|
meta_data_eval_all += meta_data_eval
|
||||||
meta_data_train_all += meta_data_train
|
meta_data_train_all += meta_data_train
|
||||||
# load attention masks for duration predictor training
|
# load attention masks for duration predictor training
|
||||||
if dataset.meta_file_attn_mask is not None:
|
if dataset.meta_file_attn_mask:
|
||||||
meta_data = dict(load_attention_mask_meta_data(dataset["meta_file_attn_mask"]))
|
meta_data = dict(load_attention_mask_meta_data(dataset["meta_file_attn_mask"]))
|
||||||
for idx, ins in enumerate(meta_data_train_all):
|
for idx, ins in enumerate(meta_data_train_all):
|
||||||
attn_file = meta_data[ins[1]].strip()
|
attn_file = meta_data[ins[1]].strip()
|
||||||
meta_data_train_all[idx].append(attn_file)
|
meta_data_train_all[idx].append(attn_file)
|
||||||
if meta_data_eval_all is not None:
|
if meta_data_eval_all:
|
||||||
for idx, ins in enumerate(meta_data_eval_all):
|
for idx, ins in enumerate(meta_data_eval_all):
|
||||||
attn_file = meta_data[ins[1]].strip()
|
attn_file = meta_data[ins[1]].strip()
|
||||||
meta_data_eval_all[idx].append(attn_file)
|
meta_data_eval_all[idx].append(attn_file)
|
||||||
|
|
|
@ -5,7 +5,62 @@ from .shared_configs import BaseGANVocoderConfig
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class FullbandMelganConfig(BaseGANVocoderConfig):
|
class FullbandMelganConfig(BaseGANVocoderConfig):
|
||||||
"""Defines parameters for FullbandMelGAN vocoder."""
|
"""Defines parameters for FullBand MelGAN vocoder.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
>>> from TTS.vocoder.configs import FullbandMelganConfig
|
||||||
|
>>> config = FullbandMelganConfig()
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model (str):
|
||||||
|
Model name used for selecting the right model at initialization. Defaults to `melgan`.
|
||||||
|
discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to
|
||||||
|
'melgan_multiscale_discriminator`.
|
||||||
|
discriminator_model_params (dict): The discriminator model parameters. Defaults to
|
||||||
|
'{"base_channels": 16, "max_channels": 1024, "downsample_factors": [4, 4, 4, 4]}`
|
||||||
|
generator_model (str): One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is
|
||||||
|
considered as a generator too. Defaults to `melgan_generator`.
|
||||||
|
batch_size (int):
|
||||||
|
Batch size used at training. Larger values use more memory. Defaults to 16.
|
||||||
|
seq_len (int):
|
||||||
|
Audio segment length used at training. Larger values use more memory. Defaults to 8192.
|
||||||
|
pad_short (int):
|
||||||
|
Additional padding applied to the audio samples shorter than `seq_len`. Defaults to 0.
|
||||||
|
use_noise_augment (bool):
|
||||||
|
enable / disable random noise added to the input waveform. The noise is added after computing the
|
||||||
|
features. Defaults to True.
|
||||||
|
use_cache (bool):
|
||||||
|
enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is
|
||||||
|
not large enough. Defaults to True.
|
||||||
|
use_stft_loss (bool):
|
||||||
|
enable / disable use of STFT loss originally used by ParallelWaveGAN model. Defaults to True.
|
||||||
|
use_subband_stft (bool):
|
||||||
|
enable / disable use of subband loss computation originally used by MultiBandMelgan model. Defaults to True.
|
||||||
|
use_mse_gan_loss (bool):
|
||||||
|
enable / disable using Mean Squeare Error GAN loss. Defaults to True.
|
||||||
|
use_hinge_gan_loss (bool):
|
||||||
|
enable / disable using Hinge GAN loss. You should choose either Hinge or MSE loss for training GAN models.
|
||||||
|
Defaults to False.
|
||||||
|
use_feat_match_loss (bool):
|
||||||
|
enable / disable using Feature Matching loss originally used by MelGAN model. Defaults to True.
|
||||||
|
use_l1_spec_loss (bool):
|
||||||
|
enable / disable using L1 spectrogram loss originally used by HifiGAN model. Defaults to False.
|
||||||
|
stft_loss_params (dict): STFT loss parameters. Default to
|
||||||
|
`{"n_ffts": [1024, 2048, 512], "hop_lengths": [120, 240, 50], "win_lengths": [600, 1200, 240]}`
|
||||||
|
stft_loss_weight (float): STFT loss weight that multiplies the computed loss before summing up the total
|
||||||
|
model loss. Defaults to 0.5.
|
||||||
|
subband_stft_loss_weight (float):
|
||||||
|
Subband STFT loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
|
||||||
|
mse_G_loss_weight (float):
|
||||||
|
MSE generator loss weight that multiplies the computed loss before summing up the total loss. faults to 2.5.
|
||||||
|
hinge_G_loss_weight (float):
|
||||||
|
Hinge generator loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
|
||||||
|
feat_match_loss_weight (float):
|
||||||
|
Feature matching loss weight that multiplies the computed loss before summing up the total loss. faults to 108.
|
||||||
|
l1_spec_loss_weight (float):
|
||||||
|
L1 spectrogram loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
|
||||||
|
"""
|
||||||
|
|
||||||
model: str = "melgan"
|
model: str = "melgan"
|
||||||
|
|
||||||
|
@ -48,4 +103,4 @@ class FullbandMelganConfig(BaseGANVocoderConfig):
|
||||||
mse_G_loss_weight: float = 2.5
|
mse_G_loss_weight: float = 2.5
|
||||||
hinge_G_loss_weight: float = 0
|
hinge_G_loss_weight: float = 0
|
||||||
feat_match_loss_weight: float = 108
|
feat_match_loss_weight: float = 108
|
||||||
l1_spec_loss_weight: float = 0
|
l1_spec_loss_weight: float = 0.0
|
||||||
|
|
|
@ -1,11 +1,94 @@
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
from .shared_configs import BaseGANVocoderConfig
|
from TTS.vocoder.configs.shared_configs import BaseGANVocoderConfig
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class HifiganConfig(BaseGANVocoderConfig):
|
class HifiganConfig(BaseGANVocoderConfig):
|
||||||
"""Defines parameters for HifiGAN vocoder."""
|
"""Defines parameters for FullBand MelGAN vocoder.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
>>> from TTS.vocoder.configs import HifiganConfig
|
||||||
|
>>> config = HifiganConfig()
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model (str):
|
||||||
|
Model name used for selecting the right model at initialization. Defaults to `hifigan`.
|
||||||
|
discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to
|
||||||
|
'hifigan_discriminator`.
|
||||||
|
generator_model (str): One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is
|
||||||
|
considered as a generator too. Defaults to `hifigan_generator`.
|
||||||
|
generator_model_params (dict): Parameters of the generator model. Defaults to
|
||||||
|
`
|
||||||
|
{
|
||||||
|
"use_mel": True,
|
||||||
|
"sample_rate": 22050,
|
||||||
|
"n_fft": 1024,
|
||||||
|
"hop_length": 256,
|
||||||
|
"win_length": 1024,
|
||||||
|
"n_mels": 80,
|
||||||
|
"mel_fmin": 0.0,
|
||||||
|
"mel_fmax": None,
|
||||||
|
}
|
||||||
|
`
|
||||||
|
batch_size (int):
|
||||||
|
Batch size used at training. Larger values use more memory. Defaults to 16.
|
||||||
|
seq_len (int):
|
||||||
|
Audio segment length used at training. Larger values use more memory. Defaults to 8192.
|
||||||
|
pad_short (int):
|
||||||
|
Additional padding applied to the audio samples shorter than `seq_len`. Defaults to 0.
|
||||||
|
use_noise_augment (bool):
|
||||||
|
enable / disable random noise added to the input waveform. The noise is added after computing the
|
||||||
|
features. Defaults to True.
|
||||||
|
use_cache (bool):
|
||||||
|
enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is
|
||||||
|
not large enough. Defaults to True.
|
||||||
|
use_stft_loss (bool):
|
||||||
|
enable / disable use of STFT loss originally used by ParallelWaveGAN model. Defaults to True.
|
||||||
|
use_subband_stft (bool):
|
||||||
|
enable / disable use of subband loss computation originally used by MultiBandMelgan model. Defaults to True.
|
||||||
|
use_mse_gan_loss (bool):
|
||||||
|
enable / disable using Mean Squeare Error GAN loss. Defaults to True.
|
||||||
|
use_hinge_gan_loss (bool):
|
||||||
|
enable / disable using Hinge GAN loss. You should choose either Hinge or MSE loss for training GAN models.
|
||||||
|
Defaults to False.
|
||||||
|
use_feat_match_loss (bool):
|
||||||
|
enable / disable using Feature Matching loss originally used by MelGAN model. Defaults to True.
|
||||||
|
use_l1_spec_loss (bool):
|
||||||
|
enable / disable using L1 spectrogram loss originally used by HifiGAN model. Defaults to False.
|
||||||
|
stft_loss_params (dict):
|
||||||
|
STFT loss parameters. Default to
|
||||||
|
`{
|
||||||
|
"n_ffts": [1024, 2048, 512],
|
||||||
|
"hop_lengths": [120, 240, 50],
|
||||||
|
"win_lengths": [600, 1200, 240]
|
||||||
|
}`
|
||||||
|
l1_spec_loss_params (dict):
|
||||||
|
L1 spectrogram loss parameters. Default to
|
||||||
|
`{
|
||||||
|
"use_mel": True,
|
||||||
|
"sample_rate": 22050,
|
||||||
|
"n_fft": 1024,
|
||||||
|
"hop_length": 256,
|
||||||
|
"win_length": 1024,
|
||||||
|
"n_mels": 80,
|
||||||
|
"mel_fmin": 0.0,
|
||||||
|
"mel_fmax": None,
|
||||||
|
}`
|
||||||
|
stft_loss_weight (float): STFT loss weight that multiplies the computed loss before summing up the total
|
||||||
|
model loss. Defaults to 0.5.
|
||||||
|
subband_stft_loss_weight (float):
|
||||||
|
Subband STFT loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
|
||||||
|
mse_G_loss_weight (float):
|
||||||
|
MSE generator loss weight that multiplies the computed loss before summing up the total loss. faults to 2.5.
|
||||||
|
hinge_G_loss_weight (float):
|
||||||
|
Hinge generator loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
|
||||||
|
feat_match_loss_weight (float):
|
||||||
|
Feature matching loss weight that multiplies the computed loss before summing up the total loss. faults to 108.
|
||||||
|
l1_spec_loss_weight (float):
|
||||||
|
L1 spectrogram loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
|
||||||
|
"""
|
||||||
|
|
||||||
model: str = "hifigan"
|
model: str = "hifigan"
|
||||||
# model specific params
|
# model specific params
|
||||||
|
|
|
@ -1,11 +1,66 @@
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
from .shared_configs import BaseGANVocoderConfig
|
from TTS.vocoder.configs.shared_configs import BaseGANVocoderConfig
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class MelganConfig(BaseGANVocoderConfig):
|
class MelganConfig(BaseGANVocoderConfig):
|
||||||
"""Defines parameters for MelGAN vocoder."""
|
"""Defines parameters for MelGAN vocoder.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
>>> from TTS.vocoder.configs import MelganConfig
|
||||||
|
>>> config = MelganConfig()
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model (str):
|
||||||
|
Model name used for selecting the right model at initialization. Defaults to `melgan`.
|
||||||
|
discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to
|
||||||
|
'melgan_multiscale_discriminator`.
|
||||||
|
discriminator_model_params (dict): The discriminator model parameters. Defaults to
|
||||||
|
'{"base_channels": 16, "max_channels": 1024, "downsample_factors": [4, 4, 4, 4]}`
|
||||||
|
generator_model (str): One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is
|
||||||
|
considered as a generator too. Defaults to `melgan_generator`.
|
||||||
|
batch_size (int):
|
||||||
|
Batch size used at training. Larger values use more memory. Defaults to 16.
|
||||||
|
seq_len (int):
|
||||||
|
Audio segment length used at training. Larger values use more memory. Defaults to 8192.
|
||||||
|
pad_short (int):
|
||||||
|
Additional padding applied to the audio samples shorter than `seq_len`. Defaults to 0.
|
||||||
|
use_noise_augment (bool):
|
||||||
|
enable / disable random noise added to the input waveform. The noise is added after computing the
|
||||||
|
features. Defaults to True.
|
||||||
|
use_cache (bool):
|
||||||
|
enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is
|
||||||
|
not large enough. Defaults to True.
|
||||||
|
use_stft_loss (bool):
|
||||||
|
enable / disable use of STFT loss originally used by ParallelWaveGAN model. Defaults to True.
|
||||||
|
use_subband_stft (bool):
|
||||||
|
enable / disable use of subband loss computation originally used by MultiBandMelgan model. Defaults to True.
|
||||||
|
use_mse_gan_loss (bool):
|
||||||
|
enable / disable using Mean Squeare Error GAN loss. Defaults to True.
|
||||||
|
use_hinge_gan_loss (bool):
|
||||||
|
enable / disable using Hinge GAN loss. You should choose either Hinge or MSE loss for training GAN models.
|
||||||
|
Defaults to False.
|
||||||
|
use_feat_match_loss (bool):
|
||||||
|
enable / disable using Feature Matching loss originally used by MelGAN model. Defaults to True.
|
||||||
|
use_l1_spec_loss (bool):
|
||||||
|
enable / disable using L1 spectrogram loss originally used by HifiGAN model. Defaults to False.
|
||||||
|
stft_loss_params (dict): STFT loss parameters. Default to
|
||||||
|
`{"n_ffts": [1024, 2048, 512], "hop_lengths": [120, 240, 50], "win_lengths": [600, 1200, 240]}`
|
||||||
|
stft_loss_weight (float): STFT loss weight that multiplies the computed loss before summing up the total
|
||||||
|
model loss. Defaults to 0.5.
|
||||||
|
subband_stft_loss_weight (float):
|
||||||
|
Subband STFT loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
|
||||||
|
mse_G_loss_weight (float):
|
||||||
|
MSE generator loss weight that multiplies the computed loss before summing up the total loss. faults to 2.5.
|
||||||
|
hinge_G_loss_weight (float):
|
||||||
|
Hinge generator loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
|
||||||
|
feat_match_loss_weight (float):
|
||||||
|
Feature matching loss weight that multiplies the computed loss before summing up the total loss. faults to 108.
|
||||||
|
l1_spec_loss_weight (float):
|
||||||
|
L1 spectrogram loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
|
||||||
|
"""
|
||||||
|
|
||||||
model: str = "melgan"
|
model: str = "melgan"
|
||||||
|
|
||||||
|
|
|
@ -1,11 +1,95 @@
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
from .shared_configs import BaseGANVocoderConfig
|
from TTS.vocoder.configs.shared_configs import BaseGANVocoderConfig
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class MultibandMelganConfig(BaseGANVocoderConfig):
|
class MultibandMelganConfig(BaseGANVocoderConfig):
|
||||||
"""Defines parameters for MultiBandMelGAN vocoder."""
|
"""Defines parameters for MultiBandMelGAN vocoder.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
>>> from TTS.vocoder.configs import MultibandMelganConfig
|
||||||
|
>>> config = MultibandMelganConfig()
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model (str):
|
||||||
|
Model name used for selecting the right model at initialization. Defaults to `melgan`.
|
||||||
|
discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to
|
||||||
|
'melgan_multiscale_discriminator`.
|
||||||
|
discriminator_model_params (dict): The discriminator model parameters. Defaults to
|
||||||
|
'{
|
||||||
|
"base_channels": 16,
|
||||||
|
"max_channels": 512,
|
||||||
|
"downsample_factors": [4, 4, 4]
|
||||||
|
}`
|
||||||
|
generator_model (str): One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is
|
||||||
|
considered as a generator too. Defaults to `melgan_generator`.
|
||||||
|
generator_model_param (dict):
|
||||||
|
The generator model parameters. Defaults to `{"upsample_factors": [8, 4, 2], "num_res_blocks": 4}`.
|
||||||
|
use_pqmf (bool):
|
||||||
|
enable / disable PQMF modulation for multi-band training. Defaults to True.
|
||||||
|
lr_gen (float):
|
||||||
|
Initial learning rate for the generator model. Defaults to 0.0001.
|
||||||
|
lr_disc (float):
|
||||||
|
Initial learning rate for the discriminator model. Defaults to 0.0001.
|
||||||
|
optimizer (torch.optim.Optimizer):
|
||||||
|
Optimizer used for the training. Defaults to `AdamW`.
|
||||||
|
optimizer_params (dict):
|
||||||
|
Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}`
|
||||||
|
lr_scheduler_gen (torch.optim.Scheduler):
|
||||||
|
Learning rate scheduler for the generator. Defaults to `MultiStepLR`.
|
||||||
|
lr_scheduler_gen_params (dict):
|
||||||
|
Parameters for the generator learning rate scheduler. Defaults to
|
||||||
|
`{"gamma": 0.5, "milestones": [100000, 200000, 300000, 400000, 500000, 600000]}`.
|
||||||
|
lr_scheduler_disc (torch.optim.Scheduler):
|
||||||
|
Learning rate scheduler for the discriminator. Defaults to `MultiStepLR`.
|
||||||
|
lr_scheduler_dict_params (dict):
|
||||||
|
Parameters for the discriminator learning rate scheduler. Defaults to
|
||||||
|
`{"gamma": 0.5, "milestones": [100000, 200000, 300000, 400000, 500000, 600000]}`.
|
||||||
|
batch_size (int):
|
||||||
|
Batch size used at training. Larger values use more memory. Defaults to 16.
|
||||||
|
seq_len (int):
|
||||||
|
Audio segment length used at training. Larger values use more memory. Defaults to 8192.
|
||||||
|
pad_short (int):
|
||||||
|
Additional padding applied to the audio samples shorter than `seq_len`. Defaults to 0.
|
||||||
|
use_noise_augment (bool):
|
||||||
|
enable / disable random noise added to the input waveform. The noise is added after computing the
|
||||||
|
features. Defaults to True.
|
||||||
|
use_cache (bool):
|
||||||
|
enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is
|
||||||
|
not large enough. Defaults to True.
|
||||||
|
steps_to_start_discriminator (int):
|
||||||
|
Number of steps required to start training the discriminator. Defaults to 0.
|
||||||
|
use_stft_loss (bool):`
|
||||||
|
enable / disable use of STFT loss originally used by ParallelWaveGAN model. Defaults to True.
|
||||||
|
use_subband_stft (bool):
|
||||||
|
enable / disable use of subband loss computation originally used by MultiBandMelgan model. Defaults to True.
|
||||||
|
use_mse_gan_loss (bool):
|
||||||
|
enable / disable using Mean Squeare Error GAN loss. Defaults to True.
|
||||||
|
use_hinge_gan_loss (bool):
|
||||||
|
enable / disable using Hinge GAN loss. You should choose either Hinge or MSE loss for training GAN models.
|
||||||
|
Defaults to False.
|
||||||
|
use_feat_match_loss (bool):
|
||||||
|
enable / disable using Feature Matching loss originally used by MelGAN model. Defaults to True.
|
||||||
|
use_l1_spec_loss (bool):
|
||||||
|
enable / disable using L1 spectrogram loss originally used by HifiGAN model. Defaults to False.
|
||||||
|
stft_loss_params (dict): STFT loss parameters. Default to
|
||||||
|
`{"n_ffts": [1024, 2048, 512], "hop_lengths": [120, 240, 50], "win_lengths": [600, 1200, 240]}`
|
||||||
|
stft_loss_weight (float): STFT loss weight that multiplies the computed loss before summing up the total
|
||||||
|
model loss. Defaults to 0.5.
|
||||||
|
subband_stft_loss_weight (float):
|
||||||
|
Subband STFT loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
|
||||||
|
mse_G_loss_weight (float):
|
||||||
|
MSE generator loss weight that multiplies the computed loss before summing up the total loss. faults to 2.5.
|
||||||
|
hinge_G_loss_weight (float):
|
||||||
|
Hinge generator loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
|
||||||
|
feat_match_loss_weight (float):
|
||||||
|
Feature matching loss weight that multiplies the computed loss before summing up the total loss. faults to 108.
|
||||||
|
l1_spec_loss_weight (float):
|
||||||
|
L1 spectrogram loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
model: str = "multiband_melgan"
|
model: str = "multiband_melgan"
|
||||||
|
|
||||||
|
@ -59,7 +143,3 @@ class MultibandMelganConfig(BaseGANVocoderConfig):
|
||||||
hinge_G_loss_weight: float = 0
|
hinge_G_loss_weight: float = 0
|
||||||
feat_match_loss_weight: float = 108
|
feat_match_loss_weight: float = 108
|
||||||
l1_spec_loss_weight: float = 0
|
l1_spec_loss_weight: float = 0
|
||||||
|
|
||||||
# optimizer parameters
|
|
||||||
lr: float = 1e-4
|
|
||||||
wd: float = 1e-6
|
|
||||||
|
|
|
@ -5,7 +5,77 @@ from .shared_configs import BaseGANVocoderConfig
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class ParallelWaveganConfig(BaseGANVocoderConfig):
|
class ParallelWaveganConfig(BaseGANVocoderConfig):
|
||||||
"""Defines parameters for ParallelWavegan vocoder."""
|
"""Defines parameters for ParallelWavegan vocoder.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model (str):
|
||||||
|
Model name used for selecting the right configuration at initialization. Defaults to `parallel_wavegan`.
|
||||||
|
discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to
|
||||||
|
'parallel_wavegan_discriminator`.
|
||||||
|
discriminator_model_params (dict): The discriminator model kwargs. Defaults to
|
||||||
|
'{"num_layers": 10}`
|
||||||
|
generator_model (str): One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is
|
||||||
|
considered as a generator too. Defaults to `parallel_wavegan_generator`.
|
||||||
|
generator_model_param (dict):
|
||||||
|
The generator model kwargs. Defaults to `{"upsample_factors": [4, 4, 4, 4], "stacks": 3, "num_res_blocks": 30}`.
|
||||||
|
batch_size (int):
|
||||||
|
Batch size used at training. Larger values use more memory. Defaults to 16.
|
||||||
|
seq_len (int):
|
||||||
|
Audio segment length used at training. Larger values use more memory. Defaults to 8192.
|
||||||
|
pad_short (int):
|
||||||
|
Additional padding applied to the audio samples shorter than `seq_len`. Defaults to 0.
|
||||||
|
use_noise_augment (bool):
|
||||||
|
enable / disable random noise added to the input waveform. The noise is added after computing the
|
||||||
|
features. Defaults to True.
|
||||||
|
use_cache (bool):
|
||||||
|
enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is
|
||||||
|
not large enough. Defaults to True.
|
||||||
|
steps_to_start_discriminator (int):
|
||||||
|
Number of steps required to start training the discriminator. Defaults to 0.
|
||||||
|
use_stft_loss (bool):`
|
||||||
|
enable / disable use of STFT loss originally used by ParallelWaveGAN model. Defaults to True.
|
||||||
|
use_subband_stft (bool):
|
||||||
|
enable / disable use of subband loss computation originally used by MultiBandMelgan model. Defaults to True.
|
||||||
|
use_mse_gan_loss (bool):
|
||||||
|
enable / disable using Mean Squeare Error GAN loss. Defaults to True.
|
||||||
|
use_hinge_gan_loss (bool):
|
||||||
|
enable / disable using Hinge GAN loss. You should choose either Hinge or MSE loss for training GAN models.
|
||||||
|
Defaults to False.
|
||||||
|
use_feat_match_loss (bool):
|
||||||
|
enable / disable using Feature Matching loss originally used by MelGAN model. Defaults to True.
|
||||||
|
use_l1_spec_loss (bool):
|
||||||
|
enable / disable using L1 spectrogram loss originally used by HifiGAN model. Defaults to False.
|
||||||
|
stft_loss_params (dict): STFT loss parameters. Default to
|
||||||
|
`{"n_ffts": [1024, 2048, 512], "hop_lengths": [120, 240, 50], "win_lengths": [600, 1200, 240]}`
|
||||||
|
stft_loss_weight (float): STFT loss weight that multiplies the computed loss before summing up the total
|
||||||
|
model loss. Defaults to 0.5.
|
||||||
|
subband_stft_loss_weight (float):
|
||||||
|
Subband STFT loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
|
||||||
|
mse_G_loss_weight (float):
|
||||||
|
MSE generator loss weight that multiplies the computed loss before summing up the total loss. faults to 2.5.
|
||||||
|
hinge_G_loss_weight (float):
|
||||||
|
Hinge generator loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
|
||||||
|
feat_match_loss_weight (float):
|
||||||
|
Feature matching loss weight that multiplies the computed loss before summing up the total loss. faults to 0.
|
||||||
|
l1_spec_loss_weight (float):
|
||||||
|
L1 spectrogram loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
|
||||||
|
lr_gen (float):
|
||||||
|
Generator model initial learning rate. Defaults to 0.0002.
|
||||||
|
lr_disc (float):
|
||||||
|
Discriminator model initial learning rate. Defaults to 0.0002.
|
||||||
|
optimizer (torch.optim.Optimizer):
|
||||||
|
Optimizer used for the training. Defaults to `AdamW`.
|
||||||
|
optimizer_params (dict):
|
||||||
|
Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}`
|
||||||
|
lr_scheduler_gen (torch.optim.Scheduler):
|
||||||
|
Learning rate scheduler for the generator. Defaults to `ExponentialLR`.
|
||||||
|
lr_scheduler_gen_params (dict):
|
||||||
|
Parameters for the generator learning rate scheduler. Defaults to `{"gamma": 0.999, "last_epoch": -1}`.
|
||||||
|
lr_scheduler_disc (torch.optim.Scheduler):
|
||||||
|
Learning rate scheduler for the discriminator. Defaults to `ExponentialLR`.
|
||||||
|
lr_scheduler_dict_params (dict):
|
||||||
|
Parameters for the discriminator learning rate scheduler. Defaults to `{"gamma": 0.999, "last_epoch": -1}`.
|
||||||
|
"""
|
||||||
|
|
||||||
model: str = "parallel_wavegan"
|
model: str = "parallel_wavegan"
|
||||||
|
|
||||||
|
|
|
@ -7,7 +7,34 @@ from TTS.config import BaseAudioConfig, BaseTrainingConfig
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class BaseVocoderConfig(BaseTrainingConfig):
|
class BaseVocoderConfig(BaseTrainingConfig):
|
||||||
"""Shared parameters among all the vocoder models."""
|
"""Shared parameters among all the vocoder models.
|
||||||
|
Args:
|
||||||
|
audio (BaseAudioConfig):
|
||||||
|
Audio processor config instance. Defaultsto `BaseAudioConfig()`.
|
||||||
|
use_noise_augment (bool):
|
||||||
|
Augment the input audio with random noise. Defaults to False/
|
||||||
|
eval_split_size (int):
|
||||||
|
Number of instances used for evaluation. Defaults to 10.
|
||||||
|
data_path (str):
|
||||||
|
Root path of the training data. All the audio files found recursively from this root path are used for
|
||||||
|
training. Defaults to MISSING.
|
||||||
|
feature_path (str):
|
||||||
|
Root path to the precomputed feature files. Defaults to None.
|
||||||
|
seq_len (int):
|
||||||
|
Length of the waveform segments used for training. Defaults to MISSING.
|
||||||
|
pad_short (int):
|
||||||
|
Extra padding for the waveforms shorter than `seq_len`. Defaults to 0.
|
||||||
|
conv_path (int):
|
||||||
|
Extra padding for the feature frames against convolution of the edge frames. Defaults to MISSING.
|
||||||
|
Defaults to 0.
|
||||||
|
use_cache (bool):
|
||||||
|
enable / disable in memory caching of the computed features. If the RAM is not enough, if may cause OOM.
|
||||||
|
Defaults to False.
|
||||||
|
epochs (int):
|
||||||
|
Number of training epochs to. Defaults to 10000.
|
||||||
|
wd (float):
|
||||||
|
Weight decay.
|
||||||
|
"""
|
||||||
|
|
||||||
audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
|
audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
|
||||||
# dataloading
|
# dataloading
|
||||||
|
@ -19,7 +46,6 @@ class BaseVocoderConfig(BaseTrainingConfig):
|
||||||
seq_len: int = MISSING # signal length used in training.
|
seq_len: int = MISSING # signal length used in training.
|
||||||
pad_short: int = 0 # additional padding for short wavs
|
pad_short: int = 0 # additional padding for short wavs
|
||||||
conv_pad: int = 0 # additional padding against convolutions applied to spectrograms
|
conv_pad: int = 0 # additional padding against convolutions applied to spectrograms
|
||||||
use_noise_augment: bool = False # add noise to the audio signal for augmentation
|
|
||||||
use_cache: bool = False # use in memory cache to keep the computed features. This might cause OOM.
|
use_cache: bool = False # use in memory cache to keep the computed features. This might cause OOM.
|
||||||
# OPTIMIZER
|
# OPTIMIZER
|
||||||
epochs: int = 10000 # total number of epochs to train.
|
epochs: int = 10000 # total number of epochs to train.
|
||||||
|
@ -28,7 +54,78 @@ class BaseVocoderConfig(BaseTrainingConfig):
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class BaseGANVocoderConfig(BaseVocoderConfig):
|
class BaseGANVocoderConfig(BaseVocoderConfig):
|
||||||
"""Common config interface for all the GAN based vocoder models."""
|
"""Base config class used among all the GAN based vocoders.
|
||||||
|
Args:
|
||||||
|
use_stft_loss (bool):
|
||||||
|
enable / disable the use of STFT loss. Defaults to True.
|
||||||
|
use_subband_stft_loss (bool):
|
||||||
|
enable / disable the use of Subband STFT loss. Defaults to True.
|
||||||
|
use_mse_gan_loss (bool):
|
||||||
|
enable / disable the use of Mean Squared Error based GAN loss. Defaults to True.
|
||||||
|
use_hinge_gan_loss (bool):
|
||||||
|
enable / disable the use of Hinge GAN loss. Defaults to True.
|
||||||
|
use_feat_match_loss (bool):
|
||||||
|
enable / disable feature matching loss. Defaults to True.
|
||||||
|
use_l1_spec_loss (bool):
|
||||||
|
enable / disable L1 spectrogram loss. Defaults to True.
|
||||||
|
stft_loss_weight (float):
|
||||||
|
Loss weight that multiplies the computed loss value. Defaults to 0.
|
||||||
|
subband_stft_loss_weight (float):
|
||||||
|
Loss weight that multiplies the computed loss value. Defaults to 0.
|
||||||
|
mse_G_loss_weight (float):
|
||||||
|
Loss weight that multiplies the computed loss value. Defaults to 1.
|
||||||
|
hinge_G_loss_weight (float):
|
||||||
|
Loss weight that multiplies the computed loss value. Defaults to 0.
|
||||||
|
feat_match_loss_weight (float):
|
||||||
|
Loss weight that multiplies the computed loss value. Defaults to 100.
|
||||||
|
l1_spec_loss_weight (float):
|
||||||
|
Loss weight that multiplies the computed loss value. Defaults to 45.
|
||||||
|
stft_loss_params (dict):
|
||||||
|
Parameters for the STFT loss. Defaults to `{"n_ffts": [1024, 2048, 512], "hop_lengths": [120, 240, 50], "win_lengths": [600, 1200, 240]}`.
|
||||||
|
l1_spec_loss_params (dict):
|
||||||
|
Parameters for the L1 spectrogram loss. Defaults to
|
||||||
|
`{
|
||||||
|
"use_mel": True,
|
||||||
|
"sample_rate": 22050,
|
||||||
|
"n_fft": 1024,
|
||||||
|
"hop_length": 256,
|
||||||
|
"win_length": 1024,
|
||||||
|
"n_mels": 80,
|
||||||
|
"mel_fmin": 0.0,
|
||||||
|
"mel_fmax": None,
|
||||||
|
}`
|
||||||
|
target_loss (str):
|
||||||
|
Target loss name that defines the quality of the model. Defaults to `avg_G_loss`.
|
||||||
|
gen_clip_grad (float):
|
||||||
|
Gradient clipping threshold for the generator model. Any value less than 0 disables clipping.
|
||||||
|
Defaults to -1.
|
||||||
|
disc_clip_grad (float):
|
||||||
|
Gradient clipping threshold for the discriminator model. Any value less than 0 disables clipping.
|
||||||
|
Defaults to -1.
|
||||||
|
lr_gen (float):
|
||||||
|
Generator model initial learning rate. Defaults to 0.0002.
|
||||||
|
lr_disc (float):
|
||||||
|
Discriminator model initial learning rate. Defaults to 0.0002.
|
||||||
|
optimizer (torch.optim.Optimizer):
|
||||||
|
Optimizer used for the training. Defaults to `AdamW`.
|
||||||
|
optimizer_params (dict):
|
||||||
|
Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}`
|
||||||
|
lr_scheduler_gen (torch.optim.Scheduler):
|
||||||
|
Learning rate scheduler for the generator. Defaults to `ExponentialLR`.
|
||||||
|
lr_scheduler_gen_params (dict):
|
||||||
|
Parameters for the generator learning rate scheduler. Defaults to `{"gamma": 0.999, "last_epoch": -1}`.
|
||||||
|
lr_scheduler_disc (torch.optim.Scheduler):
|
||||||
|
Learning rate scheduler for the discriminator. Defaults to `ExponentialLR`.
|
||||||
|
lr_scheduler_dict_params (dict):
|
||||||
|
Parameters for the discriminator learning rate scheduler. Defaults to `{"gamma": 0.999, "last_epoch": -1}`.
|
||||||
|
use_pqmf (bool):
|
||||||
|
enable / disable PQMF for subband approximation at training. Defaults to False.
|
||||||
|
steps_to_start_discriminator (int):
|
||||||
|
Number of steps required to start training the discriminator. Defaults to 0.
|
||||||
|
diff_samples_for_G_and_D (bool):
|
||||||
|
enable / disable use of different training samples for the generator and the discriminator iterations.
|
||||||
|
Enabling it results in slower iterations but faster convergance in some cases. Defaults to False.
|
||||||
|
"""
|
||||||
|
|
||||||
# LOSS PARAMETERS
|
# LOSS PARAMETERS
|
||||||
use_stft_loss: bool = True
|
use_stft_loss: bool = True
|
||||||
|
@ -43,7 +140,7 @@ class BaseGANVocoderConfig(BaseVocoderConfig):
|
||||||
subband_stft_loss_weight: float = 0
|
subband_stft_loss_weight: float = 0
|
||||||
mse_G_loss_weight: float = 1
|
mse_G_loss_weight: float = 1
|
||||||
hinge_G_loss_weight: float = 0
|
hinge_G_loss_weight: float = 0
|
||||||
feat_match_loss_weight: float = 10
|
feat_match_loss_weight: float = 100
|
||||||
l1_spec_loss_weight: float = 45
|
l1_spec_loss_weight: float = 45
|
||||||
|
|
||||||
stft_loss_params: dict = field(
|
stft_loss_params: dict = field(
|
||||||
|
|
|
@ -1,12 +1,71 @@
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
from .shared_configs import BaseVocoderConfig
|
from TTS.vocoder.configs.shared_configs import BaseVocoderConfig
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class WavegradConfig(BaseVocoderConfig):
|
class WavegradConfig(BaseVocoderConfig):
|
||||||
"""Defines parameters for Wavernn vocoder."""
|
"""Defines parameters for WaveGrad vocoder.
|
||||||
|
Example:
|
||||||
|
|
||||||
|
>>> from TTS.vocoder.configs import WavegradConfig
|
||||||
|
>>> config = WavegradConfig()
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model (str):
|
||||||
|
Model name used for selecting the right model at initialization. Defaults to `wavegrad`.
|
||||||
|
generator_model (str): One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is
|
||||||
|
considered as a generator too. Defaults to `wavegrad`.
|
||||||
|
model_params (dict):
|
||||||
|
WaveGrad kwargs. Defaults to
|
||||||
|
`
|
||||||
|
{
|
||||||
|
"use_weight_norm": True,
|
||||||
|
"y_conv_channels": 32,
|
||||||
|
"x_conv_channels": 768,
|
||||||
|
"ublock_out_channels": [512, 512, 256, 128, 128],
|
||||||
|
"dblock_out_channels": [128, 128, 256, 512],
|
||||||
|
"upsample_factors": [4, 4, 4, 2, 2],
|
||||||
|
"upsample_dilations": [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]],
|
||||||
|
}
|
||||||
|
`
|
||||||
|
target_loss (str):
|
||||||
|
Target loss name that defines the quality of the model. Defaults to `avg_wavegrad_loss`.
|
||||||
|
epochs (int):
|
||||||
|
Number of epochs to traing the model. Defaults to 10000.
|
||||||
|
batch_size (int):
|
||||||
|
Batch size used at training. Larger values use more memory. Defaults to 96.
|
||||||
|
seq_len (int):
|
||||||
|
Audio segment length used at training. Larger values use more memory. Defaults to 6144.
|
||||||
|
use_cache (bool):
|
||||||
|
enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is
|
||||||
|
not large enough. Defaults to True.
|
||||||
|
mixed_precision (bool):
|
||||||
|
enable / disable mixed precision training. Default is True.
|
||||||
|
eval_split_size (int):
|
||||||
|
Number of samples used for evalutaion. Defaults to 50.
|
||||||
|
train_noise_schedule (dict):
|
||||||
|
Training noise schedule. Defaults to
|
||||||
|
`{"min_val": 1e-6, "max_val": 1e-2, "num_steps": 1000}`
|
||||||
|
test_noise_schedule (dict):
|
||||||
|
Inference noise schedule. For a better performance, you may need to use `bin/tune_wavegrad.py` to find a
|
||||||
|
better schedule. Defaults to
|
||||||
|
`
|
||||||
|
{
|
||||||
|
"min_val": 1e-6,
|
||||||
|
"max_val": 1e-2,
|
||||||
|
"num_steps": 50,
|
||||||
|
}
|
||||||
|
`
|
||||||
|
grad_clip (float):
|
||||||
|
Gradient clipping threshold. If <= 0.0, no clipping is applied. Defaults to 1.0
|
||||||
|
lr (float):
|
||||||
|
Initila leraning rate. Defaults to 1e-4.
|
||||||
|
lr_scheduler (str):
|
||||||
|
One of the learning rate schedulers from `torch.optim.scheduler.*`. Defaults to `MultiStepLR`.
|
||||||
|
lr_scheduler_params (dict):
|
||||||
|
kwargs for the scheduler. Defaults to `{"gamma": 0.5, "milestones": [100000, 200000, 300000, 400000, 500000, 600000]}`
|
||||||
|
"""
|
||||||
model: str = "wavegrad"
|
model: str = "wavegrad"
|
||||||
# Model specific params
|
# Model specific params
|
||||||
generator_model: str = "wavegrad"
|
generator_model: str = "wavegrad"
|
||||||
|
@ -28,7 +87,6 @@ class WavegradConfig(BaseVocoderConfig):
|
||||||
batch_size: int = 96
|
batch_size: int = 96
|
||||||
seq_len: int = 6144
|
seq_len: int = 6144
|
||||||
use_cache: bool = True
|
use_cache: bool = True
|
||||||
steps_to_start_discriminator: int = 200000
|
|
||||||
mixed_precision: bool = True
|
mixed_precision: bool = True
|
||||||
eval_split_size: int = 50
|
eval_split_size: int = 50
|
||||||
|
|
||||||
|
|
|
@ -1,11 +1,77 @@
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
from .shared_configs import BaseVocoderConfig
|
from TTS.vocoder.configs.shared_configs import BaseVocoderConfig
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class WavernnConfig(BaseVocoderConfig):
|
class WavernnConfig(BaseVocoderConfig):
|
||||||
"""Defines parameters for Wavernn vocoder."""
|
"""Defines parameters for Wavernn vocoder.
|
||||||
|
Example:
|
||||||
|
|
||||||
|
>>> from TTS.vocoder.configs import WavernnConfig
|
||||||
|
>>> config = WavernnConfig()
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model (str):
|
||||||
|
Model name used for selecting the right model at initialization. Defaults to `wavernn`.
|
||||||
|
mode (str):
|
||||||
|
Output mode of the WaveRNN vocoder. `mold` for Mixture of Logistic Distribution, `gauss` for a single
|
||||||
|
Gaussian Distribution and `bits` for quantized bits as the model's output.
|
||||||
|
mulaw (bool):
|
||||||
|
enable / disable the use of Mulaw quantization for training. Only applicable if `mode == 'bits'`. Defaults
|
||||||
|
to `True`.
|
||||||
|
generator_model (str):
|
||||||
|
One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is
|
||||||
|
considered as a generator too. Defaults to `WaveRNN`.
|
||||||
|
wavernn_model_params (dict):
|
||||||
|
kwargs for the WaveRNN model. Defaults to
|
||||||
|
`{
|
||||||
|
"rnn_dims": 512,
|
||||||
|
"fc_dims": 512,
|
||||||
|
"compute_dims": 128,
|
||||||
|
"res_out_dims": 128,
|
||||||
|
"num_res_blocks": 10,
|
||||||
|
"use_aux_net": True,
|
||||||
|
"use_upsample_net": True,
|
||||||
|
"upsample_factors": [4, 8, 8]
|
||||||
|
}`
|
||||||
|
batched (bool):
|
||||||
|
enable / disable the batched inference. It speeds up the inference by splitting the input into segments and
|
||||||
|
processing the segments in a batch. Then it merges the outputs with a certain overlap and smoothing. If
|
||||||
|
you set it False, without CUDA, it is too slow to be practical. Defaults to True.
|
||||||
|
target_samples (int):
|
||||||
|
Size of the segments in batched mode. Defaults to 11000.
|
||||||
|
overlap_sampels (int):
|
||||||
|
Size of the overlap between consecutive segments. Defaults to 550.
|
||||||
|
batch_size (int):
|
||||||
|
Batch size used at training. Larger values use more memory. Defaults to 256.
|
||||||
|
seq_len (int):
|
||||||
|
Audio segment length used at training. Larger values use more memory. Defaults to 1280.
|
||||||
|
padding (int):
|
||||||
|
Padding applied to the input feature frames against the convolution layers of the feature network.
|
||||||
|
Defaults to 2.
|
||||||
|
use_noise_augment (bool):
|
||||||
|
enable / disable random noise added to the input waveform. The noise is added after computing the
|
||||||
|
features. Defaults to True.
|
||||||
|
use_cache (bool):
|
||||||
|
enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is
|
||||||
|
not large enough. Defaults to True.
|
||||||
|
mixed_precision (bool):
|
||||||
|
enable / disable mixed precision training. Default is True.
|
||||||
|
eval_split_size (int):
|
||||||
|
Number of samples used for evalutaion. Defaults to 50.
|
||||||
|
test_every_epoch (int):
|
||||||
|
Number of epochs waited to run the next evalution. Since inference takes some time, it is better to
|
||||||
|
wait some number of epochs not ot waste training time. Defaults to 10.
|
||||||
|
grad_clip (float):
|
||||||
|
Gradient clipping threshold. If <= 0.0, no clipping is applied. Defaults to 4.0
|
||||||
|
lr (float):
|
||||||
|
Initila leraning rate. Defaults to 1e-4.
|
||||||
|
lr_scheduler (str):
|
||||||
|
One of the learning rate schedulers from `torch.optim.scheduler.*`. Defaults to `MultiStepLR`.
|
||||||
|
lr_scheduler_params (dict):
|
||||||
|
kwargs for the scheduler. Defaults to `{"gamma": 0.5, "milestones": [200000, 400000, 600000]}`
|
||||||
|
"""
|
||||||
|
|
||||||
model: str = "wavernn"
|
model: str = "wavernn"
|
||||||
|
|
||||||
|
@ -38,7 +104,6 @@ class WavernnConfig(BaseVocoderConfig):
|
||||||
padding: int = 2
|
padding: int = 2
|
||||||
use_noise_augment: bool = False
|
use_noise_augment: bool = False
|
||||||
use_cache: bool = True
|
use_cache: bool = True
|
||||||
steps_to_start_discriminator: int = 200000
|
|
||||||
mixed_precision: bool = True
|
mixed_precision: bool = True
|
||||||
eval_split_size: int = 50
|
eval_split_size: int = 50
|
||||||
test_every_epochs: int = 10 # number of epochs to wait until the next test run (synthesizing a full audio clip).
|
test_every_epochs: int = 10 # number of epochs to wait until the next test run (synthesizing a full audio clip).
|
||||||
|
|
Loading…
Reference in New Issue