mirror of https://github.com/coqui-ai/TTS.git
Update vocoder model configs
This commit is contained in:
parent
f8a3460818
commit
70d968b169
|
@ -14,7 +14,7 @@ class FullbandMelganConfig(BaseGANVocoderConfig):
|
|||
|
||||
Args:
|
||||
model (str):
|
||||
Model name used for selecting the right model at initialization. Defaults to `melgan`.
|
||||
Model name used for selecting the right model at initialization. Defaults to `fullband_melgan`.
|
||||
discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to
|
||||
'melgan_multiscale_discriminator`.
|
||||
discriminator_model_params (dict): The discriminator model parameters. Defaults to
|
||||
|
@ -62,7 +62,7 @@ class FullbandMelganConfig(BaseGANVocoderConfig):
|
|||
L1 spectrogram loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
|
||||
"""
|
||||
|
||||
model: str = "melgan"
|
||||
model: str = "fullband_melgan"
|
||||
|
||||
# Model specific params
|
||||
discriminator_model: str = "melgan_multiscale_discriminator"
|
||||
|
|
|
@ -14,7 +14,7 @@ class MultibandMelganConfig(BaseGANVocoderConfig):
|
|||
|
||||
Args:
|
||||
model (str):
|
||||
Model name used for selecting the right model at initialization. Defaults to `melgan`.
|
||||
Model name used for selecting the right model at initialization. Defaults to `multiband_melgan`.
|
||||
discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to
|
||||
'melgan_multiscale_discriminator`.
|
||||
discriminator_model_params (dict): The discriminator model parameters. Defaults to
|
||||
|
|
|
@ -9,7 +9,7 @@ class ParallelWaveganConfig(BaseGANVocoderConfig):
|
|||
|
||||
Args:
|
||||
model (str):
|
||||
Model name used for selecting the right configuration at initialization. Defaults to `parallel_wavegan`.
|
||||
Model name used for selecting the right configuration at initialization. Defaults to `gan`.
|
||||
discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to
|
||||
'parallel_wavegan_discriminator`.
|
||||
discriminator_model_params (dict): The discriminator model kwargs. Defaults to
|
||||
|
|
|
@ -34,6 +34,10 @@ class BaseVocoderConfig(BaseTrainingConfig):
|
|||
Number of training epochs to. Defaults to 10000.
|
||||
wd (float):
|
||||
Weight decay.
|
||||
optimizer (torch.optim.Optimizer):
|
||||
Optimizer used for the training. Defaults to `AdamW`.
|
||||
optimizer_params (dict):
|
||||
Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}`
|
||||
"""
|
||||
|
||||
audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
|
||||
|
@ -50,6 +54,8 @@ class BaseVocoderConfig(BaseTrainingConfig):
|
|||
# OPTIMIZER
|
||||
epochs: int = 10000 # total number of epochs to train.
|
||||
wd: float = 0.0 # Weight decay weight.
|
||||
optimizer: str = "AdamW"
|
||||
optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "weight_decay": 0.0})
|
||||
|
||||
|
||||
@dataclass
|
||||
|
@ -96,20 +102,13 @@ class BaseGANVocoderConfig(BaseVocoderConfig):
|
|||
}`
|
||||
target_loss (str):
|
||||
Target loss name that defines the quality of the model. Defaults to `avg_G_loss`.
|
||||
gen_clip_grad (float):
|
||||
Gradient clipping threshold for the generator model. Any value less than 0 disables clipping.
|
||||
Defaults to -1.
|
||||
disc_clip_grad (float):
|
||||
Gradient clipping threshold for the discriminator model. Any value less than 0 disables clipping.
|
||||
Defaults to -1.
|
||||
grad_clip (list):
|
||||
A list of gradient clipping theresholds for each optimizer. Any value less than 0 disables clipping.
|
||||
Defaults to [5, 5].
|
||||
lr_gen (float):
|
||||
Generator model initial learning rate. Defaults to 0.0002.
|
||||
lr_disc (float):
|
||||
Discriminator model initial learning rate. Defaults to 0.0002.
|
||||
optimizer (torch.optim.Optimizer):
|
||||
Optimizer used for the training. Defaults to `AdamW`.
|
||||
optimizer_params (dict):
|
||||
Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}`
|
||||
lr_scheduler_gen (torch.optim.Scheduler):
|
||||
Learning rate scheduler for the generator. Defaults to `ExponentialLR`.
|
||||
lr_scheduler_gen_params (dict):
|
||||
|
@ -127,6 +126,8 @@ class BaseGANVocoderConfig(BaseVocoderConfig):
|
|||
Enabling it results in slower iterations but faster convergance in some cases. Defaults to False.
|
||||
"""
|
||||
|
||||
model: str = "gan"
|
||||
|
||||
# LOSS PARAMETERS
|
||||
use_stft_loss: bool = True
|
||||
use_subband_stft_loss: bool = True
|
||||
|
@ -164,15 +165,12 @@ class BaseGANVocoderConfig(BaseVocoderConfig):
|
|||
}
|
||||
)
|
||||
|
||||
target_loss: str = "avg_G_loss" # loss value to pick the best model to save after each epoch
|
||||
target_loss: str = "loss_0" # loss value to pick the best model to save after each epoch
|
||||
|
||||
# optimizer
|
||||
gen_clip_grad: float = -1 # Generator gradient clipping threshold. Apply gradient clipping if > 0
|
||||
disc_clip_grad: float = -1 # Discriminator gradient clipping threshold.
|
||||
grad_clip: float = field(default_factory=lambda: [5, 5])
|
||||
lr_gen: float = 0.0002 # Initial learning rate.
|
||||
lr_disc: float = 0.0002 # Initial learning rate.
|
||||
optimizer: str = "AdamW"
|
||||
optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "weight_decay": 0.0})
|
||||
lr_scheduler_gen: str = "ExponentialLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html
|
||||
lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.999, "last_epoch": -1})
|
||||
lr_scheduler_disc: str = "ExponentialLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
from dataclasses import dataclass, field
|
||||
|
||||
from TTS.vocoder.configs.shared_configs import BaseVocoderConfig
|
||||
from TTS.vocoder.models.wavegrad import WavegradArgs
|
||||
|
||||
|
||||
@dataclass
|
||||
|
@ -16,19 +17,7 @@ class WavegradConfig(BaseVocoderConfig):
|
|||
Model name used for selecting the right model at initialization. Defaults to `wavegrad`.
|
||||
generator_model (str): One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is
|
||||
considered as a generator too. Defaults to `wavegrad`.
|
||||
model_params (dict):
|
||||
WaveGrad kwargs. Defaults to
|
||||
`
|
||||
{
|
||||
"use_weight_norm": True,
|
||||
"y_conv_channels": 32,
|
||||
"x_conv_channels": 768,
|
||||
"ublock_out_channels": [512, 512, 256, 128, 128],
|
||||
"dblock_out_channels": [128, 128, 256, 512],
|
||||
"upsample_factors": [4, 4, 4, 2, 2],
|
||||
"upsample_dilations": [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]],
|
||||
}
|
||||
`
|
||||
model_params (WavegradArgs): Model parameters. Check `WavegradArgs` for default values.
|
||||
target_loss (str):
|
||||
Target loss name that defines the quality of the model. Defaults to `avg_wavegrad_loss`.
|
||||
epochs (int):
|
||||
|
@ -70,18 +59,8 @@ class WavegradConfig(BaseVocoderConfig):
|
|||
model: str = "wavegrad"
|
||||
# Model specific params
|
||||
generator_model: str = "wavegrad"
|
||||
model_params: dict = field(
|
||||
default_factory=lambda: {
|
||||
"use_weight_norm": True,
|
||||
"y_conv_channels": 32,
|
||||
"x_conv_channels": 768,
|
||||
"ublock_out_channels": [512, 512, 256, 128, 128],
|
||||
"dblock_out_channels": [128, 128, 256, 512],
|
||||
"upsample_factors": [4, 4, 4, 2, 2],
|
||||
"upsample_dilations": [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]],
|
||||
}
|
||||
)
|
||||
target_loss: str = "avg_wavegrad_loss" # loss value to pick the best model to save after each epoch
|
||||
model_params: WavegradArgs = field(default_factory=WavegradArgs)
|
||||
target_loss: str = "loss" # loss value to pick the best model to save after each epoch
|
||||
|
||||
# Training - overrides
|
||||
epochs: int = 10000
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
from dataclasses import dataclass, field
|
||||
|
||||
from TTS.vocoder.configs.shared_configs import BaseVocoderConfig
|
||||
from TTS.vocoder.models.wavernn import WavernnArgs
|
||||
|
||||
|
||||
@dataclass
|
||||
|
@ -47,9 +48,7 @@ class WavernnConfig(BaseVocoderConfig):
|
|||
Batch size used at training. Larger values use more memory. Defaults to 256.
|
||||
seq_len (int):
|
||||
Audio segment length used at training. Larger values use more memory. Defaults to 1280.
|
||||
padding (int):
|
||||
Padding applied to the input feature frames against the convolution layers of the feature network.
|
||||
Defaults to 2.
|
||||
|
||||
use_noise_augment (bool):
|
||||
enable / disable random noise added to the input waveform. The noise is added after computing the
|
||||
features. Defaults to True.
|
||||
|
@ -60,7 +59,7 @@ class WavernnConfig(BaseVocoderConfig):
|
|||
enable / disable mixed precision training. Default is True.
|
||||
eval_split_size (int):
|
||||
Number of samples used for evalutaion. Defaults to 50.
|
||||
test_every_epoch (int):
|
||||
num_epochs_before_test (int):
|
||||
Number of epochs waited to run the next evalution. Since inference takes some time, it is better to
|
||||
wait some number of epochs not ot waste training time. Defaults to 10.
|
||||
grad_clip (float):
|
||||
|
@ -76,21 +75,8 @@ class WavernnConfig(BaseVocoderConfig):
|
|||
model: str = "wavernn"
|
||||
|
||||
# Model specific params
|
||||
mode: str = "mold" # mold [string], gauss [string], bits [int]
|
||||
mulaw: bool = True # apply mulaw if mode is bits
|
||||
generator_model: str = "WaveRNN"
|
||||
wavernn_model_params: dict = field(
|
||||
default_factory=lambda: {
|
||||
"rnn_dims": 512,
|
||||
"fc_dims": 512,
|
||||
"compute_dims": 128,
|
||||
"res_out_dims": 128,
|
||||
"num_res_blocks": 10,
|
||||
"use_aux_net": True,
|
||||
"use_upsample_net": True,
|
||||
"upsample_factors": [4, 8, 8], # this needs to correctly factorise hop_length
|
||||
}
|
||||
)
|
||||
model_params: WavernnArgs = field(default_factory=WavernnArgs)
|
||||
target_loss: str = "loss"
|
||||
|
||||
# Inference
|
||||
batched: bool = True
|
||||
|
@ -101,12 +87,13 @@ class WavernnConfig(BaseVocoderConfig):
|
|||
epochs: int = 10000
|
||||
batch_size: int = 256
|
||||
seq_len: int = 1280
|
||||
padding: int = 2
|
||||
use_noise_augment: bool = False
|
||||
use_cache: bool = True
|
||||
mixed_precision: bool = True
|
||||
eval_split_size: int = 50
|
||||
test_every_epochs: int = 10 # number of epochs to wait until the next test run (synthesizing a full audio clip).
|
||||
num_epochs_before_test: int = (
|
||||
10 # number of epochs to wait until the next test run (synthesizing a full audio clip).
|
||||
)
|
||||
|
||||
# optimizer overrides
|
||||
grad_clip: float = 4.0
|
||||
|
|
Loading…
Reference in New Issue