Update vocoder model configs

This commit is contained in:
Eren Gölge 2021-06-18 13:29:35 +02:00
parent f8a3460818
commit 70d968b169
6 changed files with 29 additions and 65 deletions

View File

@ -14,7 +14,7 @@ class FullbandMelganConfig(BaseGANVocoderConfig):
Args:
model (str):
Model name used for selecting the right model at initialization. Defaults to `melgan`.
Model name used for selecting the right model at initialization. Defaults to `fullband_melgan`.
discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to
'melgan_multiscale_discriminator`.
discriminator_model_params (dict): The discriminator model parameters. Defaults to
@ -62,7 +62,7 @@ class FullbandMelganConfig(BaseGANVocoderConfig):
L1 spectrogram loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
"""
model: str = "melgan"
model: str = "fullband_melgan"
# Model specific params
discriminator_model: str = "melgan_multiscale_discriminator"

View File

@ -14,7 +14,7 @@ class MultibandMelganConfig(BaseGANVocoderConfig):
Args:
model (str):
Model name used for selecting the right model at initialization. Defaults to `melgan`.
Model name used for selecting the right model at initialization. Defaults to `multiband_melgan`.
discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to
'melgan_multiscale_discriminator`.
discriminator_model_params (dict): The discriminator model parameters. Defaults to

View File

@ -9,7 +9,7 @@ class ParallelWaveganConfig(BaseGANVocoderConfig):
Args:
model (str):
Model name used for selecting the right configuration at initialization. Defaults to `parallel_wavegan`.
Model name used for selecting the right configuration at initialization. Defaults to `gan`.
discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to
'parallel_wavegan_discriminator`.
discriminator_model_params (dict): The discriminator model kwargs. Defaults to

View File

@ -34,6 +34,10 @@ class BaseVocoderConfig(BaseTrainingConfig):
Number of training epochs to. Defaults to 10000.
wd (float):
Weight decay.
optimizer (torch.optim.Optimizer):
Optimizer used for the training. Defaults to `AdamW`.
optimizer_params (dict):
Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}`
"""
audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
@ -50,6 +54,8 @@ class BaseVocoderConfig(BaseTrainingConfig):
# OPTIMIZER
epochs: int = 10000 # total number of epochs to train.
wd: float = 0.0 # Weight decay weight.
optimizer: str = "AdamW"
optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "weight_decay": 0.0})
@dataclass
@ -96,20 +102,13 @@ class BaseGANVocoderConfig(BaseVocoderConfig):
}`
target_loss (str):
Target loss name that defines the quality of the model. Defaults to `avg_G_loss`.
gen_clip_grad (float):
Gradient clipping threshold for the generator model. Any value less than 0 disables clipping.
Defaults to -1.
disc_clip_grad (float):
Gradient clipping threshold for the discriminator model. Any value less than 0 disables clipping.
Defaults to -1.
grad_clip (list):
A list of gradient clipping theresholds for each optimizer. Any value less than 0 disables clipping.
Defaults to [5, 5].
lr_gen (float):
Generator model initial learning rate. Defaults to 0.0002.
lr_disc (float):
Discriminator model initial learning rate. Defaults to 0.0002.
optimizer (torch.optim.Optimizer):
Optimizer used for the training. Defaults to `AdamW`.
optimizer_params (dict):
Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}`
lr_scheduler_gen (torch.optim.Scheduler):
Learning rate scheduler for the generator. Defaults to `ExponentialLR`.
lr_scheduler_gen_params (dict):
@ -127,6 +126,8 @@ class BaseGANVocoderConfig(BaseVocoderConfig):
Enabling it results in slower iterations but faster convergance in some cases. Defaults to False.
"""
model: str = "gan"
# LOSS PARAMETERS
use_stft_loss: bool = True
use_subband_stft_loss: bool = True
@ -164,15 +165,12 @@ class BaseGANVocoderConfig(BaseVocoderConfig):
}
)
target_loss: str = "avg_G_loss" # loss value to pick the best model to save after each epoch
target_loss: str = "loss_0" # loss value to pick the best model to save after each epoch
# optimizer
gen_clip_grad: float = -1 # Generator gradient clipping threshold. Apply gradient clipping if > 0
disc_clip_grad: float = -1 # Discriminator gradient clipping threshold.
grad_clip: float = field(default_factory=lambda: [5, 5])
lr_gen: float = 0.0002 # Initial learning rate.
lr_disc: float = 0.0002 # Initial learning rate.
optimizer: str = "AdamW"
optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "weight_decay": 0.0})
lr_scheduler_gen: str = "ExponentialLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html
lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.999, "last_epoch": -1})
lr_scheduler_disc: str = "ExponentialLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html

View File

@ -1,6 +1,7 @@
from dataclasses import dataclass, field
from TTS.vocoder.configs.shared_configs import BaseVocoderConfig
from TTS.vocoder.models.wavegrad import WavegradArgs
@dataclass
@ -16,19 +17,7 @@ class WavegradConfig(BaseVocoderConfig):
Model name used for selecting the right model at initialization. Defaults to `wavegrad`.
generator_model (str): One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is
considered as a generator too. Defaults to `wavegrad`.
model_params (dict):
WaveGrad kwargs. Defaults to
`
{
"use_weight_norm": True,
"y_conv_channels": 32,
"x_conv_channels": 768,
"ublock_out_channels": [512, 512, 256, 128, 128],
"dblock_out_channels": [128, 128, 256, 512],
"upsample_factors": [4, 4, 4, 2, 2],
"upsample_dilations": [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]],
}
`
model_params (WavegradArgs): Model parameters. Check `WavegradArgs` for default values.
target_loss (str):
Target loss name that defines the quality of the model. Defaults to `avg_wavegrad_loss`.
epochs (int):
@ -70,18 +59,8 @@ class WavegradConfig(BaseVocoderConfig):
model: str = "wavegrad"
# Model specific params
generator_model: str = "wavegrad"
model_params: dict = field(
default_factory=lambda: {
"use_weight_norm": True,
"y_conv_channels": 32,
"x_conv_channels": 768,
"ublock_out_channels": [512, 512, 256, 128, 128],
"dblock_out_channels": [128, 128, 256, 512],
"upsample_factors": [4, 4, 4, 2, 2],
"upsample_dilations": [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]],
}
)
target_loss: str = "avg_wavegrad_loss" # loss value to pick the best model to save after each epoch
model_params: WavegradArgs = field(default_factory=WavegradArgs)
target_loss: str = "loss" # loss value to pick the best model to save after each epoch
# Training - overrides
epochs: int = 10000

View File

@ -1,6 +1,7 @@
from dataclasses import dataclass, field
from TTS.vocoder.configs.shared_configs import BaseVocoderConfig
from TTS.vocoder.models.wavernn import WavernnArgs
@dataclass
@ -47,9 +48,7 @@ class WavernnConfig(BaseVocoderConfig):
Batch size used at training. Larger values use more memory. Defaults to 256.
seq_len (int):
Audio segment length used at training. Larger values use more memory. Defaults to 1280.
padding (int):
Padding applied to the input feature frames against the convolution layers of the feature network.
Defaults to 2.
use_noise_augment (bool):
enable / disable random noise added to the input waveform. The noise is added after computing the
features. Defaults to True.
@ -60,7 +59,7 @@ class WavernnConfig(BaseVocoderConfig):
enable / disable mixed precision training. Default is True.
eval_split_size (int):
Number of samples used for evalutaion. Defaults to 50.
test_every_epoch (int):
num_epochs_before_test (int):
Number of epochs waited to run the next evalution. Since inference takes some time, it is better to
wait some number of epochs not ot waste training time. Defaults to 10.
grad_clip (float):
@ -76,21 +75,8 @@ class WavernnConfig(BaseVocoderConfig):
model: str = "wavernn"
# Model specific params
mode: str = "mold" # mold [string], gauss [string], bits [int]
mulaw: bool = True # apply mulaw if mode is bits
generator_model: str = "WaveRNN"
wavernn_model_params: dict = field(
default_factory=lambda: {
"rnn_dims": 512,
"fc_dims": 512,
"compute_dims": 128,
"res_out_dims": 128,
"num_res_blocks": 10,
"use_aux_net": True,
"use_upsample_net": True,
"upsample_factors": [4, 8, 8], # this needs to correctly factorise hop_length
}
)
model_params: WavernnArgs = field(default_factory=WavernnArgs)
target_loss: str = "loss"
# Inference
batched: bool = True
@ -101,12 +87,13 @@ class WavernnConfig(BaseVocoderConfig):
epochs: int = 10000
batch_size: int = 256
seq_len: int = 1280
padding: int = 2
use_noise_augment: bool = False
use_cache: bool = True
mixed_precision: bool = True
eval_split_size: int = 50
test_every_epochs: int = 10 # number of epochs to wait until the next test run (synthesizing a full audio clip).
num_epochs_before_test: int = (
10 # number of epochs to wait until the next test run (synthesizing a full audio clip).
)
# optimizer overrides
grad_clip: float = 4.0