From 70d968b169c17d3245c5e9fc7449c201fd13f637 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 18 Jun 2021 13:29:35 +0200 Subject: [PATCH] Update vocoder model configs --- TTS/vocoder/configs/fullband_melgan_config.py | 4 +-- .../configs/multiband_melgan_config.py | 2 +- .../configs/parallel_wavegan_config.py | 2 +- TTS/vocoder/configs/shared_configs.py | 28 +++++++++--------- TTS/vocoder/configs/wavegrad_config.py | 29 +++---------------- TTS/vocoder/configs/wavernn_config.py | 29 +++++-------------- 6 files changed, 29 insertions(+), 65 deletions(-) diff --git a/TTS/vocoder/configs/fullband_melgan_config.py b/TTS/vocoder/configs/fullband_melgan_config.py index 53444214..2ab83aac 100644 --- a/TTS/vocoder/configs/fullband_melgan_config.py +++ b/TTS/vocoder/configs/fullband_melgan_config.py @@ -14,7 +14,7 @@ class FullbandMelganConfig(BaseGANVocoderConfig): Args: model (str): - Model name used for selecting the right model at initialization. Defaults to `melgan`. + Model name used for selecting the right model at initialization. Defaults to `fullband_melgan`. discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to 'melgan_multiscale_discriminator`. discriminator_model_params (dict): The discriminator model parameters. Defaults to @@ -62,7 +62,7 @@ class FullbandMelganConfig(BaseGANVocoderConfig): L1 spectrogram loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0. """ - model: str = "melgan" + model: str = "fullband_melgan" # Model specific params discriminator_model: str = "melgan_multiscale_discriminator" diff --git a/TTS/vocoder/configs/multiband_melgan_config.py b/TTS/vocoder/configs/multiband_melgan_config.py index 81fd7904..76311353 100644 --- a/TTS/vocoder/configs/multiband_melgan_config.py +++ b/TTS/vocoder/configs/multiband_melgan_config.py @@ -14,7 +14,7 @@ class MultibandMelganConfig(BaseGANVocoderConfig): Args: model (str): - Model name used for selecting the right model at initialization. Defaults to `melgan`. + Model name used for selecting the right model at initialization. Defaults to `multiband_melgan`. discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to 'melgan_multiscale_discriminator`. discriminator_model_params (dict): The discriminator model parameters. Defaults to diff --git a/TTS/vocoder/configs/parallel_wavegan_config.py b/TTS/vocoder/configs/parallel_wavegan_config.py index d132d2e1..a89b1f3f 100644 --- a/TTS/vocoder/configs/parallel_wavegan_config.py +++ b/TTS/vocoder/configs/parallel_wavegan_config.py @@ -9,7 +9,7 @@ class ParallelWaveganConfig(BaseGANVocoderConfig): Args: model (str): - Model name used for selecting the right configuration at initialization. Defaults to `parallel_wavegan`. + Model name used for selecting the right configuration at initialization. Defaults to `gan`. discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to 'parallel_wavegan_discriminator`. discriminator_model_params (dict): The discriminator model kwargs. Defaults to diff --git a/TTS/vocoder/configs/shared_configs.py b/TTS/vocoder/configs/shared_configs.py index 664032d2..6891ce6c 100644 --- a/TTS/vocoder/configs/shared_configs.py +++ b/TTS/vocoder/configs/shared_configs.py @@ -34,6 +34,10 @@ class BaseVocoderConfig(BaseTrainingConfig): Number of training epochs to. Defaults to 10000. wd (float): Weight decay. + optimizer (torch.optim.Optimizer): + Optimizer used for the training. Defaults to `AdamW`. + optimizer_params (dict): + Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}` """ audio: BaseAudioConfig = field(default_factory=BaseAudioConfig) @@ -50,6 +54,8 @@ class BaseVocoderConfig(BaseTrainingConfig): # OPTIMIZER epochs: int = 10000 # total number of epochs to train. wd: float = 0.0 # Weight decay weight. + optimizer: str = "AdamW" + optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "weight_decay": 0.0}) @dataclass @@ -96,20 +102,13 @@ class BaseGANVocoderConfig(BaseVocoderConfig): }` target_loss (str): Target loss name that defines the quality of the model. Defaults to `avg_G_loss`. - gen_clip_grad (float): - Gradient clipping threshold for the generator model. Any value less than 0 disables clipping. - Defaults to -1. - disc_clip_grad (float): - Gradient clipping threshold for the discriminator model. Any value less than 0 disables clipping. - Defaults to -1. + grad_clip (list): + A list of gradient clipping theresholds for each optimizer. Any value less than 0 disables clipping. + Defaults to [5, 5]. lr_gen (float): Generator model initial learning rate. Defaults to 0.0002. lr_disc (float): Discriminator model initial learning rate. Defaults to 0.0002. - optimizer (torch.optim.Optimizer): - Optimizer used for the training. Defaults to `AdamW`. - optimizer_params (dict): - Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}` lr_scheduler_gen (torch.optim.Scheduler): Learning rate scheduler for the generator. Defaults to `ExponentialLR`. lr_scheduler_gen_params (dict): @@ -127,6 +126,8 @@ class BaseGANVocoderConfig(BaseVocoderConfig): Enabling it results in slower iterations but faster convergance in some cases. Defaults to False. """ + model: str = "gan" + # LOSS PARAMETERS use_stft_loss: bool = True use_subband_stft_loss: bool = True @@ -164,15 +165,12 @@ class BaseGANVocoderConfig(BaseVocoderConfig): } ) - target_loss: str = "avg_G_loss" # loss value to pick the best model to save after each epoch + target_loss: str = "loss_0" # loss value to pick the best model to save after each epoch # optimizer - gen_clip_grad: float = -1 # Generator gradient clipping threshold. Apply gradient clipping if > 0 - disc_clip_grad: float = -1 # Discriminator gradient clipping threshold. + grad_clip: float = field(default_factory=lambda: [5, 5]) lr_gen: float = 0.0002 # Initial learning rate. lr_disc: float = 0.0002 # Initial learning rate. - optimizer: str = "AdamW" - optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "weight_decay": 0.0}) lr_scheduler_gen: str = "ExponentialLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.999, "last_epoch": -1}) lr_scheduler_disc: str = "ExponentialLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html diff --git a/TTS/vocoder/configs/wavegrad_config.py b/TTS/vocoder/configs/wavegrad_config.py index 271422ee..c39813ae 100644 --- a/TTS/vocoder/configs/wavegrad_config.py +++ b/TTS/vocoder/configs/wavegrad_config.py @@ -1,6 +1,7 @@ from dataclasses import dataclass, field from TTS.vocoder.configs.shared_configs import BaseVocoderConfig +from TTS.vocoder.models.wavegrad import WavegradArgs @dataclass @@ -16,19 +17,7 @@ class WavegradConfig(BaseVocoderConfig): Model name used for selecting the right model at initialization. Defaults to `wavegrad`. generator_model (str): One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is considered as a generator too. Defaults to `wavegrad`. - model_params (dict): - WaveGrad kwargs. Defaults to - ` - { - "use_weight_norm": True, - "y_conv_channels": 32, - "x_conv_channels": 768, - "ublock_out_channels": [512, 512, 256, 128, 128], - "dblock_out_channels": [128, 128, 256, 512], - "upsample_factors": [4, 4, 4, 2, 2], - "upsample_dilations": [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]], - } - ` + model_params (WavegradArgs): Model parameters. Check `WavegradArgs` for default values. target_loss (str): Target loss name that defines the quality of the model. Defaults to `avg_wavegrad_loss`. epochs (int): @@ -70,18 +59,8 @@ class WavegradConfig(BaseVocoderConfig): model: str = "wavegrad" # Model specific params generator_model: str = "wavegrad" - model_params: dict = field( - default_factory=lambda: { - "use_weight_norm": True, - "y_conv_channels": 32, - "x_conv_channels": 768, - "ublock_out_channels": [512, 512, 256, 128, 128], - "dblock_out_channels": [128, 128, 256, 512], - "upsample_factors": [4, 4, 4, 2, 2], - "upsample_dilations": [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]], - } - ) - target_loss: str = "avg_wavegrad_loss" # loss value to pick the best model to save after each epoch + model_params: WavegradArgs = field(default_factory=WavegradArgs) + target_loss: str = "loss" # loss value to pick the best model to save after each epoch # Training - overrides epochs: int = 10000 diff --git a/TTS/vocoder/configs/wavernn_config.py b/TTS/vocoder/configs/wavernn_config.py index 95a3cfc4..0afa1f43 100644 --- a/TTS/vocoder/configs/wavernn_config.py +++ b/TTS/vocoder/configs/wavernn_config.py @@ -1,6 +1,7 @@ from dataclasses import dataclass, field from TTS.vocoder.configs.shared_configs import BaseVocoderConfig +from TTS.vocoder.models.wavernn import WavernnArgs @dataclass @@ -47,9 +48,7 @@ class WavernnConfig(BaseVocoderConfig): Batch size used at training. Larger values use more memory. Defaults to 256. seq_len (int): Audio segment length used at training. Larger values use more memory. Defaults to 1280. - padding (int): - Padding applied to the input feature frames against the convolution layers of the feature network. - Defaults to 2. + use_noise_augment (bool): enable / disable random noise added to the input waveform. The noise is added after computing the features. Defaults to True. @@ -60,7 +59,7 @@ class WavernnConfig(BaseVocoderConfig): enable / disable mixed precision training. Default is True. eval_split_size (int): Number of samples used for evalutaion. Defaults to 50. - test_every_epoch (int): + num_epochs_before_test (int): Number of epochs waited to run the next evalution. Since inference takes some time, it is better to wait some number of epochs not ot waste training time. Defaults to 10. grad_clip (float): @@ -76,21 +75,8 @@ class WavernnConfig(BaseVocoderConfig): model: str = "wavernn" # Model specific params - mode: str = "mold" # mold [string], gauss [string], bits [int] - mulaw: bool = True # apply mulaw if mode is bits - generator_model: str = "WaveRNN" - wavernn_model_params: dict = field( - default_factory=lambda: { - "rnn_dims": 512, - "fc_dims": 512, - "compute_dims": 128, - "res_out_dims": 128, - "num_res_blocks": 10, - "use_aux_net": True, - "use_upsample_net": True, - "upsample_factors": [4, 8, 8], # this needs to correctly factorise hop_length - } - ) + model_params: WavernnArgs = field(default_factory=WavernnArgs) + target_loss: str = "loss" # Inference batched: bool = True @@ -101,12 +87,13 @@ class WavernnConfig(BaseVocoderConfig): epochs: int = 10000 batch_size: int = 256 seq_len: int = 1280 - padding: int = 2 use_noise_augment: bool = False use_cache: bool = True mixed_precision: bool = True eval_split_size: int = 50 - test_every_epochs: int = 10 # number of epochs to wait until the next test run (synthesizing a full audio clip). + num_epochs_before_test: int = ( + 10 # number of epochs to wait until the next test run (synthesizing a full audio clip). + ) # optimizer overrides grad_clip: float = 4.0