Update vocoder model configs

2021-06-18 13:29:35 +02:00 · 2021-06-18 13:29:35 +02:00 · 70d968b169
parent f8a3460818
commit 70d968b169
6 changed files with 29 additions and 65 deletions
--- a/TTS/vocoder/configs/fullband_melgan_config.py
+++ b/TTS/vocoder/configs/fullband_melgan_config.py
@ -14,7 +14,7 @@ class FullbandMelganConfig(BaseGANVocoderConfig):

    Args:
        model (str):
-            Model name used for selecting the right model at initialization. Defaults to `melgan`.
+            Model name used for selecting the right model at initialization. Defaults to `fullband_melgan`.
        discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to
            'melgan_multiscale_discriminator`.
        discriminator_model_params (dict): The discriminator model parameters. Defaults to
@ -62,7 +62,7 @@ class FullbandMelganConfig(BaseGANVocoderConfig):
            L1 spectrogram loss weight that multiplies the computed loss before summing up the total loss. Defaults to 0.
    """

-    model: str = "melgan"
+    model: str = "fullband_melgan"

    # Model specific params
    discriminator_model: str = "melgan_multiscale_discriminator"
--- a/TTS/vocoder/configs/multiband_melgan_config.py
+++ b/TTS/vocoder/configs/multiband_melgan_config.py
@ -14,7 +14,7 @@ class MultibandMelganConfig(BaseGANVocoderConfig):

    Args:
        model (str):
-            Model name used for selecting the right model at initialization. Defaults to `melgan`.
+            Model name used for selecting the right model at initialization. Defaults to `multiband_melgan`.
        discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to
            'melgan_multiscale_discriminator`.
        discriminator_model_params (dict): The discriminator model parameters. Defaults to
--- a/TTS/vocoder/configs/parallel_wavegan_config.py
+++ b/TTS/vocoder/configs/parallel_wavegan_config.py
@ -9,7 +9,7 @@ class ParallelWaveganConfig(BaseGANVocoderConfig):

    Args:
        model (str):
-            Model name used for selecting the right configuration at initialization. Defaults to `parallel_wavegan`.
+            Model name used for selecting the right configuration at initialization. Defaults to `gan`.
        discriminator_model (str): One of the discriminators from `TTS.vocoder.models.*_discriminator`. Defaults to
            'parallel_wavegan_discriminator`.
        discriminator_model_params (dict): The discriminator model kwargs. Defaults to
--- a/TTS/vocoder/configs/shared_configs.py
+++ b/TTS/vocoder/configs/shared_configs.py
@ -34,6 +34,10 @@ class BaseVocoderConfig(BaseTrainingConfig):
            Number of training epochs to. Defaults to 10000.
        wd (float):
            Weight decay.
+         optimizer (torch.optim.Optimizer):
+            Optimizer used for the training. Defaults to `AdamW`.
+        optimizer_params (dict):
+            Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}`
    """

    audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
@ -50,6 +54,8 @@ class BaseVocoderConfig(BaseTrainingConfig):
    # OPTIMIZER
    epochs: int = 10000  # total number of epochs to train.
    wd: float = 0.0  # Weight decay weight.
+    optimizer: str = "AdamW"
+    optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "weight_decay": 0.0})


@dataclass
@ -96,20 +102,13 @@ class BaseGANVocoderConfig(BaseVocoderConfig):
            }`
        target_loss (str):
            Target loss name that defines the quality of the model. Defaults to `avg_G_loss`.
-        gen_clip_grad (float):
-            Gradient clipping threshold for the generator model. Any value less than 0 disables clipping.
-            Defaults to -1.
-        disc_clip_grad (float):
-            Gradient clipping threshold for the discriminator model. Any value less than 0 disables clipping.
-            Defaults to -1.
+        grad_clip (list):
+            A list of gradient clipping theresholds for each optimizer. Any value less than 0 disables clipping.
+            Defaults to [5, 5].
        lr_gen (float):
            Generator model initial learning rate. Defaults to 0.0002.
        lr_disc (float):
            Discriminator model initial learning rate. Defaults to 0.0002.
-        optimizer (torch.optim.Optimizer):
-            Optimizer used for the training. Defaults to `AdamW`.
-        optimizer_params (dict):
-            Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}`
        lr_scheduler_gen (torch.optim.Scheduler):
            Learning rate scheduler for the generator. Defaults to `ExponentialLR`.
        lr_scheduler_gen_params (dict):
@ -127,6 +126,8 @@ class BaseGANVocoderConfig(BaseVocoderConfig):
            Enabling it results in slower iterations but faster convergance in some cases. Defaults to False.
    """

+    model: str = "gan"
+
    # LOSS PARAMETERS
    use_stft_loss: bool = True
    use_subband_stft_loss: bool = True
@ -164,15 +165,12 @@ class BaseGANVocoderConfig(BaseVocoderConfig):
        }
    )

-    target_loss: str = "avg_G_loss"  # loss value to pick the best model to save after each epoch
+    target_loss: str = "loss_0"  # loss value to pick the best model to save after each epoch

    # optimizer
-    gen_clip_grad: float = -1  # Generator gradient clipping threshold. Apply gradient clipping if > 0
-    disc_clip_grad: float = -1  # Discriminator gradient clipping threshold.
+    grad_clip: float = field(default_factory=lambda: [5, 5])
    lr_gen: float = 0.0002  # Initial learning rate.
    lr_disc: float = 0.0002  # Initial learning rate.
-    optimizer: str = "AdamW"
-    optimizer_params: dict = field(default_factory=lambda: {"betas": [0.8, 0.99], "weight_decay": 0.0})
    lr_scheduler_gen: str = "ExponentialLR"  # one of the schedulers from https:#pytorch.org/docs/stable/optim.html
    lr_scheduler_gen_params: dict = field(default_factory=lambda: {"gamma": 0.999, "last_epoch": -1})
    lr_scheduler_disc: str = "ExponentialLR"  # one of the schedulers from https:#pytorch.org/docs/stable/optim.html
--- a/TTS/vocoder/configs/wavegrad_config.py
+++ b/TTS/vocoder/configs/wavegrad_config.py
@ -1,6 +1,7 @@
 from dataclasses import dataclass, field

 from TTS.vocoder.configs.shared_configs import BaseVocoderConfig
+from TTS.vocoder.models.wavegrad import WavegradArgs


@dataclass
@ -16,19 +17,7 @@ class WavegradConfig(BaseVocoderConfig):
            Model name used for selecting the right model at initialization. Defaults to `wavegrad`.
        generator_model (str): One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is
            considered as a generator too. Defaults to `wavegrad`.
-        model_params (dict):
-            WaveGrad kwargs. Defaults to
-            `
-            {
-                "use_weight_norm": True,
-                "y_conv_channels": 32,
-                "x_conv_channels": 768,
-                "ublock_out_channels": [512, 512, 256, 128, 128],
-                "dblock_out_channels": [128, 128, 256, 512],
-                "upsample_factors": [4, 4, 4, 2, 2],
-                "upsample_dilations": [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]],
-            }
-            `
+        model_params (WavegradArgs): Model parameters. Check `WavegradArgs` for default values.
        target_loss (str):
            Target loss name that defines the quality of the model. Defaults to `avg_wavegrad_loss`.
        epochs (int):
@ -70,18 +59,8 @@ class WavegradConfig(BaseVocoderConfig):
    model: str = "wavegrad"
    # Model specific params
    generator_model: str = "wavegrad"
-    model_params: dict = field(
-        default_factory=lambda: {
-            "use_weight_norm": True,
-            "y_conv_channels": 32,
-            "x_conv_channels": 768,
-            "ublock_out_channels": [512, 512, 256, 128, 128],
-            "dblock_out_channels": [128, 128, 256, 512],
-            "upsample_factors": [4, 4, 4, 2, 2],
-            "upsample_dilations": [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]],
-        }
-    )
-    target_loss: str = "avg_wavegrad_loss"  # loss value to pick the best model to save after each epoch
+    model_params: WavegradArgs = field(default_factory=WavegradArgs)
+    target_loss: str = "loss"  # loss value to pick the best model to save after each epoch

    # Training - overrides
    epochs: int = 10000
--- a/TTS/vocoder/configs/wavernn_config.py
+++ b/TTS/vocoder/configs/wavernn_config.py
@ -1,6 +1,7 @@
 from dataclasses import dataclass, field

 from TTS.vocoder.configs.shared_configs import BaseVocoderConfig
+from TTS.vocoder.models.wavernn import WavernnArgs


@dataclass
@ -47,9 +48,7 @@ class WavernnConfig(BaseVocoderConfig):
            Batch size used at training. Larger values use more memory. Defaults to 256.
        seq_len (int):
            Audio segment length used at training. Larger values use more memory. Defaults to 1280.
-        padding (int):
-            Padding applied to the input feature frames against the convolution layers of the feature network.
-            Defaults to 2.
+
        use_noise_augment (bool):
            enable / disable random noise added to the input waveform. The noise is added after computing the
            features. Defaults to True.
@ -60,7 +59,7 @@ class WavernnConfig(BaseVocoderConfig):
            enable / disable mixed precision training. Default is True.
        eval_split_size (int):
            Number of samples used for evalutaion. Defaults to 50.
-        test_every_epoch (int):
+        num_epochs_before_test (int):
            Number of epochs waited to run the next evalution. Since inference takes some time, it is better to
            wait some number of epochs not ot waste training time. Defaults to 10.
        grad_clip (float):
@ -76,21 +75,8 @@ class WavernnConfig(BaseVocoderConfig):
    model: str = "wavernn"

    # Model specific params
-    mode: str = "mold"  # mold [string], gauss [string], bits [int]
-    mulaw: bool = True  # apply mulaw if mode is bits
-    generator_model: str = "WaveRNN"
-    wavernn_model_params: dict = field(
-        default_factory=lambda: {
-            "rnn_dims": 512,
-            "fc_dims": 512,
-            "compute_dims": 128,
-            "res_out_dims": 128,
-            "num_res_blocks": 10,
-            "use_aux_net": True,
-            "use_upsample_net": True,
-            "upsample_factors": [4, 8, 8],  # this needs to correctly factorise hop_length
-        }
-    )
+    model_params: WavernnArgs = field(default_factory=WavernnArgs)
+    target_loss: str = "loss"

    # Inference
    batched: bool = True
@ -101,12 +87,13 @@ class WavernnConfig(BaseVocoderConfig):
    epochs: int = 10000
    batch_size: int = 256
    seq_len: int = 1280
-    padding: int = 2
    use_noise_augment: bool = False
    use_cache: bool = True
    mixed_precision: bool = True
    eval_split_size: int = 50
-    test_every_epochs: int = 10  # number of epochs to wait until the next test run (synthesizing a full audio clip).
+    num_epochs_before_test: int = (
+        10  # number of epochs to wait until the next test run (synthesizing a full audio clip).
+    )

    # optimizer overrides
    grad_clip: float = 4.0