Fix vits args types

This commit is contained in:
Eren Gölge 2021-08-30 23:24:20 +00:00
parent b910a6ddce
commit 2b7e55f01f
1 changed files with 7 additions and 7 deletions

View File

@ -119,7 +119,7 @@ class VitsArgs(Coqpit):
upsample_kernel_sizes_decoder (List[int]): upsample_kernel_sizes_decoder (List[int]):
Kernel sizes for each upsampling layer of the decoder network. Defaults to `[16, 16, 4, 4]`. Kernel sizes for each upsampling layer of the decoder network. Defaults to `[16, 16, 4, 4]`.
use_sdp (int): use_sdp (bool):
Use Stochastic Duration Predictor. Defaults to True. Use Stochastic Duration Predictor. Defaults to True.
noise_scale (float): noise_scale (float):
@ -128,7 +128,7 @@ class VitsArgs(Coqpit):
inference_noise_scale (float): inference_noise_scale (float):
Noise scale used for the sample noise tensor in inference. Defaults to 0.667. Noise scale used for the sample noise tensor in inference. Defaults to 0.667.
length_scale (int): length_scale (float):
Scale factor for the predicted duration values. Smaller values result faster speech. Defaults to 1. Scale factor for the predicted duration values. Smaller values result faster speech. Defaults to 1.
noise_scale_dp (float): noise_scale_dp (float):
@ -176,24 +176,24 @@ class VitsArgs(Coqpit):
num_heads_text_encoder: int = 2 num_heads_text_encoder: int = 2
num_layers_text_encoder: int = 6 num_layers_text_encoder: int = 6
kernel_size_text_encoder: int = 3 kernel_size_text_encoder: int = 3
dropout_p_text_encoder: int = 0.1 dropout_p_text_encoder: float = 0.1
dropout_p_duration_predictor: int = 0.5 dropout_p_duration_predictor: float = 0.5
kernel_size_posterior_encoder: int = 5 kernel_size_posterior_encoder: int = 5
dilation_rate_posterior_encoder: int = 1 dilation_rate_posterior_encoder: int = 1
num_layers_posterior_encoder: int = 16 num_layers_posterior_encoder: int = 16
kernel_size_flow: int = 5 kernel_size_flow: int = 5
dilation_rate_flow: int = 1 dilation_rate_flow: int = 1
num_layers_flow: int = 4 num_layers_flow: int = 4
resblock_type_decoder: int = "1" resblock_type_decoder: str = "1"
resblock_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [3, 7, 11]) resblock_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [3, 7, 11])
resblock_dilation_sizes_decoder: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]) resblock_dilation_sizes_decoder: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]])
upsample_rates_decoder: List[int] = field(default_factory=lambda: [8, 8, 2, 2]) upsample_rates_decoder: List[int] = field(default_factory=lambda: [8, 8, 2, 2])
upsample_initial_channel_decoder: int = 512 upsample_initial_channel_decoder: int = 512
upsample_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [16, 16, 4, 4]) upsample_kernel_sizes_decoder: List[int] = field(default_factory=lambda: [16, 16, 4, 4])
use_sdp: int = True use_sdp: bool = True
noise_scale: float = 1.0 noise_scale: float = 1.0
inference_noise_scale: float = 0.667 inference_noise_scale: float = 0.667
length_scale: int = 1 length_scale: float = 1
noise_scale_dp: float = 1.0 noise_scale_dp: float = 1.0
inference_noise_scale_dp: float = 1.0 inference_noise_scale_dp: float = 1.0
max_inference_len: int = None max_inference_len: int = None