mirror of https://github.com/coqui-ai/TTS.git
Make duration predictor dropout configurable
This commit is contained in:
parent
922142428c
commit
f7a72552f1
|
@ -79,6 +79,9 @@ class VitsArgs(Coqpit):
|
||||||
dropout_p_text_encoder (float):
|
dropout_p_text_encoder (float):
|
||||||
Dropout rate of the text encoder. Defaults to 0.1.
|
Dropout rate of the text encoder. Defaults to 0.1.
|
||||||
|
|
||||||
|
dropout_p_duration_predictor (float):
|
||||||
|
Dropout rate of the duration predictor. Defaults to 0.1.
|
||||||
|
|
||||||
kernel_size_posterior_encoder (int):
|
kernel_size_posterior_encoder (int):
|
||||||
Kernel size of the posterior encoder's WaveNet layers. Defaults to 5.
|
Kernel size of the posterior encoder's WaveNet layers. Defaults to 5.
|
||||||
|
|
||||||
|
@ -174,6 +177,7 @@ class VitsArgs(Coqpit):
|
||||||
num_layers_text_encoder: int = 6
|
num_layers_text_encoder: int = 6
|
||||||
kernel_size_text_encoder: int = 3
|
kernel_size_text_encoder: int = 3
|
||||||
dropout_p_text_encoder: int = 0.1
|
dropout_p_text_encoder: int = 0.1
|
||||||
|
dropout_p_duration_predictor: int = 0.1
|
||||||
kernel_size_posterior_encoder: int = 5
|
kernel_size_posterior_encoder: int = 5
|
||||||
dilation_rate_posterior_encoder: int = 1
|
dilation_rate_posterior_encoder: int = 1
|
||||||
num_layers_posterior_encoder: int = 16
|
num_layers_posterior_encoder: int = 16
|
||||||
|
@ -300,11 +304,11 @@ class Vits(BaseTTS):
|
||||||
|
|
||||||
if args.use_sdp:
|
if args.use_sdp:
|
||||||
self.duration_predictor = StochasticDurationPredictor(
|
self.duration_predictor = StochasticDurationPredictor(
|
||||||
args.hidden_channels, 192, 3, 0.5, 4, cond_channels=self.embedded_speaker_dim
|
args.hidden_channels, 192, 3, args.dropout_p_duration_predictor, 4, cond_channels=self.embedded_speaker_dim
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
self.duration_predictor = DurationPredictor(
|
self.duration_predictor = DurationPredictor(
|
||||||
args.hidden_channels, 256, 3, 0.5, cond_channels=self.embedded_speaker_dim
|
args.hidden_channels, 256, 3, args.dropout_p_duration_predictor, cond_channels=self.embedded_speaker_dim
|
||||||
)
|
)
|
||||||
|
|
||||||
self.waveform_decoder = HifiganGenerator(
|
self.waveform_decoder = HifiganGenerator(
|
||||||
|
|
Loading…
Reference in New Issue