Add `sort_by_audio_len` option

This commit is contained in:
Eren Gölge 2021-09-03 13:20:26 +00:00
parent 59d52a4cd8
commit 6e9d4062f2
2 changed files with 11 additions and 4 deletions

View File

@ -141,11 +141,14 @@ class BaseTTSConfig(BaseTrainingConfig):
loss_masking (bool):
enable / disable masking loss values against padded segments of samples in a batch.
sort_by_audio_len (bool):
If true, dataloder sorts the data by audio length else sorts by the input text length. Defaults to `True`.
min_seq_len (int):
Minimum input sequence length to be used at training.
Minimum sequence length to be used at training.
max_seq_len (int):
Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
Maximum sequence length to be used at training. Larger values result in more VRAM usage.
compute_f0 (int):
(Not in use yet).
@ -198,6 +201,7 @@ class BaseTTSConfig(BaseTrainingConfig):
batch_group_size: int = 0
loss_masking: bool = None
# dataloading
sort_by_audio_len: bool = True
min_seq_len: int = 1
max_seq_len: int = float("inf")
compute_f0: bool = False

View File

@ -67,11 +67,14 @@ class VitsConfig(BaseTTSConfig):
compute_linear_spec (bool):
If true, the linear spectrogram is computed and returned alongside the mel output. Do not change. Defaults to `True`.
sort_by_audio_len (bool):
If true, dataloder sorts the data by audio length else sorts by the input text length. Defaults to `True`.
min_seq_len (int):
Minimum text length to be considered for training. Defaults to `13`.
Minimum sequnce length to be considered for training. Defaults to `0`.
max_seq_len (int):
Maximum text length to be considered for training. Defaults to `500`.
Maximum sequnce length to be considered for training. Defaults to `500000`.
r (int):
Number of spectrogram frames to be generated at a time. Do not change. Defaults to `1`.