Add `sort_by_audio_len` option

2021-09-03 13:20:26 +00:00 · 2021-09-03 13:20:26 +00:00 · 6e9d4062f2
parent 59d52a4cd8
commit 6e9d4062f2
2 changed files with 11 additions and 4 deletions
--- a/TTS/tts/configs/shared_configs.py
+++ b/TTS/tts/configs/shared_configs.py
@ -141,11 +141,14 @@ class BaseTTSConfig(BaseTrainingConfig):
        loss_masking (bool):
            enable / disable masking loss values against padded segments of samples in a batch.

+        sort_by_audio_len (bool):
+            If true, dataloder sorts the data by audio length else sorts by the input text length. Defaults to `True`.
+
        min_seq_len (int):
-            Minimum input sequence length to be used at training.
+            Minimum sequence length to be used at training.

        max_seq_len (int):
-            Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
+            Maximum sequence length to be used at training. Larger values result in more VRAM usage.

        compute_f0 (int):
            (Not in use yet).
@ -198,6 +201,7 @@ class BaseTTSConfig(BaseTrainingConfig):
    batch_group_size: int = 0
    loss_masking: bool = None
    # dataloading
+    sort_by_audio_len: bool = True
    min_seq_len: int = 1
    max_seq_len: int = float("inf")
    compute_f0: bool = False
--- a/TTS/tts/configs/vits_config.py
+++ b/TTS/tts/configs/vits_config.py
@ -67,11 +67,14 @@ class VitsConfig(BaseTTSConfig):
        compute_linear_spec (bool):
            If true, the linear spectrogram is computed and returned alongside the mel output. Do not change. Defaults to `True`.

+        sort_by_audio_len (bool):
+            If true, dataloder sorts the data by audio length else sorts by the input text length. Defaults to `True`.
+
        min_seq_len (int):
-            Minimum text length to be considered for training. Defaults to `13`.
+            Minimum sequnce length to be considered for training. Defaults to `0`.

        max_seq_len (int):
-            Maximum text length to be considered for training. Defaults to `500`.
+            Maximum sequnce length to be considered for training. Defaults to `500000`.

        r (int):
            Number of spectrogram frames to be generated at a time. Do not change. Defaults to `1`.