From 0213e1cbf424f1b5ed1299a47c9d24062e1410df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Wed, 12 May 2021 00:56:25 +0200 Subject: [PATCH] update configs for tts models to match the field typed with the expected values --- TTS/config/shared_configs.py | 4 ++-- TTS/tts/configs/tacotron_config.py | 2 +- tests/inputs/test_tacotron2_config.json | 2 +- tests/inputs/test_tacotron_config.json | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/TTS/config/shared_configs.py b/TTS/config/shared_configs.py index b10cc9bf..153b3279 100644 --- a/TTS/config/shared_configs.py +++ b/TTS/config/shared_configs.py @@ -1,5 +1,5 @@ from dataclasses import asdict, dataclass - +from typing import List, Union from coqpit import MISSING, Coqpit, check_argument @@ -137,7 +137,7 @@ class BaseAudioConfig(Coqpit): class BaseDatasetConfig(Coqpit): name: str = None path: str = None - meta_file_train: str = None + meta_file_train: Union[str, List] = None # TODO: don't take ignored speakers for multi-speaker datasets over this. This is Union for SC-Glow compat. meta_file_val: str = None meta_file_attn_mask: str = None diff --git a/TTS/tts/configs/tacotron_config.py b/TTS/tts/configs/tacotron_config.py index 6f08e89f..5c509927 100644 --- a/TTS/tts/configs/tacotron_config.py +++ b/TTS/tts/configs/tacotron_config.py @@ -14,7 +14,7 @@ class TacotronConfig(BaseTTSConfig): gst_style_input: str = None # model specific params r: int = 2 - gradual_training: List = None + gradual_training: List[List] = None memory_size: int = -1 prenet_type: str = "original" prenet_dropout: bool = True diff --git a/tests/inputs/test_tacotron2_config.json b/tests/inputs/test_tacotron2_config.json index 779f925d..2bf1f840 100644 --- a/tests/inputs/test_tacotron2_config.json +++ b/tests/inputs/test_tacotron2_config.json @@ -64,7 +64,7 @@ "batch_size": 1, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. "eval_batch_size":1, "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. - "gradual_training": [[0, 7, 4]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed. + "gradual_training": [[0, 7, 4], [1, 5, 2]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed. "loss_masking": true, // enable / disable loss masking against the sequence padding. "ga_alpha": 10.0, // weight for guided attention loss. If > 0, guided attention is enabled. "mixed_precision": false, diff --git a/tests/inputs/test_tacotron_config.json b/tests/inputs/test_tacotron_config.json index a2fdd690..12da4762 100644 --- a/tests/inputs/test_tacotron_config.json +++ b/tests/inputs/test_tacotron_config.json @@ -64,7 +64,7 @@ "batch_size": 1, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. "eval_batch_size":1, "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. - "gradual_training": [[0, 7, 4]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed. + "gradual_training": [[0, 7, 4], [1, 5, 2]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed. "loss_masking": true, // enable / disable loss masking against the sequence padding. "ga_alpha": 10.0, // weight for guided attention loss. If > 0, guided attention is enabled. "mixed_precision": false,