mirror of https://github.com/coqui-ai/TTS.git
Add `compute_linear_spec=False` to `BaseTTSConfig`
This commit is contained in:
parent
960a35a121
commit
bd4e29b4dd
|
@ -13,12 +13,16 @@ class GSTConfig(Coqpit):
|
||||||
Args:
|
Args:
|
||||||
gst_style_input_wav (str):
|
gst_style_input_wav (str):
|
||||||
Path to the wav file used to define the style of the output speech at inference. Defaults to None.
|
Path to the wav file used to define the style of the output speech at inference. Defaults to None.
|
||||||
|
|
||||||
gst_style_input_weights (dict):
|
gst_style_input_weights (dict):
|
||||||
Defines the weights for each style token used at inference. Defaults to None.
|
Defines the weights for each style token used at inference. Defaults to None.
|
||||||
|
|
||||||
gst_embedding_dim (int):
|
gst_embedding_dim (int):
|
||||||
Defines the size of the GST embedding vector dimensions. Defaults to 256.
|
Defines the size of the GST embedding vector dimensions. Defaults to 256.
|
||||||
|
|
||||||
gst_num_heads (int):
|
gst_num_heads (int):
|
||||||
Number of attention heads used by the multi-head attention. Defaults to 4.
|
Number of attention heads used by the multi-head attention. Defaults to 4.
|
||||||
|
|
||||||
gst_num_style_tokens (int):
|
gst_num_style_tokens (int):
|
||||||
Number of style token vectors. Defaults to 10.
|
Number of style token vectors. Defaults to 10.
|
||||||
"""
|
"""
|
||||||
|
@ -51,17 +55,23 @@ class CharactersConfig(Coqpit):
|
||||||
Args:
|
Args:
|
||||||
pad (str):
|
pad (str):
|
||||||
characters in place of empty padding. Defaults to None.
|
characters in place of empty padding. Defaults to None.
|
||||||
|
|
||||||
eos (str):
|
eos (str):
|
||||||
characters showing the end of a sentence. Defaults to None.
|
characters showing the end of a sentence. Defaults to None.
|
||||||
|
|
||||||
bos (str):
|
bos (str):
|
||||||
characters showing the beginning of a sentence. Defaults to None.
|
characters showing the beginning of a sentence. Defaults to None.
|
||||||
|
|
||||||
characters (str):
|
characters (str):
|
||||||
character set used by the model. Characters not in this list are ignored when converting input text to
|
character set used by the model. Characters not in this list are ignored when converting input text to
|
||||||
a list of sequence IDs. Defaults to None.
|
a list of sequence IDs. Defaults to None.
|
||||||
|
|
||||||
punctuations (str):
|
punctuations (str):
|
||||||
characters considered as punctuation as parsing the input sentence. Defaults to None.
|
characters considered as punctuation as parsing the input sentence. Defaults to None.
|
||||||
|
|
||||||
phonemes (str):
|
phonemes (str):
|
||||||
characters considered as parsing phonemes. Defaults to None.
|
characters considered as parsing phonemes. Defaults to None.
|
||||||
|
|
||||||
unique (bool):
|
unique (bool):
|
||||||
remove any duplicate characters in the character lists. It is a bandaid for compatibility with the old
|
remove any duplicate characters in the character lists. It is a bandaid for compatibility with the old
|
||||||
models trained with character lists with duplicates.
|
models trained with character lists with duplicates.
|
||||||
|
@ -95,54 +105,78 @@ class BaseTTSConfig(BaseTrainingConfig):
|
||||||
Args:
|
Args:
|
||||||
audio (BaseAudioConfig):
|
audio (BaseAudioConfig):
|
||||||
Audio processor config object instance.
|
Audio processor config object instance.
|
||||||
|
|
||||||
use_phonemes (bool):
|
use_phonemes (bool):
|
||||||
enable / disable phoneme use.
|
enable / disable phoneme use.
|
||||||
|
|
||||||
use_espeak_phonemes (bool):
|
use_espeak_phonemes (bool):
|
||||||
enable / disable eSpeak-compatible phonemes (only if use_phonemes = `True`).
|
enable / disable eSpeak-compatible phonemes (only if use_phonemes = `True`).
|
||||||
|
|
||||||
compute_input_seq_cache (bool):
|
compute_input_seq_cache (bool):
|
||||||
enable / disable precomputation of the phoneme sequences. At the expense of some delay at the beginning of
|
enable / disable precomputation of the phoneme sequences. At the expense of some delay at the beginning of
|
||||||
the training, It allows faster data loader time and precise limitation with `max_seq_len` and
|
the training, It allows faster data loader time and precise limitation with `max_seq_len` and
|
||||||
`min_seq_len`.
|
`min_seq_len`.
|
||||||
|
|
||||||
text_cleaner (str):
|
text_cleaner (str):
|
||||||
Name of the text cleaner used for cleaning and formatting transcripts.
|
Name of the text cleaner used for cleaning and formatting transcripts.
|
||||||
|
|
||||||
enable_eos_bos_chars (bool):
|
enable_eos_bos_chars (bool):
|
||||||
enable / disable the use of eos and bos characters.
|
enable / disable the use of eos and bos characters.
|
||||||
|
|
||||||
test_senteces_file (str):
|
test_senteces_file (str):
|
||||||
Path to a txt file that has sentences used at test time. The file must have a sentence per line.
|
Path to a txt file that has sentences used at test time. The file must have a sentence per line.
|
||||||
|
|
||||||
phoneme_cache_path (str):
|
phoneme_cache_path (str):
|
||||||
Path to the output folder caching the computed phonemes for each sample.
|
Path to the output folder caching the computed phonemes for each sample.
|
||||||
|
|
||||||
characters (CharactersConfig):
|
characters (CharactersConfig):
|
||||||
Instance of a CharactersConfig class.
|
Instance of a CharactersConfig class.
|
||||||
|
|
||||||
batch_group_size (int):
|
batch_group_size (int):
|
||||||
Size of the batch groups used for bucketing. By default, the dataloader orders samples by the sequence
|
Size of the batch groups used for bucketing. By default, the dataloader orders samples by the sequence
|
||||||
length for a more efficient and stable training. If `batch_group_size > 1` then it performs bucketing to
|
length for a more efficient and stable training. If `batch_group_size > 1` then it performs bucketing to
|
||||||
prevent using the same batches for each epoch.
|
prevent using the same batches for each epoch.
|
||||||
|
|
||||||
loss_masking (bool):
|
loss_masking (bool):
|
||||||
enable / disable masking loss values against padded segments of samples in a batch.
|
enable / disable masking loss values against padded segments of samples in a batch.
|
||||||
|
|
||||||
min_seq_len (int):
|
min_seq_len (int):
|
||||||
Minimum input sequence length to be used at training.
|
Minimum input sequence length to be used at training.
|
||||||
|
|
||||||
max_seq_len (int):
|
max_seq_len (int):
|
||||||
Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
|
Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
|
||||||
|
|
||||||
compute_f0 (int):
|
compute_f0 (int):
|
||||||
(Not in use yet).
|
(Not in use yet).
|
||||||
|
|
||||||
|
compute_linear_spec (bool):
|
||||||
|
If True data loader computes and returns linear spectrograms alongside the other data.
|
||||||
|
|
||||||
use_noise_augment (bool):
|
use_noise_augment (bool):
|
||||||
Augment the input audio with random noise.
|
Augment the input audio with random noise.
|
||||||
|
|
||||||
add_blank (bool):
|
add_blank (bool):
|
||||||
Add blank characters between each other two characters. It improves performance for some models at expense
|
Add blank characters between each other two characters. It improves performance for some models at expense
|
||||||
of slower run-time due to the longer input sequence.
|
of slower run-time due to the longer input sequence.
|
||||||
|
|
||||||
datasets (List[BaseDatasetConfig]):
|
datasets (List[BaseDatasetConfig]):
|
||||||
List of datasets used for training. If multiple datasets are provided, they are merged and used together
|
List of datasets used for training. If multiple datasets are provided, they are merged and used together
|
||||||
for training.
|
for training.
|
||||||
|
|
||||||
optimizer (str):
|
optimizer (str):
|
||||||
Optimizer used for the training. Set one from `torch.optim.Optimizer` or `TTS.utils.training`.
|
Optimizer used for the training. Set one from `torch.optim.Optimizer` or `TTS.utils.training`.
|
||||||
Defaults to ``.
|
Defaults to ``.
|
||||||
|
|
||||||
optimizer_params (dict):
|
optimizer_params (dict):
|
||||||
Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}`
|
Optimizer kwargs. Defaults to `{"betas": [0.8, 0.99], "weight_decay": 0.0}`
|
||||||
|
|
||||||
lr_scheduler (str):
|
lr_scheduler (str):
|
||||||
Learning rate scheduler for the training. Use one from `torch.optim.Scheduler` schedulers or
|
Learning rate scheduler for the training. Use one from `torch.optim.Scheduler` schedulers or
|
||||||
`TTS.utils.training`. Defaults to ``.
|
`TTS.utils.training`. Defaults to ``.
|
||||||
|
|
||||||
lr_scheduler_params (dict):
|
lr_scheduler_params (dict):
|
||||||
Parameters for the generator learning rate scheduler. Defaults to `{"warmup": 4000}`.
|
Parameters for the generator learning rate scheduler. Defaults to `{"warmup": 4000}`.
|
||||||
|
|
||||||
test_sentences (List[str]):
|
test_sentences (List[str]):
|
||||||
List of sentences to be used at testing. Defaults to '[]'
|
List of sentences to be used at testing. Defaults to '[]'
|
||||||
"""
|
"""
|
||||||
|
@ -166,6 +200,7 @@ class BaseTTSConfig(BaseTrainingConfig):
|
||||||
min_seq_len: int = 1
|
min_seq_len: int = 1
|
||||||
max_seq_len: int = float("inf")
|
max_seq_len: int = float("inf")
|
||||||
compute_f0: bool = False
|
compute_f0: bool = False
|
||||||
|
compute_linear_spec: bool = False
|
||||||
use_noise_augment: bool = False
|
use_noise_augment: bool = False
|
||||||
add_blank: bool = False
|
add_blank: bool = False
|
||||||
# dataset
|
# dataset
|
||||||
|
|
Loading…
Reference in New Issue