mirror of https://github.com/coqui-ai/TTS.git
docstrings for taoctron models
This commit is contained in:
parent
5a45af48f1
commit
f288e9a260
|
@ -8,6 +8,45 @@ from TTS.tts.models.tacotron_abstract import TacotronAbstract
|
||||||
|
|
||||||
|
|
||||||
class Tacotron(TacotronAbstract):
|
class Tacotron(TacotronAbstract):
|
||||||
|
"""Tacotron as in https://arxiv.org/abs/1703.10135
|
||||||
|
|
||||||
|
It's an autoregressive encoder-attention-decoder-postnet architecture.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
num_chars (int): number of input characters to define the size of embedding layer.
|
||||||
|
num_speakers (int): number of speakers in the dataset. >1 enables multi-speaker training and model learns speaker embeddings.
|
||||||
|
r (int): initial model reduction rate.
|
||||||
|
postnet_output_dim (int, optional): postnet output channels. Defaults to 80.
|
||||||
|
decoder_output_dim (int, optional): decoder output channels. Defaults to 80.
|
||||||
|
attn_type (str, optional): attention type. Check ```TTS.tts.layers.common_layers.init_attn```. Defaults to 'original'.
|
||||||
|
attn_win (bool, optional): enable/disable attention windowing.
|
||||||
|
It especially useful at inference to keep attention alignment diagonal. Defaults to False.
|
||||||
|
attn_norm (str, optional): Attention normalization method. "sigmoid" or "softmax". Defaults to "softmax".
|
||||||
|
prenet_type (str, optional): prenet type for the decoder. Defaults to "original".
|
||||||
|
prenet_dropout (bool, optional): prenet dropout rate. Defaults to True.
|
||||||
|
forward_attn (bool, optional): enable/disable forward attention.
|
||||||
|
It is only valid if ```attn_type``` is ```original```. Defaults to False.
|
||||||
|
trans_agent (bool, optional): enable/disable transition agent in forward attention. Defaults to False.
|
||||||
|
forward_attn_mask (bool, optional): enable/disable extra masking over forward attention. Defaults to False.
|
||||||
|
location_attn (bool, optional): enable/disable location sensitive attention.
|
||||||
|
It is only valid if ```attn_type``` is ```original```. Defaults to True.
|
||||||
|
attn_K (int, optional): Number of attention heads for GMM attention. Defaults to 5.
|
||||||
|
separate_stopnet (bool, optional): enable/disable separate stopnet training without only gradient
|
||||||
|
flow from stopnet to the rest of the model. Defaults to True.
|
||||||
|
bidirectional_decoder (bool, optional): enable/disable bidirectional decoding. Defaults to False.
|
||||||
|
double_decoder_consistency (bool, optional): enable/disable double decoder consistency. Defaults to False.
|
||||||
|
ddc_r (int, optional): reduction rate for the coarse decoder of double decoder consistency. Defaults to None.
|
||||||
|
encoder_in_features (int, optional): input channels for the encoder. Defaults to 512.
|
||||||
|
decoder_in_features (int, optional): input channels for the decoder. Defaults to 512.
|
||||||
|
speaker_embedding_dim (int, optional): external speaker conditioning vector channels. Defaults to None.
|
||||||
|
gst (bool, optional): enable/disable global style token learning. Defaults to False.
|
||||||
|
gst_embedding_dim (int, optional): size of channels for GST vectors. Defaults to 512.
|
||||||
|
gst_num_heads (int, optional): number of attention heads for GST. Defaults to 4.
|
||||||
|
gst_style_tokens (int, optional): number of GST tokens. Defaults to 10.
|
||||||
|
gst_use_speaker_embedding (bool, optional): enable/disable inputing speaker embedding to GST. Defaults to False.
|
||||||
|
memory_size (int, optional): size of the history queue fed to the prenet. Model feeds the last ```memory_size```
|
||||||
|
output frames to the prenet.
|
||||||
|
"""
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
num_chars,
|
num_chars,
|
||||||
num_speakers,
|
num_speakers,
|
||||||
|
|
|
@ -7,6 +7,43 @@ from TTS.tts.models.tacotron_abstract import TacotronAbstract
|
||||||
|
|
||||||
# TODO: match function arguments with tacotron
|
# TODO: match function arguments with tacotron
|
||||||
class Tacotron2(TacotronAbstract):
|
class Tacotron2(TacotronAbstract):
|
||||||
|
"""Tacotron2 as in https://arxiv.org/abs/1712.05884
|
||||||
|
|
||||||
|
It's an autoregressive encoder-attention-decoder-postnet architecture.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
num_chars (int): number of input characters to define the size of embedding layer.
|
||||||
|
num_speakers (int): number of speakers in the dataset. >1 enables multi-speaker training and model learns speaker embeddings.
|
||||||
|
r (int): initial model reduction rate.
|
||||||
|
postnet_output_dim (int, optional): postnet output channels. Defaults to 80.
|
||||||
|
decoder_output_dim (int, optional): decoder output channels. Defaults to 80.
|
||||||
|
attn_type (str, optional): attention type. Check ```TTS.tts.layers.common_layers.init_attn```. Defaults to 'original'.
|
||||||
|
attn_win (bool, optional): enable/disable attention windowing.
|
||||||
|
It especially useful at inference to keep attention alignment diagonal. Defaults to False.
|
||||||
|
attn_norm (str, optional): Attention normalization method. "sigmoid" or "softmax". Defaults to "softmax".
|
||||||
|
prenet_type (str, optional): prenet type for the decoder. Defaults to "original".
|
||||||
|
prenet_dropout (bool, optional): prenet dropout rate. Defaults to True.
|
||||||
|
forward_attn (bool, optional): enable/disable forward attention.
|
||||||
|
It is only valid if ```attn_type``` is ```original```. Defaults to False.
|
||||||
|
trans_agent (bool, optional): enable/disable transition agent in forward attention. Defaults to False.
|
||||||
|
forward_attn_mask (bool, optional): enable/disable extra masking over forward attention. Defaults to False.
|
||||||
|
location_attn (bool, optional): enable/disable location sensitive attention.
|
||||||
|
It is only valid if ```attn_type``` is ```original```. Defaults to True.
|
||||||
|
attn_K (int, optional): Number of attention heads for GMM attention. Defaults to 5.
|
||||||
|
separate_stopnet (bool, optional): enable/disable separate stopnet training without only gradient
|
||||||
|
flow from stopnet to the rest of the model. Defaults to True.
|
||||||
|
bidirectional_decoder (bool, optional): enable/disable bidirectional decoding. Defaults to False.
|
||||||
|
double_decoder_consistency (bool, optional): enable/disable double decoder consistency. Defaults to False.
|
||||||
|
ddc_r (int, optional): reduction rate for the coarse decoder of double decoder consistency. Defaults to None.
|
||||||
|
encoder_in_features (int, optional): input channels for the encoder. Defaults to 512.
|
||||||
|
decoder_in_features (int, optional): input channels for the decoder. Defaults to 512.
|
||||||
|
speaker_embedding_dim (int, optional): external speaker conditioning vector channels. Defaults to None.
|
||||||
|
gst (bool, optional): enable/disable global style token learning. Defaults to False.
|
||||||
|
gst_embedding_dim (int, optional): size of channels for GST vectors. Defaults to 512.
|
||||||
|
gst_num_heads (int, optional): number of attention heads for GST. Defaults to 4.
|
||||||
|
gst_style_tokens (int, optional): number of GST tokens. Defaults to 10.
|
||||||
|
gst_use_speaker_embedding (bool, optional): enable/disable inputing speaker embedding to GST. Defaults to False.
|
||||||
|
"""
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
num_chars,
|
num_chars,
|
||||||
num_speakers,
|
num_speakers,
|
||||||
|
|
Loading…
Reference in New Issue