From f288e9a260249c5cb50f1805dc90137ff9422918 Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 6 Jan 2021 13:12:33 +0100 Subject: [PATCH] docstrings for taoctron models --- TTS/tts/models/tacotron.py | 39 +++++++++++++++++++++++++++++++++++++ TTS/tts/models/tacotron2.py | 37 +++++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+) diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index 9ade8592..73434aa5 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -8,6 +8,45 @@ from TTS.tts.models.tacotron_abstract import TacotronAbstract class Tacotron(TacotronAbstract): + """Tacotron as in https://arxiv.org/abs/1703.10135 + + It's an autoregressive encoder-attention-decoder-postnet architecture. + + Args: + num_chars (int): number of input characters to define the size of embedding layer. + num_speakers (int): number of speakers in the dataset. >1 enables multi-speaker training and model learns speaker embeddings. + r (int): initial model reduction rate. + postnet_output_dim (int, optional): postnet output channels. Defaults to 80. + decoder_output_dim (int, optional): decoder output channels. Defaults to 80. + attn_type (str, optional): attention type. Check ```TTS.tts.layers.common_layers.init_attn```. Defaults to 'original'. + attn_win (bool, optional): enable/disable attention windowing. + It especially useful at inference to keep attention alignment diagonal. Defaults to False. + attn_norm (str, optional): Attention normalization method. "sigmoid" or "softmax". Defaults to "softmax". + prenet_type (str, optional): prenet type for the decoder. Defaults to "original". + prenet_dropout (bool, optional): prenet dropout rate. Defaults to True. + forward_attn (bool, optional): enable/disable forward attention. + It is only valid if ```attn_type``` is ```original```. Defaults to False. + trans_agent (bool, optional): enable/disable transition agent in forward attention. Defaults to False. + forward_attn_mask (bool, optional): enable/disable extra masking over forward attention. Defaults to False. + location_attn (bool, optional): enable/disable location sensitive attention. + It is only valid if ```attn_type``` is ```original```. Defaults to True. + attn_K (int, optional): Number of attention heads for GMM attention. Defaults to 5. + separate_stopnet (bool, optional): enable/disable separate stopnet training without only gradient + flow from stopnet to the rest of the model. Defaults to True. + bidirectional_decoder (bool, optional): enable/disable bidirectional decoding. Defaults to False. + double_decoder_consistency (bool, optional): enable/disable double decoder consistency. Defaults to False. + ddc_r (int, optional): reduction rate for the coarse decoder of double decoder consistency. Defaults to None. + encoder_in_features (int, optional): input channels for the encoder. Defaults to 512. + decoder_in_features (int, optional): input channels for the decoder. Defaults to 512. + speaker_embedding_dim (int, optional): external speaker conditioning vector channels. Defaults to None. + gst (bool, optional): enable/disable global style token learning. Defaults to False. + gst_embedding_dim (int, optional): size of channels for GST vectors. Defaults to 512. + gst_num_heads (int, optional): number of attention heads for GST. Defaults to 4. + gst_style_tokens (int, optional): number of GST tokens. Defaults to 10. + gst_use_speaker_embedding (bool, optional): enable/disable inputing speaker embedding to GST. Defaults to False. + memory_size (int, optional): size of the history queue fed to the prenet. Model feeds the last ```memory_size``` + output frames to the prenet. + """ def __init__(self, num_chars, num_speakers, diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index ab4d9056..317bdbc8 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -7,6 +7,43 @@ from TTS.tts.models.tacotron_abstract import TacotronAbstract # TODO: match function arguments with tacotron class Tacotron2(TacotronAbstract): + """Tacotron2 as in https://arxiv.org/abs/1712.05884 + + It's an autoregressive encoder-attention-decoder-postnet architecture. + + Args: + num_chars (int): number of input characters to define the size of embedding layer. + num_speakers (int): number of speakers in the dataset. >1 enables multi-speaker training and model learns speaker embeddings. + r (int): initial model reduction rate. + postnet_output_dim (int, optional): postnet output channels. Defaults to 80. + decoder_output_dim (int, optional): decoder output channels. Defaults to 80. + attn_type (str, optional): attention type. Check ```TTS.tts.layers.common_layers.init_attn```. Defaults to 'original'. + attn_win (bool, optional): enable/disable attention windowing. + It especially useful at inference to keep attention alignment diagonal. Defaults to False. + attn_norm (str, optional): Attention normalization method. "sigmoid" or "softmax". Defaults to "softmax". + prenet_type (str, optional): prenet type for the decoder. Defaults to "original". + prenet_dropout (bool, optional): prenet dropout rate. Defaults to True. + forward_attn (bool, optional): enable/disable forward attention. + It is only valid if ```attn_type``` is ```original```. Defaults to False. + trans_agent (bool, optional): enable/disable transition agent in forward attention. Defaults to False. + forward_attn_mask (bool, optional): enable/disable extra masking over forward attention. Defaults to False. + location_attn (bool, optional): enable/disable location sensitive attention. + It is only valid if ```attn_type``` is ```original```. Defaults to True. + attn_K (int, optional): Number of attention heads for GMM attention. Defaults to 5. + separate_stopnet (bool, optional): enable/disable separate stopnet training without only gradient + flow from stopnet to the rest of the model. Defaults to True. + bidirectional_decoder (bool, optional): enable/disable bidirectional decoding. Defaults to False. + double_decoder_consistency (bool, optional): enable/disable double decoder consistency. Defaults to False. + ddc_r (int, optional): reduction rate for the coarse decoder of double decoder consistency. Defaults to None. + encoder_in_features (int, optional): input channels for the encoder. Defaults to 512. + decoder_in_features (int, optional): input channels for the decoder. Defaults to 512. + speaker_embedding_dim (int, optional): external speaker conditioning vector channels. Defaults to None. + gst (bool, optional): enable/disable global style token learning. Defaults to False. + gst_embedding_dim (int, optional): size of channels for GST vectors. Defaults to 512. + gst_num_heads (int, optional): number of attention heads for GST. Defaults to 4. + gst_style_tokens (int, optional): number of GST tokens. Defaults to 10. + gst_use_speaker_embedding (bool, optional): enable/disable inputing speaker embedding to GST. Defaults to False. + """ def __init__(self, num_chars, num_speakers,