docstrings for taoctron models

2021-01-06 13:12:33 +01:00 · 2021-01-06 13:12:33 +01:00 · f288e9a260
parent 5a45af48f1
commit f288e9a260
2 changed files with 76 additions and 0 deletions
--- a/TTS/tts/models/tacotron.py
+++ b/TTS/tts/models/tacotron.py
@ -8,6 +8,45 @@ from TTS.tts.models.tacotron_abstract import TacotronAbstract


 class Tacotron(TacotronAbstract):
+    """Tacotron as in https://arxiv.org/abs/1703.10135
+
+    It's an autoregressive encoder-attention-decoder-postnet architecture.
+
+    Args:
+        num_chars (int): number of input characters to define the size of embedding layer.
+        num_speakers (int): number of speakers in the dataset. >1 enables multi-speaker training and model learns speaker embeddings.
+        r (int): initial model reduction rate.
+        postnet_output_dim (int, optional): postnet output channels. Defaults to 80.
+        decoder_output_dim (int, optional): decoder output channels. Defaults to 80.
+        attn_type (str, optional): attention type. Check ```TTS.tts.layers.common_layers.init_attn```. Defaults to 'original'.
+        attn_win (bool, optional): enable/disable attention windowing.
+            It especially useful at inference to keep attention alignment diagonal. Defaults to False.
+        attn_norm (str, optional): Attention normalization method. "sigmoid" or "softmax". Defaults to "softmax".
+        prenet_type (str, optional): prenet type for the decoder. Defaults to "original".
+        prenet_dropout (bool, optional): prenet dropout rate. Defaults to True.
+        forward_attn (bool, optional): enable/disable forward attention.
+            It is only valid if ```attn_type``` is ```original```.  Defaults to False.
+        trans_agent (bool, optional): enable/disable transition agent in forward attention. Defaults to False.
+        forward_attn_mask (bool, optional): enable/disable extra masking over forward attention. Defaults to False.
+        location_attn (bool, optional): enable/disable location sensitive attention.
+            It is only valid if ```attn_type``` is ```original```. Defaults to True.
+        attn_K (int, optional): Number of attention heads for GMM attention. Defaults to 5.
+        separate_stopnet (bool, optional): enable/disable separate stopnet training without only gradient
+            flow from stopnet to the rest of the model.  Defaults to True.
+        bidirectional_decoder (bool, optional): enable/disable bidirectional decoding. Defaults to False.
+        double_decoder_consistency (bool, optional): enable/disable double decoder consistency. Defaults to False.
+        ddc_r (int, optional): reduction rate for the coarse decoder of double decoder consistency. Defaults to None.
+        encoder_in_features (int, optional): input channels for the encoder. Defaults to 512.
+        decoder_in_features (int, optional): input channels for the decoder. Defaults to 512.
+        speaker_embedding_dim (int, optional): external speaker conditioning vector channels. Defaults to None.
+        gst (bool, optional): enable/disable global style token learning. Defaults to False.
+        gst_embedding_dim (int, optional): size of channels for GST vectors. Defaults to 512.
+        gst_num_heads (int, optional): number of attention heads for GST. Defaults to 4.
+        gst_style_tokens (int, optional): number of GST tokens. Defaults to 10.
+        gst_use_speaker_embedding (bool, optional): enable/disable inputing speaker embedding to GST. Defaults to False.
+        memory_size (int, optional): size of the history queue fed to the prenet. Model feeds the last ```memory_size```
+            output frames to the prenet.
+    """
    def __init__(self,
                 num_chars,
                 num_speakers,
--- a/TTS/tts/models/tacotron2.py
+++ b/TTS/tts/models/tacotron2.py
@ -7,6 +7,43 @@ from TTS.tts.models.tacotron_abstract import TacotronAbstract

 # TODO: match function arguments with tacotron
 class Tacotron2(TacotronAbstract):
+    """Tacotron2 as in https://arxiv.org/abs/1712.05884
+
+    It's an autoregressive encoder-attention-decoder-postnet architecture.
+
+    Args:
+        num_chars (int): number of input characters to define the size of embedding layer.
+        num_speakers (int): number of speakers in the dataset. >1 enables multi-speaker training and model learns speaker embeddings.
+        r (int): initial model reduction rate.
+        postnet_output_dim (int, optional): postnet output channels. Defaults to 80.
+        decoder_output_dim (int, optional): decoder output channels. Defaults to 80.
+        attn_type (str, optional): attention type. Check ```TTS.tts.layers.common_layers.init_attn```. Defaults to 'original'.
+        attn_win (bool, optional): enable/disable attention windowing.
+            It especially useful at inference to keep attention alignment diagonal. Defaults to False.
+        attn_norm (str, optional): Attention normalization method. "sigmoid" or "softmax". Defaults to "softmax".
+        prenet_type (str, optional): prenet type for the decoder. Defaults to "original".
+        prenet_dropout (bool, optional): prenet dropout rate. Defaults to True.
+        forward_attn (bool, optional): enable/disable forward attention.
+            It is only valid if ```attn_type``` is ```original```.  Defaults to False.
+        trans_agent (bool, optional): enable/disable transition agent in forward attention. Defaults to False.
+        forward_attn_mask (bool, optional): enable/disable extra masking over forward attention. Defaults to False.
+        location_attn (bool, optional): enable/disable location sensitive attention.
+            It is only valid if ```attn_type``` is ```original```. Defaults to True.
+        attn_K (int, optional): Number of attention heads for GMM attention. Defaults to 5.
+        separate_stopnet (bool, optional): enable/disable separate stopnet training without only gradient
+            flow from stopnet to the rest of the model.  Defaults to True.
+        bidirectional_decoder (bool, optional): enable/disable bidirectional decoding. Defaults to False.
+        double_decoder_consistency (bool, optional): enable/disable double decoder consistency. Defaults to False.
+        ddc_r (int, optional): reduction rate for the coarse decoder of double decoder consistency. Defaults to None.
+        encoder_in_features (int, optional): input channels for the encoder. Defaults to 512.
+        decoder_in_features (int, optional): input channels for the decoder. Defaults to 512.
+        speaker_embedding_dim (int, optional): external speaker conditioning vector channels. Defaults to None.
+        gst (bool, optional): enable/disable global style token learning. Defaults to False.
+        gst_embedding_dim (int, optional): size of channels for GST vectors. Defaults to 512.
+        gst_num_heads (int, optional): number of attention heads for GST. Defaults to 4.
+        gst_style_tokens (int, optional): number of GST tokens. Defaults to 10.
+        gst_use_speaker_embedding (bool, optional): enable/disable inputing speaker embedding to GST. Defaults to False.
+    """
    def __init__(self,
                 num_chars,
                 num_speakers,