From a6259041d3a0d7511df568474780ffb465462640 Mon Sep 17 00:00:00 2001
From: erogol <erogol@hotmail.com>
Date: Thu, 7 Jan 2021 14:35:22 +0100
Subject: [PATCH] docstring for speedyspeech

---
 TTS/tts/layers/speedy_speech/decoder.py       |   4 +-
 .../speedy_speech/duration_predictor.py       |  22 +++-
 TTS/tts/layers/speedy_speech/encoder.py       | 109 +++++++++---------
 TTS/tts/models/speedy_speech.py               |  57 ++++++++-
 4 files changed, 128 insertions(+), 64 deletions(-)

diff --git a/TTS/tts/layers/speedy_speech/decoder.py b/TTS/tts/layers/speedy_speech/decoder.py
index bf23f603..9bbb047b 100644
--- a/TTS/tts/layers/speedy_speech/decoder.py
+++ b/TTS/tts/layers/speedy_speech/decoder.py
@@ -1,7 +1,7 @@
 from torch import nn
 from TTS.tts.layers.generic.res_conv_bn import ConvBNBlock, ResidualConvBNBlock
 from TTS.tts.layers.generic.wavenet import WNBlocks
-from TTS.tts.layers.glow_tts.transformer import Transformer
+from TTS.tts.layers.glow_tts.transformer import RelativePositionTransformer
 
 
 class Decoder(nn.Module):
@@ -67,7 +67,7 @@ class Decoder(nn.Module):
         self.out_channels = out_channels
 
         if decoder_type == 'transformer':
-            self.decoder = Transformer(self.hidden_channels, **decoder_params)
+            self.decoder = RelativePositionTransformer(self.hidden_channels, **decoder_params)
         elif decoder_type == 'residual_conv_bn':
             self.decoder = ResidualConvBNBlock(self.hidden_channels,
                                                **decoder_params)
diff --git a/TTS/tts/layers/speedy_speech/duration_predictor.py b/TTS/tts/layers/speedy_speech/duration_predictor.py
index 9f83d94d..153a6a49 100644
--- a/TTS/tts/layers/speedy_speech/duration_predictor.py
+++ b/TTS/tts/layers/speedy_speech/duration_predictor.py
@@ -4,8 +4,20 @@ from TTS.tts.layers.generic.res_conv_bn import ConvBN
 
 
 class DurationPredictor(nn.Module):
-    """Predicts phoneme log durations based on the encoder outputs"""
+    """Speedy Speech duration predictor model.
+    Predicts phoneme durations from encoder outputs.
+
+    Note:
+        Outputs interpreted as log(durations)
+        To get actual durations, do exp transformation
+
+    conv_BN_4x1 -> conv_BN_3x1 -> conv_BN_1x1 -> conv_1x1
+
+    Args:
+        hidden_channels (int): number of channels in the inner layers.
+    """
     def __init__(self, hidden_channels):
+
         super().__init__()
 
         self.layers = nn.ModuleList([
@@ -16,10 +28,10 @@ class DurationPredictor(nn.Module):
         ])
 
     def forward(self, x, x_mask):
-        """Outputs interpreted as log(durations)
-        To get actual durations, do exp transformation
-        :param x:
-        :return:
+        """
+        Shapes:
+            x: [B, C, T]
+            x_mask: [B, 1, T]
         """
         o = x
         for layer in self.layers:
diff --git a/TTS/tts/layers/speedy_speech/encoder.py b/TTS/tts/layers/speedy_speech/encoder.py
index e0a0ce8f..9fd97514 100644
--- a/TTS/tts/layers/speedy_speech/encoder.py
+++ b/TTS/tts/layers/speedy_speech/encoder.py
@@ -3,7 +3,7 @@ import torch
 from torch import nn
 from torch.nn import functional as F
 
-from TTS.tts.layers.glow_tts.transformer import Transformer
+from TTS.tts.layers.glow_tts.transformer import RelativePositionTransformer
 from TTS.tts.layers.glow_tts.glow import ConvLayerNorm
 from TTS.tts.layers.generic.res_conv_bn import ResidualConvBNBlock
 
@@ -12,42 +12,36 @@ class PositionalEncoding(nn.Module):
     """Sinusoidal positional encoding for non-recurrent neural networks.
     Implementation based on "Attention Is All You Need"
     Args:
+       channels (int): embedding size
        dropout (float): dropout parameter
-       dim (int): embedding size
     """
-    def __init__(self, dim, dropout=0.0, max_len=5000):
+    def __init__(self, channels, dropout=0.0, max_len=5000):
         super().__init__()
-        if dim % 2 != 0:
+        if channels % 2 != 0:
             raise ValueError("Cannot use sin/cos positional encoding with "
-                             "odd dim (got dim={:d})".format(dim))
-        pe = torch.zeros(max_len, dim)
+                             "odd channels (got channels={:d})".format(channels))
+        pe = torch.zeros(max_len, channels)
         position = torch.arange(0, max_len).unsqueeze(1)
-        div_term = torch.exp((torch.arange(0, dim, 2, dtype=torch.float) *
-                             -(math.log(10000.0) / dim)))
+        div_term = torch.exp((torch.arange(0, channels, 2, dtype=torch.float) *
+                             -(math.log(10000.0) / channels)))
         pe[:, 0::2] = torch.sin(position.float() * div_term)
         pe[:, 1::2] = torch.cos(position.float() * div_term)
         pe = pe.unsqueeze(0).transpose(1, 2)
         self.register_buffer('pe', pe)
         if dropout > 0:
             self.dropout = nn.Dropout(p=dropout)
-        self.dim = dim
+        self.channels = channels
 
     def forward(self, x, mask=None, first_idx=None, last_idx=None):
-        """Embed inputs.
-        Args:
-            x (FloatTensor): Sequence of word vectors
-                ``(seq_len, batch_size, self.dim)``
-            mask (FloatTensor): Sequence mask.
-            first_idx (int or NoneType): starting index for taking a
-                certain part of the embeddings.
-            last_idx (int or NoneType): ending index for taking a
-                certain part of the embeddings.
-
+        """
         Shapes:
-            x: B x C x T
+            x: [B, C, T]
+            mask: [B, 1, T]
+            first_idx: int
+            last_idx: int
         """
 
-        x = x * math.sqrt(self.dim)
+        x = x * math.sqrt(self.channels)
         if first_idx is None:
             if self.pe.size(2) < x.size(2):
                 raise RuntimeError(
@@ -67,6 +61,38 @@ class PositionalEncoding(nn.Module):
 
 class Encoder(nn.Module):
     # pylint: disable=dangerous-default-value
+    """Speedy-Speech encoder using Transformers or Residual BN Convs internally.
+
+    Args:
+        num_chars (int): number of characters.
+        out_channels (int): number of output channels.
+        in_hidden_channels (int): input and hidden channels. Model keeps the input channels for the intermediate layers.
+        encoder_type (str): encoder layer types. 'transformers' or 'residual_conv_bn'. Default 'residual_conv_bn'.
+        encoder_params (dict): model parameters for specified encoder type.
+        c_in_channels (int): number of channels for conditional input.
+
+    Note:
+        Default encoder_params...
+
+        for 'transformer'
+            encoder_params={
+                'hidden_channels_ffn': 128,
+                'num_heads': 2,
+                "kernel_size": 3,
+                "dropout_p": 0.1,
+                "num_layers": 6,
+                "rel_attn_window_size": 4,
+                "input_length": None
+            },
+
+        for 'residual_conv_bn'
+            encoder_params = {
+                "kernel_size": 4,
+                "dilations": 4 * [1, 2, 4] + [1],
+                "num_conv_blocks": 2,
+                "num_res_blocks": 13
+            }
+    """
     def __init__(
         self,
         in_hidden_channels,
@@ -79,41 +105,6 @@ class Encoder(nn.Module):
             "num_res_blocks": 13
         },
         c_in_channels=0):
-        """Speedy-Speech encoder using Transformers or Residual BN Convs internally.
-
-            Args:
-                num_chars (int): number of characters.
-                out_channels (int): number of output channels.
-                in_hidden_channels (int): input and hidden channels. Model keeps the input channels for the intermediate layers.
-                encoder_type (str): encoder layer types. 'transformers' or 'residual_conv_bn'. Default 'residual_conv_bn'.
-                encoder_params (dict): model parameters for specified encoder type.
-                c_in_channels (int): number of channels for conditional input.
-
-            Note:
-                Default encoder_params...
-
-                for 'transformer'
-                    encoder_params={
-                        'hidden_channels_ffn': 128,
-                        'num_heads': 2,
-                        "kernel_size": 3,
-                        "dropout_p": 0.1,
-                        "num_layers": 6,
-                        "rel_attn_window_size": 4,
-                        "input_length": None
-                    },
-
-                for 'residual_conv_bn'
-                    encoder_params = {
-                        "kernel_size": 4,
-                        "dilations": 4 * [1, 2, 4] + [1],
-                        "num_conv_blocks": 2,
-                        "num_res_blocks": 13
-                    }
-
-            Shapes:
-                - input: (B, C, T)
-        """
         super().__init__()
         self.out_channels = out_channels
         self.in_channels = in_hidden_channels
@@ -148,6 +139,12 @@ class Encoder(nn.Module):
         self.post_conv2 = nn.Conv1d(self.hidden_channels, self.out_channels, 1)
 
     def forward(self, x, x_mask, g=None):  # pylint: disable=unused-argument
+        """
+        Shapes:
+            x: [B, C, T]
+            x_mask: [B, 1, T]
+            g: [B, C, 1]
+        """
         # TODO: implement multi-speaker
         if self.encoder_type == 'transformer':
             o = self.pre(x, x_mask)
diff --git a/TTS/tts/models/speedy_speech.py b/TTS/tts/models/speedy_speech.py
index 319d6064..11793fff 100644
--- a/TTS/tts/models/speedy_speech.py
+++ b/TTS/tts/models/speedy_speech.py
@@ -8,7 +8,33 @@ from TTS.tts.layers.glow_tts.monotonic_align import generate_path
 
 
 class SpeedySpeech(nn.Module):
-    # pylint: disable=dangerous-default-value
+    """Speedy Speech model
+    https://arxiv.org/abs/2008.03802
+
+    Encoder -> DurationPredictor -> Decoder
+
+    This model is able to achieve a reasonable performance with only
+    ~3M model parameters and convolutional layers.
+
+    This model requires precomputed phoneme durations to train a duration predictor. At inference
+    it only uses the duration predictor to compute durations and expand encoder outputs respectively.
+
+    Args:
+        num_chars (int): number of unique input to characters
+        out_channels (int): number of output tensor channels. It is equal to the expected spectrogram size.
+        hidden_channels (int): number of channels in all the model layers.
+        positional_encoding (bool, optional): enable/disable Positional encoding on encoder outputs. Defaults to True.
+        length_scale (int, optional): coefficient to set the speech speed. <1 slower, >1 faster. Defaults to 1.
+        encoder_type (str, optional): set the encoder type. Defaults to 'residual_conv_bn'.
+        encoder_params (dict, optional): set encoder parameters depending on 'encoder_type'. Defaults to { "kernel_size": 4, "dilations": 4 * [1, 2, 4] + [1], "num_conv_blocks": 2, "num_res_blocks": 13 }.
+        decoder_type (str, optional): decoder type. Defaults to 'residual_conv_bn'.
+        decoder_params (dict, optional): set decoder parameters depending on 'decoder_type'. Defaults to { "kernel_size": 4, "dilations": 4 * [1, 2, 4, 8] + [1], "num_conv_blocks": 2, "num_res_blocks": 17 }.
+        num_speakers (int, optional): number of speakers for multi-speaker training. Defaults to 0.
+        external_c (bool, optional): enable external speaker embeddings. Defaults to False.
+        c_in_channels (int, optional): number of channels in speaker embedding vectors. Defaults to 0.
+    """
+# pylint: disable=dangerous-default-value
+
     def __init__(
         self,
         num_chars,
@@ -33,6 +59,7 @@ class SpeedySpeech(nn.Module):
         num_speakers=0,
         external_c=False,
         c_in_channels=0):
+
         super().__init__()
         self.length_scale = float(length_scale) if isinstance(length_scale, int) else length_scale
         self.emb = nn.Embedding(num_chars, hidden_channels)
@@ -54,6 +81,19 @@ class SpeedySpeech(nn.Module):
 
     @staticmethod
     def expand_encoder_outputs(en, dr, x_mask, y_mask):
+        """Generate attention alignment map from durations and
+        expand encoder outputs
+
+        Example:
+            encoder output: [a,b,c,d]
+            durations: [1, 3, 2, 1]
+
+            expanded: [a, b, b, b, c, c, d]
+            attention map: [[0, 0, 0, 0, 0, 0, 1],
+                            [0, 0, 0, 0, 1, 1, 0],
+                            [0, 1, 1, 1, 0, 0, 0],
+                            [1, 0, 0, 0, 0, 0, 0]]
+        """
         attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(y_mask, 2)
         attn = generate_path(dr, attn_mask.squeeze(1)).to(en.dtype)
         o_en_ex = torch.matmul(
@@ -121,12 +161,27 @@ class SpeedySpeech(nn.Module):
         return o_de, attn.transpose(1, 2)
 
     def forward(self, x, x_lengths, y_lengths, dr, g=None):  # pylint: disable=unused-argument
+        """
+        Shapes:
+            x: [B, T_max]
+            x_lengths: [B]
+            y_lengths: [B]
+            dr: [B, T_max]
+            g: [B, C]
+        """
+        breakpoint()
         o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
         o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask)
         o_de, attn= self._forward_decoder(o_en, o_en_dp, dr, x_mask, y_lengths, g=g)
         return o_de, o_dr_log.squeeze(1), attn
 
     def inference(self, x, x_lengths, g=None):  # pylint: disable=unused-argument
+        """
+        Shapes:
+            x: [B, T_max]
+            x_lengths: [B]
+            g: [B, C]
+        """
         # pad input to prevent dropping the last word
         x = torch.nn.functional.pad(x, pad=(0, 5), mode='constant', value=0)
         o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)