diff --git a/layers/attention.py b/layers/attention.py
index 51d3542a..8a79c6c5 100644
--- a/layers/attention.py
+++ b/layers/attention.py
@@ -22,11 +22,9 @@ class BahdanauAttention(nn.Module):
         # (batch, 1, dim)
         processed_query = self.query_layer(query)
         processed_annots = self.annot_layer(annots)
-
         # (batch, max_time, 1)
         alignment = self.v(nn.functional.tanh(
             processed_query + processed_annots))
-
         # (batch, max_time)
         return alignment.squeeze(-1)
 
@@ -54,31 +52,23 @@ class AttentionRNN(nn.Module):
 
     def forward(self, memory, context, rnn_state, annotations,
                 mask=None, annotations_lengths=None):
-
         if annotations_lengths is not None and mask is None:
             mask = get_mask_from_lengths(annotations, annotations_lengths)
-
         # Concat input query and previous context context
         rnn_input = torch.cat((memory, context), -1)
-        #rnn_input = rnn_input.unsqueeze(1)
-
         # Feed it to RNN
         # s_i = f(y_{i-1}, c_{i}, s_{i-1})
         rnn_output = self.rnn_cell(rnn_input, rnn_state)
-
         # Alignment
         # (batch, max_time)
         # e_{ij} = a(s_{i-1}, h_j)
         alignment = self.alignment_model(annotations, rnn_output)
-
         # TODO: needs recheck.
         if mask is not None:
             mask = mask.view(query.size(0), -1)
             alignment.data.masked_fill_(mask, self.score_mask_value)
-
         # Normalize context weight
         alignment = F.softmax(alignment, dim=-1)
-
         # Attention context vector
         # (batch, 1, dim)
         # c_i = \sum_{j=1}^{T_x} \alpha_{ij} h_j
diff --git a/layers/tacotron.py b/layers/tacotron.py
index 2e945844..77291a9d 100644
--- a/layers/tacotron.py
+++ b/layers/tacotron.py
@@ -102,22 +102,18 @@ class CBHG(nn.Module):
         super(CBHG, self).__init__()
         self.in_features = in_features
         self.relu = nn.ReLU()
-
         # list of conv1d bank with filter size k=1...K
         # TODO: try dilational layers instead
         self.conv1d_banks = nn.ModuleList(
             [BatchNormConv1d(in_features, in_features, kernel_size=k, stride=1,
                              padding=k // 2, activation=self.relu)
                 for k in range(1, K + 1)])
-
         # max pooling of conv bank
         # TODO: try average pooling OR larger kernel size
         self.max_pool1d = nn.MaxPool1d(kernel_size=2, stride=1, padding=1)
-
         out_features = [K * in_features] + projections[:-1]
         activations = [self.relu] * (len(projections) - 1)
         activations += [None]
-
         # setup conv1d projection layers
         layer_set = []
         for (in_size, out_size, ac) in zip(out_features, projections, activations):
@@ -125,12 +121,10 @@ class CBHG(nn.Module):
                                     padding=1, activation=ac)
             layer_set.append(layer)
         self.conv1d_projections = nn.ModuleList(layer_set)
-
         # setup Highway layers
         self.pre_highway = nn.Linear(projections[-1], in_features, bias=False)
         self.highways = nn.ModuleList(
             [Highway(in_features, in_features) for _ in range(num_highways)])
-
         # bi-directional GPU layer
         self.gru = nn.GRU(
             in_features, in_features, 1, batch_first=True, bidirectional=True)
@@ -138,14 +132,11 @@ class CBHG(nn.Module):
     def forward(self, inputs):
         # (B, T_in, in_features)
         x = inputs
-
         # Needed to perform conv1d on time-axis
         # (B, in_features, T_in)
         if x.size(-1) == self.in_features:
             x = x.transpose(1, 2)
-
         T = x.size(-1)
-
         # (B, in_features*K, T_in)
         # Concat conv1d bank outputs
         outs = []
@@ -153,29 +144,22 @@ class CBHG(nn.Module):
             out = conv1d(x)
             out = out[:, :, :T]
             outs.append(out)
-
         x = torch.cat(outs, dim=1)
         assert x.size(1) == self.in_features * len(self.conv1d_banks)
-
         x = self.max_pool1d(x)[:, :, :T]
-
         for conv1d in self.conv1d_projections:
             x = conv1d(x)
-
         # (B, T_in, in_features)
         # Back to the original shape
         x = x.transpose(1, 2)
-
         if x.size(-1) != self.in_features:
             x = self.pre_highway(x)
-
         # Residual connection
         # TODO: try residual scaling as in Deep Voice 3
         # TODO: try plain residual layers
         x += inputs
         for highway in self.highways:
             x = highway(x)
-
         # (B, T_in, in_features*2)
         # TODO: replace GRU with convolution as in Deep Voice 3
         self.gru.flatten_parameters()