diff --git a/layers/attention.py b/layers/attention.py index 51d3542a..8a79c6c5 100644 --- a/layers/attention.py +++ b/layers/attention.py @@ -22,11 +22,9 @@ class BahdanauAttention(nn.Module): # (batch, 1, dim) processed_query = self.query_layer(query) processed_annots = self.annot_layer(annots) - # (batch, max_time, 1) alignment = self.v(nn.functional.tanh( processed_query + processed_annots)) - # (batch, max_time) return alignment.squeeze(-1) @@ -54,31 +52,23 @@ class AttentionRNN(nn.Module): def forward(self, memory, context, rnn_state, annotations, mask=None, annotations_lengths=None): - if annotations_lengths is not None and mask is None: mask = get_mask_from_lengths(annotations, annotations_lengths) - # Concat input query and previous context context rnn_input = torch.cat((memory, context), -1) - #rnn_input = rnn_input.unsqueeze(1) - # Feed it to RNN # s_i = f(y_{i-1}, c_{i}, s_{i-1}) rnn_output = self.rnn_cell(rnn_input, rnn_state) - # Alignment # (batch, max_time) # e_{ij} = a(s_{i-1}, h_j) alignment = self.alignment_model(annotations, rnn_output) - # TODO: needs recheck. if mask is not None: mask = mask.view(query.size(0), -1) alignment.data.masked_fill_(mask, self.score_mask_value) - # Normalize context weight alignment = F.softmax(alignment, dim=-1) - # Attention context vector # (batch, 1, dim) # c_i = \sum_{j=1}^{T_x} \alpha_{ij} h_j diff --git a/layers/tacotron.py b/layers/tacotron.py index 2e945844..77291a9d 100644 --- a/layers/tacotron.py +++ b/layers/tacotron.py @@ -102,22 +102,18 @@ class CBHG(nn.Module): super(CBHG, self).__init__() self.in_features = in_features self.relu = nn.ReLU() - # list of conv1d bank with filter size k=1...K # TODO: try dilational layers instead self.conv1d_banks = nn.ModuleList( [BatchNormConv1d(in_features, in_features, kernel_size=k, stride=1, padding=k // 2, activation=self.relu) for k in range(1, K + 1)]) - # max pooling of conv bank # TODO: try average pooling OR larger kernel size self.max_pool1d = nn.MaxPool1d(kernel_size=2, stride=1, padding=1) - out_features = [K * in_features] + projections[:-1] activations = [self.relu] * (len(projections) - 1) activations += [None] - # setup conv1d projection layers layer_set = [] for (in_size, out_size, ac) in zip(out_features, projections, activations): @@ -125,12 +121,10 @@ class CBHG(nn.Module): padding=1, activation=ac) layer_set.append(layer) self.conv1d_projections = nn.ModuleList(layer_set) - # setup Highway layers self.pre_highway = nn.Linear(projections[-1], in_features, bias=False) self.highways = nn.ModuleList( [Highway(in_features, in_features) for _ in range(num_highways)]) - # bi-directional GPU layer self.gru = nn.GRU( in_features, in_features, 1, batch_first=True, bidirectional=True) @@ -138,14 +132,11 @@ class CBHG(nn.Module): def forward(self, inputs): # (B, T_in, in_features) x = inputs - # Needed to perform conv1d on time-axis # (B, in_features, T_in) if x.size(-1) == self.in_features: x = x.transpose(1, 2) - T = x.size(-1) - # (B, in_features*K, T_in) # Concat conv1d bank outputs outs = [] @@ -153,29 +144,22 @@ class CBHG(nn.Module): out = conv1d(x) out = out[:, :, :T] outs.append(out) - x = torch.cat(outs, dim=1) assert x.size(1) == self.in_features * len(self.conv1d_banks) - x = self.max_pool1d(x)[:, :, :T] - for conv1d in self.conv1d_projections: x = conv1d(x) - # (B, T_in, in_features) # Back to the original shape x = x.transpose(1, 2) - if x.size(-1) != self.in_features: x = self.pre_highway(x) - # Residual connection # TODO: try residual scaling as in Deep Voice 3 # TODO: try plain residual layers x += inputs for highway in self.highways: x = highway(x) - # (B, T_in, in_features*2) # TODO: replace GRU with convolution as in Deep Voice 3 self.gru.flatten_parameters()