diff --git a/TTS/tts/layers/generic/res_conv_bn.py b/TTS/tts/layers/generic/res_conv_bn.py index daffd7e2..b21c0824 100644 --- a/TTS/tts/layers/generic/res_conv_bn.py +++ b/TTS/tts/layers/generic/res_conv_bn.py @@ -27,8 +27,8 @@ class ConvBN(nn.Module): def forward(self, x): o = self.conv1d(x) o = self.pad(o) - o = self.norm(o) o = nn.functional.relu(o) + o = self.norm(o) return o diff --git a/TTS/tts/layers/speedy_speech/decoder.py b/TTS/tts/layers/speedy_speech/decoder.py index 9ef2215d..81c2d86d 100644 --- a/TTS/tts/layers/speedy_speech/decoder.py +++ b/TTS/tts/layers/speedy_speech/decoder.py @@ -25,7 +25,7 @@ class Decoder(nn.Module): self.post_conv = nn.Conv1d(hidden_channels, hidden_channels, 1) self.post_net = nn.Sequential( - ConvBNBlock(hidden_channels, 4, 1, num_conv_blocks=2), + ConvBNBlock(hidden_channels, residual_conv_bn_params['kernel_size'], 1, num_conv_blocks=2), nn.Conv1d(hidden_channels, out_channels, 1), ) @@ -33,4 +33,4 @@ class Decoder(nn.Module): # TODO: implement multi-speaker o = self.decoder(x, x_mask) o = self.post_conv(o) + x - return self.post_net(o) + return self.post_net(o) * x_mask diff --git a/TTS/tts/layers/speedy_speech/encoder.py b/TTS/tts/layers/speedy_speech/encoder.py index d88db8b7..755c8521 100644 --- a/TTS/tts/layers/speedy_speech/encoder.py +++ b/TTS/tts/layers/speedy_speech/encoder.py @@ -154,8 +154,8 @@ class Encoder(nn.Module): o = self.pre(x) * x_mask o = self.encoder(o, x_mask) o = self.post_conv(o + x) - o = self.post_bn(o) o = F.relu(o) + o = self.post_bn(o) o = self.post_conv2(o) # [B, C, T] return o * x_mask