coqui-tts/TTS/tts/layers/generic/time_depth_sep_conv.py

import torch
from torch import nn


class TimeDepthSeparableConv(nn.Module):
    """Time depth separable convolution as in https://arxiv.org/pdf/1904.02619.pdf
    It shows competative results with less computation and memory footprint."""

    def __init__(self, in_channels, hid_channels, out_channels, kernel_size, bias=True):
        super().__init__()

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.hid_channels = hid_channels
        self.kernel_size = kernel_size

        self.time_conv = nn.Conv1d(
            in_channels,
            2 * hid_channels,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=bias,
        )
        self.norm1 = nn.BatchNorm1d(2 * hid_channels)
        self.depth_conv = nn.Conv1d(
            hid_channels,
            hid_channels,
            kernel_size,
            stride=1,
            padding=(kernel_size - 1) // 2,
            groups=hid_channels,
            bias=bias,
        )
        self.norm2 = nn.BatchNorm1d(hid_channels)
        self.time_conv2 = nn.Conv1d(
            hid_channels,
            out_channels,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=bias,
        )
        self.norm3 = nn.BatchNorm1d(out_channels)

    def forward(self, x):
        x_res = x
        x = self.time_conv(x)
        x = self.norm1(x)
        x = nn.functional.glu(x, dim=1)
        x = self.depth_conv(x)
        x = self.norm2(x)
        x = x * torch.sigmoid(x)
        x = self.time_conv2(x)
        x = self.norm3(x)
        x = x_res + x
        return x


class TimeDepthSeparableConvBlock(nn.Module):
    def __init__(self, in_channels, hid_channels, out_channels, num_layers, kernel_size, bias=True):
        super().__init__()
        assert (kernel_size - 1) % 2 == 0
        assert num_layers > 1

        self.layers = nn.ModuleList()
        layer = TimeDepthSeparableConv(
            in_channels, hid_channels, out_channels if num_layers == 1 else hid_channels, kernel_size, bias
        )
        self.layers.append(layer)
        for idx in range(num_layers - 1):
            layer = TimeDepthSeparableConv(
                hid_channels,
                hid_channels,
                out_channels if (idx + 1) == (num_layers - 1) else hid_channels,
                kernel_size,
                bias,
            )
            self.layers.append(layer)

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x * mask)
        return x