implement residual BN convolution and add it as an alternative encoder for glow-tts. also generic layers to layers/generic

This commit is contained in:
erogol 2020-12-12 18:04:32 +01:00
parent 973754d893
commit 7b20d8cbd3
11 changed files with 98 additions and 14 deletions

View File

@ -1,7 +1,7 @@
{ {
"model": "glow_tts", "model": "glow_tts",
"run_name": "glow-tts-tdsep-conv", "run_name": "glow-tts-residual_bn_conv",
"run_description": "glow-tts model training with time-depth separable conv encoder.", "run_description": "glow-tts model training with residual BN conv.",
// AUDIO PARAMETERS // AUDIO PARAMETERS
"audio":{ "audio":{
@ -28,7 +28,7 @@
"num_mels": 80, // size of the mel spec frame. "num_mels": 80, // size of the mel spec frame.
"mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! "mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
"mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!! "mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!!
"spec_gain": 1.0, // scaler value appplied after log transform of spectrogram. "spec_gain": 1.0, // scaler value appplied after log transform of spectrogram.00
// Normalization parameters // Normalization parameters
"signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
@ -62,13 +62,15 @@
"reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
// MODEL PARAMETERS // MODEL PARAMETERS
"use_mas": false, // use Monotonic Alignment Search if true. Otherwise use pre-computed attention alignments. // "use_mas": false, // use Monotonic Alignment Search if true. Otherwise use pre-computed attention alignments.
// TRAINING // TRAINING
"batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. "batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
"eval_batch_size":16, "eval_batch_size":16,
"r": 1, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. "r": 1, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
"loss_masking": true, // enable / disable loss masking against the sequence padding. "loss_masking": true, // enable / disable loss masking against the sequence padding.
"mixed_precision": true,
"data_dep_init_iter": 10,
// VALIDATION // VALIDATION
"run_eval": true, "run_eval": true,
@ -84,7 +86,7 @@
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr" "warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
"seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths. "seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths.
"encoder_type": "time-depth-separable", "encoder_type": "residual_conv_bn",
// TENSORBOARD and LOGGING // TENSORBOARD and LOGGING
"print_step": 25, // Number of steps to log training on console. "print_step": 25, // Number of steps to log training on console.
@ -93,7 +95,6 @@
"save_step": 5000, // Number of training steps expected to save traninpg stats and checkpoints. "save_step": 5000, // Number of training steps expected to save traninpg stats and checkpoints.
"checkpoint": true, // If true, it saves checkpoints per "save_step" "checkpoint": true, // If true, it saves checkpoints per "save_step"
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
"apex_amp_level": null,
// DATA LOADING // DATA LOADING
"text_cleaner": "phoneme_cleaners", "text_cleaner": "phoneme_cleaners",
@ -104,6 +105,7 @@
"min_seq_len": 3, // DATASET-RELATED: minimum text length to use in training "min_seq_len": 3, // DATASET-RELATED: minimum text length to use in training
"max_seq_len": 500, // DATASET-RELATED: maximum text length "max_seq_len": 500, // DATASET-RELATED: maximum text length
"compute_f0": false, // compute f0 values in data-loader "compute_f0": false, // compute f0 values in data-loader
"compute_input_seq_cache": true,
// PATHS // PATHS
"output_path": "/home/erogol/Models/LJSpeech/", "output_path": "/home/erogol/Models/LJSpeech/",
@ -115,6 +117,7 @@
// MULTI-SPEAKER and GST // MULTI-SPEAKER and GST
"use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning. "use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning.
"use_external_speaker_embedding_file": false,
"style_wav_for_test": null, // path to style wav file to be used in TacotronGST inference. "style_wav_for_test": null, // path to style wav file to be used in TacotronGST inference.
"use_gst": false, // TACOTRON ONLY: use global style tokens "use_gst": false, // TACOTRON ONLY: use global style tokens

View File

View File

@ -0,0 +1,66 @@
import torch
from torch import nn
from .normalization import TemporalBatchNorm1d
class ZeroTemporalPad(nn.ZeroPad2d):
"""Pad sequences to equal lentgh in the temporal dimension"""
def __init__(self, kernel_size, dilation):
total_pad = (dilation * (kernel_size - 1))
begin = total_pad // 2
end = total_pad - begin
super(ZeroTemporalPad, self).__init__((0, 0, begin, end))
class ConvBN(nn.Module):
def __init__(self, channels, kernel_size, dilation):
super().__init__()
padding = (dilation * (kernel_size - 1))
pad_s = padding // 2
pad_e = padding - pad_s
self.conv1d = nn.Conv1d(channels, channels, kernel_size, dilation=dilation)
self.pad = nn.ZeroPad2d((pad_s, pad_e, 0, 0)) # uneven left and right padding
self.norm = nn.BatchNorm1d(channels)
def forward(self, x):
o = self.conv1d(x)
o = self.pad(o)
o = self.norm(o)
o = nn.functional.relu(o)
return o
class ConvBNBlock(nn.Module):
"""Implements conv->PReLU->norm n-times"""
def __init__(self, channels, kernel_size, dilation, num_conv_blocks=2):
super().__init__()
self.conv_bn_blocks = nn.Sequential(*[
ConvBN(channels, kernel_size, dilation)
for _ in range(num_conv_blocks)
])
def forward(self, x):
"""
Shapes:
x: (B, D, T)
"""
return self.conv_bn_blocks(x)
class ResidualConvBNBlock(nn.Module):
def __init__(self, channels, kernel_size, dilations, num_res_blocks=13, num_conv_blocks=2):
super().__init__()
assert len(dilations) == num_res_blocks
self.res_blocks = nn.ModuleList()
for dilation in dilations:
block = ConvBNBlock(channels, kernel_size, dilation, num_conv_blocks)
self.res_blocks.append(block)
def forward(self, x, x_mask=None):
o = x
for block in self.res_blocks:
res = o
o = block(o * x_mask if x_mask is not None else o)
o = o + res
return o

View File

@ -2,7 +2,7 @@ import torch
from torch import nn from torch import nn
from TTS.tts.layers.glow_tts.glow import InvConvNear, CouplingBlock from TTS.tts.layers.glow_tts.glow import InvConvNear, CouplingBlock
from TTS.tts.layers.glow_tts.normalization import ActNorm from TTS.tts.layers.generic.normalization import ActNorm
def squeeze(x, x_mask=None, num_sqz=2): def squeeze(x, x_mask=None, num_sqz=2):

View File

@ -1,7 +1,7 @@
import torch import torch
from torch import nn from torch import nn
from .normalization import LayerNorm from ..generic.normalization import LayerNorm
class DurationPredictor(nn.Module): class DurationPredictor(nn.Module):

View File

@ -3,11 +3,12 @@ import torch
from torch import nn from torch import nn
from TTS.tts.layers.glow_tts.transformer import Transformer from TTS.tts.layers.glow_tts.transformer import Transformer
from TTS.tts.layers.glow_tts.gated_conv import GatedConvBlock from TTS.tts.layers.generic.gated_conv import GatedConvBlock
from TTS.tts.utils.generic_utils import sequence_mask from TTS.tts.utils.generic_utils import sequence_mask
from TTS.tts.layers.glow_tts.glow import ConvLayerNorm from TTS.tts.layers.glow_tts.glow import ConvLayerNorm
from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor
from TTS.tts.layers.glow_tts.time_depth_sep_conv import TimeDepthSeparableConvBlock from TTS.tts.layers.generic.time_depth_sep_conv import TimeDepthSeparableConvBlock
from TTS.tts.layers.generic.res_conv_bn import ResidualConvBNBlock
class Encoder(nn.Module): class Encoder(nn.Module):
@ -84,12 +85,26 @@ class Encoder(nn.Module):
dropout_p=dropout_p, dropout_p=dropout_p,
rel_attn_window_size=rel_attn_window_size, rel_attn_window_size=rel_attn_window_size,
input_length=input_length) input_length=input_length)
elif encoder_type.lower() == 'gatedconv': elif encoder_type.lower() == 'gated_conv':
self.encoder = GatedConvBlock(hidden_channels, self.encoder = GatedConvBlock(hidden_channels,
kernel_size=5, kernel_size=5,
dropout_p=dropout_p, dropout_p=dropout_p,
num_layers=3 + num_layers) num_layers=3 + num_layers)
elif encoder_type.lower() == 'time-depth-separable': elif encoder_type.lower() == 'residual_conv_bn':
if use_prenet:
self.pre = nn.Sequential(
nn.Conv1d(hidden_channels, hidden_channels, 1),
nn.ReLU()
)
dilations = 4 * [1, 2, 4] + [1]
num_conv_blocks = 2
num_res_blocks = 13 # total 2 * 13 blocks
self.encoder = ResidualConvBNBlock(hidden_channels,
kernel_size=4,
dilations=dilations,
num_res_blocks=num_res_blocks,
num_conv_blocks=num_conv_blocks)
elif encoder_type.lower() == 'time_depth_separable':
# optional convolutional prenet # optional convolutional prenet
if use_prenet: if use_prenet:
self.pre = ConvLayerNorm(hidden_channels, self.pre = ConvLayerNorm(hidden_channels,

View File

@ -2,7 +2,7 @@ import torch
from torch import nn from torch import nn
from torch.nn import functional as F from torch.nn import functional as F
from .normalization import LayerNorm from ..generic.normalization import LayerNorm
class ConvLayerNorm(nn.Module): class ConvLayerNorm(nn.Module):

View File

@ -141,7 +141,7 @@ class GlowTts(nn.Module):
o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x, o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x,
x_lengths, x_lengths,
g=g) g=g)
# format feature vectors and feature vector lenghts # drop redisual frames wrt num_sqz and set y_lengths.
y, y_lengths, y_max_length, attn = self.preprocess( y, y_lengths, y_max_length, attn = self.preprocess(
y, y_lengths, y_max_length, None) y, y_lengths, y_max_length, None)
# create masks # create masks