mirror of https://github.com/coqui-ai/TTS.git
implement residual BN convolution and add it as an alternative encoder for glow-tts. also generic layers to layers/generic
This commit is contained in:
parent
973754d893
commit
7b20d8cbd3
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"model": "glow_tts",
|
||||
"run_name": "glow-tts-tdsep-conv",
|
||||
"run_description": "glow-tts model training with time-depth separable conv encoder.",
|
||||
"run_name": "glow-tts-residual_bn_conv",
|
||||
"run_description": "glow-tts model training with residual BN conv.",
|
||||
|
||||
// AUDIO PARAMETERS
|
||||
"audio":{
|
||||
|
@ -28,7 +28,7 @@
|
|||
"num_mels": 80, // size of the mel spec frame.
|
||||
"mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
|
||||
"mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!!
|
||||
"spec_gain": 1.0, // scaler value appplied after log transform of spectrogram.
|
||||
"spec_gain": 1.0, // scaler value appplied after log transform of spectrogram.00
|
||||
|
||||
// Normalization parameters
|
||||
"signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
|
||||
|
@ -62,13 +62,15 @@
|
|||
"reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
|
||||
|
||||
// MODEL PARAMETERS
|
||||
"use_mas": false, // use Monotonic Alignment Search if true. Otherwise use pre-computed attention alignments.
|
||||
// "use_mas": false, // use Monotonic Alignment Search if true. Otherwise use pre-computed attention alignments.
|
||||
|
||||
// TRAINING
|
||||
"batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
|
||||
"eval_batch_size":16,
|
||||
"r": 1, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.
|
||||
"loss_masking": true, // enable / disable loss masking against the sequence padding.
|
||||
"mixed_precision": true,
|
||||
"data_dep_init_iter": 10,
|
||||
|
||||
// VALIDATION
|
||||
"run_eval": true,
|
||||
|
@ -84,7 +86,7 @@
|
|||
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
|
||||
"seq_len_norm": false, // Normalize eash sample loss with its length to alleviate imbalanced datasets. Use it if your dataset is small or has skewed distribution of sequence lengths.
|
||||
|
||||
"encoder_type": "time-depth-separable",
|
||||
"encoder_type": "residual_conv_bn",
|
||||
|
||||
// TENSORBOARD and LOGGING
|
||||
"print_step": 25, // Number of steps to log training on console.
|
||||
|
@ -93,7 +95,6 @@
|
|||
"save_step": 5000, // Number of training steps expected to save traninpg stats and checkpoints.
|
||||
"checkpoint": true, // If true, it saves checkpoints per "save_step"
|
||||
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
|
||||
"apex_amp_level": null,
|
||||
|
||||
// DATA LOADING
|
||||
"text_cleaner": "phoneme_cleaners",
|
||||
|
@ -104,6 +105,7 @@
|
|||
"min_seq_len": 3, // DATASET-RELATED: minimum text length to use in training
|
||||
"max_seq_len": 500, // DATASET-RELATED: maximum text length
|
||||
"compute_f0": false, // compute f0 values in data-loader
|
||||
"compute_input_seq_cache": true,
|
||||
|
||||
// PATHS
|
||||
"output_path": "/home/erogol/Models/LJSpeech/",
|
||||
|
@ -115,6 +117,7 @@
|
|||
|
||||
// MULTI-SPEAKER and GST
|
||||
"use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning.
|
||||
"use_external_speaker_embedding_file": false,
|
||||
"style_wav_for_test": null, // path to style wav file to be used in TacotronGST inference.
|
||||
"use_gst": false, // TACOTRON ONLY: use global style tokens
|
||||
|
||||
|
|
|
@ -0,0 +1,66 @@
|
|||
import torch
|
||||
from torch import nn
|
||||
from .normalization import TemporalBatchNorm1d
|
||||
|
||||
|
||||
class ZeroTemporalPad(nn.ZeroPad2d):
|
||||
"""Pad sequences to equal lentgh in the temporal dimension"""
|
||||
def __init__(self, kernel_size, dilation):
|
||||
total_pad = (dilation * (kernel_size - 1))
|
||||
begin = total_pad // 2
|
||||
end = total_pad - begin
|
||||
super(ZeroTemporalPad, self).__init__((0, 0, begin, end))
|
||||
|
||||
|
||||
class ConvBN(nn.Module):
|
||||
def __init__(self, channels, kernel_size, dilation):
|
||||
super().__init__()
|
||||
padding = (dilation * (kernel_size - 1))
|
||||
pad_s = padding // 2
|
||||
pad_e = padding - pad_s
|
||||
self.conv1d = nn.Conv1d(channels, channels, kernel_size, dilation=dilation)
|
||||
self.pad = nn.ZeroPad2d((pad_s, pad_e, 0, 0)) # uneven left and right padding
|
||||
self.norm = nn.BatchNorm1d(channels)
|
||||
|
||||
def forward(self, x):
|
||||
o = self.conv1d(x)
|
||||
o = self.pad(o)
|
||||
o = self.norm(o)
|
||||
o = nn.functional.relu(o)
|
||||
return o
|
||||
|
||||
|
||||
class ConvBNBlock(nn.Module):
|
||||
"""Implements conv->PReLU->norm n-times"""
|
||||
|
||||
def __init__(self, channels, kernel_size, dilation, num_conv_blocks=2):
|
||||
super().__init__()
|
||||
self.conv_bn_blocks = nn.Sequential(*[
|
||||
ConvBN(channels, kernel_size, dilation)
|
||||
for _ in range(num_conv_blocks)
|
||||
])
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
Shapes:
|
||||
x: (B, D, T)
|
||||
"""
|
||||
return self.conv_bn_blocks(x)
|
||||
|
||||
|
||||
class ResidualConvBNBlock(nn.Module):
|
||||
def __init__(self, channels, kernel_size, dilations, num_res_blocks=13, num_conv_blocks=2):
|
||||
super().__init__()
|
||||
assert len(dilations) == num_res_blocks
|
||||
self.res_blocks = nn.ModuleList()
|
||||
for dilation in dilations:
|
||||
block = ConvBNBlock(channels, kernel_size, dilation, num_conv_blocks)
|
||||
self.res_blocks.append(block)
|
||||
|
||||
def forward(self, x, x_mask=None):
|
||||
o = x
|
||||
for block in self.res_blocks:
|
||||
res = o
|
||||
o = block(o * x_mask if x_mask is not None else o)
|
||||
o = o + res
|
||||
return o
|
|
@ -2,7 +2,7 @@ import torch
|
|||
from torch import nn
|
||||
|
||||
from TTS.tts.layers.glow_tts.glow import InvConvNear, CouplingBlock
|
||||
from TTS.tts.layers.glow_tts.normalization import ActNorm
|
||||
from TTS.tts.layers.generic.normalization import ActNorm
|
||||
|
||||
|
||||
def squeeze(x, x_mask=None, num_sqz=2):
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import torch
|
||||
from torch import nn
|
||||
|
||||
from .normalization import LayerNorm
|
||||
from ..generic.normalization import LayerNorm
|
||||
|
||||
|
||||
class DurationPredictor(nn.Module):
|
||||
|
|
|
@ -3,11 +3,12 @@ import torch
|
|||
from torch import nn
|
||||
|
||||
from TTS.tts.layers.glow_tts.transformer import Transformer
|
||||
from TTS.tts.layers.glow_tts.gated_conv import GatedConvBlock
|
||||
from TTS.tts.layers.generic.gated_conv import GatedConvBlock
|
||||
from TTS.tts.utils.generic_utils import sequence_mask
|
||||
from TTS.tts.layers.glow_tts.glow import ConvLayerNorm
|
||||
from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor
|
||||
from TTS.tts.layers.glow_tts.time_depth_sep_conv import TimeDepthSeparableConvBlock
|
||||
from TTS.tts.layers.generic.time_depth_sep_conv import TimeDepthSeparableConvBlock
|
||||
from TTS.tts.layers.generic.res_conv_bn import ResidualConvBNBlock
|
||||
|
||||
|
||||
class Encoder(nn.Module):
|
||||
|
@ -84,12 +85,26 @@ class Encoder(nn.Module):
|
|||
dropout_p=dropout_p,
|
||||
rel_attn_window_size=rel_attn_window_size,
|
||||
input_length=input_length)
|
||||
elif encoder_type.lower() == 'gatedconv':
|
||||
elif encoder_type.lower() == 'gated_conv':
|
||||
self.encoder = GatedConvBlock(hidden_channels,
|
||||
kernel_size=5,
|
||||
dropout_p=dropout_p,
|
||||
num_layers=3 + num_layers)
|
||||
elif encoder_type.lower() == 'time-depth-separable':
|
||||
elif encoder_type.lower() == 'residual_conv_bn':
|
||||
if use_prenet:
|
||||
self.pre = nn.Sequential(
|
||||
nn.Conv1d(hidden_channels, hidden_channels, 1),
|
||||
nn.ReLU()
|
||||
)
|
||||
dilations = 4 * [1, 2, 4] + [1]
|
||||
num_conv_blocks = 2
|
||||
num_res_blocks = 13 # total 2 * 13 blocks
|
||||
self.encoder = ResidualConvBNBlock(hidden_channels,
|
||||
kernel_size=4,
|
||||
dilations=dilations,
|
||||
num_res_blocks=num_res_blocks,
|
||||
num_conv_blocks=num_conv_blocks)
|
||||
elif encoder_type.lower() == 'time_depth_separable':
|
||||
# optional convolutional prenet
|
||||
if use_prenet:
|
||||
self.pre = ConvLayerNorm(hidden_channels,
|
||||
|
|
|
@ -2,7 +2,7 @@ import torch
|
|||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
from .normalization import LayerNorm
|
||||
from ..generic.normalization import LayerNorm
|
||||
|
||||
|
||||
class ConvLayerNorm(nn.Module):
|
||||
|
|
|
@ -141,7 +141,7 @@ class GlowTts(nn.Module):
|
|||
o_mean, o_log_scale, o_dur_log, x_mask = self.encoder(x,
|
||||
x_lengths,
|
||||
g=g)
|
||||
# format feature vectors and feature vector lenghts
|
||||
# drop redisual frames wrt num_sqz and set y_lengths.
|
||||
y, y_lengths, y_max_length, attn = self.preprocess(
|
||||
y, y_lengths, y_max_length, None)
|
||||
# create masks
|
||||
|
|
Loading…
Reference in New Issue