diff --git a/TTS/bin/train_vocoder.py b/TTS/bin/train_vocoder.py index 68f6982a..dc081a5e 100644 --- a/TTS/bin/train_vocoder.py +++ b/TTS/bin/train_vocoder.py @@ -1,23 +1,19 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - import argparse import glob import os import sys import time import traceback -from inspect import signature import torch from torch.utils.data import DataLoader -from TTS.utils.generic_utils import (KeepAverage, count_parameters, - create_experiment_folder, - get_git_branch, - remove_experiment_folder, - set_init_dict) +from inspect import signature + from TTS.utils.audio import AudioProcessor +from TTS.utils.generic_utils import (KeepAverage, count_parameters, + create_experiment_folder, get_git_branch, + remove_experiment_folder, set_init_dict) from TTS.utils.io import copy_config_file, load_config from TTS.utils.radam import RAdam from TTS.utils.tensorboard_logger import TensorboardLogger @@ -27,11 +23,12 @@ from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data # from distribute import (DistributedSampler, apply_gradient_allreduce, # init_distributed, reduce_tensor) from TTS.vocoder.layers.losses import DiscriminatorLoss, GeneratorLoss +from TTS.vocoder.utils.io import save_checkpoint, save_best_model from TTS.vocoder.utils.console_logger import ConsoleLogger from TTS.vocoder.utils.generic_utils import (check_config, plot_results, setup_discriminator, setup_generator) -from TTS.vocoder.utils.io import save_best_model, save_checkpoint + use_cuda, num_gpus = setup_torch_training_env(True, True) @@ -127,6 +124,7 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D, y_hat_vis = y_hat y_G_sub = model_G.pqmf_analysis(y_G) + scores_fake, feats_fake, feats_real = None, None, None if global_step > c.steps_to_start_discriminator: # run D with or without cond. features @@ -149,8 +147,6 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D, _, feats_real = D_out_real else: scores_fake = D_out_fake - else: - scores_fake, feats_fake, feats_real = None, None, None # compute losses loss_G_dict = criterion_G(y_hat, y_G, scores_fake, feats_fake, @@ -331,6 +327,7 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch) y_G_sub = model_G.pqmf_analysis(y_G) + scores_fake, feats_fake, feats_real = None, None, None if global_step > c.steps_to_start_discriminator: if len(signature(model_D.forward).parameters) == 2: @@ -352,8 +349,7 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch) _, feats_real = D_out_real else: scores_fake = D_out_fake - else: - scores_fake, feats_fake, feats_real = None, None, None + feats_fake, feats_real = None, None # compute losses loss_G_dict = criterion_G(y_hat, y_G, scores_fake, feats_fake, diff --git a/vocoder/configs/parallel_wavegan_config.json b/vocoder/configs/parallel_wavegan_config.json deleted file mode 100644 index fcd765bd..00000000 --- a/vocoder/configs/parallel_wavegan_config.json +++ /dev/null @@ -1,143 +0,0 @@ -{ - "run_name": "pwgan", - "run_description": "parallel-wavegan training", - - // AUDIO PARAMETERS - "audio":{ - "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame. - "win_length": 1024, // stft window length in ms. - "hop_length": 256, // stft window hop-lengh in ms. - "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used. - "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used. - - // Audio processing parameters - "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled. - "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis. - "ref_level_db": 0, // reference level db, theoretically 20db is the sound of air. - - // Silence trimming - "do_trim_silence": true,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) - "trim_db": 60, // threshold for timming silence. Set this according to your dataset. - - // MelSpectrogram parameters - "num_mels": 80, // size of the mel spec frame. - "mel_fmin": 50.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 7600.0, // maximum freq level for mel-spec. Tune for dataset!! - "spec_gain": 1.0, // scaler value appplied after log transform of spectrogram. - - // Normalization parameters - "signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params. - "min_level_db": -100, // lower bound for normalization - "symmetric_norm": true, // move normalization to range [-1, 1] - "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] - "clip_norm": true, // clip normalized values into the range. - "stats_path": "/home/erogol/Data/LJSpeech-1.1/scale_stats.npy" // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored - }, - - // DISTRIBUTED TRAINING - // "distributed":{ - // "backend": "nccl", - // "url": "tcp:\/\/localhost:54321" - // }, - - // MODEL PARAMETERS - "use_pqmf": true, - - // LOSS PARAMETERS - "use_stft_loss": true, - "use_subband_stft_loss": false, // USE ONLY WITH MULTIBAND MODELS - "use_mse_gan_loss": true, - "use_hinge_gan_loss": false, - "use_feat_match_loss": false, // use only with melgan discriminators - - // loss weights - "stft_loss_weight": 0.5, - "subband_stft_loss_weight": 0.5, - "mse_G_loss_weight": 2.5, - "hinge_G_loss_weight": 2.5, - "feat_match_loss_weight": 25, - - // multiscale stft loss parameters - "stft_loss_params": { - "n_ffts": [1024, 2048, 512], - "hop_lengths": [120, 240, 50], - "win_lengths": [600, 1200, 240] - }, - - // subband multiscale stft loss parameters - "subband_stft_loss_params":{ - "n_ffts": [384, 683, 171], - "hop_lengths": [30, 60, 10], - "win_lengths": [150, 300, 60] - }, - - "target_loss": "avg_G_loss", // loss value to pick the best model to save after each epoch - - // DISCRIMINATOR - "discriminator_model": "parallel_wavegan_discriminator", - "discriminator_model_params":{ - "num_layers": 10 - }, - "steps_to_start_discriminator": 200000, // steps required to start GAN trainining.1 - - // GENERATOR - "generator_model": "parallel_wavegan_generator", - "generator_model_params": { - "upsample_factors":[4, 4, 4, 4], - "stacks": 3, - "num_res_blocks": 30 - }, - - // DATASET - "data_path": "/home/erogol/Data/LJSpeech-1.1/wavs/", - "feature_path": null, - "seq_len": 25600, - "pad_short": 2000, - "conv_pad": 0, - "use_noise_augment": false, - "use_cache": true, - - "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers. - - // TRAINING - "batch_size": 6, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. - - // VALIDATION - "run_eval": true, - "test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time. - "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences. - - // OPTIMIZER - "epochs": 10000, // total number of epochs to train. - "wd": 0.0, // Weight decay weight. - "gen_clip_grad": -1, // Generator gradient clipping threshold. Apply gradient clipping if > 0 - "disc_clip_grad": -1, // Discriminator gradient clipping threshold. - "lr_scheduler_gen": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate - "lr_scheduler_gen_params": { - "gamma": 0.5, - "milestones": [100000, 200000, 300000, 400000, 500000, 600000] - }, - "lr_scheduler_disc": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate - "lr_scheduler_disc_params": { - "gamma": 0.5, - "milestones": [100000, 200000, 300000, 400000, 500000, 600000] - }, - "lr_gen": 1e-4, // Initial learning rate. If Noam decay is active, maximum learning rate. - "lr_disc": 1e-4, - - // TENSORBOARD and LOGGING - "print_step": 25, // Number of steps to log traning on console. - "print_eval": false, // If True, it prints loss values for each step in eval run. - "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints. - "checkpoint": true, // If true, it saves checkpoints per "save_step" - "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. - - // DATA LOADING - "num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values. - "num_val_loader_workers": 4, // number of evaluation data loader processes. - "eval_split_size": 10, - - // PATHS - "output_path": "/home/erogol/Models/LJSpeech/" -} - diff --git a/vocoder/layers/parallel_wavegan.py b/vocoder/layers/parallel_wavegan.py deleted file mode 100644 index 35a56e8d..00000000 --- a/vocoder/layers/parallel_wavegan.py +++ /dev/null @@ -1,75 +0,0 @@ -import torch -from torch.nn import functional as F - - -class ResidualBlock(torch.nn.Module): - """Residual block module in WaveNet.""" - - def __init__(self, - kernel_size=3, - res_channels=64, - gate_channels=128, - skip_channels=64, - aux_channels=80, - dropout=0.0, - dilation=1, - bias=True, - use_causal_conv=False - ): - super(ResidualBlock, self).__init__() - self.dropout = dropout - # no future time stamps available - if use_causal_conv: - padding = (kernel_size - 1) * dilation - else: - assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size." - padding = (kernel_size - 1) // 2 * dilation - self.use_causal_conv = use_causal_conv - - # dilation conv - self.conv = torch.nn.Conv1d(res_channels, gate_channels, kernel_size, - padding=padding, dilation=dilation, bias=bias) - - # local conditioning - if aux_channels > 0: - self.conv1x1_aux = torch.nn.Conv1d(aux_channels, gate_channels, 1, bias=False) - else: - self.conv1x1_aux = None - - # conv output is split into two groups - gate_out_channels = gate_channels // 2 - self.conv1x1_out = torch.nn.Conv1d(gate_out_channels, res_channels, 1, bias=bias) - self.conv1x1_skip = torch.nn.Conv1d(gate_out_channels, skip_channels, 1, bias=bias) - - def forward(self, x, c): - """ - x: B x D_res x T - c: B x D_aux x T - """ - residual = x - x = F.dropout(x, p=self.dropout, training=self.training) - x = self.conv(x) - - # remove future time steps if use_causal_conv conv - x = x[:, :, :residual.size(-1)] if self.use_causal_conv else x - - # split into two part for gated activation - splitdim = 1 - xa, xb = x.split(x.size(splitdim) // 2, dim=splitdim) - - # local conditioning - if c is not None: - assert self.conv1x1_aux is not None - c = self.conv1x1_aux(c) - ca, cb = c.split(c.size(splitdim) // 2, dim=splitdim) - xa, xb = xa + ca, xb + cb - - x = torch.tanh(xa) * torch.sigmoid(xb) - - # for skip connection - s = self.conv1x1_skip(x) - - # for residual connection - x = (self.conv1x1_out(x) + residual) * (0.5 ** 2) - - return x, s diff --git a/vocoder/layers/upsample.py b/vocoder/layers/upsample.py deleted file mode 100644 index 1f70c9f6..00000000 --- a/vocoder/layers/upsample.py +++ /dev/null @@ -1,100 +0,0 @@ -import numpy as np -import torch -from torch.nn import functional as F - - -class Stretch2d(torch.nn.Module): - def __init__(self, x_scale, y_scale, mode="nearest"): - super(Stretch2d, self).__init__() - self.x_scale = x_scale - self.y_scale = y_scale - self.mode = mode - - def forward(self, x): - """ - x (Tensor): Input tensor (B, C, F, T). - Tensor: Interpolated tensor (B, C, F * y_scale, T * x_scale), - """ - return F.interpolate( - x, scale_factor=(self.y_scale, self.x_scale), mode=self.mode) - - -class UpsampleNetwork(torch.nn.Module): - def __init__(self, - upsample_factors, - nonlinear_activation=None, - nonlinear_activation_params={}, - interpolate_mode="nearest", - freq_axis_kernel_size=1, - use_causal_conv=False, - ): - super(UpsampleNetwork, self).__init__() - self.use_causal_conv = use_causal_conv - self.up_layers = torch.nn.ModuleList() - for scale in upsample_factors: - # interpolation layer - stretch = Stretch2d(scale, 1, interpolate_mode) - self.up_layers += [stretch] - - # conv layer - assert (freq_axis_kernel_size - 1) % 2 == 0, "Not support even number freq axis kernel size." - freq_axis_padding = (freq_axis_kernel_size - 1) // 2 - kernel_size = (freq_axis_kernel_size, scale * 2 + 1) - if use_causal_conv: - padding = (freq_axis_padding, scale * 2) - else: - padding = (freq_axis_padding, scale) - conv = torch.nn.Conv2d(1, 1, kernel_size=kernel_size, padding=padding, bias=False) - self.up_layers += [conv] - - # nonlinear - if nonlinear_activation is not None: - nonlinear = getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params) - self.up_layers += [nonlinear] - - def forward(self, c): - """ - c : (B, C, T_in). - Tensor: (B, C, T_upsample) - """ - c = c.unsqueeze(1) # (B, 1, C, T) - for f in self.up_layers: - c = f(c) - return c.squeeze(1) # (B, C, T') - - -class ConvUpsample(torch.nn.Module): - def __init__(self, - upsample_factors, - nonlinear_activation=None, - nonlinear_activation_params={}, - interpolate_mode="nearest", - freq_axis_kernel_size=1, - aux_channels=80, - aux_context_window=0, - use_causal_conv=False - ): - super(ConvUpsample, self).__init__() - self.aux_context_window = aux_context_window - self.use_causal_conv = use_causal_conv and aux_context_window > 0 - # To capture wide-context information in conditional features - kernel_size = aux_context_window + 1 if use_causal_conv else 2 * aux_context_window + 1 - # NOTE(kan-bayashi): Here do not use padding because the input is already padded - self.conv_in = torch.nn.Conv1d(aux_channels, aux_channels, kernel_size=kernel_size, bias=False) - self.upsample = UpsampleNetwork( - upsample_factors=upsample_factors, - nonlinear_activation=nonlinear_activation, - nonlinear_activation_params=nonlinear_activation_params, - interpolate_mode=interpolate_mode, - freq_axis_kernel_size=freq_axis_kernel_size, - use_causal_conv=use_causal_conv, - ) - - def forward(self, c): - """ - c : (B, C, T_in). - Tensor: (B, C, T_upsampled), - """ - c_ = self.conv_in(c) - c = c_[:, :, :-self.aux_context_window] if self.use_causal_conv else c_ - return self.upsample(c) diff --git a/vocoder/models/parallel_wavegan_discriminator.py b/vocoder/models/parallel_wavegan_discriminator.py deleted file mode 100644 index de03ccdb..00000000 --- a/vocoder/models/parallel_wavegan_discriminator.py +++ /dev/null @@ -1,192 +0,0 @@ -import math -import torch -from torch import nn -from torch.nn import functional as F - -from TTS.vocoder.layers.parallel_wavegan import ResidualBlock - - -class ParallelWaveganDiscriminator(nn.Module): - """PWGAN discriminator as in https://arxiv.org/abs/1910.11480. - It classifies each audio window real/fake and returns a sequence - of predictions. - It is a stack of convolutional blocks with dilation. - """ - - def __init__(self, - in_channels=1, - out_channels=1, - kernel_size=3, - num_layers=10, - conv_channels=64, - dilation_factor=1, - nonlinear_activation="LeakyReLU", - nonlinear_activation_params={"negative_slope": 0.2}, - bias=True, - ): - super(ParallelWaveganDiscriminator, self).__init__() - assert (kernel_size - 1) % 2 == 0, " [!] does not support even number kernel size." - assert dilation_factor > 0, " [!] dilation factor must be > 0." - self.conv_layers = nn.ModuleList() - conv_in_channels = in_channels - for i in range(num_layers - 1): - if i == 0: - dilation = 1 - else: - dilation = i if dilation_factor == 1 else dilation_factor ** i - conv_in_channels = conv_channels - padding = (kernel_size - 1) // 2 * dilation - conv_layer = [ - nn.Conv1d(conv_in_channels, conv_channels, - kernel_size=kernel_size, padding=padding, - dilation=dilation, bias=bias), - getattr(nn, nonlinear_activation)(inplace=True, **nonlinear_activation_params) - ] - self.conv_layers += conv_layer - padding = (kernel_size - 1) // 2 - last_conv_layer = nn.Conv1d( - conv_in_channels, out_channels, - kernel_size=kernel_size, padding=padding, bias=bias) - self.conv_layers += [last_conv_layer] - self.apply_weight_norm() - - def forward(self, x): - """ - x : (B, 1, T). - Returns: - Tensor: (B, 1, T) - """ - for f in self.conv_layers: - x = f(x) - return x - - def apply_weight_norm(self): - def _apply_weight_norm(m): - if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.Conv2d): - torch.nn.utils.weight_norm(m) - self.apply(_apply_weight_norm) - - def remove_weight_norm(self): - def _remove_weight_norm(m): - try: - # print(f"Weight norm is removed from {m}.") - nn.utils.remove_weight_norm(m) - except ValueError: # this module didn't have weight norm - return - self.apply(_remove_weight_norm) - - -class ResidualParallelWaveganDiscriminator(nn.Module): - def __init__(self, - in_channels=1, - out_channels=1, - kernel_size=3, - num_layers=30, - stacks=3, - res_channels=64, - gate_channels=128, - skip_channels=64, - dropout=0.0, - bias=True, - nonlinear_activation="LeakyReLU", - nonlinear_activation_params={"negative_slope": 0.2}, - ): - super(ResidualParallelWaveganDiscriminator, self).__init__() - assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size." - - self.in_channels = in_channels - self.out_channels = out_channels - self.num_layers = num_layers - self.stacks = stacks - self.kernel_size = kernel_size - self.res_factor = math.sqrt(1.0 / num_layers) - - # check the number of num_layers and stacks - assert num_layers % stacks == 0 - layers_per_stack = num_layers // stacks - - # define first convolution - self.first_conv = nn.Sequential( - nn.Conv1d(in_channels, - res_channels, - kernel_size=1, - padding=0, - dilation=1, - bias=True), - getattr(nn, nonlinear_activation)(inplace=True, - **nonlinear_activation_params), - ) - - # define residual blocks - self.conv_layers = nn.ModuleList() - for layer in range(num_layers): - dilation = 2 ** (layer % layers_per_stack) - conv = ResidualBlock( - kernel_size=kernel_size, - res_channels=res_channels, - gate_channels=gate_channels, - skip_channels=skip_channels, - aux_channels=-1, - dilation=dilation, - dropout=dropout, - bias=bias, - use_causal_conv=False, - ) - self.conv_layers += [conv] - - # define output layers - self.last_conv_layers = nn.ModuleList([ - getattr(nn, nonlinear_activation)(inplace=True, - **nonlinear_activation_params), - nn.Conv1d(skip_channels, - skip_channels, - kernel_size=1, - padding=0, - dilation=1, - bias=True), - getattr(nn, nonlinear_activation)(inplace=True, - **nonlinear_activation_params), - nn.Conv1d(skip_channels, - out_channels, - kernel_size=1, - padding=0, - dilation=1, - bias=True), - ]) - - # apply weight norm - self.apply_weight_norm() - - def forward(self, x): - """ - x: (B, 1, T). - """ - x = self.first_conv(x) - - skips = 0 - for f in self.conv_layers: - x, h = f(x, None) - skips += h - skips *= self.res_factor - - # apply final layers - x = skips - for f in self.last_conv_layers: - x = f(x) - return x - - def apply_weight_norm(self): - def _apply_weight_norm(m): - if isinstance(m, torch.nn.Conv1d) or isinstance(m, torch.nn.Conv2d): - torch.nn.utils.weight_norm(m) - self.apply(_apply_weight_norm) - - def remove_weight_norm(self): - def _remove_weight_norm(m): - try: - print(f"Weight norm is removed from {m}.") - nn.utils.remove_weight_norm(m) - except ValueError: # this module didn't have weight norm - return - - self.apply(_remove_weight_norm) diff --git a/vocoder/models/parallel_wavegan_generator.py b/vocoder/models/parallel_wavegan_generator.py deleted file mode 100644 index 56316a41..00000000 --- a/vocoder/models/parallel_wavegan_generator.py +++ /dev/null @@ -1,162 +0,0 @@ -import math -import numpy as np -import torch -from torch.nn.utils import weight_norm - -from TTS.vocoder.layers.parallel_wavegan import ResidualBlock -from TTS.vocoder.layers.upsample import ConvUpsample - - -class ParallelWaveganGenerator(torch.nn.Module): - """PWGAN generator as in https://arxiv.org/pdf/1910.11480.pdf. - It is similar to WaveNet with no causal convolution. - It is conditioned on an aux feature (spectrogram) to generate - an output waveform from an input noise. - """ - def __init__(self, - in_channels=1, - out_channels=1, - kernel_size=3, - num_res_blocks=30, - stacks=3, - res_channels=64, - gate_channels=128, - skip_channels=64, - aux_channels=80, - aux_context_window=2, - dropout=0.0, - bias=True, - use_weight_norm=True, - use_causal_conv=False, - upsample_conditional_features=True, - upsample_net="ConvInUpsampleNetwork", - upsample_factors=[4, 4, 4, 4], - inference_padding=2): - - super(ParallelWaveganGenerator, self).__init__() - self.in_channels = in_channels - self.out_channels = out_channels - self.aux_channels = aux_channels - self.num_res_blocks = num_res_blocks - self.stacks = stacks - self.kernel_size = kernel_size - self.upsample_factors = upsample_factors - self.upsample_scale = np.prod(upsample_factors) - self.inference_padding = inference_padding - - # check the number of layers and stacks - assert num_res_blocks % stacks == 0 - layers_per_stack = num_res_blocks // stacks - - # define first convolution - self.first_conv = torch.nn.Conv1d(in_channels, - res_channels, - kernel_size=1, - bias=True) - - # define conv + upsampling network - self.upsample_net = ConvUpsample(upsample_factors=upsample_factors) - - # define residual blocks - self.conv_layers = torch.nn.ModuleList() - for layer in range(num_res_blocks): - dilation = 2**(layer % layers_per_stack) - conv = ResidualBlock( - kernel_size=kernel_size, - res_channels=res_channels, - gate_channels=gate_channels, - skip_channels=skip_channels, - aux_channels=aux_channels, - dilation=dilation, - dropout=dropout, - bias=bias, - ) - self.conv_layers += [conv] - - # define output layers - self.last_conv_layers = torch.nn.ModuleList([ - torch.nn.ReLU(inplace=True), - torch.nn.Conv1d(skip_channels, - skip_channels, - kernel_size=1, - bias=True), - torch.nn.ReLU(inplace=True), - torch.nn.Conv1d(skip_channels, - out_channels, - kernel_size=1, - bias=True), - ]) - - # apply weight norm - if use_weight_norm: - self.apply_weight_norm() - - def forward(self, c): - """ - c: (B, C ,T'). - o: Output tensor (B, out_channels, T) - """ - # random noise - x = torch.randn([c.shape[0], 1, c.shape[2] * self.upsample_scale]) - x = x.to(self.first_conv.bias.device) - - # perform upsampling - if c is not None and self.upsample_net is not None: - c = self.upsample_net(c) - assert c.shape[-1] == x.shape[ - -1], f" [!] Upsampling scale does not match the expected output. {c.shape} vs {x.shape}" - - # encode to hidden representation - x = self.first_conv(x) - skips = 0 - for f in self.conv_layers: - x, h = f(x, c) - skips += h - skips *= math.sqrt(1.0 / len(self.conv_layers)) - - # apply final layers - x = skips - for f in self.last_conv_layers: - x = f(x) - - return x - - def inference(self, c): - c = c.to(self.first_conv.weight.device) - c = torch.nn.functional.pad( - c, (self.inference_padding, self.inference_padding), 'replicate') - return self.forward(c) - - def remove_weight_norm(self): - def _remove_weight_norm(m): - try: - # print(f"Weight norm is removed from {m}.") - torch.nn.utils.remove_weight_norm(m) - except ValueError: # this module didn't have weight norm - return - - self.apply(_remove_weight_norm) - - def apply_weight_norm(self): - def _apply_weight_norm(m): - if isinstance(m, torch.nn.Conv1d) or isinstance( - m, torch.nn.Conv2d): - torch.nn.utils.weight_norm(m) - # print(f"Weight norm is applied to {m}.") - - self.apply(_apply_weight_norm) - - @staticmethod - def _get_receptive_field_size(layers, - stacks, - kernel_size, - dilation=lambda x: 2**x): - assert layers % stacks == 0 - layers_per_cycle = layers // stacks - dilations = [dilation(i % layers_per_cycle) for i in range(layers)] - return (kernel_size - 1) * sum(dilations) + 1 - - @property - def receptive_field_size(self): - return self._get_receptive_field_size(self.layers, self.stacks, - self.kernel_size) diff --git a/vocoder/tests/test_parallel_wavegan_discriminator.py b/vocoder/tests/test_parallel_wavegan_discriminator.py deleted file mode 100644 index b496e216..00000000 --- a/vocoder/tests/test_parallel_wavegan_discriminator.py +++ /dev/null @@ -1,41 +0,0 @@ -import numpy as np -import torch - -from TTS.vocoder.models.parallel_wavegan_discriminator import ParallelWaveganDiscriminator, ResidualParallelWaveganDiscriminator - - -def test_pwgan_disciminator(): - model = ParallelWaveganDiscriminator( - in_channels=1, - out_channels=1, - kernel_size=3, - num_layers=10, - conv_channels=64, - dilation_factor=1, - nonlinear_activation="LeakyReLU", - nonlinear_activation_params={"negative_slope": 0.2}, - bias=True) - dummy_x = torch.rand((4, 1, 64 * 256)) - output = model(dummy_x) - assert np.all(output.shape == (4, 1, 64 * 256)) - model.remove_weight_norm() - - -def test_redisual_pwgan_disciminator(): - model = ResidualParallelWaveganDiscriminator( - in_channels=1, - out_channels=1, - kernel_size=3, - num_layers=30, - stacks=3, - res_channels=64, - gate_channels=128, - skip_channels=64, - dropout=0.0, - bias=True, - nonlinear_activation="LeakyReLU", - nonlinear_activation_params={"negative_slope": 0.2}) - dummy_x = torch.rand((4, 1, 64 * 256)) - output = model(dummy_x) - assert np.all(output.shape == (4, 1, 64 * 256)) - model.remove_weight_norm() diff --git a/vocoder/tests/test_parallel_wavegan_generator.py b/vocoder/tests/test_parallel_wavegan_generator.py deleted file mode 100644 index f904ed24..00000000 --- a/vocoder/tests/test_parallel_wavegan_generator.py +++ /dev/null @@ -1,30 +0,0 @@ -import numpy as np -import torch - -from TTS.vocoder.models.parallel_wavegan_generator import ParallelWaveganGenerator - - -def test_pwgan_generator(): - model = ParallelWaveganGenerator( - in_channels=1, - out_channels=1, - kernel_size=3, - num_res_blocks=30, - stacks=3, - res_channels=64, - gate_channels=128, - skip_channels=64, - aux_channels=80, - aux_context_window=2, - dropout=0.0, - bias=True, - use_weight_norm=True, - use_causal_conv=False, - upsample_conditional_features=True, - upsample_factors=[4, 4, 4, 4]) - dummy_c = torch.rand((4, 80, 64)) - output = model(dummy_c) - assert np.all(output.shape == (4, 1, 64 * 256)) - model.remove_weight_norm() - output = model.inference(dummy_c) - assert np.all(output.shape == (4, 1, (64 + 4) * 256)) diff --git a/vocoder/tests/test_tf_melgan_generator.py b/vocoder/tests/test_tf_melgan_generator.py deleted file mode 100644 index 40a167a2..00000000 --- a/vocoder/tests/test_tf_melgan_generator.py +++ /dev/null @@ -1,12 +0,0 @@ -import numpy as np -import tensorflow as tf - -from TTS.vocoder.tf.models.melgan_generator import MelganGenerator - -def test_melgan_generator(): - hop_length = 256 - model = MelganGenerator() - dummy_input = tf.random.uniform((4, 80, 64)) - output = model(dummy_input, training=False) - assert np.all(output.shape == (4, 1, 64 * hop_length)), output.shape - diff --git a/vocoder/tests/test_tf_pqmf.py b/vocoder/tests/test_tf_pqmf.py deleted file mode 100644 index 75f00d5f..00000000 --- a/vocoder/tests/test_tf_pqmf.py +++ /dev/null @@ -1,29 +0,0 @@ -import os -import tensorflow as tf - -import soundfile as sf -from librosa.core import load - -from TTS.tests import get_tests_path, get_tests_input_path -from TTS.vocoder.tf.layers.pqmf import PQMF - - -TESTS_PATH = get_tests_path() -WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav") - - -def test_pqmf(): - w, sr = load(WAV_FILE) - - layer = PQMF(N=4, taps=62, cutoff=0.15, beta=9.0) - w, sr = load(WAV_FILE) - w2 = tf.convert_to_tensor(w[None, None, :]) - b2 = layer.analysis(w2) - w2_ = layer.synthesis(b2) - w2_ = w2.numpy() - - print(w2_.max()) - print(w2_.min()) - print(w2_.mean()) - sf.write('tf_pqmf_output.wav', w2_.flatten(), sr) -