mass linter fix

This commit is contained in:
erogol 2020-08-04 14:07:47 +02:00
parent f35504f187
commit e386caa071
62 changed files with 153 additions and 182 deletions

View File

@ -30,4 +30,3 @@ model = load_checkpoint(model, args.tf_model)
# create tflite model
tflite_model = convert_melgan_to_tflite(model, output_path=args.output_path)

View File

@ -114,4 +114,3 @@ assert compare_torch_tf(output_torch, output_tf) < 1e-5, compare_torch_tf(
save_checkpoint(model_tf, checkpoint['step'], checkpoint['epoch'],
args.output_path)
print(' > Model conversion is successfully completed :).')

View File

@ -92,7 +92,7 @@ var_map = [
# %%
# get tf_model graph
mel_pred = model_tf.build_inference()
model_tf.build_inference()
# get tf variables
tf_vars = model_tf.weights

View File

@ -40,8 +40,6 @@ def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_id):
if __name__ == "__main__":
global symbols, phonemes
parser = argparse.ArgumentParser()
parser.add_argument('text', type=str, help='Text to generate speech.')
parser.add_argument('config_path',

View File

@ -9,6 +9,8 @@ import traceback
import torch
from torch.utils.data import DataLoader
from mozilla_voice_tts.generic_utils import count_parameters
from mozilla_voice_tts.speaker_encoder.dataset import MyDataset
from mozilla_voice_tts.speaker_encoder.generic_utils import save_best_model
from mozilla_voice_tts.speaker_encoder.loss import GE2ELoss
@ -16,10 +18,9 @@ from mozilla_voice_tts.speaker_encoder.model import SpeakerEncoder
from mozilla_voice_tts.speaker_encoder.visual import plot_embeddings
from mozilla_voice_tts.tts.datasets.preprocess import load_meta_data
from mozilla_voice_tts.tts.utils.audio import AudioProcessor
from mozilla_voice_tts.tts.utils.generic_utils import (create_experiment_folder,
get_git_branch,
remove_experiment_folder,
set_init_dict)
from mozilla_voice_tts.tts.utils.generic_utils import (
create_experiment_folder, get_git_branch, remove_experiment_folder,
set_init_dict)
from mozilla_voice_tts.tts.utils.io import copy_config_file, load_config
from mozilla_voice_tts.tts.utils.radam import RAdam
from mozilla_voice_tts.tts.utils.tensorboard_logger import TensorboardLogger
@ -182,8 +183,8 @@ def main(args): # pylint: disable=redefined-outer-name
meta_data_train, meta_data_eval = load_meta_data(c.datasets)
global_step = args.restore_step
train_loss, global_step = train(model, criterion, optimizer, scheduler, ap,
global_step)
_, global_step = train(model, criterion, optimizer, scheduler, ap,
global_step)
if __name__ == '__main__':

View File

@ -11,31 +11,40 @@ import traceback
import numpy as np
import torch
from torch.utils.data import DataLoader
from mozilla_voice_tts.tts.datasets.preprocess import load_meta_data
from mozilla_voice_tts.tts.datasets.TTSDataset import MyDataset
from mozilla_voice_tts.tts.layers.losses import TacotronLoss
from mozilla_voice_tts.tts.utils.distribute import (DistributedSampler,
apply_gradient_allreduce,
init_distributed, reduce_tensor)
apply_gradient_allreduce,
init_distributed,
reduce_tensor)
from mozilla_voice_tts.tts.utils.generic_utils import check_config, setup_model
from mozilla_voice_tts.tts.utils.io import save_best_model, save_checkpoint
from mozilla_voice_tts.tts.utils.measures import alignment_diagonal_score
from mozilla_voice_tts.tts.utils.speakers import (get_speakers, load_speaker_mapping,
save_speaker_mapping)
from mozilla_voice_tts.tts.utils.speakers import (get_speakers,
load_speaker_mapping,
save_speaker_mapping)
from mozilla_voice_tts.tts.utils.synthesis import synthesis
from mozilla_voice_tts.tts.utils.text.symbols import make_symbols, phonemes, symbols
from mozilla_voice_tts.tts.utils.text.symbols import (make_symbols, phonemes,
symbols)
from mozilla_voice_tts.tts.utils.visual import plot_alignment, plot_spectrogram
from mozilla_voice_tts.utils.audio import AudioProcessor
from mozilla_voice_tts.utils.console_logger import ConsoleLogger
from mozilla_voice_tts.utils.generic_utils import (KeepAverage, count_parameters,
create_experiment_folder, get_git_branch,
remove_experiment_folder, set_init_dict)
from mozilla_voice_tts.utils.generic_utils import (KeepAverage,
count_parameters,
create_experiment_folder,
get_git_branch,
remove_experiment_folder,
set_init_dict)
from mozilla_voice_tts.utils.io import copy_config_file, load_config
from mozilla_voice_tts.utils.radam import RAdam
from mozilla_voice_tts.utils.tensorboard_logger import TensorboardLogger
from mozilla_voice_tts.utils.training import (NoamLR, adam_weight_decay, check_update,
gradual_training_scheduler, set_weight_decay,
setup_torch_training_env)
from mozilla_voice_tts.utils.training import (NoamLR, adam_weight_decay,
check_update,
gradual_training_scheduler,
set_weight_decay,
setup_torch_training_env)
use_cuda, num_gpus = setup_torch_training_env(True, False)
@ -47,7 +56,7 @@ def setup_loader(ap, r, is_val=False, verbose=False):
dataset = MyDataset(
r,
c.text_cleaner,
compute_linear_spec=True if c.model.lower() == 'tacotron' else False,
compute_linear_spec=c.model.lower() == 'tacotron',
meta_data=meta_data_eval if is_val else meta_data_train,
ap=ap,
tp=c.characters if 'characters' in c.keys() else None,
@ -156,7 +165,7 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,
decoder_backward_output = None
alignments_backward = None
# set the alignment lengths wrt reduction factor for guided attention
# set the [alignment] lengths wrt reduction factor for guided attention
if mel_lengths.max() % model.decoder.r != 0:
alignment_lengths = (mel_lengths + (model.decoder.r - (mel_lengths.max() % model.decoder.r))) // model.decoder.r
else:
@ -171,7 +180,7 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,
# backward pass
if amp is not None:
with amp.scale_loss( loss_dict['loss'], optimizer) as scaled_loss:
with amp.scale_loss(loss_dict['loss'], optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss_dict['loss'].backward()
@ -425,7 +434,7 @@ def evaluate(model, criterion, ap, global_step, epoch):
style_wav = c.get("style_wav_for_test")
for idx, test_sentence in enumerate(test_sentences):
try:
wav, alignment, decoder_output, postnet_output, stop_tokens, inputs = synthesis(
wav, alignment, decoder_output, postnet_output, stop_tokens, _ = synthesis(
model,
test_sentence,
c,
@ -448,7 +457,7 @@ def evaluate(model, criterion, ap, global_step, epoch):
postnet_output, ap, output_fig=False)
test_figures['{}-alignment'.format(idx)] = plot_alignment(
alignment, output_fig=False)
except:
except: #pylint: disable=bare-except
print(" !! Error creating Test Sentence -", idx)
traceback.print_exc()
tb_logger.tb_test_audios(global_step, test_audios,
@ -531,7 +540,7 @@ def main(args): # pylint: disable=redefined-outer-name
if c.reinit_layers:
raise RuntimeError
model.load_state_dict(checkpoint['model'])
except:
except KeyError:
print(" > Partial model initialization.")
model_dict = model.state_dict()
model_dict = set_init_dict(model_dict, checkpoint['model'], c)

View File

@ -8,23 +8,30 @@ from inspect import signature
import torch
from torch.utils.data import DataLoader
from mozilla_voice_tts.utils.audio import AudioProcessor
from mozilla_voice_tts.utils.console_logger import ConsoleLogger
from mozilla_voice_tts.utils.generic_utils import (KeepAverage, count_parameters,
create_experiment_folder, get_git_branch,
remove_experiment_folder, set_init_dict)
from mozilla_voice_tts.utils.generic_utils import (KeepAverage,
count_parameters,
create_experiment_folder,
get_git_branch,
remove_experiment_folder,
set_init_dict)
from mozilla_voice_tts.utils.io import copy_config_file, load_config
from mozilla_voice_tts.utils.radam import RAdam
from mozilla_voice_tts.utils.tensorboard_logger import TensorboardLogger
from mozilla_voice_tts.utils.training import setup_torch_training_env
from mozilla_voice_tts.vocoder.datasets.gan_dataset import GANDataset
from mozilla_voice_tts.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
from mozilla_voice_tts.vocoder.datasets.preprocess import (load_wav_data,
load_wav_feat_data)
# from distribute import (DistributedSampler, apply_gradient_allreduce,
# init_distributed, reduce_tensor)
from mozilla_voice_tts.vocoder.layers.losses import DiscriminatorLoss, GeneratorLoss
from mozilla_voice_tts.vocoder.utils.generic_utils import (check_config, plot_results,
setup_discriminator,
setup_generator)
from mozilla_voice_tts.vocoder.layers.losses import (DiscriminatorLoss,
GeneratorLoss)
from mozilla_voice_tts.vocoder.utils.generic_utils import (check_config,
plot_results,
setup_discriminator,
setup_generator)
from mozilla_voice_tts.vocoder.utils.io import save_best_model, save_checkpoint
use_cuda, num_gpus = setup_torch_training_env(True, True)

View File

@ -4,7 +4,6 @@ import time
import numpy as np
import torch
import yaml
import pysbd
from mozilla_voice_tts.utils.audio import AudioProcessor

View File

@ -10,7 +10,7 @@ Below is an example showing embedding results of various speakers. You can gener
Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page.
To run the code, you need to follow the same flow as in TTS.
To run the code, you need to follow the same flow as in mozilla_voice_tts.
- Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
- Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```

View File

@ -85,4 +85,3 @@ class SpeakerEncoder(nn.Module):
frames[cur_iter <= num_iters, :, :]
)
return embed / num_iters

View File

@ -1,6 +1,5 @@
import torch
from torch import nn
from torch.autograd import Variable
from torch.nn import functional as F
@ -52,6 +51,7 @@ class LinearBN(nn.Module):
class Prenet(nn.Module):
# pylint: disable=dangerous-default-value
def __init__(self,
in_features,
prenet_type="original",
@ -300,8 +300,8 @@ class OriginalAttention(nn.Module):
def apply_forward_attention(self, alignment):
# forward attention
fwd_shifted_alpha = F.pad(self.alpha[:, :-1].clone().to(alignment.device),
(1, 0, 0, 0))
fwd_shifted_alpha = F.pad(
self.alpha[:, :-1].clone().to(alignment.device), (1, 0, 0, 0))
# compute transition potentials
alpha = ((1 - self.u) * self.alpha
+ self.u * fwd_shifted_alpha
@ -309,7 +309,7 @@ class OriginalAttention(nn.Module):
# force incremental alignment
if not self.training and self.forward_attn_mask:
_, n = fwd_shifted_alpha.max(1)
val, n2 = alpha.max(1)
val, _ = alpha.max(1)
for b in range(alignment.shape[0]):
alpha[b, n[b] + 3:] = 0
alpha[b, :(

View File

@ -72,7 +72,7 @@ class ReferenceEncoder(nn.Module):
# x: 3D tensor [batch_size, post_conv_width,
# num_channels*post_conv_height]
self.recurrence.flatten_parameters()
memory, out = self.recurrence(x)
_, out = self.recurrence(x)
# out: 3D tensor [seq_len==1, batch_size, encoding_size=128]
return out.squeeze(0)

View File

@ -243,4 +243,3 @@ class TacotronLoss(torch.nn.Module):
return_dict['loss'] = loss
return return_dict

View File

@ -1,7 +1,7 @@
# coding: utf-8
import torch
from torch import nn
from .common_layers import Prenet, init_attn, Linear
from .common_layers import Prenet, init_attn
class BatchNormConv1d(nn.Module):
@ -46,9 +46,9 @@ class BatchNormConv1d(nn.Module):
# self.init_layers()
def init_layers(self):
if type(self.activation) == torch.nn.ReLU:
if isinstance(self.activation, torch.nn.ReLU):
w_gain = 'relu'
elif type(self.activation) == torch.nn.Tanh:
elif isinstance(self.activation, torch.nn.Tanh):
w_gain = 'tanh'
elif self.activation is None:
w_gain = 'linear'
@ -117,7 +117,7 @@ class CBHG(nn.Module):
- input: (B, C, T_in)
- output: (B, T_in, C*2)
"""
#pylint: disable=dangerous-default-value
def __init__(self,
in_features,
K=16,
@ -355,7 +355,6 @@ class Decoder(nn.Module):
Initialization of decoder states
"""
B = inputs.size(0)
T = inputs.size(1)
# go frame as zeros matrix
if self.use_memory_queue:
self.memory_input = torch.zeros(1, device=inputs.device).repeat(B, self.frame_channels * self.memory_size)
@ -496,7 +495,7 @@ class Decoder(nn.Module):
if t > inputs.shape[1] / 4 and (stop_token > 0.6
or attention[:, -1].item() > 0.6):
break
elif t > self.max_decoder_steps:
if t > self.max_decoder_steps:
print(" | > Decoder stopped with 'max_decoder_steps")
break
return self._parse_outputs(outputs, attentions, stop_tokens)

View File

@ -1,10 +1,11 @@
import torch
from torch.autograd import Variable
from torch import nn
from torch.nn import functional as F
from .common_layers import init_attn, Prenet, Linear
# NOTE: linter has a problem with the current TF release
#pylint: disable=no-value-for-parameter
#pylint: disable=unexpected-keyword-arg
class ConvBNBlock(nn.Module):
r"""Convolutions with Batch Normalization and non-linear activation.
@ -156,6 +157,7 @@ class Decoder(nn.Module):
self.separate_stopnet = separate_stopnet
self.max_decoder_steps = 1000
self.stop_threshold = 0.5
self.speaker_embedding_dim = speaker_embedding_dim
# model dimensions
self.query_dim = 1024
@ -211,8 +213,8 @@ class Decoder(nn.Module):
def get_go_frame(self, inputs):
B = inputs.size(0)
memory = torch.zeros(1, device=inputs.device).repeat(B,
self.frame_channels * self.r)
memory = torch.zeros(1, device=inputs.device).repeat(
B, self.frame_channels * self.r)
return memory
def _init_states(self, inputs, mask, keep_states=False):
@ -393,7 +395,6 @@ class Decoder(nn.Module):
self.attention.init_win_idx()
self.attention.init_states(inputs)
outputs, stop_tokens, alignments, t = [], [], [], 0
stop_flags = [True, False, False]
while True:
memory = self.prenet(self.memory_truncated)
decoder_output, alignment, stop_token = self.decode(memory)

View File

@ -3,6 +3,9 @@ from tensorflow import keras
from tensorflow.python.ops import math_ops
# from tensorflow_addons.seq2seq import BahdanauAttention
# NOTE: linter has a problem with the current TF release
#pylint: disable=no-value-for-parameter
#pylint: disable=unexpected-keyword-arg
class Linear(keras.layers.Layer):
def __init__(self, units, use_bias, **kwargs):

View File

@ -4,7 +4,9 @@ from mozilla_voice_tts.tts.tf.utils.tf_utils import shape_list
from mozilla_voice_tts.tts.tf.layers.common_layers import Prenet, Attention
# from tensorflow_addons.seq2seq import AttentionWrapper
# NOTE: linter has a problem with the current TF release
#pylint: disable=no-value-for-parameter
#pylint: disable=unexpected-keyword-arg
class ConvBNBlock(keras.layers.Layer):
def __init__(self, filters, kernel_size, activation, **kwargs):
super(ConvBNBlock, self).__init__(**kwargs)

View File

@ -5,7 +5,7 @@ from mozilla_voice_tts.tts.tf.layers.tacotron2 import Encoder, Decoder, Postnet
from mozilla_voice_tts.tts.tf.utils.tf_utils import shape_list
#pylint: disable=too-many-ancestors
#pylint: disable=too-many-ancestors, abstract-method
class Tacotron2(keras.models.Model):
def __init__(self,
num_chars,
@ -105,4 +105,3 @@ class Tacotron2(keras.models.Model):
# TODO: issue https://github.com/PyCQA/pylint/issues/3613
input_ids = tf.random.uniform(shape=[1, 4], maxval=10, dtype=tf.int32) #pylint: disable=unexpected-keyword-arg
self(input_ids)

View File

@ -1,6 +1,9 @@
import numpy as np
import tensorflow as tf
# NOTE: linter has a problem with the current TF release
#pylint: disable=no-value-for-parameter
#pylint: disable=unexpected-keyword-arg
def tf_create_dummy_inputs():
""" Create dummy inputs for TF Tacotron2 model """

View File

@ -1,4 +1,3 @@
import os
import datetime
import importlib
import pickle

View File

@ -39,4 +39,3 @@ def load_tflite_model(tflite_path):
tflite_model = tf.lite.Interpreter(model_path=tflite_path)
tflite_model.allocate_tensors()
return tflite_model

View File

@ -74,4 +74,3 @@ class StandardScaler():
X *= self.scale_
X += self.mean_
return X

View File

@ -1,15 +1,11 @@
# edited from https://github.com/fastai/imagenet-fast/blob/master/imagenet_nv/distributed.py
import os, sys
import math
import time
import subprocess
import argparse
import torch
import torch.distributed as dist
from torch.utils.data.sampler import Sampler
from torch.autograd import Variable
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
from mozilla_voice_tts.utils.generic_utils import create_experiment_folder
from torch.autograd import Variable
from torch.utils.data.sampler import Sampler
class DistributedSampler(Sampler):
@ -108,7 +104,7 @@ def apply_gradient_allreduce(module):
for param in list(module.parameters()):
def allreduce_hook(*_):
Variable._execution_engine.queue_callback(allreduce_params)
Variable._execution_engine.queue_callback(allreduce_params) #pylint: disable=protected-access
if param.requires_grad:
param.register_hook(allreduce_hook)

View File

@ -3,7 +3,7 @@ import torch
import datetime
def load_checkpoint(model, checkpoint_path, use_cuda=False):
def load_checkpoint(model, checkpoint_path, amp=None, use_cuda=False):
state = torch.load(checkpoint_path, map_location=torch.device('cpu'))
model.load_state_dict(state['model'])
if amp and 'amp' in state:

View File

@ -1,6 +1,3 @@
import torch
def alignment_diagonal_score(alignments, binary=False):
"""
Compute how diagonal alignment predictions are. It is useful

View File

@ -1,8 +1,6 @@
import os
import json
from mozilla_voice_tts.tts.datasets.preprocess import get_preprocessor_by_name
def make_speakers_json_path(out_path):
"""Returns conventional speakers.json location."""

View File

@ -8,6 +8,7 @@ from mozilla_voice_tts.tts.utils.text import cleaners
from mozilla_voice_tts.tts.utils.text.symbols import make_symbols, symbols, phonemes, _phoneme_punctuations, _bos, \
_eos
# pylint: disable=unnecessary-comprehension
# Mappings from symbol to numeric ID and vice versa:
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
_id_to_symbol = {i: s for i, s in enumerate(symbols)}

View File

@ -41,7 +41,7 @@ def plot_spectrogram(spectrogram, ap=None, fig_size=(16, 10), output_fig=False):
plt.colorbar()
plt.tight_layout()
if not output_fig:
plt.close()
plt.close()
return fig
@ -97,4 +97,4 @@ def visualize(alignment, postnet_output, stop_tokens, text, hop_length, CONFIG,
plt.close()
if not output_fig:
plt.close()
plt.close()

View File

@ -52,7 +52,7 @@ class AudioProcessor(object):
self.mel_fmin = mel_fmin or 0
self.mel_fmax = mel_fmax
self.spec_gain = float(spec_gain)
self.stft_pad_mode = 'reflect'
self.stft_pad_mode = stft_pad_mode
self.max_norm = 1.0 if max_norm is None else float(max_norm)
self.clip_norm = clip_norm
self.do_trim_silence = do_trim_silence
@ -123,7 +123,7 @@ class AudioProcessor(object):
if self.symmetric_norm:
S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm
if self.clip_norm:
S_norm = np.clip(S_norm, -self.max_norm, self.max_norm)
S_norm = np.clip(S_norm, -self.max_norm, self.max_norm) # pylint: disable=invalid-unary-operand-type
return S_norm
else:
S_norm = self.max_norm * S_norm
@ -148,7 +148,7 @@ class AudioProcessor(object):
raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.')
if self.symmetric_norm:
if self.clip_norm:
S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm)
S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm) #pylint: disable=invalid-unary-operand-type
S_denorm = ((S_denorm + self.max_norm) * -self.min_level_db / (2 * self.max_norm)) + self.min_level_db
return S_denorm + self.ref_level_db
else:

View File

@ -2,7 +2,7 @@
import math
import torch
from torch.optim.optimizer import Optimizer, required
from torch.optim.optimizer import Optimizer
class RAdam(Optimizer):
@ -25,7 +25,7 @@ class RAdam(Optimizer):
defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, buffer=[[None, None, None] for _ in range(10)])
super(RAdam, self).__init__(params, defaults)
def __setstate__(self, state):
def __setstate__(self, state): # pylint: disable=useless-super-delegation
super(RAdam, self).__setstate__(state)
def step(self, closure=None):

View File

@ -47,7 +47,7 @@ class TensorboardLogger(object):
for key, value in audios.items():
try:
self.writer.add_audio('{}/{}'.format(scope_name, key), value, step, sample_rate=sample_rate)
except:
except RuntimeError:
traceback.print_exc()
def tb_train_iter_stats(self, step, stats):

View File

@ -95,4 +95,3 @@ class MelganGenerator(nn.Module):
nn.utils.remove_weight_norm(layer)
except ValueError:
layer.remove_weight_norm()

View File

@ -145,6 +145,5 @@ def setup_discriminator(c):
)
return model
def check_config(c):
pass
# def check_config(c):
# pass

View File

@ -71,7 +71,7 @@ def process_meta_data(path):
def get_data_points(meta_data):
x = [char_cnt for char_cnt in meta_data]
x = meta_data
y_avg = [meta_data[d]['mean'] for d in meta_data]
y_mode = [meta_data[d]['mode'] for d in meta_data]
y_median = [meta_data[d]['median'] for d in meta_data]

View File

@ -36,7 +36,7 @@ else:
pass
class build_py(setuptools.command.build_py.build_py):
class build_py(setuptools.command.build_py.build_py): # pylint: disable=too-many-ancestors
def run(self):
self.create_version_file()
setuptools.command.build_py.build_py.run(self)

View File

@ -1,35 +0,0 @@
import unittest
import torch as T
from mozilla_voice_tts.tts.utils.generic_utils import save_checkpoint, save_best_model
from mozilla_voice_tts.tts.layers.tacotron import Prenet
OUT_PATH = '/tmp/test.pth.tar'
class ModelSavingTests(unittest.TestCase):
def save_checkpoint_test(self):
# create a dummy model
model = Prenet(128, out_features=[256, 128])
model = T.nn.DataParallel(layer) #FIXME: undefined variable layer
# save the model
save_checkpoint(model, None, 100, OUT_PATH, 1, 1)
# load the model to CPU
model_dict = T.load(
MODEL_PATH, map_location=lambda storage, loc: storage) #FIXME: undefined variable MODEL_PATH
model.load_state_dict(model_dict['model'])
def save_best_model_test(self):
# create a dummy model
model = Prenet(256, out_features=[256, 256])
model = T.nn.DataParallel(layer)
# save the model
save_best_model(model, None, 0, 100, OUT_PATH, 10, 1)
# load the model to CPU
model_dict = T.load(
MODEL_PATH, map_location=lambda storage, loc: storage)
model.load_state_dict(model_dict['model'])

View File

@ -1,7 +1,8 @@
import os
import unittest
from tests import get_tests_path, get_tests_input_path, get_tests_output_path
from tests import get_tests_input_path, get_tests_output_path, get_tests_path
from mozilla_voice_tts.utils.audio import AudioProcessor
from mozilla_voice_tts.utils.io import load_config
@ -103,7 +104,7 @@ class TestAudio(unittest.TestCase):
assert (x_old - x).sum() == 0
# check value range
assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max()
assert x_norm.min() >= -self.ap.max_norm - 2, x_norm.min()
assert x_norm.min() >= -self.ap.max_norm - 2, x_norm.min() #pylint: disable=invalid-unary-operand-type
assert x_norm.min() <= 0, x_norm.min()
# check denorm.
x_ = self.ap._denormalize(x_norm)
@ -120,7 +121,7 @@ class TestAudio(unittest.TestCase):
assert (x_old - x).sum() == 0
# check value range
assert x_norm.max() <= self.ap.max_norm, x_norm.max()
assert x_norm.min() >= -self.ap.max_norm, x_norm.min()
assert x_norm.min() >= -self.ap.max_norm, x_norm.min() #pylint: disable=invalid-unary-operand-type
assert x_norm.min() <= 0, x_norm.min()
# check denorm.
x_ = self.ap._denormalize(x_norm)
@ -148,7 +149,7 @@ class TestAudio(unittest.TestCase):
assert (x_old - x).sum() == 0
assert x_norm.max() <= self.ap.max_norm, x_norm.max()
assert x_norm.min() >= -self.ap.max_norm, x_norm.min()
assert x_norm.min() >= -self.ap.max_norm, x_norm.min() #pylint: disable=invalid-unary-operand-type
assert x_norm.min() < 0, x_norm.min()
x_ = self.ap._denormalize(x_norm)
assert (x - x_).sum() < 1e-3

View File

@ -1,13 +1,13 @@
import os
import unittest
import torch as T
from tests import get_tests_input_path, get_tests_output_path
from mozilla_voice_tts.server.synthesizer import Synthesizer
from tests import get_tests_input_path, get_tests_output_path
from mozilla_voice_tts.tts.utils.text.symbols import make_symbols, phonemes, symbols
from mozilla_voice_tts.tts.utils.generic_utils import setup_model
from mozilla_voice_tts.tts.utils.io import save_checkpoint
from mozilla_voice_tts.tts.utils.text.symbols import (make_symbols, phonemes,
symbols)
from mozilla_voice_tts.utils.io import load_config

View File

@ -1,13 +1,13 @@
import os
import unittest
import torch as T
from tests import get_tests_input_path
from tests import get_tests_path, get_tests_input_path
from mozilla_voice_tts.speaker_encoder.model import SpeakerEncoder
from mozilla_voice_tts.speaker_encoder.loss import GE2ELoss
from mozilla_voice_tts.speaker_encoder.model import SpeakerEncoder
from mozilla_voice_tts.utils.io import load_config
file_path = get_tests_input_path()
c = load_config(os.path.join(file_path, "test_config.json"))

View File

@ -9,7 +9,7 @@ from mozilla_voice_tts.tts.utils.generic_utils import sequence_mask
class PrenetTests(unittest.TestCase):
def test_in_out(self):
def test_in_out(self): #pylint: disable=no-self-use
layer = Prenet(128, out_features=[256, 128])
dummy_input = T.rand(4, 128)
@ -104,7 +104,7 @@ class DecoderTests(unittest.TestCase):
class EncoderTests(unittest.TestCase):
def test_in_out(self):
def test_in_out(self): #pylint: disable=no-self-use
layer = Encoder(128)
dummy_input = T.rand(4, 8, 128)
@ -117,7 +117,7 @@ class EncoderTests(unittest.TestCase):
class L1LossMaskedTests(unittest.TestCase):
def test_in_out(self):
def test_in_out(self): #pylint: disable=no-self-use
# test input == target
layer = L1LossMasked(seq_len_norm=False)
dummy_input = T.ones(4, 8, 128).float()

View File

@ -1,15 +1,16 @@
import os
import unittest
import shutil
import torch
import numpy as np
import unittest
from tests import get_tests_path, get_tests_input_path, get_tests_output_path
import numpy as np
import torch
from tests import get_tests_input_path, get_tests_output_path
from torch.utils.data import DataLoader
from mozilla_voice_tts.utils.io import load_config
from mozilla_voice_tts.utils.audio import AudioProcessor
from mozilla_voice_tts.tts.datasets import TTSDataset
from mozilla_voice_tts.tts.datasets.preprocess import ljspeech
from mozilla_voice_tts.utils.audio import AudioProcessor
from mozilla_voice_tts.utils.io import load_config
#pylint: disable=unused-variable
@ -32,7 +33,7 @@ class TestTTSDataset(unittest.TestCase):
self.ap = AudioProcessor(**c.audio)
def _create_dataloader(self, batch_size, r, bgs):
items = ljspeech(c.data_path,'metadata.csv')
items = ljspeech(c.data_path, 'metadata.csv')
dataset = TTSDataset.MyDataset(
r,
c.text_cleaner,
@ -74,7 +75,7 @@ class TestTTSDataset(unittest.TestCase):
assert check_count == 0, \
" !! Negative values in text_input: {}".format(check_count)
# TODO: more assertion here
assert type(speaker_name[0]) is str
assert isinstance(speaker_name[0], str)
assert linear_input.shape[0] == c.batch_size
assert linear_input.shape[2] == self.ap.fft_size // 2 + 1
assert mel_input.shape[0] == c.batch_size
@ -82,7 +83,7 @@ class TestTTSDataset(unittest.TestCase):
# check normalization ranges
if self.ap.symmetric_norm:
assert mel_input.max() <= self.ap.max_norm
assert mel_input.min() >= -self.ap.max_norm
assert mel_input.min() >= -self.ap.max_norm #pylint: disable=invalid-unary-operand-type
assert mel_input.min() < 0
else:
assert mel_input.max() <= self.ap.max_norm

View File

@ -7,7 +7,7 @@ from mozilla_voice_tts.tts.datasets.preprocess import common_voice
class TestPreprocessors(unittest.TestCase):
def test_common_voice_preprocessor(self):
def test_common_voice_preprocessor(self): #pylint: disable=no-self-use
root_path = get_tests_input_path()
meta_file = "common_voice.tsv"
items = common_voice(root_path, meta_file)

View File

@ -20,8 +20,8 @@ c = load_config(os.path.join(get_tests_input_path(), 'test_config.json'))
class TacotronTrainTest(unittest.TestCase):
def test_train_step(self):
input = torch.randint(0, 24, (8, 128)).long().to(device)
def test_train_step(self): # pylint: disable=no-self-use
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 128, (8, )).long().to(device)
input_lengths = torch.sort(input_lengths, descending=True)[0]
mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
@ -34,7 +34,7 @@ class TacotronTrainTest(unittest.TestCase):
for idx in mel_lengths:
stop_targets[:, int(idx.item()):, 0] = 1.0
stop_targets = stop_targets.view(input.shape[0],
stop_targets = stop_targets.view(input_dummy.shape[0],
stop_targets.size(1) // c.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
@ -51,7 +51,7 @@ class TacotronTrainTest(unittest.TestCase):
optimizer = optim.Adam(model.parameters(), lr=c.lr)
for i in range(5):
mel_out, mel_postnet_out, align, stop_tokens = model.forward(
input, input_lengths, mel_spec, mel_lengths, speaker_ids)
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids)
assert torch.sigmoid(stop_tokens).data.max() <= 1.0
assert torch.sigmoid(stop_tokens).data.min() >= 0.0
optimizer.zero_grad()

View File

@ -1,15 +1,19 @@
import os
import torch
import unittest
import numpy as np
import tensorflow as tf
import torch
from tests import get_tests_input_path
from mozilla_voice_tts.tts.tf.models.tacotron2 import Tacotron2
from mozilla_voice_tts.tts.tf.utils.tflite import (convert_tacotron2_to_tflite,
load_tflite_model)
from mozilla_voice_tts.utils.io import load_config
tf.get_logger().setLevel('INFO')
from tests import get_tests_path, get_tests_input_path, get_tests_output_path
from mozilla_voice_tts.utils.io import load_config
from mozilla_voice_tts.tts.tf.models.tacotron2 import Tacotron2
from mozilla_voice_tts.tts.tf.utils.tflite import convert_tacotron2_to_tflite, load_tflite_model
#pylint: disable=unused-variable
@ -132,4 +136,3 @@ class TacotronTFTrainTest(unittest.TestCase):
postnet_output = tflite_model.get_tensor(output_details[1]['index'])
# remove tflite binary
os.remove('test_tacotron2.tflite')

View File

@ -16,7 +16,7 @@ def test_phoneme_to_sequence():
lang = "en-us"
sequence = phoneme_to_sequence(text, text_cleaner, lang)
text_hat = sequence_to_phoneme(sequence)
sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
_ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
gt = "ɹiːsənt ɹɪːtʃ æt hɑːɹvɚd hɐz ʃoʊn mɛdᵻteɪɾɪŋ fɔːɹ æz lɪɾəl æz eɪt wiːks kæn æktʃuːəli ɪnkɹiːs, ðə ɡɹeɪ mæɾɚɹ ɪnðə pɑːɹts ʌvðə bɹeɪn ɹɪspɑːnsəbəl fɔːɹ ɪmoʊʃənəl ɹɛɡjuːleɪʃən ænd lɜːnɪŋ!"
assert text_hat == text_hat_with_params == gt
@ -25,7 +25,7 @@ def test_phoneme_to_sequence():
text = "Be a voice, not an! echo?"
sequence = phoneme_to_sequence(text, text_cleaner, lang)
text_hat = sequence_to_phoneme(sequence)
sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
_ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ?"
print(text_hat)
@ -36,7 +36,7 @@ def test_phoneme_to_sequence():
text = "Be a voice, not an! echo"
sequence = phoneme_to_sequence(text, text_cleaner, lang)
text_hat = sequence_to_phoneme(sequence)
sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
_ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ"
print(text_hat)
@ -47,7 +47,7 @@ def test_phoneme_to_sequence():
text = "Be a voice, not an echo!"
sequence = phoneme_to_sequence(text, text_cleaner, lang)
text_hat = sequence_to_phoneme(sequence)
sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
_ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
gt = "biː ɐ vɔɪs, nɑːt ɐn ɛkoʊ!"
print(text_hat)
@ -58,7 +58,7 @@ def test_phoneme_to_sequence():
text = "Be a voice, not an! echo. "
sequence = phoneme_to_sequence(text, text_cleaner, lang)
text_hat = sequence_to_phoneme(sequence)
sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
_ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ."
print(text_hat)
@ -69,7 +69,7 @@ def test_phoneme_to_sequence():
text = "Be a voice, not an! echo. "
sequence = phoneme_to_sequence(text, text_cleaner, lang, True)
text_hat = sequence_to_phoneme(sequence)
sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
_ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
gt = "^biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ.~"
print(text_hat)
@ -80,7 +80,7 @@ def test_phoneme_to_sequence():
text = "_Be a _voice, not an! echo_"
sequence = phoneme_to_sequence(text, text_cleaner, lang)
text_hat = sequence_to_phoneme(sequence)
sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
_ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ"
print(text_hat)

View File

@ -11,4 +11,3 @@ def test_melgan_generator():
assert np.all(output.shape == (4, 1, 64 * 256))
output = model.inference(dummy_input)
assert np.all(output.shape == (4, 1, (64 + 4) * 256))

View File

@ -25,4 +25,3 @@ def test_pqmf():
print(w2_.min())
print(w2_.mean())
sf.write('pqmf_output.wav', w2_.flatten().detach(), sr)

View File

@ -26,4 +26,3 @@ def test_pqmf():
print(w2_.min())
print(w2_.mean())
sf.write('tf_pqmf_output.wav', w2_.flatten(), sr)