mass linter fix

This commit is contained in:
erogol 2020-08-04 14:07:47 +02:00
parent f35504f187
commit e386caa071
62 changed files with 153 additions and 182 deletions

View File

@ -30,4 +30,3 @@ model = load_checkpoint(model, args.tf_model)
# create tflite model # create tflite model
tflite_model = convert_melgan_to_tflite(model, output_path=args.output_path) tflite_model = convert_melgan_to_tflite(model, output_path=args.output_path)

View File

@ -114,4 +114,3 @@ assert compare_torch_tf(output_torch, output_tf) < 1e-5, compare_torch_tf(
save_checkpoint(model_tf, checkpoint['step'], checkpoint['epoch'], save_checkpoint(model_tf, checkpoint['step'], checkpoint['epoch'],
args.output_path) args.output_path)
print(' > Model conversion is successfully completed :).') print(' > Model conversion is successfully completed :).')

View File

@ -92,7 +92,7 @@ var_map = [
# %% # %%
# get tf_model graph # get tf_model graph
mel_pred = model_tf.build_inference() model_tf.build_inference()
# get tf variables # get tf variables
tf_vars = model_tf.weights tf_vars = model_tf.weights

View File

@ -40,8 +40,6 @@ def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_id):
if __name__ == "__main__": if __name__ == "__main__":
global symbols, phonemes
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('text', type=str, help='Text to generate speech.') parser.add_argument('text', type=str, help='Text to generate speech.')
parser.add_argument('config_path', parser.add_argument('config_path',

View File

@ -9,6 +9,8 @@ import traceback
import torch import torch
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
from mozilla_voice_tts.generic_utils import count_parameters
from mozilla_voice_tts.speaker_encoder.dataset import MyDataset from mozilla_voice_tts.speaker_encoder.dataset import MyDataset
from mozilla_voice_tts.speaker_encoder.generic_utils import save_best_model from mozilla_voice_tts.speaker_encoder.generic_utils import save_best_model
from mozilla_voice_tts.speaker_encoder.loss import GE2ELoss from mozilla_voice_tts.speaker_encoder.loss import GE2ELoss
@ -16,10 +18,9 @@ from mozilla_voice_tts.speaker_encoder.model import SpeakerEncoder
from mozilla_voice_tts.speaker_encoder.visual import plot_embeddings from mozilla_voice_tts.speaker_encoder.visual import plot_embeddings
from mozilla_voice_tts.tts.datasets.preprocess import load_meta_data from mozilla_voice_tts.tts.datasets.preprocess import load_meta_data
from mozilla_voice_tts.tts.utils.audio import AudioProcessor from mozilla_voice_tts.tts.utils.audio import AudioProcessor
from mozilla_voice_tts.tts.utils.generic_utils import (create_experiment_folder, from mozilla_voice_tts.tts.utils.generic_utils import (
get_git_branch, create_experiment_folder, get_git_branch, remove_experiment_folder,
remove_experiment_folder, set_init_dict)
set_init_dict)
from mozilla_voice_tts.tts.utils.io import copy_config_file, load_config from mozilla_voice_tts.tts.utils.io import copy_config_file, load_config
from mozilla_voice_tts.tts.utils.radam import RAdam from mozilla_voice_tts.tts.utils.radam import RAdam
from mozilla_voice_tts.tts.utils.tensorboard_logger import TensorboardLogger from mozilla_voice_tts.tts.utils.tensorboard_logger import TensorboardLogger
@ -182,8 +183,8 @@ def main(args): # pylint: disable=redefined-outer-name
meta_data_train, meta_data_eval = load_meta_data(c.datasets) meta_data_train, meta_data_eval = load_meta_data(c.datasets)
global_step = args.restore_step global_step = args.restore_step
train_loss, global_step = train(model, criterion, optimizer, scheduler, ap, _, global_step = train(model, criterion, optimizer, scheduler, ap,
global_step) global_step)
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -11,31 +11,40 @@ import traceback
import numpy as np import numpy as np
import torch import torch
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
from mozilla_voice_tts.tts.datasets.preprocess import load_meta_data from mozilla_voice_tts.tts.datasets.preprocess import load_meta_data
from mozilla_voice_tts.tts.datasets.TTSDataset import MyDataset from mozilla_voice_tts.tts.datasets.TTSDataset import MyDataset
from mozilla_voice_tts.tts.layers.losses import TacotronLoss from mozilla_voice_tts.tts.layers.losses import TacotronLoss
from mozilla_voice_tts.tts.utils.distribute import (DistributedSampler, from mozilla_voice_tts.tts.utils.distribute import (DistributedSampler,
apply_gradient_allreduce, apply_gradient_allreduce,
init_distributed, reduce_tensor) init_distributed,
reduce_tensor)
from mozilla_voice_tts.tts.utils.generic_utils import check_config, setup_model from mozilla_voice_tts.tts.utils.generic_utils import check_config, setup_model
from mozilla_voice_tts.tts.utils.io import save_best_model, save_checkpoint from mozilla_voice_tts.tts.utils.io import save_best_model, save_checkpoint
from mozilla_voice_tts.tts.utils.measures import alignment_diagonal_score from mozilla_voice_tts.tts.utils.measures import alignment_diagonal_score
from mozilla_voice_tts.tts.utils.speakers import (get_speakers, load_speaker_mapping, from mozilla_voice_tts.tts.utils.speakers import (get_speakers,
save_speaker_mapping) load_speaker_mapping,
save_speaker_mapping)
from mozilla_voice_tts.tts.utils.synthesis import synthesis from mozilla_voice_tts.tts.utils.synthesis import synthesis
from mozilla_voice_tts.tts.utils.text.symbols import make_symbols, phonemes, symbols from mozilla_voice_tts.tts.utils.text.symbols import (make_symbols, phonemes,
symbols)
from mozilla_voice_tts.tts.utils.visual import plot_alignment, plot_spectrogram from mozilla_voice_tts.tts.utils.visual import plot_alignment, plot_spectrogram
from mozilla_voice_tts.utils.audio import AudioProcessor from mozilla_voice_tts.utils.audio import AudioProcessor
from mozilla_voice_tts.utils.console_logger import ConsoleLogger from mozilla_voice_tts.utils.console_logger import ConsoleLogger
from mozilla_voice_tts.utils.generic_utils import (KeepAverage, count_parameters, from mozilla_voice_tts.utils.generic_utils import (KeepAverage,
create_experiment_folder, get_git_branch, count_parameters,
remove_experiment_folder, set_init_dict) create_experiment_folder,
get_git_branch,
remove_experiment_folder,
set_init_dict)
from mozilla_voice_tts.utils.io import copy_config_file, load_config from mozilla_voice_tts.utils.io import copy_config_file, load_config
from mozilla_voice_tts.utils.radam import RAdam from mozilla_voice_tts.utils.radam import RAdam
from mozilla_voice_tts.utils.tensorboard_logger import TensorboardLogger from mozilla_voice_tts.utils.tensorboard_logger import TensorboardLogger
from mozilla_voice_tts.utils.training import (NoamLR, adam_weight_decay, check_update, from mozilla_voice_tts.utils.training import (NoamLR, adam_weight_decay,
gradual_training_scheduler, set_weight_decay, check_update,
setup_torch_training_env) gradual_training_scheduler,
set_weight_decay,
setup_torch_training_env)
use_cuda, num_gpus = setup_torch_training_env(True, False) use_cuda, num_gpus = setup_torch_training_env(True, False)
@ -47,7 +56,7 @@ def setup_loader(ap, r, is_val=False, verbose=False):
dataset = MyDataset( dataset = MyDataset(
r, r,
c.text_cleaner, c.text_cleaner,
compute_linear_spec=True if c.model.lower() == 'tacotron' else False, compute_linear_spec=c.model.lower() == 'tacotron',
meta_data=meta_data_eval if is_val else meta_data_train, meta_data=meta_data_eval if is_val else meta_data_train,
ap=ap, ap=ap,
tp=c.characters if 'characters' in c.keys() else None, tp=c.characters if 'characters' in c.keys() else None,
@ -156,7 +165,7 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,
decoder_backward_output = None decoder_backward_output = None
alignments_backward = None alignments_backward = None
# set the alignment lengths wrt reduction factor for guided attention # set the [alignment] lengths wrt reduction factor for guided attention
if mel_lengths.max() % model.decoder.r != 0: if mel_lengths.max() % model.decoder.r != 0:
alignment_lengths = (mel_lengths + (model.decoder.r - (mel_lengths.max() % model.decoder.r))) // model.decoder.r alignment_lengths = (mel_lengths + (model.decoder.r - (mel_lengths.max() % model.decoder.r))) // model.decoder.r
else: else:
@ -171,7 +180,7 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,
# backward pass # backward pass
if amp is not None: if amp is not None:
with amp.scale_loss( loss_dict['loss'], optimizer) as scaled_loss: with amp.scale_loss(loss_dict['loss'], optimizer) as scaled_loss:
scaled_loss.backward() scaled_loss.backward()
else: else:
loss_dict['loss'].backward() loss_dict['loss'].backward()
@ -425,7 +434,7 @@ def evaluate(model, criterion, ap, global_step, epoch):
style_wav = c.get("style_wav_for_test") style_wav = c.get("style_wav_for_test")
for idx, test_sentence in enumerate(test_sentences): for idx, test_sentence in enumerate(test_sentences):
try: try:
wav, alignment, decoder_output, postnet_output, stop_tokens, inputs = synthesis( wav, alignment, decoder_output, postnet_output, stop_tokens, _ = synthesis(
model, model,
test_sentence, test_sentence,
c, c,
@ -448,7 +457,7 @@ def evaluate(model, criterion, ap, global_step, epoch):
postnet_output, ap, output_fig=False) postnet_output, ap, output_fig=False)
test_figures['{}-alignment'.format(idx)] = plot_alignment( test_figures['{}-alignment'.format(idx)] = plot_alignment(
alignment, output_fig=False) alignment, output_fig=False)
except: except: #pylint: disable=bare-except
print(" !! Error creating Test Sentence -", idx) print(" !! Error creating Test Sentence -", idx)
traceback.print_exc() traceback.print_exc()
tb_logger.tb_test_audios(global_step, test_audios, tb_logger.tb_test_audios(global_step, test_audios,
@ -531,7 +540,7 @@ def main(args): # pylint: disable=redefined-outer-name
if c.reinit_layers: if c.reinit_layers:
raise RuntimeError raise RuntimeError
model.load_state_dict(checkpoint['model']) model.load_state_dict(checkpoint['model'])
except: except KeyError:
print(" > Partial model initialization.") print(" > Partial model initialization.")
model_dict = model.state_dict() model_dict = model.state_dict()
model_dict = set_init_dict(model_dict, checkpoint['model'], c) model_dict = set_init_dict(model_dict, checkpoint['model'], c)

View File

@ -8,23 +8,30 @@ from inspect import signature
import torch import torch
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
from mozilla_voice_tts.utils.audio import AudioProcessor from mozilla_voice_tts.utils.audio import AudioProcessor
from mozilla_voice_tts.utils.console_logger import ConsoleLogger from mozilla_voice_tts.utils.console_logger import ConsoleLogger
from mozilla_voice_tts.utils.generic_utils import (KeepAverage, count_parameters, from mozilla_voice_tts.utils.generic_utils import (KeepAverage,
create_experiment_folder, get_git_branch, count_parameters,
remove_experiment_folder, set_init_dict) create_experiment_folder,
get_git_branch,
remove_experiment_folder,
set_init_dict)
from mozilla_voice_tts.utils.io import copy_config_file, load_config from mozilla_voice_tts.utils.io import copy_config_file, load_config
from mozilla_voice_tts.utils.radam import RAdam from mozilla_voice_tts.utils.radam import RAdam
from mozilla_voice_tts.utils.tensorboard_logger import TensorboardLogger from mozilla_voice_tts.utils.tensorboard_logger import TensorboardLogger
from mozilla_voice_tts.utils.training import setup_torch_training_env from mozilla_voice_tts.utils.training import setup_torch_training_env
from mozilla_voice_tts.vocoder.datasets.gan_dataset import GANDataset from mozilla_voice_tts.vocoder.datasets.gan_dataset import GANDataset
from mozilla_voice_tts.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data from mozilla_voice_tts.vocoder.datasets.preprocess import (load_wav_data,
load_wav_feat_data)
# from distribute import (DistributedSampler, apply_gradient_allreduce, # from distribute import (DistributedSampler, apply_gradient_allreduce,
# init_distributed, reduce_tensor) # init_distributed, reduce_tensor)
from mozilla_voice_tts.vocoder.layers.losses import DiscriminatorLoss, GeneratorLoss from mozilla_voice_tts.vocoder.layers.losses import (DiscriminatorLoss,
from mozilla_voice_tts.vocoder.utils.generic_utils import (check_config, plot_results, GeneratorLoss)
setup_discriminator, from mozilla_voice_tts.vocoder.utils.generic_utils import (check_config,
setup_generator) plot_results,
setup_discriminator,
setup_generator)
from mozilla_voice_tts.vocoder.utils.io import save_best_model, save_checkpoint from mozilla_voice_tts.vocoder.utils.io import save_best_model, save_checkpoint
use_cuda, num_gpus = setup_torch_training_env(True, True) use_cuda, num_gpus = setup_torch_training_env(True, True)

View File

@ -4,7 +4,6 @@ import time
import numpy as np import numpy as np
import torch import torch
import yaml
import pysbd import pysbd
from mozilla_voice_tts.utils.audio import AudioProcessor from mozilla_voice_tts.utils.audio import AudioProcessor

View File

@ -10,7 +10,7 @@ Below is an example showing embedding results of various speakers. You can gener
Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page. Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page.
To run the code, you need to follow the same flow as in TTS. To run the code, you need to follow the same flow as in mozilla_voice_tts.
- Define 'config.json' for your needs. Note that, audio parameters should match your TTS model. - Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
- Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360``` - Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```

View File

@ -85,4 +85,3 @@ class SpeakerEncoder(nn.Module):
frames[cur_iter <= num_iters, :, :] frames[cur_iter <= num_iters, :, :]
) )
return embed / num_iters return embed / num_iters

View File

@ -1,6 +1,5 @@
import torch import torch
from torch import nn from torch import nn
from torch.autograd import Variable
from torch.nn import functional as F from torch.nn import functional as F
@ -52,6 +51,7 @@ class LinearBN(nn.Module):
class Prenet(nn.Module): class Prenet(nn.Module):
# pylint: disable=dangerous-default-value
def __init__(self, def __init__(self,
in_features, in_features,
prenet_type="original", prenet_type="original",
@ -300,8 +300,8 @@ class OriginalAttention(nn.Module):
def apply_forward_attention(self, alignment): def apply_forward_attention(self, alignment):
# forward attention # forward attention
fwd_shifted_alpha = F.pad(self.alpha[:, :-1].clone().to(alignment.device), fwd_shifted_alpha = F.pad(
(1, 0, 0, 0)) self.alpha[:, :-1].clone().to(alignment.device), (1, 0, 0, 0))
# compute transition potentials # compute transition potentials
alpha = ((1 - self.u) * self.alpha alpha = ((1 - self.u) * self.alpha
+ self.u * fwd_shifted_alpha + self.u * fwd_shifted_alpha
@ -309,7 +309,7 @@ class OriginalAttention(nn.Module):
# force incremental alignment # force incremental alignment
if not self.training and self.forward_attn_mask: if not self.training and self.forward_attn_mask:
_, n = fwd_shifted_alpha.max(1) _, n = fwd_shifted_alpha.max(1)
val, n2 = alpha.max(1) val, _ = alpha.max(1)
for b in range(alignment.shape[0]): for b in range(alignment.shape[0]):
alpha[b, n[b] + 3:] = 0 alpha[b, n[b] + 3:] = 0
alpha[b, :( alpha[b, :(

View File

@ -72,7 +72,7 @@ class ReferenceEncoder(nn.Module):
# x: 3D tensor [batch_size, post_conv_width, # x: 3D tensor [batch_size, post_conv_width,
# num_channels*post_conv_height] # num_channels*post_conv_height]
self.recurrence.flatten_parameters() self.recurrence.flatten_parameters()
memory, out = self.recurrence(x) _, out = self.recurrence(x)
# out: 3D tensor [seq_len==1, batch_size, encoding_size=128] # out: 3D tensor [seq_len==1, batch_size, encoding_size=128]
return out.squeeze(0) return out.squeeze(0)

View File

@ -243,4 +243,3 @@ class TacotronLoss(torch.nn.Module):
return_dict['loss'] = loss return_dict['loss'] = loss
return return_dict return return_dict

View File

@ -1,7 +1,7 @@
# coding: utf-8 # coding: utf-8
import torch import torch
from torch import nn from torch import nn
from .common_layers import Prenet, init_attn, Linear from .common_layers import Prenet, init_attn
class BatchNormConv1d(nn.Module): class BatchNormConv1d(nn.Module):
@ -46,9 +46,9 @@ class BatchNormConv1d(nn.Module):
# self.init_layers() # self.init_layers()
def init_layers(self): def init_layers(self):
if type(self.activation) == torch.nn.ReLU: if isinstance(self.activation, torch.nn.ReLU):
w_gain = 'relu' w_gain = 'relu'
elif type(self.activation) == torch.nn.Tanh: elif isinstance(self.activation, torch.nn.Tanh):
w_gain = 'tanh' w_gain = 'tanh'
elif self.activation is None: elif self.activation is None:
w_gain = 'linear' w_gain = 'linear'
@ -117,7 +117,7 @@ class CBHG(nn.Module):
- input: (B, C, T_in) - input: (B, C, T_in)
- output: (B, T_in, C*2) - output: (B, T_in, C*2)
""" """
#pylint: disable=dangerous-default-value
def __init__(self, def __init__(self,
in_features, in_features,
K=16, K=16,
@ -355,7 +355,6 @@ class Decoder(nn.Module):
Initialization of decoder states Initialization of decoder states
""" """
B = inputs.size(0) B = inputs.size(0)
T = inputs.size(1)
# go frame as zeros matrix # go frame as zeros matrix
if self.use_memory_queue: if self.use_memory_queue:
self.memory_input = torch.zeros(1, device=inputs.device).repeat(B, self.frame_channels * self.memory_size) self.memory_input = torch.zeros(1, device=inputs.device).repeat(B, self.frame_channels * self.memory_size)
@ -496,7 +495,7 @@ class Decoder(nn.Module):
if t > inputs.shape[1] / 4 and (stop_token > 0.6 if t > inputs.shape[1] / 4 and (stop_token > 0.6
or attention[:, -1].item() > 0.6): or attention[:, -1].item() > 0.6):
break break
elif t > self.max_decoder_steps: if t > self.max_decoder_steps:
print(" | > Decoder stopped with 'max_decoder_steps") print(" | > Decoder stopped with 'max_decoder_steps")
break break
return self._parse_outputs(outputs, attentions, stop_tokens) return self._parse_outputs(outputs, attentions, stop_tokens)

View File

@ -1,10 +1,11 @@
import torch import torch
from torch.autograd import Variable
from torch import nn from torch import nn
from torch.nn import functional as F from torch.nn import functional as F
from .common_layers import init_attn, Prenet, Linear from .common_layers import init_attn, Prenet, Linear
# NOTE: linter has a problem with the current TF release
#pylint: disable=no-value-for-parameter
#pylint: disable=unexpected-keyword-arg
class ConvBNBlock(nn.Module): class ConvBNBlock(nn.Module):
r"""Convolutions with Batch Normalization and non-linear activation. r"""Convolutions with Batch Normalization and non-linear activation.
@ -156,6 +157,7 @@ class Decoder(nn.Module):
self.separate_stopnet = separate_stopnet self.separate_stopnet = separate_stopnet
self.max_decoder_steps = 1000 self.max_decoder_steps = 1000
self.stop_threshold = 0.5 self.stop_threshold = 0.5
self.speaker_embedding_dim = speaker_embedding_dim
# model dimensions # model dimensions
self.query_dim = 1024 self.query_dim = 1024
@ -211,8 +213,8 @@ class Decoder(nn.Module):
def get_go_frame(self, inputs): def get_go_frame(self, inputs):
B = inputs.size(0) B = inputs.size(0)
memory = torch.zeros(1, device=inputs.device).repeat(B, memory = torch.zeros(1, device=inputs.device).repeat(
self.frame_channels * self.r) B, self.frame_channels * self.r)
return memory return memory
def _init_states(self, inputs, mask, keep_states=False): def _init_states(self, inputs, mask, keep_states=False):
@ -393,7 +395,6 @@ class Decoder(nn.Module):
self.attention.init_win_idx() self.attention.init_win_idx()
self.attention.init_states(inputs) self.attention.init_states(inputs)
outputs, stop_tokens, alignments, t = [], [], [], 0 outputs, stop_tokens, alignments, t = [], [], [], 0
stop_flags = [True, False, False]
while True: while True:
memory = self.prenet(self.memory_truncated) memory = self.prenet(self.memory_truncated)
decoder_output, alignment, stop_token = self.decode(memory) decoder_output, alignment, stop_token = self.decode(memory)

View File

@ -3,6 +3,9 @@ from tensorflow import keras
from tensorflow.python.ops import math_ops from tensorflow.python.ops import math_ops
# from tensorflow_addons.seq2seq import BahdanauAttention # from tensorflow_addons.seq2seq import BahdanauAttention
# NOTE: linter has a problem with the current TF release
#pylint: disable=no-value-for-parameter
#pylint: disable=unexpected-keyword-arg
class Linear(keras.layers.Layer): class Linear(keras.layers.Layer):
def __init__(self, units, use_bias, **kwargs): def __init__(self, units, use_bias, **kwargs):

View File

@ -4,7 +4,9 @@ from mozilla_voice_tts.tts.tf.utils.tf_utils import shape_list
from mozilla_voice_tts.tts.tf.layers.common_layers import Prenet, Attention from mozilla_voice_tts.tts.tf.layers.common_layers import Prenet, Attention
# from tensorflow_addons.seq2seq import AttentionWrapper # from tensorflow_addons.seq2seq import AttentionWrapper
# NOTE: linter has a problem with the current TF release
#pylint: disable=no-value-for-parameter
#pylint: disable=unexpected-keyword-arg
class ConvBNBlock(keras.layers.Layer): class ConvBNBlock(keras.layers.Layer):
def __init__(self, filters, kernel_size, activation, **kwargs): def __init__(self, filters, kernel_size, activation, **kwargs):
super(ConvBNBlock, self).__init__(**kwargs) super(ConvBNBlock, self).__init__(**kwargs)

View File

@ -5,7 +5,7 @@ from mozilla_voice_tts.tts.tf.layers.tacotron2 import Encoder, Decoder, Postnet
from mozilla_voice_tts.tts.tf.utils.tf_utils import shape_list from mozilla_voice_tts.tts.tf.utils.tf_utils import shape_list
#pylint: disable=too-many-ancestors #pylint: disable=too-many-ancestors, abstract-method
class Tacotron2(keras.models.Model): class Tacotron2(keras.models.Model):
def __init__(self, def __init__(self,
num_chars, num_chars,
@ -105,4 +105,3 @@ class Tacotron2(keras.models.Model):
# TODO: issue https://github.com/PyCQA/pylint/issues/3613 # TODO: issue https://github.com/PyCQA/pylint/issues/3613
input_ids = tf.random.uniform(shape=[1, 4], maxval=10, dtype=tf.int32) #pylint: disable=unexpected-keyword-arg input_ids = tf.random.uniform(shape=[1, 4], maxval=10, dtype=tf.int32) #pylint: disable=unexpected-keyword-arg
self(input_ids) self(input_ids)

View File

@ -1,6 +1,9 @@
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
# NOTE: linter has a problem with the current TF release
#pylint: disable=no-value-for-parameter
#pylint: disable=unexpected-keyword-arg
def tf_create_dummy_inputs(): def tf_create_dummy_inputs():
""" Create dummy inputs for TF Tacotron2 model """ """ Create dummy inputs for TF Tacotron2 model """

View File

@ -1,4 +1,3 @@
import os
import datetime import datetime
import importlib import importlib
import pickle import pickle

View File

@ -39,4 +39,3 @@ def load_tflite_model(tflite_path):
tflite_model = tf.lite.Interpreter(model_path=tflite_path) tflite_model = tf.lite.Interpreter(model_path=tflite_path)
tflite_model.allocate_tensors() tflite_model.allocate_tensors()
return tflite_model return tflite_model

View File

@ -74,4 +74,3 @@ class StandardScaler():
X *= self.scale_ X *= self.scale_
X += self.mean_ X += self.mean_
return X return X

View File

@ -1,15 +1,11 @@
# edited from https://github.com/fastai/imagenet-fast/blob/master/imagenet_nv/distributed.py # edited from https://github.com/fastai/imagenet-fast/blob/master/imagenet_nv/distributed.py
import os, sys
import math import math
import time
import subprocess
import argparse
import torch import torch
import torch.distributed as dist import torch.distributed as dist
from torch.utils.data.sampler import Sampler
from torch.autograd import Variable
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
from mozilla_voice_tts.utils.generic_utils import create_experiment_folder from torch.autograd import Variable
from torch.utils.data.sampler import Sampler
class DistributedSampler(Sampler): class DistributedSampler(Sampler):
@ -108,7 +104,7 @@ def apply_gradient_allreduce(module):
for param in list(module.parameters()): for param in list(module.parameters()):
def allreduce_hook(*_): def allreduce_hook(*_):
Variable._execution_engine.queue_callback(allreduce_params) Variable._execution_engine.queue_callback(allreduce_params) #pylint: disable=protected-access
if param.requires_grad: if param.requires_grad:
param.register_hook(allreduce_hook) param.register_hook(allreduce_hook)

View File

@ -3,7 +3,7 @@ import torch
import datetime import datetime
def load_checkpoint(model, checkpoint_path, use_cuda=False): def load_checkpoint(model, checkpoint_path, amp=None, use_cuda=False):
state = torch.load(checkpoint_path, map_location=torch.device('cpu')) state = torch.load(checkpoint_path, map_location=torch.device('cpu'))
model.load_state_dict(state['model']) model.load_state_dict(state['model'])
if amp and 'amp' in state: if amp and 'amp' in state:

View File

@ -1,6 +1,3 @@
import torch
def alignment_diagonal_score(alignments, binary=False): def alignment_diagonal_score(alignments, binary=False):
""" """
Compute how diagonal alignment predictions are. It is useful Compute how diagonal alignment predictions are. It is useful

View File

@ -1,8 +1,6 @@
import os import os
import json import json
from mozilla_voice_tts.tts.datasets.preprocess import get_preprocessor_by_name
def make_speakers_json_path(out_path): def make_speakers_json_path(out_path):
"""Returns conventional speakers.json location.""" """Returns conventional speakers.json location."""

View File

@ -8,6 +8,7 @@ from mozilla_voice_tts.tts.utils.text import cleaners
from mozilla_voice_tts.tts.utils.text.symbols import make_symbols, symbols, phonemes, _phoneme_punctuations, _bos, \ from mozilla_voice_tts.tts.utils.text.symbols import make_symbols, symbols, phonemes, _phoneme_punctuations, _bos, \
_eos _eos
# pylint: disable=unnecessary-comprehension
# Mappings from symbol to numeric ID and vice versa: # Mappings from symbol to numeric ID and vice versa:
_symbol_to_id = {s: i for i, s in enumerate(symbols)} _symbol_to_id = {s: i for i, s in enumerate(symbols)}
_id_to_symbol = {i: s for i, s in enumerate(symbols)} _id_to_symbol = {i: s for i, s in enumerate(symbols)}

View File

@ -41,7 +41,7 @@ def plot_spectrogram(spectrogram, ap=None, fig_size=(16, 10), output_fig=False):
plt.colorbar() plt.colorbar()
plt.tight_layout() plt.tight_layout()
if not output_fig: if not output_fig:
plt.close() plt.close()
return fig return fig
@ -97,4 +97,4 @@ def visualize(alignment, postnet_output, stop_tokens, text, hop_length, CONFIG,
plt.close() plt.close()
if not output_fig: if not output_fig:
plt.close() plt.close()

View File

@ -52,7 +52,7 @@ class AudioProcessor(object):
self.mel_fmin = mel_fmin or 0 self.mel_fmin = mel_fmin or 0
self.mel_fmax = mel_fmax self.mel_fmax = mel_fmax
self.spec_gain = float(spec_gain) self.spec_gain = float(spec_gain)
self.stft_pad_mode = 'reflect' self.stft_pad_mode = stft_pad_mode
self.max_norm = 1.0 if max_norm is None else float(max_norm) self.max_norm = 1.0 if max_norm is None else float(max_norm)
self.clip_norm = clip_norm self.clip_norm = clip_norm
self.do_trim_silence = do_trim_silence self.do_trim_silence = do_trim_silence
@ -123,7 +123,7 @@ class AudioProcessor(object):
if self.symmetric_norm: if self.symmetric_norm:
S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm
if self.clip_norm: if self.clip_norm:
S_norm = np.clip(S_norm, -self.max_norm, self.max_norm) S_norm = np.clip(S_norm, -self.max_norm, self.max_norm) # pylint: disable=invalid-unary-operand-type
return S_norm return S_norm
else: else:
S_norm = self.max_norm * S_norm S_norm = self.max_norm * S_norm
@ -148,7 +148,7 @@ class AudioProcessor(object):
raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.') raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.')
if self.symmetric_norm: if self.symmetric_norm:
if self.clip_norm: if self.clip_norm:
S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm) S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm) #pylint: disable=invalid-unary-operand-type
S_denorm = ((S_denorm + self.max_norm) * -self.min_level_db / (2 * self.max_norm)) + self.min_level_db S_denorm = ((S_denorm + self.max_norm) * -self.min_level_db / (2 * self.max_norm)) + self.min_level_db
return S_denorm + self.ref_level_db return S_denorm + self.ref_level_db
else: else:

View File

@ -2,7 +2,7 @@
import math import math
import torch import torch
from torch.optim.optimizer import Optimizer, required from torch.optim.optimizer import Optimizer
class RAdam(Optimizer): class RAdam(Optimizer):
@ -25,7 +25,7 @@ class RAdam(Optimizer):
defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, buffer=[[None, None, None] for _ in range(10)]) defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, buffer=[[None, None, None] for _ in range(10)])
super(RAdam, self).__init__(params, defaults) super(RAdam, self).__init__(params, defaults)
def __setstate__(self, state): def __setstate__(self, state): # pylint: disable=useless-super-delegation
super(RAdam, self).__setstate__(state) super(RAdam, self).__setstate__(state)
def step(self, closure=None): def step(self, closure=None):

View File

@ -47,7 +47,7 @@ class TensorboardLogger(object):
for key, value in audios.items(): for key, value in audios.items():
try: try:
self.writer.add_audio('{}/{}'.format(scope_name, key), value, step, sample_rate=sample_rate) self.writer.add_audio('{}/{}'.format(scope_name, key), value, step, sample_rate=sample_rate)
except: except RuntimeError:
traceback.print_exc() traceback.print_exc()
def tb_train_iter_stats(self, step, stats): def tb_train_iter_stats(self, step, stats):

View File

@ -95,4 +95,3 @@ class MelganGenerator(nn.Module):
nn.utils.remove_weight_norm(layer) nn.utils.remove_weight_norm(layer)
except ValueError: except ValueError:
layer.remove_weight_norm() layer.remove_weight_norm()

View File

@ -145,6 +145,5 @@ def setup_discriminator(c):
) )
return model return model
# def check_config(c):
def check_config(c): # pass
pass

View File

@ -71,7 +71,7 @@ def process_meta_data(path):
def get_data_points(meta_data): def get_data_points(meta_data):
x = [char_cnt for char_cnt in meta_data] x = meta_data
y_avg = [meta_data[d]['mean'] for d in meta_data] y_avg = [meta_data[d]['mean'] for d in meta_data]
y_mode = [meta_data[d]['mode'] for d in meta_data] y_mode = [meta_data[d]['mode'] for d in meta_data]
y_median = [meta_data[d]['median'] for d in meta_data] y_median = [meta_data[d]['median'] for d in meta_data]

View File

@ -36,7 +36,7 @@ else:
pass pass
class build_py(setuptools.command.build_py.build_py): class build_py(setuptools.command.build_py.build_py): # pylint: disable=too-many-ancestors
def run(self): def run(self):
self.create_version_file() self.create_version_file()
setuptools.command.build_py.build_py.run(self) setuptools.command.build_py.build_py.run(self)

View File

@ -1,35 +0,0 @@
import unittest
import torch as T
from mozilla_voice_tts.tts.utils.generic_utils import save_checkpoint, save_best_model
from mozilla_voice_tts.tts.layers.tacotron import Prenet
OUT_PATH = '/tmp/test.pth.tar'
class ModelSavingTests(unittest.TestCase):
def save_checkpoint_test(self):
# create a dummy model
model = Prenet(128, out_features=[256, 128])
model = T.nn.DataParallel(layer) #FIXME: undefined variable layer
# save the model
save_checkpoint(model, None, 100, OUT_PATH, 1, 1)
# load the model to CPU
model_dict = T.load(
MODEL_PATH, map_location=lambda storage, loc: storage) #FIXME: undefined variable MODEL_PATH
model.load_state_dict(model_dict['model'])
def save_best_model_test(self):
# create a dummy model
model = Prenet(256, out_features=[256, 256])
model = T.nn.DataParallel(layer)
# save the model
save_best_model(model, None, 0, 100, OUT_PATH, 10, 1)
# load the model to CPU
model_dict = T.load(
MODEL_PATH, map_location=lambda storage, loc: storage)
model.load_state_dict(model_dict['model'])

View File

@ -1,7 +1,8 @@
import os import os
import unittest import unittest
from tests import get_tests_path, get_tests_input_path, get_tests_output_path from tests import get_tests_input_path, get_tests_output_path, get_tests_path
from mozilla_voice_tts.utils.audio import AudioProcessor from mozilla_voice_tts.utils.audio import AudioProcessor
from mozilla_voice_tts.utils.io import load_config from mozilla_voice_tts.utils.io import load_config
@ -103,7 +104,7 @@ class TestAudio(unittest.TestCase):
assert (x_old - x).sum() == 0 assert (x_old - x).sum() == 0
# check value range # check value range
assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max() assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max()
assert x_norm.min() >= -self.ap.max_norm - 2, x_norm.min() assert x_norm.min() >= -self.ap.max_norm - 2, x_norm.min() #pylint: disable=invalid-unary-operand-type
assert x_norm.min() <= 0, x_norm.min() assert x_norm.min() <= 0, x_norm.min()
# check denorm. # check denorm.
x_ = self.ap._denormalize(x_norm) x_ = self.ap._denormalize(x_norm)
@ -120,7 +121,7 @@ class TestAudio(unittest.TestCase):
assert (x_old - x).sum() == 0 assert (x_old - x).sum() == 0
# check value range # check value range
assert x_norm.max() <= self.ap.max_norm, x_norm.max() assert x_norm.max() <= self.ap.max_norm, x_norm.max()
assert x_norm.min() >= -self.ap.max_norm, x_norm.min() assert x_norm.min() >= -self.ap.max_norm, x_norm.min() #pylint: disable=invalid-unary-operand-type
assert x_norm.min() <= 0, x_norm.min() assert x_norm.min() <= 0, x_norm.min()
# check denorm. # check denorm.
x_ = self.ap._denormalize(x_norm) x_ = self.ap._denormalize(x_norm)
@ -148,7 +149,7 @@ class TestAudio(unittest.TestCase):
assert (x_old - x).sum() == 0 assert (x_old - x).sum() == 0
assert x_norm.max() <= self.ap.max_norm, x_norm.max() assert x_norm.max() <= self.ap.max_norm, x_norm.max()
assert x_norm.min() >= -self.ap.max_norm, x_norm.min() assert x_norm.min() >= -self.ap.max_norm, x_norm.min() #pylint: disable=invalid-unary-operand-type
assert x_norm.min() < 0, x_norm.min() assert x_norm.min() < 0, x_norm.min()
x_ = self.ap._denormalize(x_norm) x_ = self.ap._denormalize(x_norm)
assert (x - x_).sum() < 1e-3 assert (x - x_).sum() < 1e-3

View File

@ -1,13 +1,13 @@
import os import os
import unittest import unittest
import torch as T from tests import get_tests_input_path, get_tests_output_path
from mozilla_voice_tts.server.synthesizer import Synthesizer from mozilla_voice_tts.server.synthesizer import Synthesizer
from tests import get_tests_input_path, get_tests_output_path
from mozilla_voice_tts.tts.utils.text.symbols import make_symbols, phonemes, symbols
from mozilla_voice_tts.tts.utils.generic_utils import setup_model from mozilla_voice_tts.tts.utils.generic_utils import setup_model
from mozilla_voice_tts.tts.utils.io import save_checkpoint from mozilla_voice_tts.tts.utils.io import save_checkpoint
from mozilla_voice_tts.tts.utils.text.symbols import (make_symbols, phonemes,
symbols)
from mozilla_voice_tts.utils.io import load_config from mozilla_voice_tts.utils.io import load_config

View File

@ -1,13 +1,13 @@
import os import os
import unittest import unittest
import torch as T import torch as T
from tests import get_tests_input_path
from tests import get_tests_path, get_tests_input_path
from mozilla_voice_tts.speaker_encoder.model import SpeakerEncoder
from mozilla_voice_tts.speaker_encoder.loss import GE2ELoss from mozilla_voice_tts.speaker_encoder.loss import GE2ELoss
from mozilla_voice_tts.speaker_encoder.model import SpeakerEncoder
from mozilla_voice_tts.utils.io import load_config from mozilla_voice_tts.utils.io import load_config
file_path = get_tests_input_path() file_path = get_tests_input_path()
c = load_config(os.path.join(file_path, "test_config.json")) c = load_config(os.path.join(file_path, "test_config.json"))

View File

@ -9,7 +9,7 @@ from mozilla_voice_tts.tts.utils.generic_utils import sequence_mask
class PrenetTests(unittest.TestCase): class PrenetTests(unittest.TestCase):
def test_in_out(self): def test_in_out(self): #pylint: disable=no-self-use
layer = Prenet(128, out_features=[256, 128]) layer = Prenet(128, out_features=[256, 128])
dummy_input = T.rand(4, 128) dummy_input = T.rand(4, 128)
@ -104,7 +104,7 @@ class DecoderTests(unittest.TestCase):
class EncoderTests(unittest.TestCase): class EncoderTests(unittest.TestCase):
def test_in_out(self): def test_in_out(self): #pylint: disable=no-self-use
layer = Encoder(128) layer = Encoder(128)
dummy_input = T.rand(4, 8, 128) dummy_input = T.rand(4, 8, 128)
@ -117,7 +117,7 @@ class EncoderTests(unittest.TestCase):
class L1LossMaskedTests(unittest.TestCase): class L1LossMaskedTests(unittest.TestCase):
def test_in_out(self): def test_in_out(self): #pylint: disable=no-self-use
# test input == target # test input == target
layer = L1LossMasked(seq_len_norm=False) layer = L1LossMasked(seq_len_norm=False)
dummy_input = T.ones(4, 8, 128).float() dummy_input = T.ones(4, 8, 128).float()

View File

@ -1,15 +1,16 @@
import os import os
import unittest
import shutil import shutil
import torch import unittest
import numpy as np
from tests import get_tests_path, get_tests_input_path, get_tests_output_path import numpy as np
import torch
from tests import get_tests_input_path, get_tests_output_path
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
from mozilla_voice_tts.utils.io import load_config
from mozilla_voice_tts.utils.audio import AudioProcessor
from mozilla_voice_tts.tts.datasets import TTSDataset from mozilla_voice_tts.tts.datasets import TTSDataset
from mozilla_voice_tts.tts.datasets.preprocess import ljspeech from mozilla_voice_tts.tts.datasets.preprocess import ljspeech
from mozilla_voice_tts.utils.audio import AudioProcessor
from mozilla_voice_tts.utils.io import load_config
#pylint: disable=unused-variable #pylint: disable=unused-variable
@ -32,7 +33,7 @@ class TestTTSDataset(unittest.TestCase):
self.ap = AudioProcessor(**c.audio) self.ap = AudioProcessor(**c.audio)
def _create_dataloader(self, batch_size, r, bgs): def _create_dataloader(self, batch_size, r, bgs):
items = ljspeech(c.data_path,'metadata.csv') items = ljspeech(c.data_path, 'metadata.csv')
dataset = TTSDataset.MyDataset( dataset = TTSDataset.MyDataset(
r, r,
c.text_cleaner, c.text_cleaner,
@ -74,7 +75,7 @@ class TestTTSDataset(unittest.TestCase):
assert check_count == 0, \ assert check_count == 0, \
" !! Negative values in text_input: {}".format(check_count) " !! Negative values in text_input: {}".format(check_count)
# TODO: more assertion here # TODO: more assertion here
assert type(speaker_name[0]) is str assert isinstance(speaker_name[0], str)
assert linear_input.shape[0] == c.batch_size assert linear_input.shape[0] == c.batch_size
assert linear_input.shape[2] == self.ap.fft_size // 2 + 1 assert linear_input.shape[2] == self.ap.fft_size // 2 + 1
assert mel_input.shape[0] == c.batch_size assert mel_input.shape[0] == c.batch_size
@ -82,7 +83,7 @@ class TestTTSDataset(unittest.TestCase):
# check normalization ranges # check normalization ranges
if self.ap.symmetric_norm: if self.ap.symmetric_norm:
assert mel_input.max() <= self.ap.max_norm assert mel_input.max() <= self.ap.max_norm
assert mel_input.min() >= -self.ap.max_norm assert mel_input.min() >= -self.ap.max_norm #pylint: disable=invalid-unary-operand-type
assert mel_input.min() < 0 assert mel_input.min() < 0
else: else:
assert mel_input.max() <= self.ap.max_norm assert mel_input.max() <= self.ap.max_norm

View File

@ -7,7 +7,7 @@ from mozilla_voice_tts.tts.datasets.preprocess import common_voice
class TestPreprocessors(unittest.TestCase): class TestPreprocessors(unittest.TestCase):
def test_common_voice_preprocessor(self): def test_common_voice_preprocessor(self): #pylint: disable=no-self-use
root_path = get_tests_input_path() root_path = get_tests_input_path()
meta_file = "common_voice.tsv" meta_file = "common_voice.tsv"
items = common_voice(root_path, meta_file) items = common_voice(root_path, meta_file)

View File

@ -20,8 +20,8 @@ c = load_config(os.path.join(get_tests_input_path(), 'test_config.json'))
class TacotronTrainTest(unittest.TestCase): class TacotronTrainTest(unittest.TestCase):
def test_train_step(self): def test_train_step(self): # pylint: disable=no-self-use
input = torch.randint(0, 24, (8, 128)).long().to(device) input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
input_lengths = torch.randint(100, 128, (8, )).long().to(device) input_lengths = torch.randint(100, 128, (8, )).long().to(device)
input_lengths = torch.sort(input_lengths, descending=True)[0] input_lengths = torch.sort(input_lengths, descending=True)[0]
mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device) mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
@ -34,7 +34,7 @@ class TacotronTrainTest(unittest.TestCase):
for idx in mel_lengths: for idx in mel_lengths:
stop_targets[:, int(idx.item()):, 0] = 1.0 stop_targets[:, int(idx.item()):, 0] = 1.0
stop_targets = stop_targets.view(input.shape[0], stop_targets = stop_targets.view(input_dummy.shape[0],
stop_targets.size(1) // c.r, -1) stop_targets.size(1) // c.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
@ -51,7 +51,7 @@ class TacotronTrainTest(unittest.TestCase):
optimizer = optim.Adam(model.parameters(), lr=c.lr) optimizer = optim.Adam(model.parameters(), lr=c.lr)
for i in range(5): for i in range(5):
mel_out, mel_postnet_out, align, stop_tokens = model.forward( mel_out, mel_postnet_out, align, stop_tokens = model.forward(
input, input_lengths, mel_spec, mel_lengths, speaker_ids) input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids)
assert torch.sigmoid(stop_tokens).data.max() <= 1.0 assert torch.sigmoid(stop_tokens).data.max() <= 1.0
assert torch.sigmoid(stop_tokens).data.min() >= 0.0 assert torch.sigmoid(stop_tokens).data.min() >= 0.0
optimizer.zero_grad() optimizer.zero_grad()

View File

@ -1,15 +1,19 @@
import os import os
import torch
import unittest import unittest
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
import torch
from tests import get_tests_input_path
from mozilla_voice_tts.tts.tf.models.tacotron2 import Tacotron2
from mozilla_voice_tts.tts.tf.utils.tflite import (convert_tacotron2_to_tflite,
load_tflite_model)
from mozilla_voice_tts.utils.io import load_config
tf.get_logger().setLevel('INFO') tf.get_logger().setLevel('INFO')
from tests import get_tests_path, get_tests_input_path, get_tests_output_path
from mozilla_voice_tts.utils.io import load_config
from mozilla_voice_tts.tts.tf.models.tacotron2 import Tacotron2
from mozilla_voice_tts.tts.tf.utils.tflite import convert_tacotron2_to_tflite, load_tflite_model
#pylint: disable=unused-variable #pylint: disable=unused-variable
@ -132,4 +136,3 @@ class TacotronTFTrainTest(unittest.TestCase):
postnet_output = tflite_model.get_tensor(output_details[1]['index']) postnet_output = tflite_model.get_tensor(output_details[1]['index'])
# remove tflite binary # remove tflite binary
os.remove('test_tacotron2.tflite') os.remove('test_tacotron2.tflite')

View File

@ -16,7 +16,7 @@ def test_phoneme_to_sequence():
lang = "en-us" lang = "en-us"
sequence = phoneme_to_sequence(text, text_cleaner, lang) sequence = phoneme_to_sequence(text, text_cleaner, lang)
text_hat = sequence_to_phoneme(sequence) text_hat = sequence_to_phoneme(sequence)
sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters) _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters) text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
gt = "ɹiːsənt ɹɪːtʃ æt hɑːɹvɚd hɐz ʃoʊn mɛdᵻteɪɾɪŋ fɔːɹ æz lɪɾəl æz eɪt wiːks kæn æktʃuːəli ɪnkɹiːs, ðə ɡɹeɪ mæɾɚɹ ɪnðə pɑːɹts ʌvðə bɹeɪn ɹɪspɑːnsəbəl fɔːɹ ɪmoʊʃənəl ɹɛɡjuːleɪʃən ænd lɜːnɪŋ!" gt = "ɹiːsənt ɹɪːtʃ æt hɑːɹvɚd hɐz ʃoʊn mɛdᵻteɪɾɪŋ fɔːɹ æz lɪɾəl æz eɪt wiːks kæn æktʃuːəli ɪnkɹiːs, ðə ɡɹeɪ mæɾɚɹ ɪnðə pɑːɹts ʌvðə bɹeɪn ɹɪspɑːnsəbəl fɔːɹ ɪmoʊʃənəl ɹɛɡjuːleɪʃən ænd lɜːnɪŋ!"
assert text_hat == text_hat_with_params == gt assert text_hat == text_hat_with_params == gt
@ -25,7 +25,7 @@ def test_phoneme_to_sequence():
text = "Be a voice, not an! echo?" text = "Be a voice, not an! echo?"
sequence = phoneme_to_sequence(text, text_cleaner, lang) sequence = phoneme_to_sequence(text, text_cleaner, lang)
text_hat = sequence_to_phoneme(sequence) text_hat = sequence_to_phoneme(sequence)
sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters) _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters) text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ?" gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ?"
print(text_hat) print(text_hat)
@ -36,7 +36,7 @@ def test_phoneme_to_sequence():
text = "Be a voice, not an! echo" text = "Be a voice, not an! echo"
sequence = phoneme_to_sequence(text, text_cleaner, lang) sequence = phoneme_to_sequence(text, text_cleaner, lang)
text_hat = sequence_to_phoneme(sequence) text_hat = sequence_to_phoneme(sequence)
sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters) _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters) text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ" gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ"
print(text_hat) print(text_hat)
@ -47,7 +47,7 @@ def test_phoneme_to_sequence():
text = "Be a voice, not an echo!" text = "Be a voice, not an echo!"
sequence = phoneme_to_sequence(text, text_cleaner, lang) sequence = phoneme_to_sequence(text, text_cleaner, lang)
text_hat = sequence_to_phoneme(sequence) text_hat = sequence_to_phoneme(sequence)
sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters) _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters) text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
gt = "biː ɐ vɔɪs, nɑːt ɐn ɛkoʊ!" gt = "biː ɐ vɔɪs, nɑːt ɐn ɛkoʊ!"
print(text_hat) print(text_hat)
@ -58,7 +58,7 @@ def test_phoneme_to_sequence():
text = "Be a voice, not an! echo. " text = "Be a voice, not an! echo. "
sequence = phoneme_to_sequence(text, text_cleaner, lang) sequence = phoneme_to_sequence(text, text_cleaner, lang)
text_hat = sequence_to_phoneme(sequence) text_hat = sequence_to_phoneme(sequence)
sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters) _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters) text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ." gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ."
print(text_hat) print(text_hat)
@ -69,7 +69,7 @@ def test_phoneme_to_sequence():
text = "Be a voice, not an! echo. " text = "Be a voice, not an! echo. "
sequence = phoneme_to_sequence(text, text_cleaner, lang, True) sequence = phoneme_to_sequence(text, text_cleaner, lang, True)
text_hat = sequence_to_phoneme(sequence) text_hat = sequence_to_phoneme(sequence)
sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters) _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters) text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
gt = "^biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ.~" gt = "^biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ.~"
print(text_hat) print(text_hat)
@ -80,7 +80,7 @@ def test_phoneme_to_sequence():
text = "_Be a _voice, not an! echo_" text = "_Be a _voice, not an! echo_"
sequence = phoneme_to_sequence(text, text_cleaner, lang) sequence = phoneme_to_sequence(text, text_cleaner, lang)
text_hat = sequence_to_phoneme(sequence) text_hat = sequence_to_phoneme(sequence)
sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters) _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters) text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ" gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ"
print(text_hat) print(text_hat)

View File

@ -11,4 +11,3 @@ def test_melgan_generator():
assert np.all(output.shape == (4, 1, 64 * 256)) assert np.all(output.shape == (4, 1, 64 * 256))
output = model.inference(dummy_input) output = model.inference(dummy_input)
assert np.all(output.shape == (4, 1, (64 + 4) * 256)) assert np.all(output.shape == (4, 1, (64 + 4) * 256))

View File

@ -25,4 +25,3 @@ def test_pqmf():
print(w2_.min()) print(w2_.min())
print(w2_.mean()) print(w2_.mean())
sf.write('pqmf_output.wav', w2_.flatten().detach(), sr) sf.write('pqmf_output.wav', w2_.flatten().detach(), sr)

View File

@ -26,4 +26,3 @@ def test_pqmf():
print(w2_.min()) print(w2_.min())
print(w2_.mean()) print(w2_.mean())
sf.write('tf_pqmf_output.wav', w2_.flatten(), sr) sf.write('tf_pqmf_output.wav', w2_.flatten(), sr)