mirror of https://github.com/coqui-ai/TTS.git
mass linter fix
This commit is contained in:
parent
f35504f187
commit
e386caa071
|
@ -30,4 +30,3 @@ model = load_checkpoint(model, args.tf_model)
|
||||||
|
|
||||||
# create tflite model
|
# create tflite model
|
||||||
tflite_model = convert_melgan_to_tflite(model, output_path=args.output_path)
|
tflite_model = convert_melgan_to_tflite(model, output_path=args.output_path)
|
||||||
|
|
||||||
|
|
|
@ -114,4 +114,3 @@ assert compare_torch_tf(output_torch, output_tf) < 1e-5, compare_torch_tf(
|
||||||
save_checkpoint(model_tf, checkpoint['step'], checkpoint['epoch'],
|
save_checkpoint(model_tf, checkpoint['step'], checkpoint['epoch'],
|
||||||
args.output_path)
|
args.output_path)
|
||||||
print(' > Model conversion is successfully completed :).')
|
print(' > Model conversion is successfully completed :).')
|
||||||
|
|
||||||
|
|
|
@ -92,7 +92,7 @@ var_map = [
|
||||||
|
|
||||||
# %%
|
# %%
|
||||||
# get tf_model graph
|
# get tf_model graph
|
||||||
mel_pred = model_tf.build_inference()
|
model_tf.build_inference()
|
||||||
|
|
||||||
# get tf variables
|
# get tf variables
|
||||||
tf_vars = model_tf.weights
|
tf_vars = model_tf.weights
|
||||||
|
|
|
@ -40,8 +40,6 @@ def tts(model, vocoder_model, text, CONFIG, use_cuda, ap, use_gl, speaker_id):
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
global symbols, phonemes
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument('text', type=str, help='Text to generate speech.')
|
parser.add_argument('text', type=str, help='Text to generate speech.')
|
||||||
parser.add_argument('config_path',
|
parser.add_argument('config_path',
|
||||||
|
|
|
@ -9,6 +9,8 @@ import traceback
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from torch.utils.data import DataLoader
|
from torch.utils.data import DataLoader
|
||||||
|
|
||||||
|
from mozilla_voice_tts.generic_utils import count_parameters
|
||||||
from mozilla_voice_tts.speaker_encoder.dataset import MyDataset
|
from mozilla_voice_tts.speaker_encoder.dataset import MyDataset
|
||||||
from mozilla_voice_tts.speaker_encoder.generic_utils import save_best_model
|
from mozilla_voice_tts.speaker_encoder.generic_utils import save_best_model
|
||||||
from mozilla_voice_tts.speaker_encoder.loss import GE2ELoss
|
from mozilla_voice_tts.speaker_encoder.loss import GE2ELoss
|
||||||
|
@ -16,10 +18,9 @@ from mozilla_voice_tts.speaker_encoder.model import SpeakerEncoder
|
||||||
from mozilla_voice_tts.speaker_encoder.visual import plot_embeddings
|
from mozilla_voice_tts.speaker_encoder.visual import plot_embeddings
|
||||||
from mozilla_voice_tts.tts.datasets.preprocess import load_meta_data
|
from mozilla_voice_tts.tts.datasets.preprocess import load_meta_data
|
||||||
from mozilla_voice_tts.tts.utils.audio import AudioProcessor
|
from mozilla_voice_tts.tts.utils.audio import AudioProcessor
|
||||||
from mozilla_voice_tts.tts.utils.generic_utils import (create_experiment_folder,
|
from mozilla_voice_tts.tts.utils.generic_utils import (
|
||||||
get_git_branch,
|
create_experiment_folder, get_git_branch, remove_experiment_folder,
|
||||||
remove_experiment_folder,
|
set_init_dict)
|
||||||
set_init_dict)
|
|
||||||
from mozilla_voice_tts.tts.utils.io import copy_config_file, load_config
|
from mozilla_voice_tts.tts.utils.io import copy_config_file, load_config
|
||||||
from mozilla_voice_tts.tts.utils.radam import RAdam
|
from mozilla_voice_tts.tts.utils.radam import RAdam
|
||||||
from mozilla_voice_tts.tts.utils.tensorboard_logger import TensorboardLogger
|
from mozilla_voice_tts.tts.utils.tensorboard_logger import TensorboardLogger
|
||||||
|
@ -182,8 +183,8 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
meta_data_train, meta_data_eval = load_meta_data(c.datasets)
|
meta_data_train, meta_data_eval = load_meta_data(c.datasets)
|
||||||
|
|
||||||
global_step = args.restore_step
|
global_step = args.restore_step
|
||||||
train_loss, global_step = train(model, criterion, optimizer, scheduler, ap,
|
_, global_step = train(model, criterion, optimizer, scheduler, ap,
|
||||||
global_step)
|
global_step)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
@ -11,31 +11,40 @@ import traceback
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
from torch.utils.data import DataLoader
|
from torch.utils.data import DataLoader
|
||||||
|
|
||||||
from mozilla_voice_tts.tts.datasets.preprocess import load_meta_data
|
from mozilla_voice_tts.tts.datasets.preprocess import load_meta_data
|
||||||
from mozilla_voice_tts.tts.datasets.TTSDataset import MyDataset
|
from mozilla_voice_tts.tts.datasets.TTSDataset import MyDataset
|
||||||
from mozilla_voice_tts.tts.layers.losses import TacotronLoss
|
from mozilla_voice_tts.tts.layers.losses import TacotronLoss
|
||||||
from mozilla_voice_tts.tts.utils.distribute import (DistributedSampler,
|
from mozilla_voice_tts.tts.utils.distribute import (DistributedSampler,
|
||||||
apply_gradient_allreduce,
|
apply_gradient_allreduce,
|
||||||
init_distributed, reduce_tensor)
|
init_distributed,
|
||||||
|
reduce_tensor)
|
||||||
from mozilla_voice_tts.tts.utils.generic_utils import check_config, setup_model
|
from mozilla_voice_tts.tts.utils.generic_utils import check_config, setup_model
|
||||||
from mozilla_voice_tts.tts.utils.io import save_best_model, save_checkpoint
|
from mozilla_voice_tts.tts.utils.io import save_best_model, save_checkpoint
|
||||||
from mozilla_voice_tts.tts.utils.measures import alignment_diagonal_score
|
from mozilla_voice_tts.tts.utils.measures import alignment_diagonal_score
|
||||||
from mozilla_voice_tts.tts.utils.speakers import (get_speakers, load_speaker_mapping,
|
from mozilla_voice_tts.tts.utils.speakers import (get_speakers,
|
||||||
save_speaker_mapping)
|
load_speaker_mapping,
|
||||||
|
save_speaker_mapping)
|
||||||
from mozilla_voice_tts.tts.utils.synthesis import synthesis
|
from mozilla_voice_tts.tts.utils.synthesis import synthesis
|
||||||
from mozilla_voice_tts.tts.utils.text.symbols import make_symbols, phonemes, symbols
|
from mozilla_voice_tts.tts.utils.text.symbols import (make_symbols, phonemes,
|
||||||
|
symbols)
|
||||||
from mozilla_voice_tts.tts.utils.visual import plot_alignment, plot_spectrogram
|
from mozilla_voice_tts.tts.utils.visual import plot_alignment, plot_spectrogram
|
||||||
from mozilla_voice_tts.utils.audio import AudioProcessor
|
from mozilla_voice_tts.utils.audio import AudioProcessor
|
||||||
from mozilla_voice_tts.utils.console_logger import ConsoleLogger
|
from mozilla_voice_tts.utils.console_logger import ConsoleLogger
|
||||||
from mozilla_voice_tts.utils.generic_utils import (KeepAverage, count_parameters,
|
from mozilla_voice_tts.utils.generic_utils import (KeepAverage,
|
||||||
create_experiment_folder, get_git_branch,
|
count_parameters,
|
||||||
remove_experiment_folder, set_init_dict)
|
create_experiment_folder,
|
||||||
|
get_git_branch,
|
||||||
|
remove_experiment_folder,
|
||||||
|
set_init_dict)
|
||||||
from mozilla_voice_tts.utils.io import copy_config_file, load_config
|
from mozilla_voice_tts.utils.io import copy_config_file, load_config
|
||||||
from mozilla_voice_tts.utils.radam import RAdam
|
from mozilla_voice_tts.utils.radam import RAdam
|
||||||
from mozilla_voice_tts.utils.tensorboard_logger import TensorboardLogger
|
from mozilla_voice_tts.utils.tensorboard_logger import TensorboardLogger
|
||||||
from mozilla_voice_tts.utils.training import (NoamLR, adam_weight_decay, check_update,
|
from mozilla_voice_tts.utils.training import (NoamLR, adam_weight_decay,
|
||||||
gradual_training_scheduler, set_weight_decay,
|
check_update,
|
||||||
setup_torch_training_env)
|
gradual_training_scheduler,
|
||||||
|
set_weight_decay,
|
||||||
|
setup_torch_training_env)
|
||||||
|
|
||||||
use_cuda, num_gpus = setup_torch_training_env(True, False)
|
use_cuda, num_gpus = setup_torch_training_env(True, False)
|
||||||
|
|
||||||
|
@ -47,7 +56,7 @@ def setup_loader(ap, r, is_val=False, verbose=False):
|
||||||
dataset = MyDataset(
|
dataset = MyDataset(
|
||||||
r,
|
r,
|
||||||
c.text_cleaner,
|
c.text_cleaner,
|
||||||
compute_linear_spec=True if c.model.lower() == 'tacotron' else False,
|
compute_linear_spec=c.model.lower() == 'tacotron',
|
||||||
meta_data=meta_data_eval if is_val else meta_data_train,
|
meta_data=meta_data_eval if is_val else meta_data_train,
|
||||||
ap=ap,
|
ap=ap,
|
||||||
tp=c.characters if 'characters' in c.keys() else None,
|
tp=c.characters if 'characters' in c.keys() else None,
|
||||||
|
@ -156,7 +165,7 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,
|
||||||
decoder_backward_output = None
|
decoder_backward_output = None
|
||||||
alignments_backward = None
|
alignments_backward = None
|
||||||
|
|
||||||
# set the alignment lengths wrt reduction factor for guided attention
|
# set the [alignment] lengths wrt reduction factor for guided attention
|
||||||
if mel_lengths.max() % model.decoder.r != 0:
|
if mel_lengths.max() % model.decoder.r != 0:
|
||||||
alignment_lengths = (mel_lengths + (model.decoder.r - (mel_lengths.max() % model.decoder.r))) // model.decoder.r
|
alignment_lengths = (mel_lengths + (model.decoder.r - (mel_lengths.max() % model.decoder.r))) // model.decoder.r
|
||||||
else:
|
else:
|
||||||
|
@ -171,7 +180,7 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,
|
||||||
|
|
||||||
# backward pass
|
# backward pass
|
||||||
if amp is not None:
|
if amp is not None:
|
||||||
with amp.scale_loss( loss_dict['loss'], optimizer) as scaled_loss:
|
with amp.scale_loss(loss_dict['loss'], optimizer) as scaled_loss:
|
||||||
scaled_loss.backward()
|
scaled_loss.backward()
|
||||||
else:
|
else:
|
||||||
loss_dict['loss'].backward()
|
loss_dict['loss'].backward()
|
||||||
|
@ -425,7 +434,7 @@ def evaluate(model, criterion, ap, global_step, epoch):
|
||||||
style_wav = c.get("style_wav_for_test")
|
style_wav = c.get("style_wav_for_test")
|
||||||
for idx, test_sentence in enumerate(test_sentences):
|
for idx, test_sentence in enumerate(test_sentences):
|
||||||
try:
|
try:
|
||||||
wav, alignment, decoder_output, postnet_output, stop_tokens, inputs = synthesis(
|
wav, alignment, decoder_output, postnet_output, stop_tokens, _ = synthesis(
|
||||||
model,
|
model,
|
||||||
test_sentence,
|
test_sentence,
|
||||||
c,
|
c,
|
||||||
|
@ -448,7 +457,7 @@ def evaluate(model, criterion, ap, global_step, epoch):
|
||||||
postnet_output, ap, output_fig=False)
|
postnet_output, ap, output_fig=False)
|
||||||
test_figures['{}-alignment'.format(idx)] = plot_alignment(
|
test_figures['{}-alignment'.format(idx)] = plot_alignment(
|
||||||
alignment, output_fig=False)
|
alignment, output_fig=False)
|
||||||
except:
|
except: #pylint: disable=bare-except
|
||||||
print(" !! Error creating Test Sentence -", idx)
|
print(" !! Error creating Test Sentence -", idx)
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
tb_logger.tb_test_audios(global_step, test_audios,
|
tb_logger.tb_test_audios(global_step, test_audios,
|
||||||
|
@ -531,7 +540,7 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
if c.reinit_layers:
|
if c.reinit_layers:
|
||||||
raise RuntimeError
|
raise RuntimeError
|
||||||
model.load_state_dict(checkpoint['model'])
|
model.load_state_dict(checkpoint['model'])
|
||||||
except:
|
except KeyError:
|
||||||
print(" > Partial model initialization.")
|
print(" > Partial model initialization.")
|
||||||
model_dict = model.state_dict()
|
model_dict = model.state_dict()
|
||||||
model_dict = set_init_dict(model_dict, checkpoint['model'], c)
|
model_dict = set_init_dict(model_dict, checkpoint['model'], c)
|
||||||
|
|
|
@ -8,23 +8,30 @@ from inspect import signature
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from torch.utils.data import DataLoader
|
from torch.utils.data import DataLoader
|
||||||
|
|
||||||
from mozilla_voice_tts.utils.audio import AudioProcessor
|
from mozilla_voice_tts.utils.audio import AudioProcessor
|
||||||
from mozilla_voice_tts.utils.console_logger import ConsoleLogger
|
from mozilla_voice_tts.utils.console_logger import ConsoleLogger
|
||||||
from mozilla_voice_tts.utils.generic_utils import (KeepAverage, count_parameters,
|
from mozilla_voice_tts.utils.generic_utils import (KeepAverage,
|
||||||
create_experiment_folder, get_git_branch,
|
count_parameters,
|
||||||
remove_experiment_folder, set_init_dict)
|
create_experiment_folder,
|
||||||
|
get_git_branch,
|
||||||
|
remove_experiment_folder,
|
||||||
|
set_init_dict)
|
||||||
from mozilla_voice_tts.utils.io import copy_config_file, load_config
|
from mozilla_voice_tts.utils.io import copy_config_file, load_config
|
||||||
from mozilla_voice_tts.utils.radam import RAdam
|
from mozilla_voice_tts.utils.radam import RAdam
|
||||||
from mozilla_voice_tts.utils.tensorboard_logger import TensorboardLogger
|
from mozilla_voice_tts.utils.tensorboard_logger import TensorboardLogger
|
||||||
from mozilla_voice_tts.utils.training import setup_torch_training_env
|
from mozilla_voice_tts.utils.training import setup_torch_training_env
|
||||||
from mozilla_voice_tts.vocoder.datasets.gan_dataset import GANDataset
|
from mozilla_voice_tts.vocoder.datasets.gan_dataset import GANDataset
|
||||||
from mozilla_voice_tts.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
|
from mozilla_voice_tts.vocoder.datasets.preprocess import (load_wav_data,
|
||||||
|
load_wav_feat_data)
|
||||||
# from distribute import (DistributedSampler, apply_gradient_allreduce,
|
# from distribute import (DistributedSampler, apply_gradient_allreduce,
|
||||||
# init_distributed, reduce_tensor)
|
# init_distributed, reduce_tensor)
|
||||||
from mozilla_voice_tts.vocoder.layers.losses import DiscriminatorLoss, GeneratorLoss
|
from mozilla_voice_tts.vocoder.layers.losses import (DiscriminatorLoss,
|
||||||
from mozilla_voice_tts.vocoder.utils.generic_utils import (check_config, plot_results,
|
GeneratorLoss)
|
||||||
setup_discriminator,
|
from mozilla_voice_tts.vocoder.utils.generic_utils import (check_config,
|
||||||
setup_generator)
|
plot_results,
|
||||||
|
setup_discriminator,
|
||||||
|
setup_generator)
|
||||||
from mozilla_voice_tts.vocoder.utils.io import save_best_model, save_checkpoint
|
from mozilla_voice_tts.vocoder.utils.io import save_best_model, save_checkpoint
|
||||||
|
|
||||||
use_cuda, num_gpus = setup_torch_training_env(True, True)
|
use_cuda, num_gpus = setup_torch_training_env(True, True)
|
||||||
|
|
|
@ -4,7 +4,6 @@ import time
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
import yaml
|
|
||||||
import pysbd
|
import pysbd
|
||||||
|
|
||||||
from mozilla_voice_tts.utils.audio import AudioProcessor
|
from mozilla_voice_tts.utils.audio import AudioProcessor
|
||||||
|
|
|
@ -10,7 +10,7 @@ Below is an example showing embedding results of various speakers. You can gener
|
||||||
|
|
||||||
Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page.
|
Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page.
|
||||||
|
|
||||||
To run the code, you need to follow the same flow as in TTS.
|
To run the code, you need to follow the same flow as in mozilla_voice_tts.
|
||||||
|
|
||||||
- Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
|
- Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
|
||||||
- Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```
|
- Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```
|
||||||
|
|
|
@ -85,4 +85,3 @@ class SpeakerEncoder(nn.Module):
|
||||||
frames[cur_iter <= num_iters, :, :]
|
frames[cur_iter <= num_iters, :, :]
|
||||||
)
|
)
|
||||||
return embed / num_iters
|
return embed / num_iters
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
import torch
|
import torch
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from torch.autograd import Variable
|
|
||||||
from torch.nn import functional as F
|
from torch.nn import functional as F
|
||||||
|
|
||||||
|
|
||||||
|
@ -52,6 +51,7 @@ class LinearBN(nn.Module):
|
||||||
|
|
||||||
|
|
||||||
class Prenet(nn.Module):
|
class Prenet(nn.Module):
|
||||||
|
# pylint: disable=dangerous-default-value
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
in_features,
|
in_features,
|
||||||
prenet_type="original",
|
prenet_type="original",
|
||||||
|
@ -300,8 +300,8 @@ class OriginalAttention(nn.Module):
|
||||||
|
|
||||||
def apply_forward_attention(self, alignment):
|
def apply_forward_attention(self, alignment):
|
||||||
# forward attention
|
# forward attention
|
||||||
fwd_shifted_alpha = F.pad(self.alpha[:, :-1].clone().to(alignment.device),
|
fwd_shifted_alpha = F.pad(
|
||||||
(1, 0, 0, 0))
|
self.alpha[:, :-1].clone().to(alignment.device), (1, 0, 0, 0))
|
||||||
# compute transition potentials
|
# compute transition potentials
|
||||||
alpha = ((1 - self.u) * self.alpha
|
alpha = ((1 - self.u) * self.alpha
|
||||||
+ self.u * fwd_shifted_alpha
|
+ self.u * fwd_shifted_alpha
|
||||||
|
@ -309,7 +309,7 @@ class OriginalAttention(nn.Module):
|
||||||
# force incremental alignment
|
# force incremental alignment
|
||||||
if not self.training and self.forward_attn_mask:
|
if not self.training and self.forward_attn_mask:
|
||||||
_, n = fwd_shifted_alpha.max(1)
|
_, n = fwd_shifted_alpha.max(1)
|
||||||
val, n2 = alpha.max(1)
|
val, _ = alpha.max(1)
|
||||||
for b in range(alignment.shape[0]):
|
for b in range(alignment.shape[0]):
|
||||||
alpha[b, n[b] + 3:] = 0
|
alpha[b, n[b] + 3:] = 0
|
||||||
alpha[b, :(
|
alpha[b, :(
|
||||||
|
|
|
@ -72,7 +72,7 @@ class ReferenceEncoder(nn.Module):
|
||||||
# x: 3D tensor [batch_size, post_conv_width,
|
# x: 3D tensor [batch_size, post_conv_width,
|
||||||
# num_channels*post_conv_height]
|
# num_channels*post_conv_height]
|
||||||
self.recurrence.flatten_parameters()
|
self.recurrence.flatten_parameters()
|
||||||
memory, out = self.recurrence(x)
|
_, out = self.recurrence(x)
|
||||||
# out: 3D tensor [seq_len==1, batch_size, encoding_size=128]
|
# out: 3D tensor [seq_len==1, batch_size, encoding_size=128]
|
||||||
|
|
||||||
return out.squeeze(0)
|
return out.squeeze(0)
|
||||||
|
|
|
@ -243,4 +243,3 @@ class TacotronLoss(torch.nn.Module):
|
||||||
|
|
||||||
return_dict['loss'] = loss
|
return_dict['loss'] = loss
|
||||||
return return_dict
|
return return_dict
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
import torch
|
import torch
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from .common_layers import Prenet, init_attn, Linear
|
from .common_layers import Prenet, init_attn
|
||||||
|
|
||||||
|
|
||||||
class BatchNormConv1d(nn.Module):
|
class BatchNormConv1d(nn.Module):
|
||||||
|
@ -46,9 +46,9 @@ class BatchNormConv1d(nn.Module):
|
||||||
# self.init_layers()
|
# self.init_layers()
|
||||||
|
|
||||||
def init_layers(self):
|
def init_layers(self):
|
||||||
if type(self.activation) == torch.nn.ReLU:
|
if isinstance(self.activation, torch.nn.ReLU):
|
||||||
w_gain = 'relu'
|
w_gain = 'relu'
|
||||||
elif type(self.activation) == torch.nn.Tanh:
|
elif isinstance(self.activation, torch.nn.Tanh):
|
||||||
w_gain = 'tanh'
|
w_gain = 'tanh'
|
||||||
elif self.activation is None:
|
elif self.activation is None:
|
||||||
w_gain = 'linear'
|
w_gain = 'linear'
|
||||||
|
@ -117,7 +117,7 @@ class CBHG(nn.Module):
|
||||||
- input: (B, C, T_in)
|
- input: (B, C, T_in)
|
||||||
- output: (B, T_in, C*2)
|
- output: (B, T_in, C*2)
|
||||||
"""
|
"""
|
||||||
|
#pylint: disable=dangerous-default-value
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
in_features,
|
in_features,
|
||||||
K=16,
|
K=16,
|
||||||
|
@ -355,7 +355,6 @@ class Decoder(nn.Module):
|
||||||
Initialization of decoder states
|
Initialization of decoder states
|
||||||
"""
|
"""
|
||||||
B = inputs.size(0)
|
B = inputs.size(0)
|
||||||
T = inputs.size(1)
|
|
||||||
# go frame as zeros matrix
|
# go frame as zeros matrix
|
||||||
if self.use_memory_queue:
|
if self.use_memory_queue:
|
||||||
self.memory_input = torch.zeros(1, device=inputs.device).repeat(B, self.frame_channels * self.memory_size)
|
self.memory_input = torch.zeros(1, device=inputs.device).repeat(B, self.frame_channels * self.memory_size)
|
||||||
|
@ -496,7 +495,7 @@ class Decoder(nn.Module):
|
||||||
if t > inputs.shape[1] / 4 and (stop_token > 0.6
|
if t > inputs.shape[1] / 4 and (stop_token > 0.6
|
||||||
or attention[:, -1].item() > 0.6):
|
or attention[:, -1].item() > 0.6):
|
||||||
break
|
break
|
||||||
elif t > self.max_decoder_steps:
|
if t > self.max_decoder_steps:
|
||||||
print(" | > Decoder stopped with 'max_decoder_steps")
|
print(" | > Decoder stopped with 'max_decoder_steps")
|
||||||
break
|
break
|
||||||
return self._parse_outputs(outputs, attentions, stop_tokens)
|
return self._parse_outputs(outputs, attentions, stop_tokens)
|
||||||
|
|
|
@ -1,10 +1,11 @@
|
||||||
import torch
|
import torch
|
||||||
from torch.autograd import Variable
|
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from torch.nn import functional as F
|
from torch.nn import functional as F
|
||||||
from .common_layers import init_attn, Prenet, Linear
|
from .common_layers import init_attn, Prenet, Linear
|
||||||
|
|
||||||
|
# NOTE: linter has a problem with the current TF release
|
||||||
|
#pylint: disable=no-value-for-parameter
|
||||||
|
#pylint: disable=unexpected-keyword-arg
|
||||||
class ConvBNBlock(nn.Module):
|
class ConvBNBlock(nn.Module):
|
||||||
r"""Convolutions with Batch Normalization and non-linear activation.
|
r"""Convolutions with Batch Normalization and non-linear activation.
|
||||||
|
|
||||||
|
@ -156,6 +157,7 @@ class Decoder(nn.Module):
|
||||||
self.separate_stopnet = separate_stopnet
|
self.separate_stopnet = separate_stopnet
|
||||||
self.max_decoder_steps = 1000
|
self.max_decoder_steps = 1000
|
||||||
self.stop_threshold = 0.5
|
self.stop_threshold = 0.5
|
||||||
|
self.speaker_embedding_dim = speaker_embedding_dim
|
||||||
|
|
||||||
# model dimensions
|
# model dimensions
|
||||||
self.query_dim = 1024
|
self.query_dim = 1024
|
||||||
|
@ -211,8 +213,8 @@ class Decoder(nn.Module):
|
||||||
|
|
||||||
def get_go_frame(self, inputs):
|
def get_go_frame(self, inputs):
|
||||||
B = inputs.size(0)
|
B = inputs.size(0)
|
||||||
memory = torch.zeros(1, device=inputs.device).repeat(B,
|
memory = torch.zeros(1, device=inputs.device).repeat(
|
||||||
self.frame_channels * self.r)
|
B, self.frame_channels * self.r)
|
||||||
return memory
|
return memory
|
||||||
|
|
||||||
def _init_states(self, inputs, mask, keep_states=False):
|
def _init_states(self, inputs, mask, keep_states=False):
|
||||||
|
@ -393,7 +395,6 @@ class Decoder(nn.Module):
|
||||||
self.attention.init_win_idx()
|
self.attention.init_win_idx()
|
||||||
self.attention.init_states(inputs)
|
self.attention.init_states(inputs)
|
||||||
outputs, stop_tokens, alignments, t = [], [], [], 0
|
outputs, stop_tokens, alignments, t = [], [], [], 0
|
||||||
stop_flags = [True, False, False]
|
|
||||||
while True:
|
while True:
|
||||||
memory = self.prenet(self.memory_truncated)
|
memory = self.prenet(self.memory_truncated)
|
||||||
decoder_output, alignment, stop_token = self.decode(memory)
|
decoder_output, alignment, stop_token = self.decode(memory)
|
||||||
|
|
|
@ -3,6 +3,9 @@ from tensorflow import keras
|
||||||
from tensorflow.python.ops import math_ops
|
from tensorflow.python.ops import math_ops
|
||||||
# from tensorflow_addons.seq2seq import BahdanauAttention
|
# from tensorflow_addons.seq2seq import BahdanauAttention
|
||||||
|
|
||||||
|
# NOTE: linter has a problem with the current TF release
|
||||||
|
#pylint: disable=no-value-for-parameter
|
||||||
|
#pylint: disable=unexpected-keyword-arg
|
||||||
|
|
||||||
class Linear(keras.layers.Layer):
|
class Linear(keras.layers.Layer):
|
||||||
def __init__(self, units, use_bias, **kwargs):
|
def __init__(self, units, use_bias, **kwargs):
|
||||||
|
|
|
@ -4,7 +4,9 @@ from mozilla_voice_tts.tts.tf.utils.tf_utils import shape_list
|
||||||
from mozilla_voice_tts.tts.tf.layers.common_layers import Prenet, Attention
|
from mozilla_voice_tts.tts.tf.layers.common_layers import Prenet, Attention
|
||||||
# from tensorflow_addons.seq2seq import AttentionWrapper
|
# from tensorflow_addons.seq2seq import AttentionWrapper
|
||||||
|
|
||||||
|
# NOTE: linter has a problem with the current TF release
|
||||||
|
#pylint: disable=no-value-for-parameter
|
||||||
|
#pylint: disable=unexpected-keyword-arg
|
||||||
class ConvBNBlock(keras.layers.Layer):
|
class ConvBNBlock(keras.layers.Layer):
|
||||||
def __init__(self, filters, kernel_size, activation, **kwargs):
|
def __init__(self, filters, kernel_size, activation, **kwargs):
|
||||||
super(ConvBNBlock, self).__init__(**kwargs)
|
super(ConvBNBlock, self).__init__(**kwargs)
|
||||||
|
|
|
@ -5,7 +5,7 @@ from mozilla_voice_tts.tts.tf.layers.tacotron2 import Encoder, Decoder, Postnet
|
||||||
from mozilla_voice_tts.tts.tf.utils.tf_utils import shape_list
|
from mozilla_voice_tts.tts.tf.utils.tf_utils import shape_list
|
||||||
|
|
||||||
|
|
||||||
#pylint: disable=too-many-ancestors
|
#pylint: disable=too-many-ancestors, abstract-method
|
||||||
class Tacotron2(keras.models.Model):
|
class Tacotron2(keras.models.Model):
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
num_chars,
|
num_chars,
|
||||||
|
@ -105,4 +105,3 @@ class Tacotron2(keras.models.Model):
|
||||||
# TODO: issue https://github.com/PyCQA/pylint/issues/3613
|
# TODO: issue https://github.com/PyCQA/pylint/issues/3613
|
||||||
input_ids = tf.random.uniform(shape=[1, 4], maxval=10, dtype=tf.int32) #pylint: disable=unexpected-keyword-arg
|
input_ids = tf.random.uniform(shape=[1, 4], maxval=10, dtype=tf.int32) #pylint: disable=unexpected-keyword-arg
|
||||||
self(input_ids)
|
self(input_ids)
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,9 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
|
|
||||||
|
# NOTE: linter has a problem with the current TF release
|
||||||
|
#pylint: disable=no-value-for-parameter
|
||||||
|
#pylint: disable=unexpected-keyword-arg
|
||||||
|
|
||||||
def tf_create_dummy_inputs():
|
def tf_create_dummy_inputs():
|
||||||
""" Create dummy inputs for TF Tacotron2 model """
|
""" Create dummy inputs for TF Tacotron2 model """
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
import os
|
|
||||||
import datetime
|
import datetime
|
||||||
import importlib
|
import importlib
|
||||||
import pickle
|
import pickle
|
||||||
|
|
|
@ -39,4 +39,3 @@ def load_tflite_model(tflite_path):
|
||||||
tflite_model = tf.lite.Interpreter(model_path=tflite_path)
|
tflite_model = tf.lite.Interpreter(model_path=tflite_path)
|
||||||
tflite_model.allocate_tensors()
|
tflite_model.allocate_tensors()
|
||||||
return tflite_model
|
return tflite_model
|
||||||
|
|
||||||
|
|
|
@ -74,4 +74,3 @@ class StandardScaler():
|
||||||
X *= self.scale_
|
X *= self.scale_
|
||||||
X += self.mean_
|
X += self.mean_
|
||||||
return X
|
return X
|
||||||
|
|
||||||
|
|
|
@ -1,15 +1,11 @@
|
||||||
# edited from https://github.com/fastai/imagenet-fast/blob/master/imagenet_nv/distributed.py
|
# edited from https://github.com/fastai/imagenet-fast/blob/master/imagenet_nv/distributed.py
|
||||||
import os, sys
|
|
||||||
import math
|
import math
|
||||||
import time
|
|
||||||
import subprocess
|
|
||||||
import argparse
|
|
||||||
import torch
|
import torch
|
||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
from torch.utils.data.sampler import Sampler
|
|
||||||
from torch.autograd import Variable
|
|
||||||
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
|
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
|
||||||
from mozilla_voice_tts.utils.generic_utils import create_experiment_folder
|
from torch.autograd import Variable
|
||||||
|
from torch.utils.data.sampler import Sampler
|
||||||
|
|
||||||
|
|
||||||
class DistributedSampler(Sampler):
|
class DistributedSampler(Sampler):
|
||||||
|
@ -108,7 +104,7 @@ def apply_gradient_allreduce(module):
|
||||||
for param in list(module.parameters()):
|
for param in list(module.parameters()):
|
||||||
|
|
||||||
def allreduce_hook(*_):
|
def allreduce_hook(*_):
|
||||||
Variable._execution_engine.queue_callback(allreduce_params)
|
Variable._execution_engine.queue_callback(allreduce_params) #pylint: disable=protected-access
|
||||||
|
|
||||||
if param.requires_grad:
|
if param.requires_grad:
|
||||||
param.register_hook(allreduce_hook)
|
param.register_hook(allreduce_hook)
|
||||||
|
|
|
@ -3,7 +3,7 @@ import torch
|
||||||
import datetime
|
import datetime
|
||||||
|
|
||||||
|
|
||||||
def load_checkpoint(model, checkpoint_path, use_cuda=False):
|
def load_checkpoint(model, checkpoint_path, amp=None, use_cuda=False):
|
||||||
state = torch.load(checkpoint_path, map_location=torch.device('cpu'))
|
state = torch.load(checkpoint_path, map_location=torch.device('cpu'))
|
||||||
model.load_state_dict(state['model'])
|
model.load_state_dict(state['model'])
|
||||||
if amp and 'amp' in state:
|
if amp and 'amp' in state:
|
||||||
|
|
|
@ -1,6 +1,3 @@
|
||||||
import torch
|
|
||||||
|
|
||||||
|
|
||||||
def alignment_diagonal_score(alignments, binary=False):
|
def alignment_diagonal_score(alignments, binary=False):
|
||||||
"""
|
"""
|
||||||
Compute how diagonal alignment predictions are. It is useful
|
Compute how diagonal alignment predictions are. It is useful
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
|
|
||||||
from mozilla_voice_tts.tts.datasets.preprocess import get_preprocessor_by_name
|
|
||||||
|
|
||||||
|
|
||||||
def make_speakers_json_path(out_path):
|
def make_speakers_json_path(out_path):
|
||||||
"""Returns conventional speakers.json location."""
|
"""Returns conventional speakers.json location."""
|
||||||
|
|
|
@ -8,6 +8,7 @@ from mozilla_voice_tts.tts.utils.text import cleaners
|
||||||
from mozilla_voice_tts.tts.utils.text.symbols import make_symbols, symbols, phonemes, _phoneme_punctuations, _bos, \
|
from mozilla_voice_tts.tts.utils.text.symbols import make_symbols, symbols, phonemes, _phoneme_punctuations, _bos, \
|
||||||
_eos
|
_eos
|
||||||
|
|
||||||
|
# pylint: disable=unnecessary-comprehension
|
||||||
# Mappings from symbol to numeric ID and vice versa:
|
# Mappings from symbol to numeric ID and vice versa:
|
||||||
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
|
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
|
||||||
_id_to_symbol = {i: s for i, s in enumerate(symbols)}
|
_id_to_symbol = {i: s for i, s in enumerate(symbols)}
|
||||||
|
|
|
@ -41,7 +41,7 @@ def plot_spectrogram(spectrogram, ap=None, fig_size=(16, 10), output_fig=False):
|
||||||
plt.colorbar()
|
plt.colorbar()
|
||||||
plt.tight_layout()
|
plt.tight_layout()
|
||||||
if not output_fig:
|
if not output_fig:
|
||||||
plt.close()
|
plt.close()
|
||||||
return fig
|
return fig
|
||||||
|
|
||||||
|
|
||||||
|
@ -97,4 +97,4 @@ def visualize(alignment, postnet_output, stop_tokens, text, hop_length, CONFIG,
|
||||||
plt.close()
|
plt.close()
|
||||||
|
|
||||||
if not output_fig:
|
if not output_fig:
|
||||||
plt.close()
|
plt.close()
|
||||||
|
|
|
@ -52,7 +52,7 @@ class AudioProcessor(object):
|
||||||
self.mel_fmin = mel_fmin or 0
|
self.mel_fmin = mel_fmin or 0
|
||||||
self.mel_fmax = mel_fmax
|
self.mel_fmax = mel_fmax
|
||||||
self.spec_gain = float(spec_gain)
|
self.spec_gain = float(spec_gain)
|
||||||
self.stft_pad_mode = 'reflect'
|
self.stft_pad_mode = stft_pad_mode
|
||||||
self.max_norm = 1.0 if max_norm is None else float(max_norm)
|
self.max_norm = 1.0 if max_norm is None else float(max_norm)
|
||||||
self.clip_norm = clip_norm
|
self.clip_norm = clip_norm
|
||||||
self.do_trim_silence = do_trim_silence
|
self.do_trim_silence = do_trim_silence
|
||||||
|
@ -123,7 +123,7 @@ class AudioProcessor(object):
|
||||||
if self.symmetric_norm:
|
if self.symmetric_norm:
|
||||||
S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm
|
S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm
|
||||||
if self.clip_norm:
|
if self.clip_norm:
|
||||||
S_norm = np.clip(S_norm, -self.max_norm, self.max_norm)
|
S_norm = np.clip(S_norm, -self.max_norm, self.max_norm) # pylint: disable=invalid-unary-operand-type
|
||||||
return S_norm
|
return S_norm
|
||||||
else:
|
else:
|
||||||
S_norm = self.max_norm * S_norm
|
S_norm = self.max_norm * S_norm
|
||||||
|
@ -148,7 +148,7 @@ class AudioProcessor(object):
|
||||||
raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.')
|
raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.')
|
||||||
if self.symmetric_norm:
|
if self.symmetric_norm:
|
||||||
if self.clip_norm:
|
if self.clip_norm:
|
||||||
S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm)
|
S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm) #pylint: disable=invalid-unary-operand-type
|
||||||
S_denorm = ((S_denorm + self.max_norm) * -self.min_level_db / (2 * self.max_norm)) + self.min_level_db
|
S_denorm = ((S_denorm + self.max_norm) * -self.min_level_db / (2 * self.max_norm)) + self.min_level_db
|
||||||
return S_denorm + self.ref_level_db
|
return S_denorm + self.ref_level_db
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
import math
|
import math
|
||||||
import torch
|
import torch
|
||||||
from torch.optim.optimizer import Optimizer, required
|
from torch.optim.optimizer import Optimizer
|
||||||
|
|
||||||
|
|
||||||
class RAdam(Optimizer):
|
class RAdam(Optimizer):
|
||||||
|
@ -25,7 +25,7 @@ class RAdam(Optimizer):
|
||||||
defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, buffer=[[None, None, None] for _ in range(10)])
|
defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, buffer=[[None, None, None] for _ in range(10)])
|
||||||
super(RAdam, self).__init__(params, defaults)
|
super(RAdam, self).__init__(params, defaults)
|
||||||
|
|
||||||
def __setstate__(self, state):
|
def __setstate__(self, state): # pylint: disable=useless-super-delegation
|
||||||
super(RAdam, self).__setstate__(state)
|
super(RAdam, self).__setstate__(state)
|
||||||
|
|
||||||
def step(self, closure=None):
|
def step(self, closure=None):
|
||||||
|
|
|
@ -47,7 +47,7 @@ class TensorboardLogger(object):
|
||||||
for key, value in audios.items():
|
for key, value in audios.items():
|
||||||
try:
|
try:
|
||||||
self.writer.add_audio('{}/{}'.format(scope_name, key), value, step, sample_rate=sample_rate)
|
self.writer.add_audio('{}/{}'.format(scope_name, key), value, step, sample_rate=sample_rate)
|
||||||
except:
|
except RuntimeError:
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
def tb_train_iter_stats(self, step, stats):
|
def tb_train_iter_stats(self, step, stats):
|
||||||
|
|
|
@ -95,4 +95,3 @@ class MelganGenerator(nn.Module):
|
||||||
nn.utils.remove_weight_norm(layer)
|
nn.utils.remove_weight_norm(layer)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
layer.remove_weight_norm()
|
layer.remove_weight_norm()
|
||||||
|
|
||||||
|
|
|
@ -145,6 +145,5 @@ def setup_discriminator(c):
|
||||||
)
|
)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
# def check_config(c):
|
||||||
def check_config(c):
|
# pass
|
||||||
pass
|
|
||||||
|
|
|
@ -71,7 +71,7 @@ def process_meta_data(path):
|
||||||
|
|
||||||
|
|
||||||
def get_data_points(meta_data):
|
def get_data_points(meta_data):
|
||||||
x = [char_cnt for char_cnt in meta_data]
|
x = meta_data
|
||||||
y_avg = [meta_data[d]['mean'] for d in meta_data]
|
y_avg = [meta_data[d]['mean'] for d in meta_data]
|
||||||
y_mode = [meta_data[d]['mode'] for d in meta_data]
|
y_mode = [meta_data[d]['mode'] for d in meta_data]
|
||||||
y_median = [meta_data[d]['median'] for d in meta_data]
|
y_median = [meta_data[d]['median'] for d in meta_data]
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -36,7 +36,7 @@ else:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class build_py(setuptools.command.build_py.build_py):
|
class build_py(setuptools.command.build_py.build_py): # pylint: disable=too-many-ancestors
|
||||||
def run(self):
|
def run(self):
|
||||||
self.create_version_file()
|
self.create_version_file()
|
||||||
setuptools.command.build_py.build_py.run(self)
|
setuptools.command.build_py.build_py.run(self)
|
||||||
|
|
|
@ -1,35 +0,0 @@
|
||||||
import unittest
|
|
||||||
import torch as T
|
|
||||||
|
|
||||||
from mozilla_voice_tts.tts.utils.generic_utils import save_checkpoint, save_best_model
|
|
||||||
from mozilla_voice_tts.tts.layers.tacotron import Prenet
|
|
||||||
|
|
||||||
OUT_PATH = '/tmp/test.pth.tar'
|
|
||||||
|
|
||||||
|
|
||||||
class ModelSavingTests(unittest.TestCase):
|
|
||||||
def save_checkpoint_test(self):
|
|
||||||
# create a dummy model
|
|
||||||
model = Prenet(128, out_features=[256, 128])
|
|
||||||
model = T.nn.DataParallel(layer) #FIXME: undefined variable layer
|
|
||||||
|
|
||||||
# save the model
|
|
||||||
save_checkpoint(model, None, 100, OUT_PATH, 1, 1)
|
|
||||||
|
|
||||||
# load the model to CPU
|
|
||||||
model_dict = T.load(
|
|
||||||
MODEL_PATH, map_location=lambda storage, loc: storage) #FIXME: undefined variable MODEL_PATH
|
|
||||||
model.load_state_dict(model_dict['model'])
|
|
||||||
|
|
||||||
def save_best_model_test(self):
|
|
||||||
# create a dummy model
|
|
||||||
model = Prenet(256, out_features=[256, 256])
|
|
||||||
model = T.nn.DataParallel(layer)
|
|
||||||
|
|
||||||
# save the model
|
|
||||||
save_best_model(model, None, 0, 100, OUT_PATH, 10, 1)
|
|
||||||
|
|
||||||
# load the model to CPU
|
|
||||||
model_dict = T.load(
|
|
||||||
MODEL_PATH, map_location=lambda storage, loc: storage)
|
|
||||||
model.load_state_dict(model_dict['model'])
|
|
|
@ -1,7 +1,8 @@
|
||||||
import os
|
import os
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from tests import get_tests_path, get_tests_input_path, get_tests_output_path
|
from tests import get_tests_input_path, get_tests_output_path, get_tests_path
|
||||||
|
|
||||||
from mozilla_voice_tts.utils.audio import AudioProcessor
|
from mozilla_voice_tts.utils.audio import AudioProcessor
|
||||||
from mozilla_voice_tts.utils.io import load_config
|
from mozilla_voice_tts.utils.io import load_config
|
||||||
|
|
||||||
|
@ -103,7 +104,7 @@ class TestAudio(unittest.TestCase):
|
||||||
assert (x_old - x).sum() == 0
|
assert (x_old - x).sum() == 0
|
||||||
# check value range
|
# check value range
|
||||||
assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max()
|
assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max()
|
||||||
assert x_norm.min() >= -self.ap.max_norm - 2, x_norm.min()
|
assert x_norm.min() >= -self.ap.max_norm - 2, x_norm.min() #pylint: disable=invalid-unary-operand-type
|
||||||
assert x_norm.min() <= 0, x_norm.min()
|
assert x_norm.min() <= 0, x_norm.min()
|
||||||
# check denorm.
|
# check denorm.
|
||||||
x_ = self.ap._denormalize(x_norm)
|
x_ = self.ap._denormalize(x_norm)
|
||||||
|
@ -120,7 +121,7 @@ class TestAudio(unittest.TestCase):
|
||||||
assert (x_old - x).sum() == 0
|
assert (x_old - x).sum() == 0
|
||||||
# check value range
|
# check value range
|
||||||
assert x_norm.max() <= self.ap.max_norm, x_norm.max()
|
assert x_norm.max() <= self.ap.max_norm, x_norm.max()
|
||||||
assert x_norm.min() >= -self.ap.max_norm, x_norm.min()
|
assert x_norm.min() >= -self.ap.max_norm, x_norm.min() #pylint: disable=invalid-unary-operand-type
|
||||||
assert x_norm.min() <= 0, x_norm.min()
|
assert x_norm.min() <= 0, x_norm.min()
|
||||||
# check denorm.
|
# check denorm.
|
||||||
x_ = self.ap._denormalize(x_norm)
|
x_ = self.ap._denormalize(x_norm)
|
||||||
|
@ -148,7 +149,7 @@ class TestAudio(unittest.TestCase):
|
||||||
|
|
||||||
assert (x_old - x).sum() == 0
|
assert (x_old - x).sum() == 0
|
||||||
assert x_norm.max() <= self.ap.max_norm, x_norm.max()
|
assert x_norm.max() <= self.ap.max_norm, x_norm.max()
|
||||||
assert x_norm.min() >= -self.ap.max_norm, x_norm.min()
|
assert x_norm.min() >= -self.ap.max_norm, x_norm.min() #pylint: disable=invalid-unary-operand-type
|
||||||
assert x_norm.min() < 0, x_norm.min()
|
assert x_norm.min() < 0, x_norm.min()
|
||||||
x_ = self.ap._denormalize(x_norm)
|
x_ = self.ap._denormalize(x_norm)
|
||||||
assert (x - x_).sum() < 1e-3
|
assert (x - x_).sum() < 1e-3
|
||||||
|
|
|
@ -1,13 +1,13 @@
|
||||||
import os
|
import os
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
import torch as T
|
from tests import get_tests_input_path, get_tests_output_path
|
||||||
|
|
||||||
from mozilla_voice_tts.server.synthesizer import Synthesizer
|
from mozilla_voice_tts.server.synthesizer import Synthesizer
|
||||||
from tests import get_tests_input_path, get_tests_output_path
|
|
||||||
from mozilla_voice_tts.tts.utils.text.symbols import make_symbols, phonemes, symbols
|
|
||||||
from mozilla_voice_tts.tts.utils.generic_utils import setup_model
|
from mozilla_voice_tts.tts.utils.generic_utils import setup_model
|
||||||
from mozilla_voice_tts.tts.utils.io import save_checkpoint
|
from mozilla_voice_tts.tts.utils.io import save_checkpoint
|
||||||
|
from mozilla_voice_tts.tts.utils.text.symbols import (make_symbols, phonemes,
|
||||||
|
symbols)
|
||||||
from mozilla_voice_tts.utils.io import load_config
|
from mozilla_voice_tts.utils.io import load_config
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,13 +1,13 @@
|
||||||
import os
|
import os
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
import torch as T
|
import torch as T
|
||||||
|
from tests import get_tests_input_path
|
||||||
|
|
||||||
from tests import get_tests_path, get_tests_input_path
|
|
||||||
from mozilla_voice_tts.speaker_encoder.model import SpeakerEncoder
|
|
||||||
from mozilla_voice_tts.speaker_encoder.loss import GE2ELoss
|
from mozilla_voice_tts.speaker_encoder.loss import GE2ELoss
|
||||||
|
from mozilla_voice_tts.speaker_encoder.model import SpeakerEncoder
|
||||||
from mozilla_voice_tts.utils.io import load_config
|
from mozilla_voice_tts.utils.io import load_config
|
||||||
|
|
||||||
|
|
||||||
file_path = get_tests_input_path()
|
file_path = get_tests_input_path()
|
||||||
c = load_config(os.path.join(file_path, "test_config.json"))
|
c = load_config(os.path.join(file_path, "test_config.json"))
|
||||||
|
|
||||||
|
|
|
@ -9,7 +9,7 @@ from mozilla_voice_tts.tts.utils.generic_utils import sequence_mask
|
||||||
|
|
||||||
|
|
||||||
class PrenetTests(unittest.TestCase):
|
class PrenetTests(unittest.TestCase):
|
||||||
def test_in_out(self):
|
def test_in_out(self): #pylint: disable=no-self-use
|
||||||
layer = Prenet(128, out_features=[256, 128])
|
layer = Prenet(128, out_features=[256, 128])
|
||||||
dummy_input = T.rand(4, 128)
|
dummy_input = T.rand(4, 128)
|
||||||
|
|
||||||
|
@ -104,7 +104,7 @@ class DecoderTests(unittest.TestCase):
|
||||||
|
|
||||||
|
|
||||||
class EncoderTests(unittest.TestCase):
|
class EncoderTests(unittest.TestCase):
|
||||||
def test_in_out(self):
|
def test_in_out(self): #pylint: disable=no-self-use
|
||||||
layer = Encoder(128)
|
layer = Encoder(128)
|
||||||
dummy_input = T.rand(4, 8, 128)
|
dummy_input = T.rand(4, 8, 128)
|
||||||
|
|
||||||
|
@ -117,7 +117,7 @@ class EncoderTests(unittest.TestCase):
|
||||||
|
|
||||||
|
|
||||||
class L1LossMaskedTests(unittest.TestCase):
|
class L1LossMaskedTests(unittest.TestCase):
|
||||||
def test_in_out(self):
|
def test_in_out(self): #pylint: disable=no-self-use
|
||||||
# test input == target
|
# test input == target
|
||||||
layer = L1LossMasked(seq_len_norm=False)
|
layer = L1LossMasked(seq_len_norm=False)
|
||||||
dummy_input = T.ones(4, 8, 128).float()
|
dummy_input = T.ones(4, 8, 128).float()
|
||||||
|
|
|
@ -1,15 +1,16 @@
|
||||||
import os
|
import os
|
||||||
import unittest
|
|
||||||
import shutil
|
import shutil
|
||||||
import torch
|
import unittest
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from tests import get_tests_path, get_tests_input_path, get_tests_output_path
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
from tests import get_tests_input_path, get_tests_output_path
|
||||||
from torch.utils.data import DataLoader
|
from torch.utils.data import DataLoader
|
||||||
from mozilla_voice_tts.utils.io import load_config
|
|
||||||
from mozilla_voice_tts.utils.audio import AudioProcessor
|
|
||||||
from mozilla_voice_tts.tts.datasets import TTSDataset
|
from mozilla_voice_tts.tts.datasets import TTSDataset
|
||||||
from mozilla_voice_tts.tts.datasets.preprocess import ljspeech
|
from mozilla_voice_tts.tts.datasets.preprocess import ljspeech
|
||||||
|
from mozilla_voice_tts.utils.audio import AudioProcessor
|
||||||
|
from mozilla_voice_tts.utils.io import load_config
|
||||||
|
|
||||||
#pylint: disable=unused-variable
|
#pylint: disable=unused-variable
|
||||||
|
|
||||||
|
@ -32,7 +33,7 @@ class TestTTSDataset(unittest.TestCase):
|
||||||
self.ap = AudioProcessor(**c.audio)
|
self.ap = AudioProcessor(**c.audio)
|
||||||
|
|
||||||
def _create_dataloader(self, batch_size, r, bgs):
|
def _create_dataloader(self, batch_size, r, bgs):
|
||||||
items = ljspeech(c.data_path,'metadata.csv')
|
items = ljspeech(c.data_path, 'metadata.csv')
|
||||||
dataset = TTSDataset.MyDataset(
|
dataset = TTSDataset.MyDataset(
|
||||||
r,
|
r,
|
||||||
c.text_cleaner,
|
c.text_cleaner,
|
||||||
|
@ -74,7 +75,7 @@ class TestTTSDataset(unittest.TestCase):
|
||||||
assert check_count == 0, \
|
assert check_count == 0, \
|
||||||
" !! Negative values in text_input: {}".format(check_count)
|
" !! Negative values in text_input: {}".format(check_count)
|
||||||
# TODO: more assertion here
|
# TODO: more assertion here
|
||||||
assert type(speaker_name[0]) is str
|
assert isinstance(speaker_name[0], str)
|
||||||
assert linear_input.shape[0] == c.batch_size
|
assert linear_input.shape[0] == c.batch_size
|
||||||
assert linear_input.shape[2] == self.ap.fft_size // 2 + 1
|
assert linear_input.shape[2] == self.ap.fft_size // 2 + 1
|
||||||
assert mel_input.shape[0] == c.batch_size
|
assert mel_input.shape[0] == c.batch_size
|
||||||
|
@ -82,7 +83,7 @@ class TestTTSDataset(unittest.TestCase):
|
||||||
# check normalization ranges
|
# check normalization ranges
|
||||||
if self.ap.symmetric_norm:
|
if self.ap.symmetric_norm:
|
||||||
assert mel_input.max() <= self.ap.max_norm
|
assert mel_input.max() <= self.ap.max_norm
|
||||||
assert mel_input.min() >= -self.ap.max_norm
|
assert mel_input.min() >= -self.ap.max_norm #pylint: disable=invalid-unary-operand-type
|
||||||
assert mel_input.min() < 0
|
assert mel_input.min() < 0
|
||||||
else:
|
else:
|
||||||
assert mel_input.max() <= self.ap.max_norm
|
assert mel_input.max() <= self.ap.max_norm
|
||||||
|
|
|
@ -7,7 +7,7 @@ from mozilla_voice_tts.tts.datasets.preprocess import common_voice
|
||||||
|
|
||||||
class TestPreprocessors(unittest.TestCase):
|
class TestPreprocessors(unittest.TestCase):
|
||||||
|
|
||||||
def test_common_voice_preprocessor(self):
|
def test_common_voice_preprocessor(self): #pylint: disable=no-self-use
|
||||||
root_path = get_tests_input_path()
|
root_path = get_tests_input_path()
|
||||||
meta_file = "common_voice.tsv"
|
meta_file = "common_voice.tsv"
|
||||||
items = common_voice(root_path, meta_file)
|
items = common_voice(root_path, meta_file)
|
||||||
|
|
|
@ -20,8 +20,8 @@ c = load_config(os.path.join(get_tests_input_path(), 'test_config.json'))
|
||||||
|
|
||||||
|
|
||||||
class TacotronTrainTest(unittest.TestCase):
|
class TacotronTrainTest(unittest.TestCase):
|
||||||
def test_train_step(self):
|
def test_train_step(self): # pylint: disable=no-self-use
|
||||||
input = torch.randint(0, 24, (8, 128)).long().to(device)
|
input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
|
||||||
input_lengths = torch.randint(100, 128, (8, )).long().to(device)
|
input_lengths = torch.randint(100, 128, (8, )).long().to(device)
|
||||||
input_lengths = torch.sort(input_lengths, descending=True)[0]
|
input_lengths = torch.sort(input_lengths, descending=True)[0]
|
||||||
mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
|
mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
|
||||||
|
@ -34,7 +34,7 @@ class TacotronTrainTest(unittest.TestCase):
|
||||||
for idx in mel_lengths:
|
for idx in mel_lengths:
|
||||||
stop_targets[:, int(idx.item()):, 0] = 1.0
|
stop_targets[:, int(idx.item()):, 0] = 1.0
|
||||||
|
|
||||||
stop_targets = stop_targets.view(input.shape[0],
|
stop_targets = stop_targets.view(input_dummy.shape[0],
|
||||||
stop_targets.size(1) // c.r, -1)
|
stop_targets.size(1) // c.r, -1)
|
||||||
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
|
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
|
||||||
|
|
||||||
|
@ -51,7 +51,7 @@ class TacotronTrainTest(unittest.TestCase):
|
||||||
optimizer = optim.Adam(model.parameters(), lr=c.lr)
|
optimizer = optim.Adam(model.parameters(), lr=c.lr)
|
||||||
for i in range(5):
|
for i in range(5):
|
||||||
mel_out, mel_postnet_out, align, stop_tokens = model.forward(
|
mel_out, mel_postnet_out, align, stop_tokens = model.forward(
|
||||||
input, input_lengths, mel_spec, mel_lengths, speaker_ids)
|
input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids)
|
||||||
assert torch.sigmoid(stop_tokens).data.max() <= 1.0
|
assert torch.sigmoid(stop_tokens).data.max() <= 1.0
|
||||||
assert torch.sigmoid(stop_tokens).data.min() >= 0.0
|
assert torch.sigmoid(stop_tokens).data.min() >= 0.0
|
||||||
optimizer.zero_grad()
|
optimizer.zero_grad()
|
||||||
|
|
|
@ -1,15 +1,19 @@
|
||||||
import os
|
import os
|
||||||
import torch
|
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
|
import torch
|
||||||
|
from tests import get_tests_input_path
|
||||||
|
|
||||||
|
from mozilla_voice_tts.tts.tf.models.tacotron2 import Tacotron2
|
||||||
|
from mozilla_voice_tts.tts.tf.utils.tflite import (convert_tacotron2_to_tflite,
|
||||||
|
load_tflite_model)
|
||||||
|
from mozilla_voice_tts.utils.io import load_config
|
||||||
|
|
||||||
tf.get_logger().setLevel('INFO')
|
tf.get_logger().setLevel('INFO')
|
||||||
|
|
||||||
from tests import get_tests_path, get_tests_input_path, get_tests_output_path
|
|
||||||
|
|
||||||
from mozilla_voice_tts.utils.io import load_config
|
|
||||||
from mozilla_voice_tts.tts.tf.models.tacotron2 import Tacotron2
|
|
||||||
from mozilla_voice_tts.tts.tf.utils.tflite import convert_tacotron2_to_tflite, load_tflite_model
|
|
||||||
|
|
||||||
#pylint: disable=unused-variable
|
#pylint: disable=unused-variable
|
||||||
|
|
||||||
|
@ -132,4 +136,3 @@ class TacotronTFTrainTest(unittest.TestCase):
|
||||||
postnet_output = tflite_model.get_tensor(output_details[1]['index'])
|
postnet_output = tflite_model.get_tensor(output_details[1]['index'])
|
||||||
# remove tflite binary
|
# remove tflite binary
|
||||||
os.remove('test_tacotron2.tflite')
|
os.remove('test_tacotron2.tflite')
|
||||||
|
|
||||||
|
|
|
@ -16,7 +16,7 @@ def test_phoneme_to_sequence():
|
||||||
lang = "en-us"
|
lang = "en-us"
|
||||||
sequence = phoneme_to_sequence(text, text_cleaner, lang)
|
sequence = phoneme_to_sequence(text, text_cleaner, lang)
|
||||||
text_hat = sequence_to_phoneme(sequence)
|
text_hat = sequence_to_phoneme(sequence)
|
||||||
sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
|
_ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
|
||||||
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
|
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
|
||||||
gt = "ɹiːsənt ɹɪsɜːtʃ æt hɑːɹvɚd hɐz ʃoʊn mɛdᵻteɪɾɪŋ fɔːɹ æz lɪɾəl æz eɪt wiːks kæn æktʃuːəli ɪnkɹiːs, ðə ɡɹeɪ mæɾɚɹ ɪnðə pɑːɹts ʌvðə bɹeɪn ɹɪspɑːnsəbəl fɔːɹ ɪmoʊʃənəl ɹɛɡjuːleɪʃən ænd lɜːnɪŋ!"
|
gt = "ɹiːsənt ɹɪsɜːtʃ æt hɑːɹvɚd hɐz ʃoʊn mɛdᵻteɪɾɪŋ fɔːɹ æz lɪɾəl æz eɪt wiːks kæn æktʃuːəli ɪnkɹiːs, ðə ɡɹeɪ mæɾɚɹ ɪnðə pɑːɹts ʌvðə bɹeɪn ɹɪspɑːnsəbəl fɔːɹ ɪmoʊʃənəl ɹɛɡjuːleɪʃən ænd lɜːnɪŋ!"
|
||||||
assert text_hat == text_hat_with_params == gt
|
assert text_hat == text_hat_with_params == gt
|
||||||
|
@ -25,7 +25,7 @@ def test_phoneme_to_sequence():
|
||||||
text = "Be a voice, not an! echo?"
|
text = "Be a voice, not an! echo?"
|
||||||
sequence = phoneme_to_sequence(text, text_cleaner, lang)
|
sequence = phoneme_to_sequence(text, text_cleaner, lang)
|
||||||
text_hat = sequence_to_phoneme(sequence)
|
text_hat = sequence_to_phoneme(sequence)
|
||||||
sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
|
_ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
|
||||||
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
|
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
|
||||||
gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ?"
|
gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ?"
|
||||||
print(text_hat)
|
print(text_hat)
|
||||||
|
@ -36,7 +36,7 @@ def test_phoneme_to_sequence():
|
||||||
text = "Be a voice, not an! echo"
|
text = "Be a voice, not an! echo"
|
||||||
sequence = phoneme_to_sequence(text, text_cleaner, lang)
|
sequence = phoneme_to_sequence(text, text_cleaner, lang)
|
||||||
text_hat = sequence_to_phoneme(sequence)
|
text_hat = sequence_to_phoneme(sequence)
|
||||||
sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
|
_ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
|
||||||
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
|
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
|
||||||
gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ"
|
gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ"
|
||||||
print(text_hat)
|
print(text_hat)
|
||||||
|
@ -47,7 +47,7 @@ def test_phoneme_to_sequence():
|
||||||
text = "Be a voice, not an echo!"
|
text = "Be a voice, not an echo!"
|
||||||
sequence = phoneme_to_sequence(text, text_cleaner, lang)
|
sequence = phoneme_to_sequence(text, text_cleaner, lang)
|
||||||
text_hat = sequence_to_phoneme(sequence)
|
text_hat = sequence_to_phoneme(sequence)
|
||||||
sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
|
_ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
|
||||||
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
|
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
|
||||||
gt = "biː ɐ vɔɪs, nɑːt ɐn ɛkoʊ!"
|
gt = "biː ɐ vɔɪs, nɑːt ɐn ɛkoʊ!"
|
||||||
print(text_hat)
|
print(text_hat)
|
||||||
|
@ -58,7 +58,7 @@ def test_phoneme_to_sequence():
|
||||||
text = "Be a voice, not an! echo. "
|
text = "Be a voice, not an! echo. "
|
||||||
sequence = phoneme_to_sequence(text, text_cleaner, lang)
|
sequence = phoneme_to_sequence(text, text_cleaner, lang)
|
||||||
text_hat = sequence_to_phoneme(sequence)
|
text_hat = sequence_to_phoneme(sequence)
|
||||||
sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
|
_ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
|
||||||
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
|
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
|
||||||
gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ."
|
gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ."
|
||||||
print(text_hat)
|
print(text_hat)
|
||||||
|
@ -69,7 +69,7 @@ def test_phoneme_to_sequence():
|
||||||
text = "Be a voice, not an! echo. "
|
text = "Be a voice, not an! echo. "
|
||||||
sequence = phoneme_to_sequence(text, text_cleaner, lang, True)
|
sequence = phoneme_to_sequence(text, text_cleaner, lang, True)
|
||||||
text_hat = sequence_to_phoneme(sequence)
|
text_hat = sequence_to_phoneme(sequence)
|
||||||
sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
|
_ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
|
||||||
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
|
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
|
||||||
gt = "^biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ.~"
|
gt = "^biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ.~"
|
||||||
print(text_hat)
|
print(text_hat)
|
||||||
|
@ -80,7 +80,7 @@ def test_phoneme_to_sequence():
|
||||||
text = "_Be a _voice, not an! echo_"
|
text = "_Be a _voice, not an! echo_"
|
||||||
sequence = phoneme_to_sequence(text, text_cleaner, lang)
|
sequence = phoneme_to_sequence(text, text_cleaner, lang)
|
||||||
text_hat = sequence_to_phoneme(sequence)
|
text_hat = sequence_to_phoneme(sequence)
|
||||||
sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
|
_ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
|
||||||
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
|
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
|
||||||
gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ"
|
gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ"
|
||||||
print(text_hat)
|
print(text_hat)
|
||||||
|
|
|
@ -11,4 +11,3 @@ def test_melgan_generator():
|
||||||
assert np.all(output.shape == (4, 1, 64 * 256))
|
assert np.all(output.shape == (4, 1, 64 * 256))
|
||||||
output = model.inference(dummy_input)
|
output = model.inference(dummy_input)
|
||||||
assert np.all(output.shape == (4, 1, (64 + 4) * 256))
|
assert np.all(output.shape == (4, 1, (64 + 4) * 256))
|
||||||
|
|
||||||
|
|
|
@ -25,4 +25,3 @@ def test_pqmf():
|
||||||
print(w2_.min())
|
print(w2_.min())
|
||||||
print(w2_.mean())
|
print(w2_.mean())
|
||||||
sf.write('pqmf_output.wav', w2_.flatten().detach(), sr)
|
sf.write('pqmf_output.wav', w2_.flatten().detach(), sr)
|
||||||
|
|
||||||
|
|
|
@ -26,4 +26,3 @@ def test_pqmf():
|
||||||
print(w2_.min())
|
print(w2_.min())
|
||||||
print(w2_.mean())
|
print(w2_.mean())
|
||||||
sf.write('tf_pqmf_output.wav', w2_.flatten(), sr)
|
sf.write('tf_pqmf_output.wav', w2_.flatten(), sr)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue