mirror of https://github.com/coqui-ai/TTS.git
linter updates
This commit is contained in:
parent
496ff68dec
commit
f75b0a6439
|
@ -9,7 +9,6 @@ import torch.distributed as dist
|
|||
from torch.utils.data.sampler import Sampler
|
||||
from torch.autograd import Variable
|
||||
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
|
||||
from TTS.utils.io import load_config
|
||||
from TTS.utils.generic_utils import create_experiment_folder
|
||||
|
||||
|
||||
|
|
|
@ -11,9 +11,9 @@ class ConvBNBlock(nn.Module):
|
|||
assert (kernel_size - 1) % 2 == 0
|
||||
padding = (kernel_size - 1) // 2
|
||||
self.convolution1d = nn.Conv1d(in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
padding=padding)
|
||||
out_channels,
|
||||
kernel_size,
|
||||
padding=padding)
|
||||
self.batch_normalization = nn.BatchNorm1d(out_channels, momentum=0.1, eps=1e-5)
|
||||
self.dropout = nn.Dropout(p=0.5)
|
||||
if activation == 'relu':
|
||||
|
|
|
@ -171,12 +171,12 @@ class Synthesizer(object):
|
|||
speaker_id = id_to_torch(speaker_id)
|
||||
if speaker_id is not None and self.use_cuda:
|
||||
speaker_id = speaker_id.cuda()
|
||||
|
||||
|
||||
for sen in sens:
|
||||
# preprocess the given text
|
||||
inputs = text_to_seqvec(sen, self.tts_config, self.use_cuda)
|
||||
# synthesize voice
|
||||
decoder_output, postnet_output, alignments, _ = run_model(
|
||||
decoder_output, postnet_output, alignments, _ = run_model_torch(
|
||||
self.tts_model, inputs, self.tts_config, False, speaker_id, None)
|
||||
# convert outputs to numpy
|
||||
postnet_output, decoder_output, _ = parse_outputs(
|
||||
|
|
|
@ -25,7 +25,7 @@ def tts(model,
|
|||
figures=False):
|
||||
t_1 = time.time()
|
||||
use_vocoder_model = vocoder_model is not None
|
||||
waveform, alignment, _, postnet_output, stop_tokens = synthesis(
|
||||
waveform, alignment, _, postnet_output, stop_tokens, _ = synthesis(
|
||||
model, text, C, use_cuda, ap, speaker_id, style_wav=False,
|
||||
truncated=False, enable_eos_bos_chars=C.enable_eos_bos_chars,
|
||||
use_griffin_lim=(not use_vocoder_model), do_trim_silence=True)
|
||||
|
|
|
@ -1,14 +1,10 @@
|
|||
import os
|
||||
import copy
|
||||
import torch
|
||||
import unittest
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
from torch import optim
|
||||
from torch import nn
|
||||
from TTS.utils.io import load_config
|
||||
from TTS.layers.losses import MSELossMasked
|
||||
from TTS.tf.models.tacotron2 import Tacotron2
|
||||
|
||||
#pylint: disable=unused-variable
|
||||
|
@ -22,36 +18,44 @@ c = load_config(os.path.join(file_path, 'test_config.json'))
|
|||
|
||||
|
||||
class TacotronTFTrainTest(unittest.TestCase):
|
||||
def test_train_step(self):
|
||||
''' test forward pass '''
|
||||
input = torch.randint(0, 24, (8, 128)).long().to(device)
|
||||
input_lengths = torch.randint(100, 128, (8, )).long().to(device)
|
||||
input_lengths = torch.sort(input_lengths, descending=True)[0]
|
||||
|
||||
@staticmethod
|
||||
def generate_dummy_inputs():
|
||||
chars_seq = torch.randint(0, 24, (8, 128)).long().to(device)
|
||||
chars_seq_lengths = torch.randint(100, 128, (8, )).long().to(device)
|
||||
chars_seq_lengths = torch.sort(chars_seq_lengths, descending=True)[0]
|
||||
mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
|
||||
mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
|
||||
mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
|
||||
stop_targets = torch.zeros(8, 30, 1).float().to(device)
|
||||
speaker_ids = torch.randint(0, 5, (8, )).long().to(device)
|
||||
|
||||
input = tf.convert_to_tensor(input.cpu().numpy())
|
||||
input_lengths = tf.convert_to_tensor(input_lengths.cpu().numpy())
|
||||
chars_seq = tf.convert_to_tensor(chars_seq.cpu().numpy())
|
||||
chars_seq_lengths = tf.convert_to_tensor(chars_seq_lengths.cpu().numpy())
|
||||
mel_spec = tf.convert_to_tensor(mel_spec.cpu().numpy())
|
||||
return chars_seq, chars_seq_lengths, mel_spec, mel_postnet_spec, mel_lengths,\
|
||||
stop_targets, speaker_ids
|
||||
|
||||
def test_train_step(self):
|
||||
''' test forward pass '''
|
||||
chars_seq, chars_seq_lengths, mel_spec, mel_postnet_spec, mel_lengths,\
|
||||
stop_targets, speaker_ids = self.generate_dummy_inputs()
|
||||
|
||||
for idx in mel_lengths:
|
||||
stop_targets[:, int(idx.item()):, 0] = 1.0
|
||||
|
||||
stop_targets = stop_targets.view(input.shape[0],
|
||||
stop_targets = stop_targets.view(chars_seq.shape[0],
|
||||
stop_targets.size(1) // c.r, -1)
|
||||
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
|
||||
|
||||
model = Tacotron2(num_chars=24, r=c.r, num_speakers=5)
|
||||
# training pass
|
||||
output = model(input, input_lengths, mel_spec, training=True)
|
||||
output = model(chars_seq, chars_seq_lengths, mel_spec, training=True)
|
||||
|
||||
# check model output shapes
|
||||
assert np.all(output[0].shape == mel_spec.shape)
|
||||
assert np.all(output[1].shape == mel_spec.shape)
|
||||
assert output[2].shape[2] == input.shape[1]
|
||||
assert output[2].shape[2] == chars_seq.shape[1]
|
||||
assert output[2].shape[1] == (mel_spec.shape[1] // model.decoder.r)
|
||||
assert output[3].shape[1] == (mel_spec.shape[1] // model.decoder.r)
|
||||
|
||||
|
|
|
@ -10,27 +10,23 @@ import torch
|
|||
import tensorflow as tf
|
||||
from fuzzywuzzy import fuzz
|
||||
|
||||
from TTS.utils.text.symbols import make_symbols, phonemes, symbols
|
||||
from TTS.utils.generic_utils import setup_model, count_parameters
|
||||
from TTS.utils.text.symbols import phonemes, symbols
|
||||
from TTS.utils.generic_utils import setup_model
|
||||
from TTS.utils.io import load_config
|
||||
from TTS_tf.models.tacotron2 import Tacotron2
|
||||
from TTS_tf.utils.convert_torch_to_tf_utils import compare_torch_tf, tf_create_dummy_inputs, transfer_weights_torch_to_tf, convert_tf_name
|
||||
from TTS_tf.utils.generic_utils import save_checkpoint
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
'--torch_model_path',
|
||||
type=str,
|
||||
help='Path to target torch model to be converted to TF.')
|
||||
parser.add_argument(
|
||||
'--config_path',
|
||||
type=str,
|
||||
help='Path to config file of torch model.')
|
||||
parser.add_argument(
|
||||
'--output_path',
|
||||
type=str,
|
||||
help='path to save TF model weights.')
|
||||
parser.add_argument('--torch_model_path',
|
||||
type=str,
|
||||
help='Path to target torch model to be converted to TF.')
|
||||
parser.add_argument('--config_path',
|
||||
type=str,
|
||||
help='Path to config file of torch model.')
|
||||
parser.add_argument('--output_path',
|
||||
type=str,
|
||||
help='path to save TF model weights.')
|
||||
args = parser.parse_args()
|
||||
|
||||
# load model config
|
||||
|
@ -41,7 +37,8 @@ num_speakers = 0
|
|||
# init torch model
|
||||
num_chars = len(phonemes) if c.use_phonemes else len(symbols)
|
||||
model = setup_model(num_chars, num_speakers, c)
|
||||
checkpoint = torch.load(args.torch_model_path, map_location=torch.device('cpu'))
|
||||
checkpoint = torch.load(args.torch_model_path,
|
||||
map_location=torch.device('cpu'))
|
||||
state_dict = checkpoint['model']
|
||||
model.load_state_dict(state_dict)
|
||||
|
||||
|
@ -69,18 +66,24 @@ model_tf = Tacotron2(num_chars=num_chars,
|
|||
common_sufix = '/.ATTRIBUTES/VARIABLE_VALUE'
|
||||
var_map = [
|
||||
('tacotron2/embedding/embeddings:0', 'embedding.weight'),
|
||||
('tacotron2/encoder/lstm/forward_lstm/lstm_cell_1/kernel:0', 'encoder.lstm.weight_ih_l0'),
|
||||
('tacotron2/encoder/lstm/forward_lstm/lstm_cell_1/recurrent_kernel:0', 'encoder.lstm.weight_hh_l0'),
|
||||
('tacotron2/encoder/lstm/backward_lstm/lstm_cell_2/kernel:0', 'encoder.lstm.weight_ih_l0_reverse'),
|
||||
('tacotron2/encoder/lstm/backward_lstm/lstm_cell_2/recurrent_kernel:0', 'encoder.lstm.weight_hh_l0_reverse'),
|
||||
('tacotron2/encoder/lstm/forward_lstm/lstm_cell_1/bias:0', ('encoder.lstm.bias_ih_l0', 'encoder.lstm.bias_hh_l0')),
|
||||
('tacotron2/encoder/lstm/backward_lstm/lstm_cell_2/bias:0', ('encoder.lstm.bias_ih_l0_reverse', 'encoder.lstm.bias_hh_l0_reverse')),
|
||||
('tacotron2/encoder/lstm/forward_lstm/lstm_cell_1/kernel:0',
|
||||
'encoder.lstm.weight_ih_l0'),
|
||||
('tacotron2/encoder/lstm/forward_lstm/lstm_cell_1/recurrent_kernel:0',
|
||||
'encoder.lstm.weight_hh_l0'),
|
||||
('tacotron2/encoder/lstm/backward_lstm/lstm_cell_2/kernel:0',
|
||||
'encoder.lstm.weight_ih_l0_reverse'),
|
||||
('tacotron2/encoder/lstm/backward_lstm/lstm_cell_2/recurrent_kernel:0',
|
||||
'encoder.lstm.weight_hh_l0_reverse'),
|
||||
('tacotron2/encoder/lstm/forward_lstm/lstm_cell_1/bias:0',
|
||||
('encoder.lstm.bias_ih_l0', 'encoder.lstm.bias_hh_l0')),
|
||||
('tacotron2/encoder/lstm/backward_lstm/lstm_cell_2/bias:0',
|
||||
('encoder.lstm.bias_ih_l0_reverse', 'encoder.lstm.bias_hh_l0_reverse')),
|
||||
('attention/v/kernel:0', 'decoder.attention.v.linear_layer.weight'),
|
||||
('decoder/linear_projection/kernel:0', 'decoder.linear_projection.linear_layer.weight'),
|
||||
('decoder/linear_projection/kernel:0',
|
||||
'decoder.linear_projection.linear_layer.weight'),
|
||||
('decoder/stopnet/kernel:0', 'decoder.stopnet.1.linear_layer.weight')
|
||||
]
|
||||
|
||||
|
||||
# %%
|
||||
# get tf_model graph
|
||||
input_ids, input_lengths, mel_outputs, mel_lengths = tf_create_dummy_inputs()
|
||||
|
@ -95,15 +98,17 @@ tf_var_names = [we.name for we in model_tf.weights]
|
|||
for tf_name in tf_var_names:
|
||||
# skip re-mapped layer names
|
||||
if tf_name in [name[0] for name in var_map]:
|
||||
continue
|
||||
continue
|
||||
tf_name_edited = convert_tf_name(tf_name)
|
||||
ratios = [fuzz.ratio(torch_name, tf_name_edited) for torch_name in torch_var_names]
|
||||
ratios = [
|
||||
fuzz.ratio(torch_name, tf_name_edited)
|
||||
for torch_name in torch_var_names
|
||||
]
|
||||
max_idx = np.argmax(ratios)
|
||||
matching_name = torch_var_names[max_idx]
|
||||
del torch_var_names[max_idx]
|
||||
var_map.append((tf_name, matching_name))
|
||||
|
||||
|
||||
# %%
|
||||
# print variable match
|
||||
from pprint import pprint
|
||||
|
@ -121,20 +126,25 @@ input_ids = torch.randint(0, 24, (1, 128)).long()
|
|||
|
||||
o_t = model.embedding(input_ids)
|
||||
o_tf = model_tf.embedding(input_ids.detach().numpy())
|
||||
assert abs(o_t.detach().numpy() - o_tf.numpy()).sum() < 1e-5, abs(o_t.detach().numpy() - o_tf.numpy()).sum()
|
||||
assert abs(o_t.detach().numpy() -
|
||||
o_tf.numpy()).sum() < 1e-5, abs(o_t.detach().numpy() -
|
||||
o_tf.numpy()).sum()
|
||||
|
||||
# compare encoder outputs
|
||||
oo_en = model.encoder.inference(o_t.transpose(1,2))
|
||||
oo_en = model.encoder.inference(o_t.transpose(1, 2))
|
||||
ooo_en = model_tf.encoder(o_t.detach().numpy(), training=False)
|
||||
assert compare_torch_tf(oo_en, ooo_en) < 1e-5
|
||||
|
||||
#pylint: disable=redefined-builtin
|
||||
# compare decoder.attention_rnn
|
||||
inp = torch.rand([1, 768])
|
||||
inp_tf = inp.numpy()
|
||||
model.decoder._init_states(oo_en, mask=None)
|
||||
model.decoder._init_states(oo_en, mask=None) #pylint: disable=protected-access
|
||||
output, cell_state = model.decoder.attention_rnn(inp)
|
||||
states = model_tf.decoder.build_decoder_initial_states(1,512,128)
|
||||
output_tf, memory_state = model_tf.decoder.attention_rnn(inp_tf, states[2], training=False)
|
||||
states = model_tf.decoder.build_decoder_initial_states(1, 512, 128)
|
||||
output_tf, memory_state = model_tf.decoder.attention_rnn(inp_tf,
|
||||
states[2],
|
||||
training=False)
|
||||
assert compare_torch_tf(output, output_tf).mean() < 1e-5
|
||||
|
||||
# compare decoder.attention
|
||||
|
@ -145,7 +155,8 @@ inputs_tf = inputs.numpy()
|
|||
|
||||
model.decoder.attention.init_states(inputs)
|
||||
processes_inputs = model.decoder.attention.preprocess_inputs(inputs)
|
||||
loc_attn, proc_query = model.decoder.attention.get_location_attention(query, processes_inputs)
|
||||
loc_attn, proc_query = model.decoder.attention.get_location_attention(
|
||||
query, processes_inputs)
|
||||
context = model.decoder.attention(query, inputs, processes_inputs, None)
|
||||
|
||||
model_tf.decoder.attention.process_values(tf.convert_to_tensor(inputs_tf))
|
||||
|
@ -159,10 +170,13 @@ assert compare_torch_tf(context, context_tf) < 1e-5
|
|||
# compare decoder.decoder_rnn
|
||||
input = torch.rand([1, 1536])
|
||||
input_tf = input.numpy()
|
||||
model.decoder._init_states(oo_en, mask=None)
|
||||
output, cell_state = model.decoder.decoder_rnn(input, [model.decoder.decoder_hidden, model.decoder.decoder_cell])
|
||||
states = model_tf.decoder.build_decoder_initial_states(1,512,128)
|
||||
output_tf, memory_state = model_tf.decoder.decoder_rnn(input_tf, states[3], training=False)
|
||||
model.decoder._init_states(oo_en, mask=None) #pylint: disable=protected-access
|
||||
output, cell_state = model.decoder.decoder_rnn(
|
||||
input, [model.decoder.decoder_hidden, model.decoder.decoder_cell])
|
||||
states = model_tf.decoder.build_decoder_initial_states(1, 512, 128)
|
||||
output_tf, memory_state = model_tf.decoder.decoder_rnn(input_tf,
|
||||
states[3],
|
||||
training=False)
|
||||
assert abs(input - input_tf).mean() < 1e-5
|
||||
assert compare_torch_tf(output, output_tf).mean() < 1e-5
|
||||
|
||||
|
@ -177,15 +191,16 @@ assert compare_torch_tf(output, output_tf) < 1e-5
|
|||
model.decoder.max_decoder_steps = 100
|
||||
model_tf.decoder.set_max_decoder_steps(100)
|
||||
output, align, stop = model.decoder.inference(oo_en)
|
||||
states = model_tf.decoder.build_decoder_initial_states(1,512,128)
|
||||
states = model_tf.decoder.build_decoder_initial_states(1, 512, 128)
|
||||
output_tf, align_tf, stop_tf = model_tf.decoder(ooo_en, states, training=False)
|
||||
assert compare_torch_tf(output.transpose(1,2), output_tf) < 1e-4
|
||||
assert compare_torch_tf(output.transpose(1, 2), output_tf) < 1e-4
|
||||
|
||||
# compare the whole model output
|
||||
outputs_torch = model.inference(input_ids)
|
||||
outputs_tf = model_tf(tf.convert_to_tensor(input_ids.numpy()))
|
||||
print(abs(outputs_torch[0].numpy()[:, 0] - outputs_tf[0].numpy()[:, 0]).mean() )
|
||||
assert compare_torch_tf(outputs_torch[2][:, 50, :], outputs_tf[2][:, 50, :]) < 1e-5
|
||||
print(abs(outputs_torch[0].numpy()[:, 0] - outputs_tf[0].numpy()[:, 0]).mean())
|
||||
assert compare_torch_tf(outputs_torch[2][:, 50, :],
|
||||
outputs_tf[2][:, 50, :]) < 1e-5
|
||||
assert compare_torch_tf(outputs_torch[0], outputs_tf[0]) < 1e-4
|
||||
|
||||
# %%
|
||||
|
@ -193,4 +208,3 @@ assert compare_torch_tf(outputs_torch[0], outputs_tf[0]) < 1e-4
|
|||
save_checkpoint(model_tf, None, checkpoint['step'], checkpoint['epoch'],
|
||||
checkpoint['r'], args.output_path)
|
||||
print(' > Model conversion is successfully completed :).')
|
||||
|
||||
|
|
|
@ -3,8 +3,6 @@ from tensorflow import keras
|
|||
from tensorflow.python.ops import math_ops
|
||||
# from tensorflow_addons.seq2seq import BahdanauAttention
|
||||
|
||||
from TTS.tf.utils.tf_utils import shape_list
|
||||
|
||||
|
||||
class Linear(keras.layers.Layer):
|
||||
def __init__(self, units, use_bias, **kwargs):
|
||||
|
@ -12,7 +10,7 @@ class Linear(keras.layers.Layer):
|
|||
self.linear_layer = keras.layers.Dense(units, use_bias=use_bias, name='linear_layer')
|
||||
self.activation = keras.layers.ReLU()
|
||||
|
||||
def call(self, x, training=None):
|
||||
def call(self, x):
|
||||
"""
|
||||
shapes:
|
||||
x: B x T x C
|
||||
|
@ -77,9 +75,9 @@ def _sigmoid_norm(score):
|
|||
|
||||
|
||||
class Attention(keras.layers.Layer):
|
||||
"""TODO: implement forward_attention"""
|
||||
"""TODO: location sensitive attention"""
|
||||
"""TODO: implement attention windowing """
|
||||
"""TODO: implement forward_attention
|
||||
TODO: location sensitive attention
|
||||
TODO: implement attention windowing """
|
||||
def __init__(self, attn_dim, use_loc_attn, loc_attn_n_filters,
|
||||
loc_attn_kernel_size, use_windowing, norm, use_forward_attn,
|
||||
use_trans_agent, use_forward_attn_mask, **kwargs):
|
||||
|
@ -120,6 +118,7 @@ class Attention(keras.layers.Layer):
|
|||
|
||||
def process_values(self, values):
|
||||
""" cache values for decoder iterations """
|
||||
#pylint: disable=attribute-defined-outside-init
|
||||
self.processed_values = self.inputs_layer(values)
|
||||
self.values = values
|
||||
|
||||
|
@ -127,8 +126,7 @@ class Attention(keras.layers.Layer):
|
|||
""" compute location attention, query layer and
|
||||
unnorm. attention weights"""
|
||||
attention_cum, attention_old = states
|
||||
attn_cat = tf.stack([attention_old, attention_cum],
|
||||
axis=2)
|
||||
attn_cat = tf.stack([attention_old, attention_cum], axis=2)
|
||||
|
||||
processed_query = self.query_layer(tf.expand_dims(query, 1))
|
||||
processed_attn = self.location_dense(self.location_conv1d(attn_cat))
|
||||
|
@ -145,7 +143,7 @@ class Attention(keras.layers.Layer):
|
|||
score = tf.squeeze(score, axis=2)
|
||||
return score, processed_query
|
||||
|
||||
def apply_score_masking(self, score, mask):
|
||||
def apply_score_masking(self, score, mask): #pylint: disable=no-self-use
|
||||
""" ignore sequence paddings """
|
||||
padding_mask = tf.expand_dims(math_ops.logical_not(mask), 2)
|
||||
# Bias so padding positions do not contribute to attention distribution.
|
||||
|
@ -158,13 +156,13 @@ class Attention(keras.layers.Layer):
|
|||
query: B x D
|
||||
"""
|
||||
if self.use_loc_attn:
|
||||
score, processed_query = self.get_loc_attn(query, states)
|
||||
score, _ = self.get_loc_attn(query, states)
|
||||
else:
|
||||
score, processed_query = self.get_attn(query)
|
||||
score, _ = self.get_attn(query)
|
||||
|
||||
# TODO: masking
|
||||
# if mask is not None:
|
||||
# self.apply_score_masking(score, mask)
|
||||
# self.apply_score_masking(score, mask)
|
||||
# attn_weights shape == (batch_size, max_length, 1)
|
||||
|
||||
attn_weights = self.norm_func(score)
|
||||
|
|
|
@ -55,6 +55,7 @@ class Encoder(keras.layers.Layer):
|
|||
|
||||
|
||||
class Decoder(keras.layers.Layer):
|
||||
#pylint: disable=unused-argument
|
||||
def __init__(self, frame_dim, r, attn_type, use_attn_win, attn_norm, prenet_type,
|
||||
prenet_dropout, use_forward_attn, use_trans_agent, use_forward_attn_mask,
|
||||
use_location_attn, attn_K, separate_stopnet, speaker_emb_dim, **kwargs):
|
||||
|
@ -135,7 +136,7 @@ class Decoder(keras.layers.Layer):
|
|||
return output_frame, stopnet_output, states, attention
|
||||
|
||||
def decode(self, memory, states, frames, memory_seq_length=None):
|
||||
B, T, D = shape_list(memory)
|
||||
B, _, _ = shape_list(memory)
|
||||
num_iter = shape_list(frames)[1] // self.r
|
||||
# init states
|
||||
frame_zero = tf.expand_dims(states[0], 1)
|
||||
|
@ -159,25 +160,25 @@ class Decoder(keras.layers.Layer):
|
|||
return step + 1, memory, prenet_output, states, outputs, stop_tokens, attentions
|
||||
_, memory, _, states, outputs, stop_tokens, attentions = \
|
||||
tf.while_loop(lambda *arg: True,
|
||||
_body,
|
||||
loop_vars=(step_count, memory, prenet_output, states, outputs,
|
||||
stop_tokens, attentions),
|
||||
parallel_iterations=32,
|
||||
swap_memory=True,
|
||||
maximum_iterations=num_iter)
|
||||
_body,
|
||||
loop_vars=(step_count, memory, prenet_output,
|
||||
states, outputs, stop_tokens, attentions),
|
||||
parallel_iterations=32,
|
||||
swap_memory=True,
|
||||
maximum_iterations=num_iter)
|
||||
|
||||
outputs = outputs.stack()
|
||||
attentions = attentions.stack()
|
||||
stop_tokens = stop_tokens.stack()
|
||||
outputs = tf.transpose(outputs, [1, 0, 2])
|
||||
attentions = tf.transpose(attentions, [1, 0 ,2])
|
||||
attentions = tf.transpose(attentions, [1, 0, 2])
|
||||
stop_tokens = tf.transpose(stop_tokens, [1, 0, 2])
|
||||
stop_tokens = tf.squeeze(stop_tokens, axis=2)
|
||||
outputs = tf.reshape(outputs, [B, -1, self.frame_dim])
|
||||
return outputs, stop_tokens, attentions
|
||||
|
||||
def decode_inference(self, memory, states):
|
||||
B, T, D = shape_list(memory)
|
||||
B, _, _ = shape_list(memory)
|
||||
# init states
|
||||
outputs = tf.TensorArray(dtype=tf.float32, size=0, clear_after_read=False, dynamic_size=True)
|
||||
attentions = tf.TensorArray(dtype=tf.float32, size=0, clear_after_read=False, dynamic_size=True)
|
||||
|
@ -207,12 +208,12 @@ class Decoder(keras.layers.Layer):
|
|||
cond = lambda step, m, s, o, st, a, stop_flag: tf.equal(stop_flag, tf.constant(False, dtype=tf.bool))
|
||||
_, memory, states, outputs, stop_tokens, attentions, stop_flag = \
|
||||
tf.while_loop(cond,
|
||||
_body,
|
||||
loop_vars=(step_count, memory, states, outputs,
|
||||
stop_tokens, attentions, stop_flag),
|
||||
parallel_iterations=32,
|
||||
swap_memory=True,
|
||||
maximum_iterations=self.max_decoder_steps)
|
||||
_body,
|
||||
loop_vars=(step_count, memory, states, outputs,
|
||||
stop_tokens, attentions, stop_flag),
|
||||
parallel_iterations=32,
|
||||
swap_memory=True,
|
||||
maximum_iterations=self.max_decoder_steps)
|
||||
|
||||
outputs = outputs.stack()
|
||||
attentions = attentions.stack()
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
import tensorflow as tf
|
||||
from tensorflow import keras
|
||||
|
||||
from TTS.tf.layers.tacotron2 import Encoder, Decoder, Postnet
|
||||
from TTS.tf.utils.tf_utils import shape_list
|
||||
|
||||
|
||||
#pylint: disable=too-many-ancestors
|
||||
class Tacotron2(keras.models.Model):
|
||||
def __init__(self,
|
||||
num_chars,
|
||||
|
@ -35,16 +35,28 @@ class Tacotron2(keras.models.Model):
|
|||
self.embedding = keras.layers.Embedding(num_chars, 512, name='embedding')
|
||||
self.encoder = Encoder(512, name='encoder')
|
||||
# TODO: most of the decoder args have no use at the momment
|
||||
self.decoder = Decoder(decoder_output_dim, r, attn_type=attn_type, use_attn_win=attn_win, attn_norm=attn_norm, prenet_type=prenet_type,
|
||||
prenet_dropout=prenet_dropout, use_forward_attn=forward_attn, use_trans_agent=trans_agent, use_forward_attn_mask=forward_attn_mask,
|
||||
use_location_attn=location_attn, attn_K=attn_K, separate_stopnet=separate_stopnet, speaker_emb_dim=self.speaker_embed_dim)
|
||||
self.decoder = Decoder(decoder_output_dim,
|
||||
r,
|
||||
attn_type=attn_type,
|
||||
use_attn_win=attn_win,
|
||||
attn_norm=attn_norm,
|
||||
prenet_type=prenet_type,
|
||||
prenet_dropout=prenet_dropout,
|
||||
use_forward_attn=forward_attn,
|
||||
use_trans_agent=trans_agent,
|
||||
use_forward_attn_mask=forward_attn_mask,
|
||||
use_location_attn=location_attn,
|
||||
attn_K=attn_K,
|
||||
separate_stopnet=separate_stopnet,
|
||||
speaker_emb_dim=self.speaker_embed_dim)
|
||||
self.postnet = Postnet(postnet_output_dim, 5, name='postnet')
|
||||
|
||||
def call(self, characters, text_lengths=None, frames=None, training=None):
|
||||
if training == True:
|
||||
if training:
|
||||
return self.training(characters, text_lengths, frames)
|
||||
else:
|
||||
if not training:
|
||||
return self.inference(characters)
|
||||
raise RuntimeError(' [!] Set model training mode True or False')
|
||||
|
||||
def training(self, characters, text_lengths, frames):
|
||||
B, T = shape_list(characters)
|
||||
|
@ -67,6 +79,3 @@ class Tacotron2(keras.models.Model):
|
|||
print(output_frames.shape)
|
||||
return decoder_frames, output_frames, attentions, stop_tokens
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -1,8 +1,5 @@
|
|||
import numpy as np
|
||||
import torch
|
||||
import re
|
||||
import tensorflow as tf
|
||||
import tensorflow.keras.backend as K
|
||||
|
||||
|
||||
def tf_create_dummy_inputs():
|
||||
|
@ -17,7 +14,7 @@ def tf_create_dummy_inputs():
|
|||
input_lengths[-1] = max_input_length
|
||||
input_lengths = tf.convert_to_tensor(input_lengths, dtype=tf.int32)
|
||||
mel_outputs = tf.random.uniform(shape=[batch_size, max_mel_length + pad, 80])
|
||||
mel_lengths = np.random.randint(0, high=max_mel_length+1 + pad, size=[batch_size])
|
||||
mel_lengths = np.random.randint(0, high=max_mel_length+1 + pad, size=[batch_size])
|
||||
mel_lengths[-1] = max_mel_length
|
||||
mel_lengths = tf.convert_to_tensor(mel_lengths, dtype=tf.int32)
|
||||
return input_ids, input_lengths, mel_outputs, mel_lengths
|
||||
|
@ -49,7 +46,7 @@ def transfer_weights_torch_to_tf(tf_vars, var_map_dict, state_dict):
|
|||
torch_var_name = var_map_dict[tf_var.name]
|
||||
print(f' | > {tf_var.name} <-- {torch_var_name}')
|
||||
# if tuple, it is a bias variable
|
||||
if type(torch_var_name) is not tuple:
|
||||
if not isinstance(torch_var_name, tuple):
|
||||
torch_layer_name = '.'.join(torch_var_name.split('.')[-2:])
|
||||
torch_weight = state_dict[torch_var_name]
|
||||
if 'convolution1d/kernel' in tf_var.name or 'conv1d/kernel' in tf_var.name:
|
||||
|
|
|
@ -1,14 +1,8 @@
|
|||
import os
|
||||
import re
|
||||
import glob
|
||||
import shutil
|
||||
import datetime
|
||||
import json
|
||||
import subprocess
|
||||
import importlib
|
||||
import pickle
|
||||
import numpy as np
|
||||
from collections import OrderedDict, Counter
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
|
@ -29,7 +23,7 @@ def save_checkpoint(model, optimizer, current_step, epoch, r, output_folder, **k
|
|||
|
||||
def load_checkpoint(model, checkpoint_path):
|
||||
checkpoint = pickle.load(open(checkpoint_path, 'rb'))
|
||||
chkp_var_dict = dict([(var.name, var.numpy()) for var in checkpoint['model']])
|
||||
chkp_var_dict = {var.name: var.numpy() for var in checkpoint['model']}
|
||||
tf_vars = model.weights
|
||||
for tf_var in tf_vars:
|
||||
layer_name = tf_var.name
|
||||
|
@ -64,7 +58,7 @@ def check_gradient(x, grad_clip):
|
|||
def count_parameters(model, c):
|
||||
try:
|
||||
return model.count_params()
|
||||
except:
|
||||
except RuntimeError:
|
||||
input_dummy = tf.convert_to_tensor(np.random.rand(8, 128).astype('int32'))
|
||||
input_lengths = np.random.randint(100, 129, (8, ))
|
||||
input_lengths[-1] = 128
|
||||
|
@ -74,7 +68,7 @@ def count_parameters(model, c):
|
|||
mel_spec = tf.convert_to_tensor(mel_spec)
|
||||
speaker_ids = np.random.randint(
|
||||
0, 5, (8, )) if c.use_speaker_embedding else None
|
||||
_ = model(input_dummy, input_lengths, mel_spec)
|
||||
_ = model(input_dummy, input_lengths, mel_spec, speaker_ids=speaker_ids)
|
||||
return model.count_params()
|
||||
|
||||
|
||||
|
@ -83,23 +77,23 @@ def setup_model(num_chars, num_speakers, c):
|
|||
MyModel = importlib.import_module('TTS.tf.models.' + c.model.lower())
|
||||
MyModel = getattr(MyModel, c.model)
|
||||
if c.model.lower() in "tacotron":
|
||||
raise NotImplemented(' [!] Tacotron model is not ready.')
|
||||
elif c.model.lower() == "tacotron2":
|
||||
model = MyModel(num_chars=num_chars,
|
||||
num_speakers=num_speakers,
|
||||
r=c.r,
|
||||
postnet_output_dim=c.audio['num_mels'],
|
||||
decoder_output_dim=c.audio['num_mels'],
|
||||
attn_type=c.attention_type,
|
||||
attn_win=c.windowing,
|
||||
attn_norm=c.attention_norm,
|
||||
prenet_type=c.prenet_type,
|
||||
prenet_dropout=c.prenet_dropout,
|
||||
forward_attn=c.use_forward_attn,
|
||||
trans_agent=c.transition_agent,
|
||||
forward_attn_mask=c.forward_attn_mask,
|
||||
location_attn=c.location_attn,
|
||||
attn_K=c.attention_heads,
|
||||
separate_stopnet=c.separate_stopnet,
|
||||
bidirectional_decoder=c.bidirectional_decoder)
|
||||
raise NotImplementedError(' [!] Tacotron model is not ready.')
|
||||
# tacotron2
|
||||
model = MyModel(num_chars=num_chars,
|
||||
num_speakers=num_speakers,
|
||||
r=c.r,
|
||||
postnet_output_dim=c.audio['num_mels'],
|
||||
decoder_output_dim=c.audio['num_mels'],
|
||||
attn_type=c.attention_type,
|
||||
attn_win=c.windowing,
|
||||
attn_norm=c.attention_norm,
|
||||
prenet_type=c.prenet_type,
|
||||
prenet_dropout=c.prenet_dropout,
|
||||
forward_attn=c.use_forward_attn,
|
||||
trans_agent=c.transition_agent,
|
||||
forward_attn_mask=c.forward_attn_mask,
|
||||
location_attn=c.location_attn,
|
||||
attn_K=c.attention_heads,
|
||||
separate_stopnet=c.separate_stopnet,
|
||||
bidirectional_decoder=c.bidirectional_decoder)
|
||||
return model
|
||||
|
|
7
train.py
7
train.py
|
@ -190,7 +190,7 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,
|
|||
# backward pass
|
||||
loss_dict['loss'].backward()
|
||||
optimizer, current_lr = adam_weight_decay(optimizer)
|
||||
grad_norm, grad_flag = check_update(model, c.grad_clip, ignore_stopnet=True)
|
||||
grad_norm, _ = check_update(model, c.grad_clip, ignore_stopnet=True)
|
||||
optimizer.step()
|
||||
|
||||
# compute alignment error (the lower the better )
|
||||
|
@ -232,8 +232,7 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,
|
|||
loss_dict['postnet_loss'] = reduce_tensor(loss_dict['postnet_loss'].data, num_gpus)
|
||||
loss_dict['decoder_loss'] = reduce_tensor(loss_dict['decoder_loss'].data, num_gpus)
|
||||
loss_dict['loss'] = reduce_tensor(loss_dict['loss'] .data, num_gpus)
|
||||
loss_dict['stopnet_loss'] = reduce_tensor(loss_dict['stopnet_loss'].data,
|
||||
num_gpus) if c.stopnet else loss_dict['stopnet_loss']
|
||||
loss_dict['stopnet_loss'] = reduce_tensor(loss_dict['stopnet_loss'].data, num_gpus) if c.stopnet else loss_dict['stopnet_loss']
|
||||
|
||||
if args.rank == 0:
|
||||
# Plot Training Iter Stats
|
||||
|
@ -308,8 +307,6 @@ def train(model, criterion, optimizer, optimizer_st, scheduler,
|
|||
@torch.no_grad()
|
||||
def evaluate(model, criterion, ap, global_step, epoch):
|
||||
data_loader = setup_loader(ap, model.decoder.r, is_val=True)
|
||||
if c.use_speaker_embedding:
|
||||
speaker_mapping = load_speaker_mapping(OUT_PATH)
|
||||
model.eval()
|
||||
epoch_time = 0
|
||||
eval_values_dict = {
|
||||
|
|
|
@ -6,6 +6,7 @@ import datetime
|
|||
import subprocess
|
||||
import importlib
|
||||
import numpy as np
|
||||
from collections import Counter
|
||||
|
||||
|
||||
def get_git_branch():
|
||||
|
@ -40,10 +41,10 @@ def get_commit_hash():
|
|||
def create_experiment_folder(root_path, model_name, debug):
|
||||
""" Create a folder with the current date and time """
|
||||
date_str = datetime.datetime.now().strftime("%B-%d-%Y_%I+%M%p")
|
||||
# if debug:
|
||||
# commit_hash = 'debug'
|
||||
# else:
|
||||
commit_hash = get_commit_hash()
|
||||
if debug:
|
||||
commit_hash = 'debug'
|
||||
else:
|
||||
commit_hash = get_commit_hash()
|
||||
output_folder = os.path.join(
|
||||
root_path, model_name + '-' + date_str + '-' + commit_hash)
|
||||
os.makedirs(output_folder, exist_ok=True)
|
||||
|
@ -87,8 +88,7 @@ def split_dataset(items):
|
|||
items_eval.append(items[item_idx])
|
||||
del items[item_idx]
|
||||
return items_eval, items
|
||||
else:
|
||||
return items[:eval_split_size], items[eval_split_size:]
|
||||
return items[:eval_split_size], items[eval_split_size:]
|
||||
|
||||
|
||||
# from https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1
|
||||
|
|
12
utils/io.py
12
utils/io.py
|
@ -26,7 +26,7 @@ def copy_config_file(config_file, out_path, new_fields):
|
|||
config_lines = open(config_file, "r").readlines()
|
||||
# add extra information fields
|
||||
for key, value in new_fields.items():
|
||||
if type(value) == str:
|
||||
if isinstance(value, str):
|
||||
new_line = '"{}":"{}",\n'.format(key, value)
|
||||
else:
|
||||
new_line = '"{}":{},\n'.format(key, value)
|
||||
|
@ -37,7 +37,7 @@ def copy_config_file(config_file, out_path, new_fields):
|
|||
|
||||
|
||||
def load_checkpoint(model, checkpoint_path, use_cuda=False):
|
||||
state = torch.load(checkpoint_path, map_location=torch.device('cpu'))
|
||||
state = torch.load(checkpoint_path, map_location=torch.device('cpu'))
|
||||
model.load_state_dict(state['model'])
|
||||
if use_cuda:
|
||||
model.cuda()
|
||||
|
@ -55,7 +55,7 @@ def save_model(model, optimizer, current_step, epoch, r, output_path, **kwargs):
|
|||
'step': current_step,
|
||||
'epoch': epoch,
|
||||
'date': datetime.date.today().strftime("%B %d, %Y"),
|
||||
'r': model.decoder.r
|
||||
'r': r
|
||||
}
|
||||
state.update(kwargs)
|
||||
torch.save(state, output_path)
|
||||
|
@ -65,7 +65,7 @@ def save_checkpoint(model, optimizer, current_step, epoch, r, output_folder, **k
|
|||
file_name = 'checkpoint_{}.pth.tar'.format(current_step)
|
||||
checkpoint_path = os.path.join(output_folder, file_name)
|
||||
print(" > CHECKPOINT : {}".format(checkpoint_path))
|
||||
save_model(model, optimizer, current_step, epoch ,r, checkpoint_path, **kwargs)
|
||||
save_model(model, optimizer, current_step, epoch, r, checkpoint_path, **kwargs)
|
||||
|
||||
|
||||
def save_best_model(target_loss, best_loss, model, optimizer, current_step, epoch, r, output_folder, **kwargs):
|
||||
|
@ -73,6 +73,6 @@ def save_best_model(target_loss, best_loss, model, optimizer, current_step, epoc
|
|||
file_name = 'best_model.pth.tar'
|
||||
checkpoint_path = os.path.join(output_folder, file_name)
|
||||
print(" > BEST MODEL : {}".format(checkpoint_path))
|
||||
save_model(model, optimizer, current_step, epoch ,r, checkpoint_path, model_loss=target_loss)
|
||||
save_model(model, optimizer, current_step, epoch, r, checkpoint_path, model_loss=target_loss, **kwargs)
|
||||
best_loss = target_loss
|
||||
return best_loss
|
||||
return best_loss
|
||||
|
|
|
@ -8,9 +8,9 @@ from torch.optim.optimizer import Optimizer, required
|
|||
class RAdam(Optimizer):
|
||||
|
||||
def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True):
|
||||
if not 0.0 <= lr:
|
||||
if lr < 0.0:
|
||||
raise ValueError("Invalid learning rate: {}".format(lr))
|
||||
if not 0.0 <= eps:
|
||||
if eps < 0.0:
|
||||
raise ValueError("Invalid epsilon value: {}".format(eps))
|
||||
if not 0.0 <= betas[0] < 1.0:
|
||||
raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
|
||||
|
@ -94,4 +94,4 @@ class RAdam(Optimizer):
|
|||
p_data_fp32.add_(exp_avg, alpha=-step_size * group['lr'])
|
||||
p.data.copy_(p_data_fp32)
|
||||
|
||||
return loss
|
||||
return loss
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
import pkg_resources
|
||||
installed = {pkg.key for pkg in pkg_resources.working_set}
|
||||
installed = {pkg.key for pkg in pkg_resources.working_set} #pylint: disable=not-an-iterable
|
||||
if 'tensorflow' in installed or 'tensorflow-gpu' in installed:
|
||||
import tensorflow as tf
|
||||
import torch
|
||||
|
@ -7,7 +7,7 @@ import numpy as np
|
|||
from .text import text_to_sequence, phoneme_to_sequence
|
||||
|
||||
|
||||
def text_to_seqvec(text, CONFIG, use_cuda):
|
||||
def text_to_seqvec(text, CONFIG):
|
||||
text_cleaner = [CONFIG.text_cleaner]
|
||||
# text ot phonemes to sequence vector
|
||||
if CONFIG.use_phonemes:
|
||||
|
@ -37,7 +37,7 @@ def numpy_to_tf(np_array, dtype):
|
|||
return tensor
|
||||
|
||||
|
||||
def compute_style_mel(style_wav, ap, use_cuda):
|
||||
def compute_style_mel(style_wav, ap):
|
||||
style_mel = ap.melspectrogram(
|
||||
ap.load_wav(style_wav)).expand_dims(0)
|
||||
return style_mel
|
||||
|
@ -58,13 +58,13 @@ def run_model_torch(model, inputs, CONFIG, truncated, speaker_id=None, style_mel
|
|||
|
||||
|
||||
def run_model_tf(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None):
|
||||
if CONFIG.use_gst:
|
||||
raise NotImplemented(' [!] GST inference not implemented for TF')
|
||||
if CONFIG.use_gst and style_mel is not None:
|
||||
raise NotImplementedError(' [!] GST inference not implemented for TF')
|
||||
if truncated:
|
||||
raise NotImplemented(' [!] Truncated inference not implemented for TF')
|
||||
raise NotImplementedError(' [!] Truncated inference not implemented for TF')
|
||||
# TODO: handle multispeaker case
|
||||
decoder_output, postnet_output, alignments, stop_tokens = model(
|
||||
inputs)
|
||||
inputs, speaker_ids=speaker_id)
|
||||
return decoder_output, postnet_output, alignments, stop_tokens
|
||||
|
||||
|
||||
|
@ -153,9 +153,9 @@ def synthesis(model,
|
|||
# GST processing
|
||||
style_mel = None
|
||||
if CONFIG.model == "TacotronGST" and style_wav is not None:
|
||||
style_mel = compute_style_mel(style_wav, ap, use_cuda)
|
||||
style_mel = compute_style_mel(style_wav, ap)
|
||||
# preprocess the given text
|
||||
inputs = text_to_seqvec(text, CONFIG, use_cuda)
|
||||
inputs = text_to_seqvec(text, CONFIG)
|
||||
# pass tensors to backend
|
||||
if backend == 'torch':
|
||||
speaker_id = id_to_torch(speaker_id)
|
||||
|
|
|
@ -9,7 +9,7 @@ def check_update(model, grad_clip, ignore_stopnet=False):
|
|||
grad_norm = torch.nn.utils.clip_grad_norm_([param for name, param in model.named_parameters() if 'stopnet' not in name], grad_clip)
|
||||
else:
|
||||
grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
|
||||
if torch.isinf(grad_norm):
|
||||
if np.isinf(grad_norm):
|
||||
print(" | > Gradient is INF !!")
|
||||
skip_flag = True
|
||||
return grad_norm, skip_flag
|
||||
|
@ -62,6 +62,7 @@ def set_weight_decay(model, weight_decay, skip_list={"decoder.attention.v", "rnn
|
|||
}]
|
||||
|
||||
|
||||
# pylint: disable=protected-access
|
||||
class NoamLR(torch.optim.lr_scheduler._LRScheduler):
|
||||
def __init__(self, optimizer, warmup_steps=0.1, last_epoch=-1):
|
||||
self.warmup_steps = float(warmup_steps)
|
||||
|
@ -87,4 +88,4 @@ def gradual_training_scheduler(global_step, config):
|
|||
for values in config.gradual_training:
|
||||
if global_step * num_gpus >= values[0]:
|
||||
new_values = values
|
||||
return new_values[1], new_values[2]
|
||||
return new_values[1], new_values[2]
|
||||
|
|
Loading…
Reference in New Issue