mirror of https://github.com/coqui-ai/TTS.git
linter and test updates for speaker_encoder, gmm_Attention
This commit is contained in:
parent
1401a0db6b
commit
df1b8b3ec7
|
@ -136,6 +136,8 @@ class GravesAttention(nn.Module):
|
|||
self.attention_weights = torch.zeros(inputs.shape[0], inputs.shape[1]).to(inputs.device)
|
||||
self.mu_prev = torch.zeros(inputs.shape[0], self.K).to(inputs.device)
|
||||
|
||||
# pylint: disable=R0201
|
||||
# pylint: disable=unused-argument
|
||||
def preprocess_inputs(self, inputs):
|
||||
return None
|
||||
|
||||
|
@ -376,8 +378,7 @@ def init_attn(attn_type, query_dim, embedding_dim, attention_dim,
|
|||
attention_location_kernel_size, windowing,
|
||||
norm, forward_attn, trans_agent,
|
||||
forward_attn_mask)
|
||||
elif attn_type == "graves":
|
||||
if attn_type == "graves":
|
||||
return GravesAttention(query_dim, attn_K)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
" [!] Given Attention Type '{attn_type}' is not exist.")
|
||||
raise RuntimeError(
|
||||
" [!] Given Attention Type '{attn_type}' is not exist.")
|
||||
|
|
|
@ -27,6 +27,7 @@ class Tacotron2(nn.Module):
|
|||
separate_stopnet=True,
|
||||
bidirectional_decoder=False):
|
||||
super(Tacotron2, self).__init__()
|
||||
self.postnet_output_dim = postnet_output_dim
|
||||
self.decoder_output_dim = decoder_output_dim
|
||||
self.n_frames_per_step = r
|
||||
self.bidirectional_decoder = bidirectional_decoder
|
||||
|
@ -50,7 +51,7 @@ class Tacotron2(nn.Module):
|
|||
location_attn, attn_K, separate_stopnet, proj_speaker_dim)
|
||||
if self.bidirectional_decoder:
|
||||
self.decoder_backward = copy.deepcopy(self.decoder)
|
||||
self.postnet = Postnet(self.decoder_output_dim)
|
||||
self.postnet = Postnet(self.postnet_output_dim)
|
||||
|
||||
def _init_states(self):
|
||||
self.speaker_embeddings = None
|
||||
|
|
|
@ -6,51 +6,41 @@ import numpy as np
|
|||
from tqdm import tqdm
|
||||
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
from TTS.datasets.preprocess import get_preprocessor_by_name
|
||||
from TTS.speaker_encoder.dataset import MyDataset
|
||||
from TTS.speaker_encoder.model import SpeakerEncoder
|
||||
from TTS.speaker_encoder.visual import plot_embeddings
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.generic_utils import load_config
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Compute embedding vectors for each wav file in a dataset. ')
|
||||
parser.add_argument(
|
||||
'model_path',
|
||||
type=str,
|
||||
help='Path to model outputs (checkpoint, tensorboard etc.).')
|
||||
parser.add_argument(
|
||||
'config_path',
|
||||
type=str,
|
||||
help='Path to config file for training.',
|
||||
description="Compute embedding vectors for each wav file in a dataset. "
|
||||
)
|
||||
parser.add_argument(
|
||||
'data_path',
|
||||
type=str,
|
||||
help='Defines the data path. It overwrites config.json.')
|
||||
parser.add_argument(
|
||||
'output_path',
|
||||
type=str,
|
||||
help='path for training outputs.')
|
||||
parser.add_argument(
|
||||
'--use_cuda', type=bool, help='flag to set cuda.', default=False
|
||||
"model_path", type=str, help="Path to model outputs (checkpoint, tensorboard etc.)."
|
||||
)
|
||||
parser.add_argument(
|
||||
"config_path", type=str, help="Path to config file for training.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"data_path", type=str, help="Defines the data path. It overwrites config.json."
|
||||
)
|
||||
parser.add_argument("output_path", type=str, help="path for training outputs.")
|
||||
parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=False)
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
c = load_config(args.config_path)
|
||||
ap = AudioProcessor(**c['audio'])
|
||||
ap = AudioProcessor(**c["audio"])
|
||||
|
||||
wav_files = glob.glob(args.data_path + '/**/*.wav', recursive=True)
|
||||
output_files = [wav_file.replace(args.data_path, args.output_path).replace(
|
||||
'.wav', '.npy') for wav_file in wav_files]
|
||||
wav_files = glob.glob(args.data_path + "/**/*.wav", recursive=True)
|
||||
output_files = [
|
||||
wav_file.replace(args.data_path, args.output_path).replace(".wav", ".npy")
|
||||
for wav_file in wav_files
|
||||
]
|
||||
|
||||
for output_file in output_files:
|
||||
os.makedirs(os.path.dirname(output_file), exist_ok=True)
|
||||
|
||||
model = SpeakerEncoder(**c.model)
|
||||
model.load_state_dict(torch.load(args.model_path)['model'])
|
||||
model.load_state_dict(torch.load(args.model_path)["model"])
|
||||
model.eval()
|
||||
if args.use_cuda:
|
||||
model.cuda()
|
||||
|
|
|
@ -1,34 +1,24 @@
|
|||
import os
|
||||
import numpy as np
|
||||
import collections
|
||||
import torch
|
||||
import random
|
||||
from torch.utils.data import Dataset
|
||||
|
||||
from TTS.utils.text import text_to_sequence, phoneme_to_sequence, pad_with_eos_bos
|
||||
from TTS.utils.data import prepare_data, prepare_tensor, prepare_stop_target
|
||||
|
||||
|
||||
class MyDataset(Dataset):
|
||||
def __init__(self,
|
||||
ap,
|
||||
meta_data,
|
||||
voice_len=1.6,
|
||||
num_speakers_in_batch=64,
|
||||
num_utter_per_speaker=10,
|
||||
skip_speakers=False,
|
||||
verbose=False):
|
||||
def __init__(self, ap, meta_data, voice_len=1.6, num_speakers_in_batch=64,
|
||||
num_utter_per_speaker=10, skip_speakers=False, verbose=False):
|
||||
"""
|
||||
Args:
|
||||
ap (TTS.utils.AudioProcessor): audio processor object.
|
||||
meta_data (list): list of dataset instances.
|
||||
seq_len (int): voice segment length in seconds.
|
||||
seq_len (int): voice segment length in seconds.
|
||||
verbose (bool): print diagnostic information.
|
||||
"""
|
||||
self.items = meta_data
|
||||
self.sample_rate = ap.sample_rate
|
||||
self.voice_len = voice_len
|
||||
self.seq_len = int(voice_len * self.sample_rate)
|
||||
self.num_speakers_in_batch = num_speakers_in_batch
|
||||
self.num_utter_per_speaker = num_utter_per_speaker
|
||||
self.skip_speakers = skip_speakers
|
||||
self.ap = ap
|
||||
|
@ -47,16 +37,16 @@ class MyDataset(Dataset):
|
|||
def load_data(self, idx):
|
||||
text, wav_file, speaker_name = self.items[idx]
|
||||
wav = np.asarray(self.load_wav(wav_file), dtype=np.float32)
|
||||
mel = self.ap.melspectrogram(wav).astype('float32')
|
||||
mel = self.ap.melspectrogram(wav).astype("float32")
|
||||
# sample seq_len
|
||||
|
||||
assert text.size > 0, self.items[idx][1]
|
||||
assert wav.size > 0, self.items[idx][1]
|
||||
|
||||
sample = {
|
||||
'mel': mel,
|
||||
'item_idx': self.items[idx][1],
|
||||
'speaker_name': speaker_name
|
||||
"mel": mel,
|
||||
"item_idx": self.items[idx][1],
|
||||
"speaker_name": speaker_name,
|
||||
}
|
||||
return sample
|
||||
|
||||
|
@ -64,26 +54,32 @@ class MyDataset(Dataset):
|
|||
"""
|
||||
Find unique speaker ids and create a dict mapping utterances from speaker id
|
||||
"""
|
||||
speakers = list(set([item[-1] for item in self.items]))
|
||||
speakers = list({item[-1] for item in self.items})
|
||||
self.speaker_to_utters = {}
|
||||
self.speakers = []
|
||||
for speaker in speakers:
|
||||
speaker_utters = [item[1] for item in self.items if item[2] == speaker]
|
||||
if len(speaker_utters) < self.num_utter_per_speaker and self.skip_speakers:
|
||||
print(f" [!] Skipped speaker {speaker}. Not enough utterances {self.num_utter_per_speaker} vs {len(speaker_utters)}.")
|
||||
print(
|
||||
f" [!] Skipped speaker {speaker}. Not enough utterances {self.num_utter_per_speaker} vs {len(speaker_utters)}."
|
||||
)
|
||||
else:
|
||||
self.speakers.append(speaker)
|
||||
self.speaker_to_utters[speaker] = speaker_utters
|
||||
|
||||
def __len__(self):
|
||||
return int(1e+10)
|
||||
return int(1e10)
|
||||
|
||||
def __sample_speaker(self):
|
||||
speaker = random.sample(self.speakers, 1)[0]
|
||||
if self.num_utter_per_speaker > len(self.speaker_to_utters[speaker]):
|
||||
utters = random.choices(self.speaker_to_utters[speaker], k=self.num_utter_per_speaker)
|
||||
utters = random.choices(
|
||||
self.speaker_to_utters[speaker], k=self.num_utter_per_speaker
|
||||
)
|
||||
else:
|
||||
utters = random.sample(self.speaker_to_utters[speaker], self.num_utter_per_speaker)
|
||||
utters = random.sample(
|
||||
self.speaker_to_utters[speaker], self.num_utter_per_speaker
|
||||
)
|
||||
return speaker, utters
|
||||
|
||||
def __sample_speaker_utterances(self, speaker):
|
||||
|
@ -92,7 +88,7 @@ class MyDataset(Dataset):
|
|||
"""
|
||||
feats = []
|
||||
labels = []
|
||||
for idx in range(self.num_utter_per_speaker):
|
||||
for _ in range(self.num_utter_per_speaker):
|
||||
# TODO:dummy but works
|
||||
while True:
|
||||
if len(self.speaker_to_utters[speaker]) > 0:
|
||||
|
@ -104,11 +100,10 @@ class MyDataset(Dataset):
|
|||
wav = self.load_wav(utter)
|
||||
if wav.shape[0] - self.seq_len > 0:
|
||||
break
|
||||
else:
|
||||
self.speaker_to_utters[speaker].remove(utter)
|
||||
self.speaker_to_utters[speaker].remove(utter)
|
||||
|
||||
offset = random.randint(0, wav.shape[0] - self.seq_len)
|
||||
mel = self.ap.melspectrogram(wav[offset:offset+self.seq_len])
|
||||
mel = self.ap.melspectrogram(wav[offset : offset + self.seq_len])
|
||||
feats.append(torch.FloatTensor(mel))
|
||||
labels.append(speaker)
|
||||
return feats, labels
|
||||
|
@ -124,5 +119,5 @@ class MyDataset(Dataset):
|
|||
feats_, labels_ = self.__sample_speaker_utterances(speaker)
|
||||
labels.append(labels_)
|
||||
feats.extend(feats_)
|
||||
feats = torch.stack(feats)
|
||||
return feats.transpose(1, 2), labels
|
||||
feats = torch.stack(feats)
|
||||
return feats.transpose(1, 2), labels
|
||||
|
|
|
@ -5,9 +5,8 @@ import torch.nn.functional as F
|
|||
|
||||
# adapted from https://github.com/cvqluu/GE2E-Loss
|
||||
class GE2ELoss(nn.Module):
|
||||
|
||||
def __init__(self, init_w=10.0, init_b=-5.0, loss_method='softmax'):
|
||||
'''
|
||||
def __init__(self, init_w=10.0, init_b=-5.0, loss_method="softmax"):
|
||||
"""
|
||||
Implementation of the Generalized End-to-End loss defined in https://arxiv.org/abs/1710.10467 [1]
|
||||
Accepts an input of size (N, M, D)
|
||||
where N is the number of speakers in the batch,
|
||||
|
@ -16,24 +15,27 @@ class GE2ELoss(nn.Module):
|
|||
Args:
|
||||
- init_w (float): defines the initial value of w in Equation (5) of [1]
|
||||
- init_b (float): definies the initial value of b in Equation (5) of [1]
|
||||
'''
|
||||
"""
|
||||
super(GE2ELoss, self).__init__()
|
||||
# pylint: disable=E1102
|
||||
self.w = nn.Parameter(torch.tensor(init_w))
|
||||
# pylint: disable=E1102
|
||||
self.b = nn.Parameter(torch.tensor(init_b))
|
||||
self.loss_method = loss_method
|
||||
|
||||
assert self.loss_method in ['softmax', 'contrast']
|
||||
assert self.loss_method in ["softmax", "contrast"]
|
||||
|
||||
if self.loss_method == 'softmax':
|
||||
if self.loss_method == "softmax":
|
||||
self.embed_loss = self.embed_loss_softmax
|
||||
if self.loss_method == 'contrast':
|
||||
if self.loss_method == "contrast":
|
||||
self.embed_loss = self.embed_loss_contrast
|
||||
|
||||
# pylint: disable=R0201
|
||||
def calc_new_centroids(self, dvecs, centroids, spkr, utt):
|
||||
'''
|
||||
"""
|
||||
Calculates the new centroids excluding the reference utterance
|
||||
'''
|
||||
excl = torch.cat((dvecs[spkr, :utt], dvecs[spkr, utt+1:]))
|
||||
"""
|
||||
excl = torch.cat((dvecs[spkr, :utt], dvecs[spkr, utt + 1 :]))
|
||||
excl = torch.mean(excl, 0)
|
||||
new_centroids = []
|
||||
for i, centroid in enumerate(centroids):
|
||||
|
@ -44,26 +46,36 @@ class GE2ELoss(nn.Module):
|
|||
return torch.stack(new_centroids)
|
||||
|
||||
def calc_cosine_sim(self, dvecs, centroids):
|
||||
'''
|
||||
"""
|
||||
Make the cosine similarity matrix with dims (N,M,N)
|
||||
'''
|
||||
"""
|
||||
cos_sim_matrix = []
|
||||
for spkr_idx, speaker in enumerate(dvecs):
|
||||
cs_row = []
|
||||
for utt_idx, utterance in enumerate(speaker):
|
||||
new_centroids = self.calc_new_centroids(
|
||||
dvecs, centroids, spkr_idx, utt_idx)
|
||||
dvecs, centroids, spkr_idx, utt_idx
|
||||
)
|
||||
# vector based cosine similarity for speed
|
||||
cs_row.append(torch.clamp(torch.mm(utterance.unsqueeze(1).transpose(0, 1), new_centroids.transpose(
|
||||
0, 1)) / (torch.norm(utterance) * torch.norm(new_centroids, dim=1)), 1e-6))
|
||||
cs_row.append(
|
||||
torch.clamp(
|
||||
torch.mm(
|
||||
utterance.unsqueeze(1).transpose(0, 1),
|
||||
new_centroids.transpose(0, 1),
|
||||
)
|
||||
/ (torch.norm(utterance) * torch.norm(new_centroids, dim=1)),
|
||||
1e-6,
|
||||
)
|
||||
)
|
||||
cs_row = torch.cat(cs_row, dim=0)
|
||||
cos_sim_matrix.append(cs_row)
|
||||
return torch.stack(cos_sim_matrix)
|
||||
|
||||
# pylint: disable=R0201
|
||||
def embed_loss_softmax(self, dvecs, cos_sim_matrix):
|
||||
'''
|
||||
"""
|
||||
Calculates the loss on each embedding $L(e_{ji})$ by taking softmax
|
||||
'''
|
||||
"""
|
||||
N, M, _ = dvecs.shape
|
||||
L = []
|
||||
for j in range(N):
|
||||
|
@ -74,10 +86,11 @@ class GE2ELoss(nn.Module):
|
|||
L.append(L_row)
|
||||
return torch.stack(L)
|
||||
|
||||
# pylint: disable=R0201
|
||||
def embed_loss_contrast(self, dvecs, cos_sim_matrix):
|
||||
'''
|
||||
"""
|
||||
Calculates the loss on each embedding $L(e_{ji})$ by contrast loss with closest centroid
|
||||
'''
|
||||
"""
|
||||
N, M, _ = dvecs.shape
|
||||
L = []
|
||||
for j in range(N):
|
||||
|
@ -85,17 +98,21 @@ class GE2ELoss(nn.Module):
|
|||
for i in range(M):
|
||||
centroids_sigmoids = torch.sigmoid(cos_sim_matrix[j, i])
|
||||
excl_centroids_sigmoids = torch.cat(
|
||||
(centroids_sigmoids[:j], centroids_sigmoids[j+1:]))
|
||||
(centroids_sigmoids[:j], centroids_sigmoids[j + 1 :])
|
||||
)
|
||||
L_row.append(
|
||||
1. - torch.sigmoid(cos_sim_matrix[j, i, j]) + torch.max(excl_centroids_sigmoids))
|
||||
1.0
|
||||
- torch.sigmoid(cos_sim_matrix[j, i, j])
|
||||
+ torch.max(excl_centroids_sigmoids)
|
||||
)
|
||||
L_row = torch.stack(L_row)
|
||||
L.append(L_row)
|
||||
return torch.stack(L)
|
||||
|
||||
def forward(self, dvecs):
|
||||
'''
|
||||
"""
|
||||
Calculates the GE2E loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
|
||||
'''
|
||||
"""
|
||||
centroids = torch.mean(dvecs, 1)
|
||||
cos_sim_matrix = self.calc_cosine_sim(dvecs, centroids)
|
||||
torch.clamp(self.w, 1e-6)
|
||||
|
|
|
@ -10,10 +10,10 @@ class LSTMWithProjection(nn.Module):
|
|||
self.proj_size = proj_size
|
||||
self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
|
||||
self.linear = nn.Linear(hidden_size, proj_size, bias=False)
|
||||
|
||||
|
||||
def forward(self, x):
|
||||
self.lstm.flatten_parameters()
|
||||
o, (h, c) = self.lstm(x)
|
||||
o, (_, _) = self.lstm(x)
|
||||
return self.linear(o)
|
||||
|
||||
|
||||
|
@ -22,16 +22,16 @@ class SpeakerEncoder(nn.Module):
|
|||
super().__init__()
|
||||
layers = []
|
||||
layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim))
|
||||
for _ in range(num_lstm_layers-1):
|
||||
for _ in range(num_lstm_layers - 1):
|
||||
layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim))
|
||||
self.layers = nn.Sequential(*layers)
|
||||
self._init_layers()
|
||||
|
||||
def _init_layers(self):
|
||||
for name, param in self.layers.named_parameters():
|
||||
if 'bias' in name:
|
||||
if "bias" in name:
|
||||
nn.init.constant_(param, 0.0)
|
||||
elif 'weight' in name:
|
||||
elif "weight" in name:
|
||||
nn.init.xavier_normal_(param)
|
||||
|
||||
def forward(self, x):
|
||||
|
@ -81,7 +81,8 @@ class SpeakerEncoder(nn.Module):
|
|||
if embed is None:
|
||||
embed = self.inference(frames)
|
||||
else:
|
||||
embed[cur_iter <= num_iters, :] += self.inference(frames[cur_iter <= num_iters, :, :])
|
||||
embed[cur_iter <= num_iters, :] += self.inference(
|
||||
frames[cur_iter <= num_iters, :, :]
|
||||
)
|
||||
return embed / num_iters
|
||||
|
||||
|
||||
|
|
|
@ -4,22 +4,21 @@ import torch as T
|
|||
|
||||
from TTS.speaker_encoder.model import SpeakerEncoder
|
||||
from TTS.speaker_encoder.loss import GE2ELoss
|
||||
from TTS.speaker_encoder.dataset import MyDataset
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from torch.utils.data import DataLoader
|
||||
from TTS.datasets.preprocess import libri_tts
|
||||
from TTS.utils.generic_utils import load_config
|
||||
|
||||
|
||||
file_path = os.path.dirname(os.path.realpath(__file__)) + "/../tests/"
|
||||
c = load_config(os.path.join(file_path, 'test_config.json'))
|
||||
file_path = os.path.dirname(os.path.realpath(__file__)) + "/../tests/"
|
||||
c = load_config(os.path.join(file_path, "test_config.json"))
|
||||
|
||||
|
||||
class SpeakerEncoderTests(unittest.TestCase):
|
||||
# pylint: disable=R0201
|
||||
def test_in_out(self):
|
||||
dummy_input = T.rand(4, 20, 80) # B x T x D
|
||||
dummy_hidden = [T.rand(2, 4, 128), T.rand(2, 4, 128)]
|
||||
model = SpeakerEncoder(input_dim=80, proj_dim=256, lstm_dim=768, num_lstm_layers=3)
|
||||
model = SpeakerEncoder(
|
||||
input_dim=80, proj_dim=256, lstm_dim=768, num_lstm_layers=3
|
||||
)
|
||||
# computing d vectors
|
||||
output = model.forward(dummy_input)
|
||||
assert output.shape[0] == 4
|
||||
|
@ -35,8 +34,10 @@ class SpeakerEncoderTests(unittest.TestCase):
|
|||
# check normalization
|
||||
output_norm = T.nn.functional.normalize(output, dim=1, p=2)
|
||||
assert_diff = (output_norm - output).sum().item()
|
||||
assert output.type() == 'torch.FloatTensor'
|
||||
assert abs(assert_diff) < 1e-4, f" [!] output_norm has wrong values - {assert_diff}"
|
||||
assert output.type() == "torch.FloatTensor"
|
||||
assert (
|
||||
abs(assert_diff) < 1e-4
|
||||
), f" [!] output_norm has wrong values - {assert_diff}"
|
||||
# compute d for a given batch
|
||||
dummy_input = T.rand(1, 240, 80) # B x T x D
|
||||
output = model.compute_embedding(dummy_input, num_frames=160, overlap=0.5)
|
||||
|
@ -45,23 +46,29 @@ class SpeakerEncoderTests(unittest.TestCase):
|
|||
assert len(output.shape) == 2
|
||||
|
||||
|
||||
|
||||
class GE2ELossTests(unittest.TestCase):
|
||||
# pylint: disable=R0201
|
||||
def test_in_out(self):
|
||||
# check random input
|
||||
dummy_input = T.rand(4, 5, 64) # num_speaker x num_utterance x dim
|
||||
loss = GE2ELoss(loss_method='softmax')
|
||||
loss = GE2ELoss(loss_method="softmax")
|
||||
output = loss.forward(dummy_input)
|
||||
assert output.item() >= 0.
|
||||
assert output.item() >= 0.0
|
||||
# check all zeros
|
||||
dummy_input = T.ones(4, 5, 64) # num_speaker x num_utterance x dim
|
||||
loss = GE2ELoss(loss_method='softmax')
|
||||
loss = GE2ELoss(loss_method="softmax")
|
||||
output = loss.forward(dummy_input)
|
||||
# check speaker loss with orthogonal d-vectors
|
||||
dummy_input = T.empty(3, 64)
|
||||
dummy_input = T.nn.init.orthogonal(dummy_input)
|
||||
dummy_input = T.cat([dummy_input[0].repeat(5, 1, 1).transpose(0, 1), dummy_input[1].repeat(5, 1, 1).transpose(0, 1), dummy_input[2].repeat(5, 1, 1).transpose(0, 1)]) # num_speaker x num_utterance x dim
|
||||
loss = GE2ELoss(loss_method='softmax')
|
||||
dummy_input = T.cat(
|
||||
[
|
||||
dummy_input[0].repeat(5, 1, 1).transpose(0, 1),
|
||||
dummy_input[1].repeat(5, 1, 1).transpose(0, 1),
|
||||
dummy_input[2].repeat(5, 1, 1).transpose(0, 1),
|
||||
]
|
||||
) # num_speaker x num_utterance x dim
|
||||
loss = GE2ELoss(loss_method="softmax")
|
||||
output = loss.forward(dummy_input)
|
||||
assert output.item() < 0.005
|
||||
|
||||
|
@ -77,4 +84,4 @@ class GE2ELossTests(unittest.TestCase):
|
|||
# print(mel.shape)
|
||||
# if count == 4:
|
||||
# break
|
||||
# count += 1
|
||||
# count += 1
|
||||
|
|
|
@ -5,24 +5,21 @@ import time
|
|||
import traceback
|
||||
|
||||
import torch
|
||||
from torch import optim
|
||||
from torch.utils.data import DataLoader
|
||||
from TTS.datasets.preprocess import load_meta_data
|
||||
from TTS.speaker_encoder.dataset import MyDataset
|
||||
from TTS.speaker_encoder.generic_utils import save_best_model, save_checkpoint
|
||||
from TTS.speaker_encoder.loss import GE2ELoss
|
||||
from TTS.speaker_encoder.model import SpeakerEncoder
|
||||
from TTS.speaker_encoder.visual import plot_embeddings
|
||||
from TTS.speaker_encoder.generic_utils import save_best_model, save_checkpoint
|
||||
from TTS.utils.audio import AudioProcessor
|
||||
from TTS.utils.generic_utils import (NoamLR, check_update, copy_config_file,
|
||||
count_parameters,
|
||||
create_experiment_folder, get_git_branch,
|
||||
gradual_training_scheduler, load_config,
|
||||
remove_experiment_folder, set_init_dict,
|
||||
setup_model, split_dataset)
|
||||
load_config,
|
||||
remove_experiment_folder, set_init_dict)
|
||||
from TTS.utils.logger import Logger
|
||||
from TTS.utils.radam import RAdam
|
||||
from TTS.utils.visual import plot_alignment, plot_spectrogram
|
||||
|
||||
torch.backends.cudnn.enabled = True
|
||||
torch.backends.cudnn.benchmark = True
|
||||
|
@ -34,10 +31,6 @@ print(" > Number of GPUs: ", num_gpus)
|
|||
|
||||
|
||||
def setup_loader(ap, is_val=False, verbose=False):
|
||||
global meta_data_train
|
||||
global meta_data_eval
|
||||
if "meta_data_train" not in globals():
|
||||
meta_data_train, meta_data_eval = load_meta_data(c.datasets)
|
||||
if is_val:
|
||||
loader = None
|
||||
else:
|
||||
|
@ -63,12 +56,11 @@ def train(model, criterion, optimizer, scheduler, ap, global_step):
|
|||
best_loss = float('inf')
|
||||
avg_loss = 0
|
||||
end_time = time.time()
|
||||
for num_iter, data in enumerate(data_loader):
|
||||
for _, data in enumerate(data_loader):
|
||||
start_time = time.time()
|
||||
|
||||
# setup input data
|
||||
inputs = data[0]
|
||||
labels = data[1]
|
||||
loader_time = time.time() - end_time
|
||||
global_step += 1
|
||||
|
||||
|
@ -132,68 +124,11 @@ def train(model, criterion, optimizer, scheduler, ap, global_step):
|
|||
return avg_loss, global_step
|
||||
|
||||
|
||||
# def evaluate(model, criterion, ap, global_step, epoch):
|
||||
# data_loader = setup_loader(ap, is_val=True)
|
||||
# model.eval()
|
||||
# epoch_time = 0
|
||||
# avg_loss = 0
|
||||
# print("\n > Validation")
|
||||
# with torch.no_grad():
|
||||
# if data_loader is not None:
|
||||
# for num_iter, data in enumerate(data_loader):
|
||||
# start_time = time.time()
|
||||
|
||||
# # setup input data
|
||||
# inputs = data[0]
|
||||
# labels = data[1]
|
||||
|
||||
# # dispatch data to GPU
|
||||
# if use_cuda:
|
||||
# inputs = inputs.cuda()
|
||||
# # labels = labels.cuda()
|
||||
|
||||
# # forward pass
|
||||
# outputs = model.forward(inputs)
|
||||
|
||||
# # loss computation
|
||||
# loss = criterion(outputs.reshape(
|
||||
# c.num_speakers_in_batch, outputs.shape[0] // c.num_speakers_in_batch, -1))
|
||||
# step_time = time.time() - start_time
|
||||
# epoch_time += step_time
|
||||
|
||||
# if num_iter % c.print_step == 0:
|
||||
# print(
|
||||
# " | > Loss: {:.5f} ".format(loss.item()),
|
||||
# flush=True)
|
||||
|
||||
# avg_loss += float(loss.item())
|
||||
|
||||
# eval_figures = {
|
||||
# "prediction": plot_spectrogram(const_spec, ap),
|
||||
# "ground_truth": plot_spectrogram(gt_spec, ap),
|
||||
# "alignment": plot_alignment(align_img)
|
||||
# }
|
||||
# tb_logger.tb_eval_figures(global_step, eval_figures)
|
||||
|
||||
# # Sample audio
|
||||
# if c.model in ["Tacotron", "TacotronGST"]:
|
||||
# eval_audio = ap.inv_spectrogram(const_spec.T)
|
||||
# else:
|
||||
# eval_audio = ap.inv_mel_spectrogram(const_spec.T)
|
||||
# tb_logger.tb_eval_audios(
|
||||
# global_step, {"ValAudio": eval_audio}, c.audio["sample_rate"])
|
||||
|
||||
# # compute average losses
|
||||
# avg_loss /= (num_iter + 1)
|
||||
|
||||
# # Plot Validation Stats
|
||||
# epoch_stats = {"GE2Eloss": avg_loss}
|
||||
# tb_logger.tb_eval_stats(global_step, epoch_stats)
|
||||
# return avg_loss
|
||||
|
||||
|
||||
# FIXME: move args definition/parsing inside of main?
|
||||
def main(args): # pylint: disable=redefined-outer-name
|
||||
# pylint: disable=global-variable-undefined
|
||||
global meta_data_train
|
||||
global meta_data_eval
|
||||
|
||||
ap = AudioProcessor(**c.audio)
|
||||
model = SpeakerEncoder(input_dim=40,
|
||||
proj_dim=128,
|
||||
|
@ -211,7 +146,7 @@ def main(args): # pylint: disable=redefined-outer-name
|
|||
if c.reinit_layers:
|
||||
raise RuntimeError
|
||||
model.load_state_dict(checkpoint['model'])
|
||||
except:
|
||||
except KeyError:
|
||||
print(" > Partial model initialization.")
|
||||
model_dict = model.state_dict()
|
||||
model_dict = set_init_dict(model_dict, checkpoint, c)
|
||||
|
@ -239,6 +174,9 @@ def main(args): # pylint: disable=redefined-outer-name
|
|||
num_params = count_parameters(model)
|
||||
print("\n > Model has {} parameters".format(num_params), flush=True)
|
||||
|
||||
# pylint: disable=redefined-outer-name
|
||||
meta_data_train, meta_data_eval = load_meta_data(c.datasets)
|
||||
|
||||
global_step = args.restore_step
|
||||
train_loss, global_step = train(model, criterion, optimizer, scheduler, ap,
|
||||
global_step)
|
||||
|
|
|
@ -3,28 +3,34 @@ import numpy as np
|
|||
import matplotlib
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
matplotlib.use('Agg')
|
||||
matplotlib.use("Agg")
|
||||
|
||||
|
||||
colormap = np.array([
|
||||
[76, 255, 0],
|
||||
[0, 127, 70],
|
||||
[255, 0, 0],
|
||||
[255, 217, 38],
|
||||
[0, 135, 255],
|
||||
[165, 0, 165],
|
||||
[255, 167, 255],
|
||||
[0, 255, 255],
|
||||
[255, 96, 38],
|
||||
[142, 76, 0],
|
||||
[33, 0, 127],
|
||||
[0, 0, 0],
|
||||
[183, 183, 183],
|
||||
], dtype=np.float) / 255
|
||||
colormap = (
|
||||
np.array(
|
||||
[
|
||||
[76, 255, 0],
|
||||
[0, 127, 70],
|
||||
[255, 0, 0],
|
||||
[255, 217, 38],
|
||||
[0, 135, 255],
|
||||
[165, 0, 165],
|
||||
[255, 167, 255],
|
||||
[0, 255, 255],
|
||||
[255, 96, 38],
|
||||
[142, 76, 0],
|
||||
[33, 0, 127],
|
||||
[0, 0, 0],
|
||||
[183, 183, 183],
|
||||
],
|
||||
dtype=np.float,
|
||||
)
|
||||
/ 255
|
||||
)
|
||||
|
||||
|
||||
def plot_embeddings(embeddings, num_utter_per_speaker):
|
||||
embeddings = embeddings[:10*num_utter_per_speaker]
|
||||
embeddings = embeddings[: 10 * num_utter_per_speaker]
|
||||
model = umap.UMAP()
|
||||
projection = model.fit_transform(embeddings)
|
||||
num_speakers = embeddings.shape[0] // num_utter_per_speaker
|
||||
|
@ -32,7 +38,7 @@ def plot_embeddings(embeddings, num_utter_per_speaker):
|
|||
colors = [colormap[i] for i in ground_truth]
|
||||
|
||||
fig, ax = plt.subplots(figsize=(16, 10))
|
||||
im = ax.scatter(projection[:, 0], projection[:, 1], c=colors)
|
||||
_ = ax.scatter(projection[:, 0], projection[:, 1], c=colors)
|
||||
plt.gca().set_aspect("equal", "datalim")
|
||||
plt.title("UMAP projection")
|
||||
plt.tight_layout()
|
||||
|
|
|
@ -44,6 +44,8 @@
|
|||
"prenet_dropout": true, // ONLY TACOTRON2 - enable/disable dropout at prenet.
|
||||
"use_forward_attn": true, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster.
|
||||
"forward_attn_mask": false,
|
||||
"attention_type": "original",
|
||||
"attention_heads": 5,
|
||||
"bidirectional_decoder": false,
|
||||
"transition_agent": false, // ONLY TACOTRON2 - enable/disable transition agent of forward attention.
|
||||
"location_attn": false, // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default.
|
||||
|
|
|
@ -5,7 +5,8 @@ from TTS.layers.tacotron import Prenet, CBHG, Decoder, Encoder
|
|||
from TTS.layers.losses import L1LossMasked
|
||||
from TTS.utils.generic_utils import sequence_mask
|
||||
|
||||
#pylint: disable=unused-variable
|
||||
# pylint: disable=unused-variable
|
||||
|
||||
|
||||
class PrenetTests(unittest.TestCase):
|
||||
def test_in_out(self):
|
||||
|
@ -49,6 +50,8 @@ class DecoderTests(unittest.TestCase):
|
|||
memory_size=4,
|
||||
attn_windowing=False,
|
||||
attn_norm="sigmoid",
|
||||
attn_K=5,
|
||||
attn_type="original",
|
||||
prenet_type='original',
|
||||
prenet_dropout=True,
|
||||
forward_attn=True,
|
||||
|
@ -77,6 +80,8 @@ class DecoderTests(unittest.TestCase):
|
|||
memory_size=4,
|
||||
attn_windowing=False,
|
||||
attn_norm="sigmoid",
|
||||
attn_K=5,
|
||||
attn_type="graves",
|
||||
prenet_type='original',
|
||||
prenet_dropout=True,
|
||||
forward_attn=True,
|
||||
|
|
3
train.py
3
train.py
|
@ -117,7 +117,8 @@ def format_data(data):
|
|||
|
||||
def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
|
||||
ap, global_step, epoch):
|
||||
data_loader = setup_loader(ap, model.decoder.r, is_val=False, verbose=(epoch == 0))
|
||||
data_loader = setup_loader(ap, model.decoder.r, is_val=False,
|
||||
verbose=(epoch == 0))
|
||||
model.train()
|
||||
epoch_time = 0
|
||||
train_values = {
|
||||
|
|
|
@ -15,7 +15,7 @@ class AudioProcessor(object):
|
|||
ref_level_db=None,
|
||||
num_freq=None,
|
||||
power=None,
|
||||
preemphasis=None,
|
||||
preemphasis=0.0,
|
||||
signal_norm=None,
|
||||
symmetric_norm=None,
|
||||
max_norm=None,
|
||||
|
@ -48,7 +48,7 @@ class AudioProcessor(object):
|
|||
self.do_trim_silence = do_trim_silence
|
||||
self.sound_norm = sound_norm
|
||||
self.n_fft, self.hop_length, self.win_length = self._stft_parameters()
|
||||
assert min_level_db ~= 0.0, " [!] min_level_db is 0"
|
||||
assert min_level_db != 0.0, " [!] min_level_db is 0"
|
||||
members = vars(self)
|
||||
for key, value in members.items():
|
||||
print(" | > {}:{}".format(key, value))
|
||||
|
@ -132,12 +132,12 @@ class AudioProcessor(object):
|
|||
|
||||
def apply_preemphasis(self, x):
|
||||
if self.preemphasis == 0:
|
||||
raise RuntimeError(" !! Preemphasis is applied with factor 0.0. ")
|
||||
raise RuntimeError(" [!] Preemphasis is set 0.0.")
|
||||
return scipy.signal.lfilter([1, -self.preemphasis], [1], x)
|
||||
|
||||
def apply_inv_preemphasis(self, x):
|
||||
if self.preemphasis == 0:
|
||||
raise RuntimeError(" !! Preemphasis is applied with factor 0.0. ")
|
||||
raise RuntimeError(" [!] Preemphasis is set 0.0.")
|
||||
return scipy.signal.lfilter([1], [1, -self.preemphasis], x)
|
||||
|
||||
def spectrogram(self, y):
|
||||
|
|
|
@ -13,8 +13,8 @@ class Logger(object):
|
|||
for name, param in model.named_parameters():
|
||||
if param.numel() == 1:
|
||||
self.writer.add_scalar(
|
||||
"layer{}-{}/value".format(layer_num, name),
|
||||
param.max(), step)
|
||||
"layer{}-{}/value".format(layer_num, name),
|
||||
param.max(), step)
|
||||
else:
|
||||
self.writer.add_scalar(
|
||||
"layer{}-{}/max".format(layer_num, name),
|
||||
|
|
Loading…
Reference in New Issue