mirror of https://github.com/coqui-ai/TTS.git
Add evaluation during encoder training
This commit is contained in:
parent
0e372e0b9b
commit
33fd07a209
|
@ -33,148 +33,218 @@ print(" > Number of GPUs: ", num_gpus)
|
|||
|
||||
|
||||
def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False):
|
||||
num_utter_per_class = c.num_utter_per_class if not is_val else c.eval_num_utter_per_class
|
||||
num_classes_in_batch = c.num_classes_in_batch if not is_val else c.eval_num_classes_in_batch
|
||||
|
||||
dataset = EncoderDataset(
|
||||
ap,
|
||||
meta_data_eval if is_val else meta_data_train,
|
||||
voice_len=c.voice_len,
|
||||
num_utter_per_class=num_utter_per_class,
|
||||
num_classes_in_batch=num_classes_in_batch,
|
||||
verbose=verbose,
|
||||
augmentation_config=c.audio_augmentation if not is_val else None,
|
||||
use_torch_spec=c.model_params.get("use_torch_spec", False),
|
||||
)
|
||||
# get classes list
|
||||
classes = dataset.get_class_list()
|
||||
|
||||
sampler = PerfectBatchSampler(
|
||||
dataset.items,
|
||||
classes,
|
||||
batch_size=num_classes_in_batch*num_utter_per_class, # total batch size
|
||||
num_classes_in_batch=num_classes_in_batch,
|
||||
num_gpus=1,
|
||||
shuffle=False if is_val else True,
|
||||
drop_last=True)
|
||||
|
||||
if len(classes) < num_classes_in_batch:
|
||||
if is_val:
|
||||
raise RuntimeError(f"config.eval_num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Eval dataset) !")
|
||||
else:
|
||||
raise RuntimeError(f"config.num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Train dataset) !")
|
||||
|
||||
# set the classes to avoid get wrong class_id when the number of training and eval classes are not equal
|
||||
if is_val:
|
||||
loader = None
|
||||
else:
|
||||
dataset = EncoderDataset(
|
||||
ap,
|
||||
meta_data_eval if is_val else meta_data_train,
|
||||
voice_len=c.voice_len,
|
||||
num_utter_per_class=c.num_utter_per_class,
|
||||
num_classes_in_batch=c.num_classes_in_batch,
|
||||
verbose=verbose,
|
||||
augmentation_config=c.audio_augmentation if not is_val else None,
|
||||
use_torch_spec=c.model_params.get("use_torch_spec", False),
|
||||
)
|
||||
dataset.set_classes(train_classes)
|
||||
|
||||
sampler = PerfectBatchSampler(
|
||||
dataset.items,
|
||||
dataset.get_class_list(),
|
||||
batch_size=c.num_classes_in_batch*c.num_utter_per_class, # total batch size
|
||||
num_classes_in_batch=c.num_classes_in_batch,
|
||||
num_gpus=1,
|
||||
shuffle=False if is_val else True,
|
||||
drop_last=True)
|
||||
loader = DataLoader(
|
||||
dataset,
|
||||
num_workers=c.num_loader_workers,
|
||||
batch_sampler=sampler,
|
||||
collate_fn=dataset.collate_fn,
|
||||
)
|
||||
|
||||
loader = DataLoader(
|
||||
dataset,
|
||||
num_workers=c.num_loader_workers,
|
||||
batch_sampler=sampler,
|
||||
collate_fn=dataset.collate_fn,
|
||||
)
|
||||
return loader, classes, dataset.get_map_classid_to_classname()
|
||||
|
||||
return loader, dataset.get_num_classes(), dataset.get_map_classid_to_classname()
|
||||
def evaluation(model, criterion, data_loader, global_step):
|
||||
eval_loss = 0
|
||||
for step, data in enumerate(data_loader):
|
||||
with torch.no_grad():
|
||||
start_time = time.time()
|
||||
|
||||
# setup input data
|
||||
inputs, labels = data
|
||||
|
||||
def train(model, optimizer, scheduler, criterion, data_loader, global_step):
|
||||
# agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
|
||||
labels = torch.transpose(labels.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch), 0, 1).reshape(labels.shape)
|
||||
inputs = torch.transpose(inputs.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch, -1), 0, 1).reshape(inputs.shape)
|
||||
|
||||
# dispatch data to GPU
|
||||
if use_cuda:
|
||||
inputs = inputs.cuda(non_blocking=True)
|
||||
labels = labels.cuda(non_blocking=True)
|
||||
|
||||
# forward pass model
|
||||
outputs = model(inputs)
|
||||
|
||||
# loss computation
|
||||
loss = criterion(outputs.view(c.eval_num_classes_in_batch, outputs.shape[0] // c.eval_num_classes_in_batch, -1), labels)
|
||||
|
||||
eval_loss += loss.item()
|
||||
|
||||
eval_avg_loss = eval_loss/len(data_loader)
|
||||
# save stats
|
||||
dashboard_logger.eval_stats(global_step, {"loss": eval_avg_loss})
|
||||
# plot the last batch in the evaluation
|
||||
figures = {
|
||||
"UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
|
||||
}
|
||||
dashboard_logger.eval_figures(global_step, figures)
|
||||
return eval_avg_loss
|
||||
|
||||
def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, global_step):
|
||||
model.train()
|
||||
epoch_time = 0
|
||||
best_loss = float("inf")
|
||||
avg_loss = 0
|
||||
avg_loss_all = 0
|
||||
avg_loader_time = 0
|
||||
end_time = time.time()
|
||||
print(len(data_loader))
|
||||
for _, data in enumerate(data_loader):
|
||||
start_time = time.time()
|
||||
for epoch in range(c.epochs):
|
||||
tot_loss = 0
|
||||
epoch_time = 0
|
||||
for step, data in enumerate(data_loader):
|
||||
start_time = time.time()
|
||||
|
||||
# setup input data
|
||||
inputs, labels = data
|
||||
# agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
|
||||
labels = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(labels.shape)
|
||||
inputs = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(inputs.shape)
|
||||
"""
|
||||
labels_converted = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(labels.shape)
|
||||
inputs_converted = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(inputs.shape)
|
||||
idx = 0
|
||||
for j in range(0, c.num_classes_in_batch, 1):
|
||||
for i in range(j, len(labels), c.num_classes_in_batch):
|
||||
if not torch.all(labels[i].eq(labels_converted[idx])) or not torch.all(inputs[i].eq(inputs_converted[idx])):
|
||||
print("Invalid")
|
||||
print(labels)
|
||||
exit()
|
||||
idx += 1
|
||||
labels = labels_converted
|
||||
inputs = inputs_converted
|
||||
print(labels)
|
||||
print(inputs.shape)"""
|
||||
# setup input data
|
||||
inputs, labels = data
|
||||
# agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
|
||||
labels = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(labels.shape)
|
||||
inputs = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(inputs.shape)
|
||||
"""
|
||||
# ToDo: move it to a unit test
|
||||
labels_converted = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(labels.shape)
|
||||
inputs_converted = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(inputs.shape)
|
||||
idx = 0
|
||||
for j in range(0, c.num_classes_in_batch, 1):
|
||||
for i in range(j, len(labels), c.num_classes_in_batch):
|
||||
if not torch.all(labels[i].eq(labels_converted[idx])) or not torch.all(inputs[i].eq(inputs_converted[idx])):
|
||||
print("Invalid")
|
||||
print(labels)
|
||||
exit()
|
||||
idx += 1
|
||||
labels = labels_converted
|
||||
inputs = inputs_converted
|
||||
print(labels)
|
||||
print(inputs.shape)"""
|
||||
|
||||
loader_time = time.time() - end_time
|
||||
global_step += 1
|
||||
loader_time = time.time() - end_time
|
||||
global_step += 1
|
||||
|
||||
# setup lr
|
||||
if c.lr_decay:
|
||||
scheduler.step()
|
||||
optimizer.zero_grad()
|
||||
# setup lr
|
||||
if c.lr_decay:
|
||||
scheduler.step()
|
||||
optimizer.zero_grad()
|
||||
|
||||
# dispatch data to GPU
|
||||
if use_cuda:
|
||||
inputs = inputs.cuda(non_blocking=True)
|
||||
labels = labels.cuda(non_blocking=True)
|
||||
# dispatch data to GPU
|
||||
if use_cuda:
|
||||
inputs = inputs.cuda(non_blocking=True)
|
||||
labels = labels.cuda(non_blocking=True)
|
||||
|
||||
# forward pass model
|
||||
outputs = model(inputs)
|
||||
# forward pass model
|
||||
outputs = model(inputs)
|
||||
|
||||
# loss computation
|
||||
loss = criterion(outputs.view(c.num_classes_in_batch, outputs.shape[0] // c.num_classes_in_batch, -1), labels)
|
||||
loss.backward()
|
||||
grad_norm, _ = check_update(model, c.grad_clip)
|
||||
optimizer.step()
|
||||
# loss computation
|
||||
loss = criterion(outputs.view(c.num_classes_in_batch, outputs.shape[0] // c.num_classes_in_batch, -1), labels)
|
||||
loss.backward()
|
||||
grad_norm, _ = check_update(model, c.grad_clip)
|
||||
optimizer.step()
|
||||
|
||||
step_time = time.time() - start_time
|
||||
epoch_time += step_time
|
||||
step_time = time.time() - start_time
|
||||
epoch_time += step_time
|
||||
|
||||
# Averaged Loss and Averaged Loader Time
|
||||
avg_loss = 0.01 * loss.item() + 0.99 * avg_loss if avg_loss != 0 else loss.item()
|
||||
num_loader_workers = c.num_loader_workers if c.num_loader_workers > 0 else 1
|
||||
avg_loader_time = (
|
||||
1 / num_loader_workers * loader_time + (num_loader_workers - 1) / num_loader_workers * avg_loader_time
|
||||
if avg_loader_time != 0
|
||||
else loader_time
|
||||
)
|
||||
current_lr = optimizer.param_groups[0]["lr"]
|
||||
# acumulate the total epoch loss
|
||||
tot_loss += loss.item()
|
||||
|
||||
if global_step % c.steps_plot_stats == 0:
|
||||
# Plot Training Epoch Stats
|
||||
train_stats = {
|
||||
"loss": avg_loss,
|
||||
"lr": current_lr,
|
||||
"grad_norm": grad_norm,
|
||||
"step_time": step_time,
|
||||
"avg_loader_time": avg_loader_time,
|
||||
}
|
||||
dashboard_logger.train_epoch_stats(global_step, train_stats)
|
||||
figures = {
|
||||
"UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
|
||||
}
|
||||
dashboard_logger.train_figures(global_step, figures)
|
||||
|
||||
if global_step % c.print_step == 0:
|
||||
print(
|
||||
" | > Step:{} Loss:{:.5f} AvgLoss:{:.5f} GradNorm:{:.5f} "
|
||||
"StepTime:{:.2f} LoaderTime:{:.2f} AvGLoaderTime:{:.2f} LR:{:.6f}".format(
|
||||
global_step, loss.item(), avg_loss, grad_norm, step_time, loader_time, avg_loader_time, current_lr
|
||||
),
|
||||
flush=True,
|
||||
# Averaged Loader Time
|
||||
num_loader_workers = c.num_loader_workers if c.num_loader_workers > 0 else 1
|
||||
avg_loader_time = (
|
||||
1 / num_loader_workers * loader_time + (num_loader_workers - 1) / num_loader_workers * avg_loader_time
|
||||
if avg_loader_time != 0
|
||||
else loader_time
|
||||
)
|
||||
avg_loss_all += avg_loss
|
||||
current_lr = optimizer.param_groups[0]["lr"]
|
||||
|
||||
if global_step >= c.max_train_step or global_step % c.save_step == 0:
|
||||
# save best model only
|
||||
best_loss = save_best_model(model, optimizer, criterion, avg_loss, best_loss, OUT_PATH, global_step)
|
||||
avg_loss_all = 0
|
||||
if global_step >= c.max_train_step:
|
||||
break
|
||||
if global_step % c.steps_plot_stats == 0:
|
||||
# Plot Training Epoch Stats
|
||||
train_stats = {
|
||||
"loss": loss.item(),
|
||||
"lr": current_lr,
|
||||
"grad_norm": grad_norm,
|
||||
"step_time": step_time,
|
||||
"avg_loader_time": avg_loader_time,
|
||||
}
|
||||
dashboard_logger.train_epoch_stats(global_step, train_stats)
|
||||
figures = {
|
||||
"UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
|
||||
}
|
||||
dashboard_logger.train_figures(global_step, figures)
|
||||
|
||||
end_time = time.time()
|
||||
if global_step % c.print_step == 0:
|
||||
print(
|
||||
" | > Step:{} Loss:{:.5f} GradNorm:{:.5f} "
|
||||
"StepTime:{:.2f} LoaderTime:{:.2f} AvGLoaderTime:{:.2f} LR:{:.6f}".format(
|
||||
global_step, loss.item(), grad_norm, step_time, loader_time, avg_loader_time, current_lr
|
||||
),
|
||||
flush=True,
|
||||
)
|
||||
|
||||
return avg_loss, global_step
|
||||
if global_step % c.save_step == 0:
|
||||
# save model
|
||||
save_checkpoint(model, optimizer, criterion, loss.item(), OUT_PATH, global_step, epoch)
|
||||
|
||||
end_time = time.time()
|
||||
|
||||
print("")
|
||||
print(
|
||||
" | > Epoch:{} AvgLoss: {:.5f} GradNorm:{:.5f} "
|
||||
"EpochTime:{:.2f} AvGLoaderTime:{:.2f} ".format(
|
||||
epoch, tot_loss/len(data_loader), grad_norm, epoch_time, avg_loader_time, current_lr
|
||||
),
|
||||
flush=True,
|
||||
)
|
||||
# evaluation
|
||||
if c.run_eval:
|
||||
model.eval()
|
||||
eval_loss = evaluation(model, criterion, eval_data_loader, global_step)
|
||||
print("\n\n")
|
||||
print("--> EVAL PERFORMANCE")
|
||||
print(
|
||||
" | > Epoch:{} AvgLoss: {:.5f} ".format(
|
||||
epoch, eval_loss
|
||||
),
|
||||
flush=True,
|
||||
)
|
||||
# save the best checkpoint
|
||||
best_loss = save_best_model(model, optimizer, criterion, eval_loss, best_loss, OUT_PATH, global_step, epoch)
|
||||
model.train()
|
||||
|
||||
return best_loss, global_step
|
||||
|
||||
|
||||
def main(args): # pylint: disable=redefined-outer-name
|
||||
# pylint: disable=global-variable-undefined
|
||||
global meta_data_train
|
||||
global meta_data_eval
|
||||
global train_classes
|
||||
|
||||
ap = AudioProcessor(**c.audio)
|
||||
model = setup_speaker_encoder_model(c)
|
||||
|
@ -184,8 +254,12 @@ def main(args): # pylint: disable=redefined-outer-name
|
|||
# pylint: disable=redefined-outer-name
|
||||
meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=True)
|
||||
|
||||
train_data_loader, num_classes, map_classid_to_classname = setup_loader(ap, is_val=False, verbose=True)
|
||||
# eval_data_loader, _, _ = setup_loader(ap, is_val=True, verbose=True)
|
||||
train_data_loader, train_classes, map_classid_to_classname = setup_loader(ap, is_val=False, verbose=True)
|
||||
if c.run_eval:
|
||||
eval_data_loader, _, _ = setup_loader(ap, is_val=True, verbose=True)
|
||||
else:
|
||||
eval_data_loader = None
|
||||
num_classes = len(train_classes)
|
||||
|
||||
if c.loss == "ge2e":
|
||||
criterion = GE2ELoss(loss_method="softmax")
|
||||
|
@ -235,7 +309,7 @@ def main(args): # pylint: disable=redefined-outer-name
|
|||
criterion.cuda()
|
||||
|
||||
global_step = args.restore_step
|
||||
_, global_step = train(model, optimizer, scheduler, criterion, train_data_loader, global_step)
|
||||
_, global_step = train(model, optimizer, scheduler, criterion, train_data_loader, eval_data_loader, global_step)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
@ -39,15 +39,18 @@ class BaseEncoderConfig(BaseTrainingConfig):
|
|||
# logging params
|
||||
tb_model_param_stats: bool = False
|
||||
steps_plot_stats: int = 10
|
||||
checkpoint: bool = True
|
||||
epochs: int = 10000
|
||||
save_step: int = 1000
|
||||
print_step: int = 20
|
||||
run_eval: bool = False
|
||||
|
||||
# data loader
|
||||
num_classes_in_batch: int = MISSING
|
||||
num_utter_per_class: int = MISSING
|
||||
eval_num_classes_in_batch: int = MISSING
|
||||
eval_num_utter_per_class: int = MISSING
|
||||
|
||||
num_loader_workers: int = MISSING
|
||||
skip_classes: bool = False
|
||||
voice_len: float = 1.6
|
||||
|
||||
def check_values(self):
|
||||
|
|
|
@ -104,7 +104,11 @@ class EncoderDataset(Dataset):
|
|||
return len(self.classes)
|
||||
|
||||
def get_class_list(self):
|
||||
return list(self.classes)
|
||||
return self.classes
|
||||
def set_classes(self, classes):
|
||||
self.classes = classes
|
||||
self.classname_to_classid = {key: i for i, key in enumerate(self.classes)}
|
||||
|
||||
|
||||
def get_map_classid_to_classname(self):
|
||||
return dict((c_id, c_n) for c_n, c_id in self.classname_to_classid.items())
|
||||
|
|
|
@ -209,7 +209,7 @@ def save_checkpoint(model, optimizer, criterion, model_loss, out_path, current_s
|
|||
save_fsspec(state, checkpoint_path)
|
||||
|
||||
|
||||
def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path, current_step):
|
||||
def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path, current_step, epoch):
|
||||
if model_loss < best_loss:
|
||||
new_state_dict = model.state_dict()
|
||||
state = {
|
||||
|
@ -217,6 +217,7 @@ def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path
|
|||
"optimizer": optimizer.state_dict(),
|
||||
"criterion": criterion.state_dict(),
|
||||
"step": current_step,
|
||||
"epoch": epoch,
|
||||
"loss": model_loss,
|
||||
"date": datetime.date.today().strftime("%B %d, %Y"),
|
||||
}
|
||||
|
|
|
@ -36,7 +36,7 @@ class PerfectBatchSampler(Sampler):
|
|||
|
||||
def __init__(self, dataset_items, classes, batch_size, num_classes_in_batch, num_gpus=1, shuffle=True, drop_last=False):
|
||||
|
||||
assert batch_size % (len(classes) * num_gpus) == 0, (
|
||||
assert batch_size % (num_classes_in_batch * num_gpus) == 0, (
|
||||
'Batch size must be divisible by number of classes times the number of data parallel devices (if enabled).')
|
||||
|
||||
label_indices = {}
|
||||
|
|
Loading…
Reference in New Issue