add initial wavernn support

This commit is contained in:
Alex K 2020-10-15 19:14:50 +02:00
parent 1a87ad82e3
commit d6bd3cd8b8
6 changed files with 838 additions and 175 deletions

View File

@ -11,20 +11,27 @@ from TTS.tts.datasets.preprocess import load_meta_data
from TTS.utils.io import load_config from TTS.utils.io import load_config
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor
def main(): def main():
"""Run preprocessing process.""" """Run preprocessing process."""
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Compute mean and variance of spectrogtram features.") description="Compute mean and variance of spectrogtram features."
parser.add_argument("--config_path", type=str, required=True, )
help="TTS config file path to define audio processin parameters.") parser.add_argument(
parser.add_argument("--out_path", default=None, type=str, "--config_path",
help="directory to save the output file.") type=str,
required=True,
help="TTS config file path to define audio processin parameters.",
)
parser.add_argument(
"--out_path", default=None, type=str, help="directory to save the output file."
)
args = parser.parse_args() args = parser.parse_args()
# load config # load config
CONFIG = load_config(args.config_path) CONFIG = load_config(args.config_path)
CONFIG.audio['signal_norm'] = False # do not apply earlier normalization CONFIG.audio["signal_norm"] = False # do not apply earlier normalization
CONFIG.audio['stats_path'] = None # discard pre-defined stats CONFIG.audio["stats_path"] = None # discard pre-defined stats
# load audio processor # load audio processor
ap = AudioProcessor(**CONFIG.audio) ap = AudioProcessor(**CONFIG.audio)
@ -58,27 +65,27 @@ def main():
output_file_path = os.path.join(args.out_path, "scale_stats.npy") output_file_path = os.path.join(args.out_path, "scale_stats.npy")
stats = {} stats = {}
stats['mel_mean'] = mel_mean stats["mel_mean"] = mel_mean
stats['mel_std'] = mel_scale stats["mel_std"] = mel_scale
stats['linear_mean'] = linear_mean stats["linear_mean"] = linear_mean
stats['linear_std'] = linear_scale stats["linear_std"] = linear_scale
print(f' > Avg mel spec mean: {mel_mean.mean()}') print(f" > Avg mel spec mean: {mel_mean.mean()}")
print(f' > Avg mel spec scale: {mel_scale.mean()}') print(f" > Avg mel spec scale: {mel_scale.mean()}")
print(f' > Avg linear spec mean: {linear_mean.mean()}') print(f" > Avg linear spec mean: {linear_mean.mean()}")
print(f' > Avg lienar spec scale: {linear_scale.mean()}') print(f" > Avg lienar spec scale: {linear_scale.mean()}")
# set default config values for mean-var scaling # set default config values for mean-var scaling
CONFIG.audio['stats_path'] = output_file_path CONFIG.audio["stats_path"] = output_file_path
CONFIG.audio['signal_norm'] = True CONFIG.audio["signal_norm"] = True
# remove redundant values # remove redundant values
del CONFIG.audio['max_norm'] del CONFIG.audio["max_norm"]
del CONFIG.audio['min_level_db'] del CONFIG.audio["min_level_db"]
del CONFIG.audio['symmetric_norm'] del CONFIG.audio["symmetric_norm"]
del CONFIG.audio['clip_norm'] del CONFIG.audio["clip_norm"]
stats['audio_config'] = CONFIG.audio stats["audio_config"] = CONFIG.audio
np.save(output_file_path, stats, allow_pickle=True) np.save(output_file_path, stats, allow_pickle=True)
print(f' > scale_stats.npy is saved to {output_file_path}') print(f" > scale_stats.npy is saved to {output_file_path}")
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -10,20 +10,29 @@ import torch
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
from TTS.utils.audio import AudioProcessor from TTS.utils.audio import AudioProcessor
from TTS.utils.console_logger import ConsoleLogger from TTS.utils.console_logger import ConsoleLogger
from TTS.utils.generic_utils import (KeepAverage, count_parameters, from TTS.utils.generic_utils import (
create_experiment_folder, get_git_branch, KeepAverage,
remove_experiment_folder, set_init_dict) count_parameters,
create_experiment_folder,
get_git_branch,
remove_experiment_folder,
set_init_dict,
)
from TTS.utils.io import copy_config_file, load_config from TTS.utils.io import copy_config_file, load_config
from TTS.utils.radam import RAdam from TTS.utils.radam import RAdam
from TTS.utils.tensorboard_logger import TensorboardLogger from TTS.utils.tensorboard_logger import TensorboardLogger
from TTS.utils.training import setup_torch_training_env from TTS.utils.training import setup_torch_training_env
from TTS.vocoder.datasets.gan_dataset import GANDataset from TTS.vocoder.datasets.gan_dataset import GANDataset
from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
# from distribute import (DistributedSampler, apply_gradient_allreduce, # from distribute import (DistributedSampler, apply_gradient_allreduce,
# init_distributed, reduce_tensor) # init_distributed, reduce_tensor)
from TTS.vocoder.layers.losses import DiscriminatorLoss, GeneratorLoss from TTS.vocoder.layers.losses import DiscriminatorLoss, GeneratorLoss
from TTS.vocoder.utils.generic_utils import (plot_results, setup_discriminator, from TTS.vocoder.utils.generic_utils import (
setup_generator) plot_results,
setup_discriminator,
setup_generator,
)
from TTS.vocoder.utils.io import save_best_model, save_checkpoint from TTS.vocoder.utils.io import save_best_model, save_checkpoint
use_cuda, num_gpus = setup_torch_training_env(True, True) use_cuda, num_gpus = setup_torch_training_env(True, True)
@ -33,27 +42,30 @@ def setup_loader(ap, is_val=False, verbose=False):
if is_val and not c.run_eval: if is_val and not c.run_eval:
loader = None loader = None
else: else:
dataset = GANDataset(ap=ap, dataset = GANDataset(
items=eval_data if is_val else train_data, ap=ap,
seq_len=c.seq_len, items=eval_data if is_val else train_data,
hop_len=ap.hop_length, seq_len=c.seq_len,
pad_short=c.pad_short, hop_len=ap.hop_length,
conv_pad=c.conv_pad, pad_short=c.pad_short,
is_training=not is_val, conv_pad=c.conv_pad,
return_segments=not is_val, is_training=not is_val,
use_noise_augment=c.use_noise_augment, return_segments=not is_val,
use_cache=c.use_cache, use_noise_augment=c.use_noise_augment,
verbose=verbose) use_cache=c.use_cache,
verbose=verbose,
)
dataset.shuffle_mapping() dataset.shuffle_mapping()
# sampler = DistributedSampler(dataset) if num_gpus > 1 else None # sampler = DistributedSampler(dataset) if num_gpus > 1 else None
loader = DataLoader(dataset, loader = DataLoader(
batch_size=1 if is_val else c.batch_size, dataset,
shuffle=True, batch_size=1 if is_val else c.batch_size,
drop_last=False, shuffle=True,
sampler=None, drop_last=False,
num_workers=c.num_val_loader_workers sampler=None,
if is_val else c.num_loader_workers, num_workers=c.num_val_loader_workers if is_val else c.num_loader_workers,
pin_memory=False) pin_memory=False,
)
return loader return loader
@ -80,16 +92,26 @@ def format_data(data):
return co, x, None, None return co, x, None, None
def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D, def train(
scheduler_G, scheduler_D, ap, global_step, epoch): model_G,
criterion_G,
optimizer_G,
model_D,
criterion_D,
optimizer_D,
scheduler_G,
scheduler_D,
ap,
global_step,
epoch,
):
data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0)) data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0))
model_G.train() model_G.train()
model_D.train() model_D.train()
epoch_time = 0 epoch_time = 0
keep_avg = KeepAverage() keep_avg = KeepAverage()
if use_cuda: if use_cuda:
batch_n_iter = int( batch_n_iter = int(len(data_loader.dataset) / (c.batch_size * num_gpus))
len(data_loader.dataset) / (c.batch_size * num_gpus))
else: else:
batch_n_iter = int(len(data_loader.dataset) / c.batch_size) batch_n_iter = int(len(data_loader.dataset) / c.batch_size)
end_time = time.time() end_time = time.time()
@ -145,16 +167,16 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D,
scores_fake = D_out_fake scores_fake = D_out_fake
# compute losses # compute losses
loss_G_dict = criterion_G(y_hat, y_G, scores_fake, feats_fake, loss_G_dict = criterion_G(
feats_real, y_hat_sub, y_G_sub) y_hat, y_G, scores_fake, feats_fake, feats_real, y_hat_sub, y_G_sub
loss_G = loss_G_dict['G_loss'] )
loss_G = loss_G_dict["G_loss"]
# optimizer generator # optimizer generator
optimizer_G.zero_grad() optimizer_G.zero_grad()
loss_G.backward() loss_G.backward()
if c.gen_clip_grad > 0: if c.gen_clip_grad > 0:
torch.nn.utils.clip_grad_norm_(model_G.parameters(), torch.nn.utils.clip_grad_norm_(model_G.parameters(), c.gen_clip_grad)
c.gen_clip_grad)
optimizer_G.step() optimizer_G.step()
if scheduler_G is not None: if scheduler_G is not None:
scheduler_G.step() scheduler_G.step()
@ -199,14 +221,13 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D,
# compute losses # compute losses
loss_D_dict = criterion_D(scores_fake, scores_real) loss_D_dict = criterion_D(scores_fake, scores_real)
loss_D = loss_D_dict['D_loss'] loss_D = loss_D_dict["D_loss"]
# optimizer discriminator # optimizer discriminator
optimizer_D.zero_grad() optimizer_D.zero_grad()
loss_D.backward() loss_D.backward()
if c.disc_clip_grad > 0: if c.disc_clip_grad > 0:
torch.nn.utils.clip_grad_norm_(model_D.parameters(), torch.nn.utils.clip_grad_norm_(model_D.parameters(), c.disc_clip_grad)
c.disc_clip_grad)
optimizer_D.step() optimizer_D.step()
if scheduler_D is not None: if scheduler_D is not None:
scheduler_D.step() scheduler_D.step()
@ -221,34 +242,40 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D,
epoch_time += step_time epoch_time += step_time
# get current learning rates # get current learning rates
current_lr_G = list(optimizer_G.param_groups)[0]['lr'] current_lr_G = list(optimizer_G.param_groups)[0]["lr"]
current_lr_D = list(optimizer_D.param_groups)[0]['lr'] current_lr_D = list(optimizer_D.param_groups)[0]["lr"]
# update avg stats # update avg stats
update_train_values = dict() update_train_values = dict()
for key, value in loss_dict.items(): for key, value in loss_dict.items():
update_train_values['avg_' + key] = value update_train_values["avg_" + key] = value
update_train_values['avg_loader_time'] = loader_time update_train_values["avg_loader_time"] = loader_time
update_train_values['avg_step_time'] = step_time update_train_values["avg_step_time"] = step_time
keep_avg.update_values(update_train_values) keep_avg.update_values(update_train_values)
# print training stats # print training stats
if global_step % c.print_step == 0: if global_step % c.print_step == 0:
log_dict = { log_dict = {
'step_time': [step_time, 2], "step_time": [step_time, 2],
'loader_time': [loader_time, 4], "loader_time": [loader_time, 4],
"current_lr_G": current_lr_G, "current_lr_G": current_lr_G,
"current_lr_D": current_lr_D "current_lr_D": current_lr_D,
} }
c_logger.print_train_step(batch_n_iter, num_iter, global_step, c_logger.print_train_step(
log_dict, loss_dict, keep_avg.avg_values) batch_n_iter,
num_iter,
global_step,
log_dict,
loss_dict,
keep_avg.avg_values,
)
# plot step stats # plot step stats
if global_step % 10 == 0: if global_step % 10 == 0:
iter_stats = { iter_stats = {
"lr_G": current_lr_G, "lr_G": current_lr_G,
"lr_D": current_lr_D, "lr_D": current_lr_D,
"step_time": step_time "step_time": step_time,
} }
iter_stats.update(loss_dict) iter_stats.update(loss_dict)
tb_logger.tb_train_iter_stats(global_step, iter_stats) tb_logger.tb_train_iter_stats(global_step, iter_stats)
@ -257,27 +284,28 @@ def train(model_G, criterion_G, optimizer_G, model_D, criterion_D, optimizer_D,
if global_step % c.save_step == 0: if global_step % c.save_step == 0:
if c.checkpoint: if c.checkpoint:
# save model # save model
save_checkpoint(model_G, save_checkpoint(
optimizer_G, model_G,
scheduler_G, optimizer_G,
model_D, scheduler_G,
optimizer_D, model_D,
scheduler_D, optimizer_D,
global_step, scheduler_D,
epoch, global_step,
OUT_PATH, epoch,
model_losses=loss_dict) OUT_PATH,
model_losses=loss_dict,
)
# compute spectrograms # compute spectrograms
figures = plot_results(y_hat_vis, y_G, ap, global_step, figures = plot_results(y_hat_vis, y_G, ap, global_step, "train")
'train')
tb_logger.tb_train_figures(global_step, figures) tb_logger.tb_train_figures(global_step, figures)
# Sample audio # Sample audio
sample_voice = y_hat_vis[0].squeeze(0).detach().cpu().numpy() sample_voice = y_hat_vis[0].squeeze(0).detach().cpu().numpy()
tb_logger.tb_train_audios(global_step, tb_logger.tb_train_audios(
{'train/audio': sample_voice}, global_step, {"train/audio": sample_voice}, c.audio["sample_rate"]
c.audio["sample_rate"]) )
end_time = time.time() end_time = time.time()
# print epoch stats # print epoch stats
@ -326,7 +354,6 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch)
y_hat = model_G.pqmf_synthesis(y_hat) y_hat = model_G.pqmf_synthesis(y_hat)
y_G_sub = model_G.pqmf_analysis(y_G) y_G_sub = model_G.pqmf_analysis(y_G)
scores_fake, feats_fake, feats_real = None, None, None scores_fake, feats_fake, feats_real = None, None, None
if global_step > c.steps_to_start_discriminator: if global_step > c.steps_to_start_discriminator:
@ -352,8 +379,9 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch)
feats_fake, feats_real = None, None feats_fake, feats_real = None, None
# compute losses # compute losses
loss_G_dict = criterion_G(y_hat, y_G, scores_fake, feats_fake, loss_G_dict = criterion_G(
feats_real, y_hat_sub, y_G_sub) y_hat, y_G, scores_fake, feats_fake, feats_real, y_hat_sub, y_G_sub
)
loss_dict = dict() loss_dict = dict()
for key, value in loss_G_dict.items(): for key, value in loss_G_dict.items():
@ -403,16 +431,15 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch)
else: else:
loss_dict[key] = value.item() loss_dict[key] = value.item()
step_time = time.time() - start_time step_time = time.time() - start_time
epoch_time += step_time epoch_time += step_time
# update avg stats # update avg stats
update_eval_values = dict() update_eval_values = dict()
for key, value in loss_dict.items(): for key, value in loss_dict.items():
update_eval_values['avg_' + key] = value update_eval_values["avg_" + key] = value
update_eval_values['avg_loader_time'] = loader_time update_eval_values["avg_loader_time"] = loader_time
update_eval_values['avg_step_time'] = step_time update_eval_values["avg_step_time"] = step_time
keep_avg.update_values(update_eval_values) keep_avg.update_values(update_eval_values)
# print eval stats # print eval stats
@ -420,13 +447,14 @@ def evaluate(model_G, criterion_G, model_D, criterion_D, ap, global_step, epoch)
c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values)
# compute spectrograms # compute spectrograms
figures = plot_results(y_hat, y_G, ap, global_step, 'eval') figures = plot_results(y_hat, y_G, ap, global_step, "eval")
tb_logger.tb_eval_figures(global_step, figures) tb_logger.tb_eval_figures(global_step, figures)
# Sample audio # Sample audio
sample_voice = y_hat[0].squeeze(0).detach().cpu().numpy() sample_voice = y_hat[0].squeeze(0).detach().cpu().numpy()
tb_logger.tb_eval_audios(global_step, {'eval/audio': sample_voice}, tb_logger.tb_eval_audios(
c.audio["sample_rate"]) global_step, {"eval/audio": sample_voice}, c.audio["sample_rate"]
)
# synthesize a full voice # synthesize a full voice
data_loader.return_segments = False data_loader.return_segments = False
@ -443,7 +471,9 @@ def main(args): # pylint: disable=redefined-outer-name
print(f" > Loading wavs from: {c.data_path}") print(f" > Loading wavs from: {c.data_path}")
if c.feature_path is not None: if c.feature_path is not None:
print(f" > Loading features from: {c.feature_path}") print(f" > Loading features from: {c.feature_path}")
eval_data, train_data = load_wav_feat_data(c.data_path, c.feature_path, c.eval_split_size) eval_data, train_data = load_wav_feat_data(
c.data_path, c.feature_path, c.eval_split_size
)
else: else:
eval_data, train_data = load_wav_data(c.data_path, c.eval_split_size) eval_data, train_data = load_wav_data(c.data_path, c.eval_split_size)
@ -461,17 +491,15 @@ def main(args): # pylint: disable=redefined-outer-name
# setup optimizers # setup optimizers
optimizer_gen = RAdam(model_gen.parameters(), lr=c.lr_gen, weight_decay=0) optimizer_gen = RAdam(model_gen.parameters(), lr=c.lr_gen, weight_decay=0)
optimizer_disc = RAdam(model_disc.parameters(), optimizer_disc = RAdam(model_disc.parameters(), lr=c.lr_disc, weight_decay=0)
lr=c.lr_disc,
weight_decay=0)
# schedulers # schedulers
scheduler_gen = None scheduler_gen = None
scheduler_disc = None scheduler_disc = None
if 'lr_scheduler_gen' in c: if "lr_scheduler_gen" in c:
scheduler_gen = getattr(torch.optim.lr_scheduler, c.lr_scheduler_gen) scheduler_gen = getattr(torch.optim.lr_scheduler, c.lr_scheduler_gen)
scheduler_gen = scheduler_gen(optimizer_gen, **c.lr_scheduler_gen_params) scheduler_gen = scheduler_gen(optimizer_gen, **c.lr_scheduler_gen_params)
if 'lr_scheduler_disc' in c: if "lr_scheduler_disc" in c:
scheduler_disc = getattr(torch.optim.lr_scheduler, c.lr_scheduler_disc) scheduler_disc = getattr(torch.optim.lr_scheduler, c.lr_scheduler_disc)
scheduler_disc = scheduler_disc(optimizer_disc, **c.lr_scheduler_disc_params) scheduler_disc = scheduler_disc(optimizer_disc, **c.lr_scheduler_disc_params)
@ -480,47 +508,46 @@ def main(args): # pylint: disable=redefined-outer-name
criterion_disc = DiscriminatorLoss(c) criterion_disc = DiscriminatorLoss(c)
if args.restore_path: if args.restore_path:
checkpoint = torch.load(args.restore_path, map_location='cpu') checkpoint = torch.load(args.restore_path, map_location="cpu")
try: try:
print(" > Restoring Generator Model...") print(" > Restoring Generator Model...")
model_gen.load_state_dict(checkpoint['model']) model_gen.load_state_dict(checkpoint["model"])
print(" > Restoring Generator Optimizer...") print(" > Restoring Generator Optimizer...")
optimizer_gen.load_state_dict(checkpoint['optimizer']) optimizer_gen.load_state_dict(checkpoint["optimizer"])
print(" > Restoring Discriminator Model...") print(" > Restoring Discriminator Model...")
model_disc.load_state_dict(checkpoint['model_disc']) model_disc.load_state_dict(checkpoint["model_disc"])
print(" > Restoring Discriminator Optimizer...") print(" > Restoring Discriminator Optimizer...")
optimizer_disc.load_state_dict(checkpoint['optimizer_disc']) optimizer_disc.load_state_dict(checkpoint["optimizer_disc"])
if 'scheduler' in checkpoint: if "scheduler" in checkpoint:
print(" > Restoring Generator LR Scheduler...") print(" > Restoring Generator LR Scheduler...")
scheduler_gen.load_state_dict(checkpoint['scheduler']) scheduler_gen.load_state_dict(checkpoint["scheduler"])
# NOTE: Not sure if necessary # NOTE: Not sure if necessary
scheduler_gen.optimizer = optimizer_gen scheduler_gen.optimizer = optimizer_gen
if 'scheduler_disc' in checkpoint: if "scheduler_disc" in checkpoint:
print(" > Restoring Discriminator LR Scheduler...") print(" > Restoring Discriminator LR Scheduler...")
scheduler_disc.load_state_dict(checkpoint['scheduler_disc']) scheduler_disc.load_state_dict(checkpoint["scheduler_disc"])
scheduler_disc.optimizer = optimizer_disc scheduler_disc.optimizer = optimizer_disc
except RuntimeError: except RuntimeError:
# retore only matching layers. # retore only matching layers.
print(" > Partial model initialization...") print(" > Partial model initialization...")
model_dict = model_gen.state_dict() model_dict = model_gen.state_dict()
model_dict = set_init_dict(model_dict, checkpoint['model'], c) model_dict = set_init_dict(model_dict, checkpoint["model"], c)
model_gen.load_state_dict(model_dict) model_gen.load_state_dict(model_dict)
model_dict = model_disc.state_dict() model_dict = model_disc.state_dict()
model_dict = set_init_dict(model_dict, checkpoint['model_disc'], c) model_dict = set_init_dict(model_dict, checkpoint["model_disc"], c)
model_disc.load_state_dict(model_dict) model_disc.load_state_dict(model_dict)
del model_dict del model_dict
# reset lr if not countinuining training. # reset lr if not countinuining training.
for group in optimizer_gen.param_groups: for group in optimizer_gen.param_groups:
group['lr'] = c.lr_gen group["lr"] = c.lr_gen
for group in optimizer_disc.param_groups: for group in optimizer_disc.param_groups:
group['lr'] = c.lr_disc group["lr"] = c.lr_disc
print(" > Model restored from step %d" % checkpoint['step'], print(" > Model restored from step %d" % checkpoint["step"], flush=True)
flush=True) args.restore_step = checkpoint["step"]
args.restore_step = checkpoint['step']
else: else:
args.restore_step = 0 args.restore_step = 0
@ -539,75 +566,92 @@ def main(args): # pylint: disable=redefined-outer-name
num_params = count_parameters(model_disc) num_params = count_parameters(model_disc)
print(" > Discriminator has {} parameters".format(num_params), flush=True) print(" > Discriminator has {} parameters".format(num_params), flush=True)
if 'best_loss' not in locals(): if "best_loss" not in locals():
best_loss = float('inf') best_loss = float("inf")
global_step = args.restore_step global_step = args.restore_step
for epoch in range(0, c.epochs): for epoch in range(0, c.epochs):
c_logger.print_epoch_start(epoch, c.epochs) c_logger.print_epoch_start(epoch, c.epochs)
_, global_step = train(model_gen, criterion_gen, optimizer_gen, _, global_step = train(
model_disc, criterion_disc, optimizer_disc, model_gen,
scheduler_gen, scheduler_disc, ap, global_step, criterion_gen,
epoch) optimizer_gen,
eval_avg_loss_dict = evaluate(model_gen, criterion_gen, model_disc, criterion_disc, ap, model_disc,
global_step, epoch) criterion_disc,
optimizer_disc,
scheduler_gen,
scheduler_disc,
ap,
global_step,
epoch,
)
eval_avg_loss_dict = evaluate(
model_gen, criterion_gen, model_disc, criterion_disc, ap, global_step, epoch
)
c_logger.print_epoch_end(epoch, eval_avg_loss_dict) c_logger.print_epoch_end(epoch, eval_avg_loss_dict)
target_loss = eval_avg_loss_dict[c.target_loss] target_loss = eval_avg_loss_dict[c.target_loss]
best_loss = save_best_model(target_loss, best_loss = save_best_model(
best_loss, target_loss,
model_gen, best_loss,
optimizer_gen, model_gen,
scheduler_gen, optimizer_gen,
model_disc, scheduler_gen,
optimizer_disc, model_disc,
scheduler_disc, optimizer_disc,
global_step, scheduler_disc,
epoch, global_step,
OUT_PATH, epoch,
model_losses=eval_avg_loss_dict) OUT_PATH,
model_losses=eval_avg_loss_dict,
)
if __name__ == '__main__': if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument( parser.add_argument(
'--continue_path', "--continue_path",
type=str, type=str,
help= help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.',
'Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.', default="",
default='', required="--config_path" not in sys.argv,
required='--config_path' not in sys.argv) )
parser.add_argument( parser.add_argument(
'--restore_path', "--restore_path",
type=str, type=str,
help='Model file to be restored. Use to finetune a model.', help="Model file to be restored. Use to finetune a model.",
default='') default="",
parser.add_argument('--config_path', )
type=str, parser.add_argument(
help='Path to config file for training.', "--config_path",
required='--continue_path' not in sys.argv) type=str,
parser.add_argument('--debug', help="Path to config file for training.",
type=bool, required="--continue_path" not in sys.argv,
default=False, )
help='Do not verify commit integrity to run training.') parser.add_argument(
"--debug",
type=bool,
default=False,
help="Do not verify commit integrity to run training.",
)
# DISTRUBUTED # DISTRUBUTED
parser.add_argument( parser.add_argument(
'--rank', "--rank",
type=int, type=int,
default=0, default=0,
help='DISTRIBUTED: process rank for distributed training.') help="DISTRIBUTED: process rank for distributed training.",
parser.add_argument('--group_id', )
type=str, parser.add_argument(
default="", "--group_id", type=str, default="", help="DISTRIBUTED: process group id."
help='DISTRIBUTED: process group id.') )
args = parser.parse_args() args = parser.parse_args()
if args.continue_path != '': if args.continue_path != "":
args.output_path = args.continue_path args.output_path = args.continue_path
args.config_path = os.path.join(args.continue_path, 'config.json') args.config_path = os.path.join(args.continue_path, "config.json")
list_of_files = glob.glob( list_of_files = glob.glob(
args.continue_path + args.continue_path + "/*.pth.tar"
"/*.pth.tar") # * means all if need specific format then *.csv ) # * means all if need specific format then *.csv
latest_model_file = max(list_of_files, key=os.path.getctime) latest_model_file = max(list_of_files, key=os.path.getctime)
args.restore_path = latest_model_file args.restore_path = latest_model_file
print(f" > Training continues for {args.restore_path}") print(f" > Training continues for {args.restore_path}")
@ -618,11 +662,10 @@ if __name__ == '__main__':
_ = os.path.dirname(os.path.realpath(__file__)) _ = os.path.dirname(os.path.realpath(__file__))
OUT_PATH = args.continue_path OUT_PATH = args.continue_path
if args.continue_path == '': if args.continue_path == "":
OUT_PATH = create_experiment_folder(c.output_path, c.run_name, OUT_PATH = create_experiment_folder(c.output_path, c.run_name, args.debug)
args.debug)
AUDIO_PATH = os.path.join(OUT_PATH, 'test_audios') AUDIO_PATH = os.path.join(OUT_PATH, "test_audios")
c_logger = ConsoleLogger() c_logger = ConsoleLogger()
@ -632,16 +675,17 @@ if __name__ == '__main__':
if args.restore_path: if args.restore_path:
new_fields["restore_path"] = args.restore_path new_fields["restore_path"] = args.restore_path
new_fields["github_branch"] = get_git_branch() new_fields["github_branch"] = get_git_branch()
copy_config_file(args.config_path, copy_config_file(
os.path.join(OUT_PATH, 'config.json'), new_fields) args.config_path, os.path.join(OUT_PATH, "config.json"), new_fields
)
os.chmod(AUDIO_PATH, 0o775) os.chmod(AUDIO_PATH, 0o775)
os.chmod(OUT_PATH, 0o775) os.chmod(OUT_PATH, 0o775)
LOG_DIR = OUT_PATH LOG_DIR = OUT_PATH
tb_logger = TensorboardLogger(LOG_DIR, model_name='VOCODER') tb_logger = TensorboardLogger(LOG_DIR, model_name="VOCODER")
# write model desc to tensorboard # write model desc to tensorboard
tb_logger.tb_add_text('model-description', c['run_description'], 0) tb_logger.tb_add_text("model-description", c["run_description"], 0)
try: try:
main(args) main(args)

View File

@ -0,0 +1,493 @@
import argparse
import math
import os
import pickle
import shutil
import sys
import traceback
import time
import glob
import random
import torch
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler
from TTS.utils.audio import AudioProcessor
from TTS.tts.utils.visual import plot_spectrogram
from TTS.utils.io import copy_config_file, load_config
from TTS.vocoder.datasets.wavernn_dataset import WaveRNNDataset
from TTS.utils.tensorboard_logger import TensorboardLogger
from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
from TTS.vocoder.utils.distribution import discretized_mix_logistic_loss, gaussian_loss
from TTS.vocoder.utils.generic_utils import setup_wavernn
from TTS.utils.training import setup_torch_training_env
from TTS.utils.console_logger import ConsoleLogger
from TTS.utils.generic_utils import (
KeepAverage,
count_parameters,
create_experiment_folder,
get_git_branch,
remove_experiment_folder,
set_init_dict,
)
from TTS.vocoder.utils.io import save_best_model, save_checkpoint
use_cuda, num_gpus = setup_torch_training_env(True, True)
def setup_loader(ap, is_val=False, verbose=False):
if is_val and not CONFIG.run_eval:
loader = None
else:
dataset = WaveRNNDataset(
ap=ap,
items=eval_data if is_val else train_data,
seq_len=CONFIG.seq_len,
hop_len=ap.hop_length,
pad=CONFIG.padding,
mode=CONFIG.mode,
is_training=not is_val,
verbose=verbose,
)
# sampler = DistributedSampler(dataset) if num_gpus > 1 else None
loader = DataLoader(
dataset,
shuffle=True,
collate_fn=dataset.collate,
batch_size=CONFIG.batch_size,
num_workers=CONFIG.num_val_loader_workers
if is_val
else CONFIG.num_loader_workers,
pin_memory=True,
)
return loader
def format_data(data):
# setup input data
x = data[0]
m = data[1]
y = data[2]
# dispatch data to GPU
if use_cuda:
x = x.cuda(non_blocking=True)
m = m.cuda(non_blocking=True)
y = y.cuda(non_blocking=True)
return x, m, y
def train(model, optimizer, criterion, scheduler, ap, global_step, epoch):
# create train loader
data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0))
model.train()
epoch_time = 0
keep_avg = KeepAverage()
if use_cuda:
batch_n_iter = int(len(data_loader.dataset) / (CONFIG.batch_size * num_gpus))
else:
batch_n_iter = int(len(data_loader.dataset) / CONFIG.batch_size)
end_time = time.time()
c_logger.print_train_start()
# train loop
print(" > Training", flush=True)
for num_iter, data in enumerate(data_loader):
start_time = time.time()
x, m, y = format_data(data)
loader_time = time.time() - end_time
global_step += 1
##################
# MODEL TRAINING #
##################
y_hat = model(x, m)
y_hat_vis = y_hat # for visualization
# y_hat = y_hat.transpose(1, 2)
if isinstance(model.mode, int):
y_hat = y_hat.transpose(1, 2).unsqueeze(-1)
else:
y = y.float()
y = y.unsqueeze(-1)
# m_scaled, _ = model.upsample(m)
# compute losses
loss = criterion(y_hat, y)
if loss.item() is None:
raise RuntimeError(" [!] None loss. Exiting ...")
optimizer.zero_grad()
loss.backward()
if CONFIG.grad_clip > 0:
torch.nn.utils.clip_grad_norm_(model.parameters(), CONFIG.grad_clip)
optimizer.step()
if scheduler is not None:
scheduler.step()
# get the current learning rate
cur_lr = list(optimizer.param_groups)[0]["lr"]
step_time = time.time() - start_time
epoch_time += step_time
update_train_values = dict()
loss_dict = dict()
loss_dict["model_loss"] = loss.item()
for key, value in loss_dict.items():
update_train_values["avg_" + key] = value
update_train_values["avg_loader_time"] = loader_time
update_train_values["avg_step_time"] = step_time
keep_avg.update_values(update_train_values)
# print training stats
if global_step % CONFIG.print_step == 0:
log_dict = {
"step_time": [step_time, 2],
"loader_time": [loader_time, 4],
"current_lr": cur_lr,
}
c_logger.print_train_step(
batch_n_iter,
num_iter,
global_step,
log_dict,
loss_dict,
keep_avg.avg_values,
)
# plot step stats
if global_step % 10 == 0:
iter_stats = {"lr": cur_lr, "step_time": step_time}
iter_stats.update(loss_dict)
tb_logger.tb_train_iter_stats(global_step, iter_stats)
# save checkpoint
if global_step % CONFIG.save_step == 0:
if CONFIG.checkpoint:
# save model
save_checkpoint(
model,
optimizer,
scheduler,
None,
None,
None,
global_step,
epoch,
OUT_PATH,
model_losses=loss_dict,
)
# synthesize a full voice
wav_path = train_data[random.randrange(0, len(train_data))][0]
wav = ap.load_wav(wav_path)
ground_mel = ap.melspectrogram(wav)
sample_wav = model.generate(
ground_mel,
CONFIG.batched,
CONFIG.target_samples,
CONFIG.overlap_samples,
)
predict_mel = ap.melspectrogram(sample_wav)
# Sample audio
tb_logger.tb_train_audios(
global_step, {"eval/audio": sample_wav}, CONFIG.audio["sample_rate"]
)
# compute spectrograms
figures = {
"prediction": plot_spectrogram(predict_mel.T, ap, output_fig=False),
"ground_truth": plot_spectrogram(ground_mel.T, ap, output_fig=False),
}
tb_logger.tb_train_figures(global_step, figures)
end_time = time.time()
# print epoch stats
c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg)
# Plot Training Epoch Stats
epoch_stats = {"epoch_time": epoch_time}
epoch_stats.update(keep_avg.avg_values)
tb_logger.tb_train_epoch_stats(global_step, epoch_stats)
# TODO: plot model stats
# if c.tb_model_param_stats:
# tb_logger.tb_model_weights(model, global_step)
return keep_avg.avg_values, global_step
@torch.no_grad()
def evaluate(model, criterion, ap, global_step, epoch):
# create train loader
data_loader = setup_loader(ap, is_val=True, verbose=(epoch == 0))
model.eval()
epoch_time = 0
keep_avg = KeepAverage()
end_time = time.time()
c_logger.print_eval_start()
with torch.no_grad():
for num_iter, data in enumerate(data_loader):
start_time = time.time()
# format data
x, m, y = format_data(data)
loader_time = time.time() - end_time
global_step += 1
y_hat = model(x, m)
if isinstance(model.mode, int):
y_hat = y_hat.transpose(1, 2).unsqueeze(-1)
else:
y = y.float()
y = y.unsqueeze(-1)
loss = criterion(y_hat, y)
# Compute avg loss
# if num_gpus > 1:
# loss = reduce_tensor(loss.data, num_gpus)
loss_dict = dict()
loss_dict["model_loss"] = loss.item()
step_time = time.time() - start_time
epoch_time += step_time
# update avg stats
update_eval_values = dict()
for key, value in loss_dict.items():
update_eval_values["avg_" + key] = value
update_eval_values["avg_loader_time"] = loader_time
update_eval_values["avg_step_time"] = step_time
keep_avg.update_values(update_eval_values)
# print eval stats
if CONFIG.print_eval:
c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values)
if epoch > CONFIG.test_delay_epochs:
# synthesize a full voice
wav_path = eval_data[random.randrange(0, len(eval_data))][0]
wav = ap.load_wav(wav_path)
ground_mel = ap.melspectrogram(wav)
sample_wav = model.generate(
ground_mel,
CONFIG.batched,
CONFIG.target_samples,
CONFIG.overlap_samples,
)
predict_mel = ap.melspectrogram(sample_wav)
# Sample audio
tb_logger.tb_eval_audios(
global_step, {"eval/audio": sample_wav}, CONFIG.audio["sample_rate"]
)
# compute spectrograms
figures = {
"prediction": plot_spectrogram(predict_mel.T, ap, output_fig=False),
"ground_truth": plot_spectrogram(ground_mel.T, ap, output_fig=False),
}
tb_logger.tb_eval_figures(global_step, figures)
tb_logger.tb_eval_stats(global_step, keep_avg.avg_values)
return keep_avg.avg_values
# FIXME: move args definition/parsing inside of main?
def main(args): # pylint: disable=redefined-outer-name
# pylint: disable=global-variable-undefined
global train_data, eval_data
print(f" > Loading wavs from: {CONFIG.data_path}")
if CONFIG.feature_path is not None:
print(f" > Loading features from: {CONFIG.feature_path}")
eval_data, train_data = load_wav_feat_data(
CONFIG.data_path, CONFIG.feature_path, CONFIG.eval_split_size
)
eval_data, train_data = eval_data, train_data
else:
eval_data, train_data = load_wav_data(CONFIG.data_path, CONFIG.eval_split_size)
# setup audio processor
ap = AudioProcessor(**CONFIG.audio)
# setup model
model_wavernn = setup_wavernn(CONFIG)
# define train functions
if CONFIG.mode == "mold":
criterion = discretized_mix_logistic_loss
elif CONFIG.mode == "gauss":
criterion = gaussian_loss
elif isinstance(CONFIG.mode, int):
criterion = torch.nn.CrossEntropyLoss()
if use_cuda:
model_wavernn.cuda()
if isinstance(CONFIG.mode, int):
criterion.cuda()
optimizer = optim.Adam(model_wavernn.parameters(), lr=CONFIG.lr, weight_decay=0)
scheduler = None
if "lr_scheduler" in CONFIG:
scheduler = getattr(torch.optim.lr_scheduler, CONFIG.lr_scheduler)
scheduler = scheduler(optimizer, **CONFIG.lr_scheduler_params)
# slow start for the first 5 epochs
# lr_lambda = lambda epoch: min(epoch / CONFIG.warmup_steps, 1)
# scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
# restore any checkpoint
if args.restore_path:
checkpoint = torch.load(args.restore_path, map_location="cpu")
try:
print(" > Restoring Model...")
model_wavernn.load_state_dict(checkpoint["model"])
print(" > Restoring Optimizer...")
optimizer.load_state_dict(checkpoint["optimizer"])
if "scheduler" in checkpoint:
print(" > Restoring Generator LR Scheduler...")
scheduler.load_state_dict(checkpoint["scheduler"])
scheduler.optimizer = optimizer
# TODO: fix resetting restored optimizer lr
# optimizer.load_state_dict(checkpoint["optimizer"])
except RuntimeError:
# retore only matching layers.
print(" > Partial model initialization...")
model_dict = model_wavernn.state_dict()
model_dict = set_init_dict(model_dict, checkpoint["model"], CONFIG)
model_wavernn.load_state_dict(model_dict)
print(" > Model restored from step %d" % checkpoint["step"], flush=True)
args.restore_step = checkpoint["step"]
else:
args.restore_step = 0
# DISTRIBUTED
# if num_gpus > 1:
# model = apply_gradient_allreduce(model)
num_parameters = count_parameters(model_wavernn)
print(" > Model has {} parameters".format(num_parameters), flush=True)
if "best_loss" not in locals():
best_loss = float("inf")
global_step = args.restore_step
for epoch in range(0, CONFIG.epochs):
c_logger.print_epoch_start(epoch, CONFIG.epochs)
_, global_step = train(
model_wavernn, optimizer, criterion, scheduler, ap, global_step, epoch
)
eval_avg_loss_dict = evaluate(model_wavernn, criterion, ap, global_step, epoch)
c_logger.print_epoch_end(epoch, eval_avg_loss_dict)
target_loss = eval_avg_loss_dict["avg_model_loss"]
best_loss = save_best_model(
target_loss,
best_loss,
model_wavernn,
optimizer,
scheduler,
None,
None,
None,
global_step,
epoch,
OUT_PATH,
model_losses=eval_avg_loss_dict,
)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--continue_path",
type=str,
help='Training output folder to continue training. Use to continue a training. If it is used, "config_path" is ignored.',
default="",
required="--config_path" not in sys.argv,
)
parser.add_argument(
"--restore_path",
type=str,
help="Model file to be restored. Use to finetune a model.",
default="",
)
parser.add_argument(
"--config_path",
type=str,
help="Path to config file for training.",
required="--continue_path" not in sys.argv,
)
parser.add_argument(
"--debug",
type=bool,
default=False,
help="Do not verify commit integrity to run training.",
)
# DISTRUBUTED
parser.add_argument(
"--rank",
type=int,
default=0,
help="DISTRIBUTED: process rank for distributed training.",
)
parser.add_argument(
"--group_id", type=str, default="", help="DISTRIBUTED: process group id."
)
args = parser.parse_args()
if args.continue_path != "":
args.output_path = args.continue_path
args.config_path = os.path.join(args.continue_path, "config.json")
list_of_files = glob.glob(
args.continue_path + "/*.pth.tar"
) # * means all if need specific format then *.csv
latest_model_file = max(list_of_files, key=os.path.getctime)
args.restore_path = latest_model_file
print(f" > Training continues for {args.restore_path}")
# setup output paths and read configs
CONFIG = load_config(args.config_path)
# check_config(c)
_ = os.path.dirname(os.path.realpath(__file__))
OUT_PATH = args.continue_path
if args.continue_path == "":
OUT_PATH = create_experiment_folder(
CONFIG.output_path, CONFIG.run_name, args.debug
)
AUDIO_PATH = os.path.join(OUT_PATH, "test_audios")
c_logger = ConsoleLogger()
if args.rank == 0:
os.makedirs(AUDIO_PATH, exist_ok=True)
new_fields = {}
if args.restore_path:
new_fields["restore_path"] = args.restore_path
new_fields["github_branch"] = get_git_branch()
copy_config_file(
args.config_path, os.path.join(OUT_PATH, "config.json"), new_fields
)
os.chmod(AUDIO_PATH, 0o775)
os.chmod(OUT_PATH, 0o775)
LOG_DIR = OUT_PATH
tb_logger = TensorboardLogger(LOG_DIR, model_name="VOCODER")
# write model desc to tensorboard
tb_logger.tb_add_text("model-description", CONFIG["run_description"], 0)
try:
main(args)
except KeyboardInterrupt:
remove_experiment_folder(OUT_PATH)
try:
sys.exit(0)
except SystemExit:
os._exit(0) # pylint: disable=protected-access
except Exception: # pylint: disable=broad-except
remove_experiment_folder(OUT_PATH)
traceback.print_exc()
sys.exit(1)

View File

@ -0,0 +1,95 @@
{
"model": "wavernn",
"run_name": "wavernn_test",
"run_description": "wavernn_test training",
// AUDIO PARAMETERS
"audio":{
"fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
"win_length": 1024, // stft window length in ms.
"hop_length": 256, // stft window hop-lengh in ms.
"frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
"frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
// Audio processing parameters
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
"preemphasis": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
// Silence trimming
"do_trim_silence": false,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
"trim_db": 60, // threshold for timming silence. Set this according to your dataset.
// MelSpectrogram parameters
"num_mels": 80, // size of the mel spec frame.
"mel_fmin": 40.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
"mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
"spec_gain": 20.0, // scaler value appplied after log transform of spectrogram.
// Normalization parameters
"signal_norm": true, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
"min_level_db": -100, // lower bound for normalization
"symmetric_norm": true, // move normalization to range [-1, 1]
"max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
"clip_norm": true, // clip normalized values into the range.
"stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
},
// Generating / Synthesizing
"batched": true,
"target_samples": 11000, // target number of samples to be generated in each batch entry
"overlap_samples": 550, // number of samples for crossfading between batches
// DISTRIBUTED TRAINING
// "distributed":{
// "backend": "nccl",
// "url": "tcp:\/\/localhost:54321"
// },
// MODEL PARAMETERS
"use_aux_net": true,
"use_upsample_net": true,
"upsample_factors": [4, 8, 8], // this needs to correctly factorise hop_length
"seq_len": 1280, // has to be devideable by hop_length
"mode": "mold", // mold [string], gauss [string], bits [int]
"mulaw": false, // apply mulaw if mode is bits
"padding": 2, // pad the input for resnet to see wider input length
// DATASET
"data_path": "/media/alexander/LinuxFS/SpeechData/GothicSpeech/NPC_Speech/", // path containing training wav files
"feature_path": "/media/alexander/LinuxFS/SpeechData/GothicSpeech/NPC_Speech_Computed/mel/", // path containing extracted features .npy (mels / quant)
// TRAINING
"batch_size": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention.
"epochs": 10000, // total number of epochs to train.
"warmup_steps": 10,
// VALIDATION
"run_eval": true,
"test_delay_epochs": 10, // early testing only wastes computation time.
// OPTIMIZER
"grad_clip": 4, // apply gradient clipping if > 0
"lr_scheduler": "MultiStepLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
"lr_scheduler_params": {
"gamma": 0.5,
"milestones": [200000, 400000, 600000]
},
"lr": 1e-4, // initial learning rate
// TENSORBOARD and LOGGING
"print_step": 25, // Number of steps to log traning on console.
"print_eval": false, // If True, it prints loss values for each step in eval run.
"save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints.
"checkpoint": true, // If true, it saves checkpoints per "save_step"
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
// DATA LOADING
"num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values.
"num_val_loader_workers": 4, // number of evaluation data loader processes.
"eval_split_size": 50, // number of samples for testing
// PATHS
"output_path": "/media/alexander/LinuxFS/Projects/wavernn/Trainings/"
}

View File

@ -23,8 +23,12 @@ def load_wav_data(data_path, eval_split_size):
def load_wav_feat_data(data_path, feat_path, eval_split_size): def load_wav_feat_data(data_path, feat_path, eval_split_size):
wav_paths = sorted(find_wav_files(data_path)) wav_paths = find_wav_files(data_path)
feat_paths = sorted(find_feat_files(feat_path)) feat_paths = find_feat_files(feat_path)
wav_paths.sort(key=lambda x: Path(x).stem)
feat_paths.sort(key=lambda x: Path(x).stem)
assert len(wav_paths) == len(feat_paths) assert len(wav_paths) == len(feat_paths)
for wav, feat in zip(wav_paths, feat_paths): for wav, feat in zip(wav_paths, feat_paths):
wav_name = Path(wav).stem wav_name = Path(wav).stem

View File

@ -41,6 +41,26 @@ def to_camel(text):
text = text.capitalize() text = text.capitalize()
return re.sub(r'(?!^)_([a-zA-Z])', lambda m: m.group(1).upper(), text) return re.sub(r'(?!^)_([a-zA-Z])', lambda m: m.group(1).upper(), text)
def setup_wavernn(c):
print(" > Model: {}".format(c.model))
MyModel = importlib.import_module('TTS.vocoder.models.wavernn')
MyModel = getattr(MyModel, "WaveRNN")
model = MyModel(
rnn_dims=512,
fc_dims=512,
mode=c.mode,
mulaw=c.mulaw,
pad=c.padding,
use_aux_net=c.use_aux_net,
use_upsample_net=c.use_upsample_net,
upsample_factors=c.upsample_factors,
feat_dims=80,
compute_dims=128,
res_out_dims=128,
res_blocks=10,
hop_length=c.audio['hop_length'],
sample_rate=c.audio['sample_rate'])
return model
def setup_generator(c): def setup_generator(c):
print(" > Generator Model: {}".format(c.generator_model)) print(" > Generator Model: {}".format(c.generator_model))