diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py index 95030309..8d1f14fa 100644 --- a/TTS/bin/train_encoder.py +++ b/TTS/bin/train_encoder.py @@ -9,20 +9,19 @@ import traceback import torch from torch.utils.data import DataLoader - from TTS.speaker_encoder.dataset import MyDataset -from TTS.speaker_encoder.losses import GE2ELoss, AngleProtoLoss +from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss from TTS.speaker_encoder.model import SpeakerEncoder -from TTS.speaker_encoder.utils import check_config_speaker_encoder -from TTS.speaker_encoder.visuals import plot_embeddings +from TTS.speaker_encoder.utils.generic_utils import \ + check_config_speaker_encoder +from TTS.speaker_encoder.utils.visual import plot_embeddings from TTS.tts.datasets.preprocess import load_meta_data from TTS.tts.utils.io import save_best_model -from TTS.utils.generic_utils import ( - create_experiment_folder, get_git_branch, remove_experiment_folder, - set_init_dict) -from TTS.utils.io import copy_config_file, load_config from TTS.utils.audio import AudioProcessor -from TTS.utils.generic_utils import count_parameters +from TTS.utils.generic_utils import (count_parameters, + create_experiment_folder, get_git_branch, + remove_experiment_folder, set_init_dict) +from TTS.utils.io import copy_config_file, load_config from TTS.utils.radam import RAdam from TTS.utils.tensorboard_logger import TensorboardLogger from TTS.utils.training import NoamLR, check_update diff --git a/TTS/speaker_encoder/compute_embeddings.py b/TTS/speaker_encoder/compute_embeddings.py deleted file mode 100644 index c8608755..00000000 --- a/TTS/speaker_encoder/compute_embeddings.py +++ /dev/null @@ -1,88 +0,0 @@ -import argparse -import glob -import os - -import numpy as np -from tqdm import tqdm - -import torch -from TTS.speaker_encoder.model import SpeakerEncoder -from TTS.utils.audio import AudioProcessor -from TTS.utils.io import load_config - -parser = argparse.ArgumentParser( - description='Compute embedding vectors for each wav file in a dataset. ') -parser.add_argument( - 'model_path', - type=str, - help='Path to model outputs (checkpoint, tensorboard etc.).') -parser.add_argument( - 'config_path', - type=str, - help='Path to config file for training.', -) -parser.add_argument( - 'data_path', - type=str, - help='Data path for wav files - directory or CSV file') -parser.add_argument( - 'output_path', - type=str, - help='path for training outputs.') -parser.add_argument( - '--use_cuda', type=bool, help='flag to set cuda.', default=False -) -parser.add_argument( - '--separator', type=str, help='Separator used in file if CSV is passed for data_path', default='|' -) -args = parser.parse_args() - - -c = load_config(args.config_path) -ap = AudioProcessor(**c['audio']) - -data_path = args.data_path -split_ext = os.path.splitext(data_path) -sep = args.separator - -if len(split_ext) > 0 and split_ext[1].lower() == '.csv': - # Parse CSV - print(f'CSV file: {data_path}') - with open(data_path) as f: - wav_path = os.path.join(os.path.dirname(data_path), 'wavs') - wav_files = [] - print(f'Separator is: {sep}') - for line in f: - components = line.split(sep) - if len(components) != 2: - print("Invalid line") - continue - wav_file = os.path.join(wav_path, components[0] + '.wav') - #print(f'wav_file: {wav_file}') - if os.path.exists(wav_file): - wav_files.append(wav_file) - print(f'Count of wavs imported: {len(wav_files)}') -else: - # Parse all wav files in data_path - wav_path = data_path - wav_files = glob.glob(data_path + '/**/*.wav', recursive=True) - -output_files = [wav_file.replace(wav_path, args.output_path).replace( - '.wav', '.npy') for wav_file in wav_files] - -for output_file in output_files: - os.makedirs(os.path.dirname(output_file), exist_ok=True) - -model = SpeakerEncoder(**c.model) -model.load_state_dict(torch.load(args.model_path)['model']) -model.eval() -if args.use_cuda: - model.cuda() - -for idx, wav_file in enumerate(tqdm(wav_files)): - mel_spec = ap.melspectrogram(ap.load_wav(wav_file, sr=ap.sample_rate)).T - mel_spec = torch.FloatTensor(mel_spec[None, :, :]) - if args.use_cuda: - mel_spec = mel_spec.cuda() - embedd = model.compute_embedding(mel_spec) - np.save(output_files[idx], embedd.detach().cpu().numpy()) diff --git a/TTS/speaker_encoder/utils.py b/TTS/speaker_encoder/utils.py deleted file mode 100644 index 95c222f2..00000000 --- a/TTS/speaker_encoder/utils.py +++ /dev/null @@ -1,61 +0,0 @@ -from TTS.utils.generic_utils import check_argument - - -def check_config_speaker_encoder(c): - """Check the config.json file of the speaker encoder""" - check_argument('run_name', c, restricted=True, val_type=str) - check_argument('run_description', c, val_type=str) - - # audio processing parameters - check_argument('audio', c, restricted=True, val_type=dict) - check_argument('num_mels', c['audio'], restricted=True, val_type=int, min_val=10, max_val=2056) - check_argument('fft_size', c['audio'], restricted=True, val_type=int, min_val=128, max_val=4058) - check_argument('sample_rate', c['audio'], restricted=True, val_type=int, min_val=512, max_val=100000) - check_argument('frame_length_ms', c['audio'], restricted=True, val_type=float, min_val=10, max_val=1000, alternative='win_length') - check_argument('frame_shift_ms', c['audio'], restricted=True, val_type=float, min_val=1, max_val=1000, alternative='hop_length') - check_argument('preemphasis', c['audio'], restricted=True, val_type=float, min_val=0, max_val=1) - check_argument('min_level_db', c['audio'], restricted=True, val_type=int, min_val=-1000, max_val=10) - check_argument('ref_level_db', c['audio'], restricted=True, val_type=int, min_val=0, max_val=1000) - check_argument('power', c['audio'], restricted=True, val_type=float, min_val=1, max_val=5) - check_argument('griffin_lim_iters', c['audio'], restricted=True, val_type=int, min_val=10, max_val=1000) - - # training parameters - check_argument('loss', c, enum_list=['ge2e', 'angleproto'], restricted=True, val_type=str) - check_argument('grad_clip', c, restricted=True, val_type=float) - check_argument('epochs', c, restricted=True, val_type=int, min_val=1) - check_argument('lr', c, restricted=True, val_type=float, min_val=0) - check_argument('lr_decay', c, restricted=True, val_type=bool) - check_argument('warmup_steps', c, restricted=True, val_type=int, min_val=0) - check_argument('tb_model_param_stats', c, restricted=True, val_type=bool) - check_argument('num_speakers_in_batch', c, restricted=True, val_type=int) - check_argument('num_loader_workers', c, restricted=True, val_type=int) - check_argument('wd', c, restricted=True, val_type=float, min_val=0.0, max_val=1.0) - - # checkpoint and output parameters - check_argument('steps_plot_stats', c, restricted=True, val_type=int) - check_argument('checkpoint', c, restricted=True, val_type=bool) - check_argument('save_step', c, restricted=True, val_type=int) - check_argument('print_step', c, restricted=True, val_type=int) - check_argument('output_path', c, restricted=True, val_type=str) - - # model parameters - check_argument('model', c, restricted=True, val_type=dict) - check_argument('input_dim', c['model'], restricted=True, val_type=int) - check_argument('proj_dim', c['model'], restricted=True, val_type=int) - check_argument('lstm_dim', c['model'], restricted=True, val_type=int) - check_argument('num_lstm_layers', c['model'], restricted=True, val_type=int) - check_argument('use_lstm_with_projection', c['model'], restricted=True, val_type=bool) - - # in-memory storage parameters - check_argument('storage', c, restricted=True, val_type=dict) - check_argument('sample_from_storage_p', c['storage'], restricted=True, val_type=float, min_val=0.0, max_val=1.0) - check_argument('storage_size', c['storage'], restricted=True, val_type=int, min_val=1, max_val=100) - check_argument('additive_noise', c['storage'], restricted=True, val_type=float, min_val=0.0, max_val=1.0) - - # datasets - checking only the first entry - check_argument('datasets', c, restricted=True, val_type=list) - for dataset_entry in c['datasets']: - check_argument('name', dataset_entry, restricted=True, val_type=str) - check_argument('path', dataset_entry, restricted=True, val_type=str) - check_argument('meta_file_train', dataset_entry, restricted=True, val_type=[str, list]) - check_argument('meta_file_val', dataset_entry, restricted=True, val_type=str) diff --git a/TTS/speaker_encoder/visuals.py b/TTS/speaker_encoder/visuals.py deleted file mode 100644 index 68c48f12..00000000 --- a/TTS/speaker_encoder/visuals.py +++ /dev/null @@ -1,46 +0,0 @@ -import umap -import numpy as np -import matplotlib -import matplotlib.pyplot as plt - -matplotlib.use("Agg") - - -colormap = ( - np.array( - [ - [76, 255, 0], - [0, 127, 70], - [255, 0, 0], - [255, 217, 38], - [0, 135, 255], - [165, 0, 165], - [255, 167, 255], - [0, 255, 255], - [255, 96, 38], - [142, 76, 0], - [33, 0, 127], - [0, 0, 0], - [183, 183, 183], - ], - dtype=np.float, - ) - / 255 -) - - -def plot_embeddings(embeddings, num_utter_per_speaker): - embeddings = embeddings[: 10 * num_utter_per_speaker] - model = umap.UMAP() - projection = model.fit_transform(embeddings) - num_speakers = embeddings.shape[0] // num_utter_per_speaker - ground_truth = np.repeat(np.arange(num_speakers), num_utter_per_speaker) - colors = [colormap[i] for i in ground_truth] - - fig, ax = plt.subplots(figsize=(16, 10)) - _ = ax.scatter(projection[:, 0], projection[:, 1], c=colors) - plt.gca().set_aspect("equal", "datalim") - plt.title("UMAP projection") - plt.tight_layout() - plt.savefig("umap") - return fig diff --git a/TTS/vocoder/configs/multiband_melgan_config.json b/TTS/vocoder/configs/multiband_melgan_config.json index a89d43bb..7a5a13e3 100644 --- a/TTS/vocoder/configs/multiband_melgan_config.json +++ b/TTS/vocoder/configs/multiband_melgan_config.json @@ -40,12 +40,9 @@ // "url": "tcp:\/\/localhost:54321" // }, - // MODEL PARAMETERS - "use_pqmf": true, - // LOSS PARAMETERS "use_stft_loss": true, - "use_subband_stft_loss": true, + "use_subband_stft_loss": true, // use only with multi-band models. "use_mse_gan_loss": true, "use_hinge_gan_loss": false, "use_feat_match_loss": false, // use only with melgan discriminators