diff --git a/layers/common_layers.py b/layers/common_layers.py index 37de2209..6a1e4b6e 100644 --- a/layers/common_layers.py +++ b/layers/common_layers.py @@ -136,6 +136,8 @@ class GravesAttention(nn.Module): self.attention_weights = torch.zeros(inputs.shape[0], inputs.shape[1]).to(inputs.device) self.mu_prev = torch.zeros(inputs.shape[0], self.K).to(inputs.device) + # pylint: disable=R0201 + # pylint: disable=unused-argument def preprocess_inputs(self, inputs): return None @@ -376,8 +378,7 @@ def init_attn(attn_type, query_dim, embedding_dim, attention_dim, attention_location_kernel_size, windowing, norm, forward_attn, trans_agent, forward_attn_mask) - elif attn_type == "graves": + if attn_type == "graves": return GravesAttention(query_dim, attn_K) - else: - raise RuntimeError( - " [!] Given Attention Type '{attn_type}' is not exist.") + raise RuntimeError( + " [!] Given Attention Type '{attn_type}' is not exist.") diff --git a/models/tacotron2.py b/models/tacotron2.py index c8fd9242..852b1886 100644 --- a/models/tacotron2.py +++ b/models/tacotron2.py @@ -27,6 +27,7 @@ class Tacotron2(nn.Module): separate_stopnet=True, bidirectional_decoder=False): super(Tacotron2, self).__init__() + self.postnet_output_dim = postnet_output_dim self.decoder_output_dim = decoder_output_dim self.n_frames_per_step = r self.bidirectional_decoder = bidirectional_decoder @@ -50,7 +51,7 @@ class Tacotron2(nn.Module): location_attn, attn_K, separate_stopnet, proj_speaker_dim) if self.bidirectional_decoder: self.decoder_backward = copy.deepcopy(self.decoder) - self.postnet = Postnet(self.decoder_output_dim) + self.postnet = Postnet(self.postnet_output_dim) def _init_states(self): self.speaker_embeddings = None diff --git a/speaker_encoder/compute_embeddings.py b/speaker_encoder/compute_embeddings.py index ff90acb7..e746463a 100644 --- a/speaker_encoder/compute_embeddings.py +++ b/speaker_encoder/compute_embeddings.py @@ -6,51 +6,41 @@ import numpy as np from tqdm import tqdm import torch -from torch.utils.data import DataLoader -from TTS.datasets.preprocess import get_preprocessor_by_name -from TTS.speaker_encoder.dataset import MyDataset from TTS.speaker_encoder.model import SpeakerEncoder -from TTS.speaker_encoder.visual import plot_embeddings from TTS.utils.audio import AudioProcessor from TTS.utils.generic_utils import load_config parser = argparse.ArgumentParser( - description='Compute embedding vectors for each wav file in a dataset. ') -parser.add_argument( - 'model_path', - type=str, - help='Path to model outputs (checkpoint, tensorboard etc.).') -parser.add_argument( - 'config_path', - type=str, - help='Path to config file for training.', + description="Compute embedding vectors for each wav file in a dataset. " ) parser.add_argument( - 'data_path', - type=str, - help='Defines the data path. It overwrites config.json.') -parser.add_argument( - 'output_path', - type=str, - help='path for training outputs.') -parser.add_argument( - '--use_cuda', type=bool, help='flag to set cuda.', default=False + "model_path", type=str, help="Path to model outputs (checkpoint, tensorboard etc.)." ) +parser.add_argument( + "config_path", type=str, help="Path to config file for training.", +) +parser.add_argument( + "data_path", type=str, help="Defines the data path. It overwrites config.json." +) +parser.add_argument("output_path", type=str, help="path for training outputs.") +parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=False) args = parser.parse_args() c = load_config(args.config_path) -ap = AudioProcessor(**c['audio']) +ap = AudioProcessor(**c["audio"]) -wav_files = glob.glob(args.data_path + '/**/*.wav', recursive=True) -output_files = [wav_file.replace(args.data_path, args.output_path).replace( - '.wav', '.npy') for wav_file in wav_files] +wav_files = glob.glob(args.data_path + "/**/*.wav", recursive=True) +output_files = [ + wav_file.replace(args.data_path, args.output_path).replace(".wav", ".npy") + for wav_file in wav_files +] for output_file in output_files: os.makedirs(os.path.dirname(output_file), exist_ok=True) model = SpeakerEncoder(**c.model) -model.load_state_dict(torch.load(args.model_path)['model']) +model.load_state_dict(torch.load(args.model_path)["model"]) model.eval() if args.use_cuda: model.cuda() diff --git a/speaker_encoder/dataset.py b/speaker_encoder/dataset.py index 2dd50c7d..913b7a6d 100644 --- a/speaker_encoder/dataset.py +++ b/speaker_encoder/dataset.py @@ -1,34 +1,24 @@ -import os import numpy as np -import collections import torch import random from torch.utils.data import Dataset -from TTS.utils.text import text_to_sequence, phoneme_to_sequence, pad_with_eos_bos -from TTS.utils.data import prepare_data, prepare_tensor, prepare_stop_target - class MyDataset(Dataset): - def __init__(self, - ap, - meta_data, - voice_len=1.6, - num_speakers_in_batch=64, - num_utter_per_speaker=10, - skip_speakers=False, - verbose=False): + def __init__(self, ap, meta_data, voice_len=1.6, num_speakers_in_batch=64, + num_utter_per_speaker=10, skip_speakers=False, verbose=False): """ Args: ap (TTS.utils.AudioProcessor): audio processor object. meta_data (list): list of dataset instances. - seq_len (int): voice segment length in seconds. + seq_len (int): voice segment length in seconds. verbose (bool): print diagnostic information. """ self.items = meta_data self.sample_rate = ap.sample_rate self.voice_len = voice_len self.seq_len = int(voice_len * self.sample_rate) + self.num_speakers_in_batch = num_speakers_in_batch self.num_utter_per_speaker = num_utter_per_speaker self.skip_speakers = skip_speakers self.ap = ap @@ -47,16 +37,16 @@ class MyDataset(Dataset): def load_data(self, idx): text, wav_file, speaker_name = self.items[idx] wav = np.asarray(self.load_wav(wav_file), dtype=np.float32) - mel = self.ap.melspectrogram(wav).astype('float32') + mel = self.ap.melspectrogram(wav).astype("float32") # sample seq_len assert text.size > 0, self.items[idx][1] assert wav.size > 0, self.items[idx][1] sample = { - 'mel': mel, - 'item_idx': self.items[idx][1], - 'speaker_name': speaker_name + "mel": mel, + "item_idx": self.items[idx][1], + "speaker_name": speaker_name, } return sample @@ -64,26 +54,32 @@ class MyDataset(Dataset): """ Find unique speaker ids and create a dict mapping utterances from speaker id """ - speakers = list(set([item[-1] for item in self.items])) + speakers = list({item[-1] for item in self.items}) self.speaker_to_utters = {} self.speakers = [] for speaker in speakers: speaker_utters = [item[1] for item in self.items if item[2] == speaker] if len(speaker_utters) < self.num_utter_per_speaker and self.skip_speakers: - print(f" [!] Skipped speaker {speaker}. Not enough utterances {self.num_utter_per_speaker} vs {len(speaker_utters)}.") + print( + f" [!] Skipped speaker {speaker}. Not enough utterances {self.num_utter_per_speaker} vs {len(speaker_utters)}." + ) else: self.speakers.append(speaker) self.speaker_to_utters[speaker] = speaker_utters def __len__(self): - return int(1e+10) + return int(1e10) def __sample_speaker(self): speaker = random.sample(self.speakers, 1)[0] if self.num_utter_per_speaker > len(self.speaker_to_utters[speaker]): - utters = random.choices(self.speaker_to_utters[speaker], k=self.num_utter_per_speaker) + utters = random.choices( + self.speaker_to_utters[speaker], k=self.num_utter_per_speaker + ) else: - utters = random.sample(self.speaker_to_utters[speaker], self.num_utter_per_speaker) + utters = random.sample( + self.speaker_to_utters[speaker], self.num_utter_per_speaker + ) return speaker, utters def __sample_speaker_utterances(self, speaker): @@ -92,7 +88,7 @@ class MyDataset(Dataset): """ feats = [] labels = [] - for idx in range(self.num_utter_per_speaker): + for _ in range(self.num_utter_per_speaker): # TODO:dummy but works while True: if len(self.speaker_to_utters[speaker]) > 0: @@ -104,11 +100,10 @@ class MyDataset(Dataset): wav = self.load_wav(utter) if wav.shape[0] - self.seq_len > 0: break - else: - self.speaker_to_utters[speaker].remove(utter) + self.speaker_to_utters[speaker].remove(utter) offset = random.randint(0, wav.shape[0] - self.seq_len) - mel = self.ap.melspectrogram(wav[offset:offset+self.seq_len]) + mel = self.ap.melspectrogram(wav[offset : offset + self.seq_len]) feats.append(torch.FloatTensor(mel)) labels.append(speaker) return feats, labels @@ -124,5 +119,5 @@ class MyDataset(Dataset): feats_, labels_ = self.__sample_speaker_utterances(speaker) labels.append(labels_) feats.extend(feats_) - feats = torch.stack(feats) - return feats.transpose(1, 2), labels \ No newline at end of file + feats = torch.stack(feats) + return feats.transpose(1, 2), labels diff --git a/speaker_encoder/loss.py b/speaker_encoder/loss.py index 9b5a29b8..ab290547 100644 --- a/speaker_encoder/loss.py +++ b/speaker_encoder/loss.py @@ -5,9 +5,8 @@ import torch.nn.functional as F # adapted from https://github.com/cvqluu/GE2E-Loss class GE2ELoss(nn.Module): - - def __init__(self, init_w=10.0, init_b=-5.0, loss_method='softmax'): - ''' + def __init__(self, init_w=10.0, init_b=-5.0, loss_method="softmax"): + """ Implementation of the Generalized End-to-End loss defined in https://arxiv.org/abs/1710.10467 [1] Accepts an input of size (N, M, D) where N is the number of speakers in the batch, @@ -16,24 +15,27 @@ class GE2ELoss(nn.Module): Args: - init_w (float): defines the initial value of w in Equation (5) of [1] - init_b (float): definies the initial value of b in Equation (5) of [1] - ''' + """ super(GE2ELoss, self).__init__() + # pylint: disable=E1102 self.w = nn.Parameter(torch.tensor(init_w)) + # pylint: disable=E1102 self.b = nn.Parameter(torch.tensor(init_b)) self.loss_method = loss_method - assert self.loss_method in ['softmax', 'contrast'] + assert self.loss_method in ["softmax", "contrast"] - if self.loss_method == 'softmax': + if self.loss_method == "softmax": self.embed_loss = self.embed_loss_softmax - if self.loss_method == 'contrast': + if self.loss_method == "contrast": self.embed_loss = self.embed_loss_contrast + # pylint: disable=R0201 def calc_new_centroids(self, dvecs, centroids, spkr, utt): - ''' + """ Calculates the new centroids excluding the reference utterance - ''' - excl = torch.cat((dvecs[spkr, :utt], dvecs[spkr, utt+1:])) + """ + excl = torch.cat((dvecs[spkr, :utt], dvecs[spkr, utt + 1 :])) excl = torch.mean(excl, 0) new_centroids = [] for i, centroid in enumerate(centroids): @@ -44,26 +46,36 @@ class GE2ELoss(nn.Module): return torch.stack(new_centroids) def calc_cosine_sim(self, dvecs, centroids): - ''' + """ Make the cosine similarity matrix with dims (N,M,N) - ''' + """ cos_sim_matrix = [] for spkr_idx, speaker in enumerate(dvecs): cs_row = [] for utt_idx, utterance in enumerate(speaker): new_centroids = self.calc_new_centroids( - dvecs, centroids, spkr_idx, utt_idx) + dvecs, centroids, spkr_idx, utt_idx + ) # vector based cosine similarity for speed - cs_row.append(torch.clamp(torch.mm(utterance.unsqueeze(1).transpose(0, 1), new_centroids.transpose( - 0, 1)) / (torch.norm(utterance) * torch.norm(new_centroids, dim=1)), 1e-6)) + cs_row.append( + torch.clamp( + torch.mm( + utterance.unsqueeze(1).transpose(0, 1), + new_centroids.transpose(0, 1), + ) + / (torch.norm(utterance) * torch.norm(new_centroids, dim=1)), + 1e-6, + ) + ) cs_row = torch.cat(cs_row, dim=0) cos_sim_matrix.append(cs_row) return torch.stack(cos_sim_matrix) + # pylint: disable=R0201 def embed_loss_softmax(self, dvecs, cos_sim_matrix): - ''' + """ Calculates the loss on each embedding $L(e_{ji})$ by taking softmax - ''' + """ N, M, _ = dvecs.shape L = [] for j in range(N): @@ -74,10 +86,11 @@ class GE2ELoss(nn.Module): L.append(L_row) return torch.stack(L) + # pylint: disable=R0201 def embed_loss_contrast(self, dvecs, cos_sim_matrix): - ''' + """ Calculates the loss on each embedding $L(e_{ji})$ by contrast loss with closest centroid - ''' + """ N, M, _ = dvecs.shape L = [] for j in range(N): @@ -85,17 +98,21 @@ class GE2ELoss(nn.Module): for i in range(M): centroids_sigmoids = torch.sigmoid(cos_sim_matrix[j, i]) excl_centroids_sigmoids = torch.cat( - (centroids_sigmoids[:j], centroids_sigmoids[j+1:])) + (centroids_sigmoids[:j], centroids_sigmoids[j + 1 :]) + ) L_row.append( - 1. - torch.sigmoid(cos_sim_matrix[j, i, j]) + torch.max(excl_centroids_sigmoids)) + 1.0 + - torch.sigmoid(cos_sim_matrix[j, i, j]) + + torch.max(excl_centroids_sigmoids) + ) L_row = torch.stack(L_row) L.append(L_row) return torch.stack(L) def forward(self, dvecs): - ''' + """ Calculates the GE2E loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats) - ''' + """ centroids = torch.mean(dvecs, 1) cos_sim_matrix = self.calc_cosine_sim(dvecs, centroids) torch.clamp(self.w, 1e-6) diff --git a/speaker_encoder/model.py b/speaker_encoder/model.py index 000cc967..b3bd71ff 100644 --- a/speaker_encoder/model.py +++ b/speaker_encoder/model.py @@ -10,10 +10,10 @@ class LSTMWithProjection(nn.Module): self.proj_size = proj_size self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True) self.linear = nn.Linear(hidden_size, proj_size, bias=False) - + def forward(self, x): self.lstm.flatten_parameters() - o, (h, c) = self.lstm(x) + o, (_, _) = self.lstm(x) return self.linear(o) @@ -22,16 +22,16 @@ class SpeakerEncoder(nn.Module): super().__init__() layers = [] layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim)) - for _ in range(num_lstm_layers-1): + for _ in range(num_lstm_layers - 1): layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim)) self.layers = nn.Sequential(*layers) self._init_layers() def _init_layers(self): for name, param in self.layers.named_parameters(): - if 'bias' in name: + if "bias" in name: nn.init.constant_(param, 0.0) - elif 'weight' in name: + elif "weight" in name: nn.init.xavier_normal_(param) def forward(self, x): @@ -81,7 +81,8 @@ class SpeakerEncoder(nn.Module): if embed is None: embed = self.inference(frames) else: - embed[cur_iter <= num_iters, :] += self.inference(frames[cur_iter <= num_iters, :, :]) + embed[cur_iter <= num_iters, :] += self.inference( + frames[cur_iter <= num_iters, :, :] + ) return embed / num_iters - diff --git a/speaker_encoder/tests.py b/speaker_encoder/tests.py index bb5ac28e..220ba360 100644 --- a/speaker_encoder/tests.py +++ b/speaker_encoder/tests.py @@ -4,22 +4,21 @@ import torch as T from TTS.speaker_encoder.model import SpeakerEncoder from TTS.speaker_encoder.loss import GE2ELoss -from TTS.speaker_encoder.dataset import MyDataset -from TTS.utils.audio import AudioProcessor -from torch.utils.data import DataLoader -from TTS.datasets.preprocess import libri_tts from TTS.utils.generic_utils import load_config -file_path = os.path.dirname(os.path.realpath(__file__)) + "/../tests/" -c = load_config(os.path.join(file_path, 'test_config.json')) +file_path = os.path.dirname(os.path.realpath(__file__)) + "/../tests/" +c = load_config(os.path.join(file_path, "test_config.json")) class SpeakerEncoderTests(unittest.TestCase): + # pylint: disable=R0201 def test_in_out(self): dummy_input = T.rand(4, 20, 80) # B x T x D dummy_hidden = [T.rand(2, 4, 128), T.rand(2, 4, 128)] - model = SpeakerEncoder(input_dim=80, proj_dim=256, lstm_dim=768, num_lstm_layers=3) + model = SpeakerEncoder( + input_dim=80, proj_dim=256, lstm_dim=768, num_lstm_layers=3 + ) # computing d vectors output = model.forward(dummy_input) assert output.shape[0] == 4 @@ -35,8 +34,10 @@ class SpeakerEncoderTests(unittest.TestCase): # check normalization output_norm = T.nn.functional.normalize(output, dim=1, p=2) assert_diff = (output_norm - output).sum().item() - assert output.type() == 'torch.FloatTensor' - assert abs(assert_diff) < 1e-4, f" [!] output_norm has wrong values - {assert_diff}" + assert output.type() == "torch.FloatTensor" + assert ( + abs(assert_diff) < 1e-4 + ), f" [!] output_norm has wrong values - {assert_diff}" # compute d for a given batch dummy_input = T.rand(1, 240, 80) # B x T x D output = model.compute_embedding(dummy_input, num_frames=160, overlap=0.5) @@ -45,23 +46,29 @@ class SpeakerEncoderTests(unittest.TestCase): assert len(output.shape) == 2 - class GE2ELossTests(unittest.TestCase): + # pylint: disable=R0201 def test_in_out(self): # check random input dummy_input = T.rand(4, 5, 64) # num_speaker x num_utterance x dim - loss = GE2ELoss(loss_method='softmax') + loss = GE2ELoss(loss_method="softmax") output = loss.forward(dummy_input) - assert output.item() >= 0. + assert output.item() >= 0.0 # check all zeros dummy_input = T.ones(4, 5, 64) # num_speaker x num_utterance x dim - loss = GE2ELoss(loss_method='softmax') + loss = GE2ELoss(loss_method="softmax") output = loss.forward(dummy_input) # check speaker loss with orthogonal d-vectors dummy_input = T.empty(3, 64) dummy_input = T.nn.init.orthogonal(dummy_input) - dummy_input = T.cat([dummy_input[0].repeat(5, 1, 1).transpose(0, 1), dummy_input[1].repeat(5, 1, 1).transpose(0, 1), dummy_input[2].repeat(5, 1, 1).transpose(0, 1)]) # num_speaker x num_utterance x dim - loss = GE2ELoss(loss_method='softmax') + dummy_input = T.cat( + [ + dummy_input[0].repeat(5, 1, 1).transpose(0, 1), + dummy_input[1].repeat(5, 1, 1).transpose(0, 1), + dummy_input[2].repeat(5, 1, 1).transpose(0, 1), + ] + ) # num_speaker x num_utterance x dim + loss = GE2ELoss(loss_method="softmax") output = loss.forward(dummy_input) assert output.item() < 0.005 @@ -77,4 +84,4 @@ class GE2ELossTests(unittest.TestCase): # print(mel.shape) # if count == 4: # break -# count += 1 \ No newline at end of file +# count += 1 diff --git a/speaker_encoder/train.py b/speaker_encoder/train.py index e154f616..962bb881 100644 --- a/speaker_encoder/train.py +++ b/speaker_encoder/train.py @@ -5,24 +5,21 @@ import time import traceback import torch -from torch import optim from torch.utils.data import DataLoader from TTS.datasets.preprocess import load_meta_data from TTS.speaker_encoder.dataset import MyDataset -from TTS.speaker_encoder.generic_utils import save_best_model, save_checkpoint from TTS.speaker_encoder.loss import GE2ELoss from TTS.speaker_encoder.model import SpeakerEncoder from TTS.speaker_encoder.visual import plot_embeddings +from TTS.speaker_encoder.generic_utils import save_best_model, save_checkpoint from TTS.utils.audio import AudioProcessor from TTS.utils.generic_utils import (NoamLR, check_update, copy_config_file, count_parameters, create_experiment_folder, get_git_branch, - gradual_training_scheduler, load_config, - remove_experiment_folder, set_init_dict, - setup_model, split_dataset) + load_config, + remove_experiment_folder, set_init_dict) from TTS.utils.logger import Logger from TTS.utils.radam import RAdam -from TTS.utils.visual import plot_alignment, plot_spectrogram torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True @@ -34,10 +31,6 @@ print(" > Number of GPUs: ", num_gpus) def setup_loader(ap, is_val=False, verbose=False): - global meta_data_train - global meta_data_eval - if "meta_data_train" not in globals(): - meta_data_train, meta_data_eval = load_meta_data(c.datasets) if is_val: loader = None else: @@ -63,12 +56,11 @@ def train(model, criterion, optimizer, scheduler, ap, global_step): best_loss = float('inf') avg_loss = 0 end_time = time.time() - for num_iter, data in enumerate(data_loader): + for _, data in enumerate(data_loader): start_time = time.time() # setup input data inputs = data[0] - labels = data[1] loader_time = time.time() - end_time global_step += 1 @@ -132,68 +124,11 @@ def train(model, criterion, optimizer, scheduler, ap, global_step): return avg_loss, global_step -# def evaluate(model, criterion, ap, global_step, epoch): -# data_loader = setup_loader(ap, is_val=True) -# model.eval() -# epoch_time = 0 -# avg_loss = 0 -# print("\n > Validation") -# with torch.no_grad(): -# if data_loader is not None: -# for num_iter, data in enumerate(data_loader): -# start_time = time.time() - -# # setup input data -# inputs = data[0] -# labels = data[1] - -# # dispatch data to GPU -# if use_cuda: -# inputs = inputs.cuda() -# # labels = labels.cuda() - -# # forward pass -# outputs = model.forward(inputs) - -# # loss computation -# loss = criterion(outputs.reshape( -# c.num_speakers_in_batch, outputs.shape[0] // c.num_speakers_in_batch, -1)) -# step_time = time.time() - start_time -# epoch_time += step_time - -# if num_iter % c.print_step == 0: -# print( -# " | > Loss: {:.5f} ".format(loss.item()), -# flush=True) - -# avg_loss += float(loss.item()) - -# eval_figures = { -# "prediction": plot_spectrogram(const_spec, ap), -# "ground_truth": plot_spectrogram(gt_spec, ap), -# "alignment": plot_alignment(align_img) -# } -# tb_logger.tb_eval_figures(global_step, eval_figures) - -# # Sample audio -# if c.model in ["Tacotron", "TacotronGST"]: -# eval_audio = ap.inv_spectrogram(const_spec.T) -# else: -# eval_audio = ap.inv_mel_spectrogram(const_spec.T) -# tb_logger.tb_eval_audios( -# global_step, {"ValAudio": eval_audio}, c.audio["sample_rate"]) - -# # compute average losses -# avg_loss /= (num_iter + 1) - -# # Plot Validation Stats -# epoch_stats = {"GE2Eloss": avg_loss} -# tb_logger.tb_eval_stats(global_step, epoch_stats) -# return avg_loss - - -# FIXME: move args definition/parsing inside of main? def main(args): # pylint: disable=redefined-outer-name + # pylint: disable=global-variable-undefined + global meta_data_train + global meta_data_eval + ap = AudioProcessor(**c.audio) model = SpeakerEncoder(input_dim=40, proj_dim=128, @@ -211,7 +146,7 @@ def main(args): # pylint: disable=redefined-outer-name if c.reinit_layers: raise RuntimeError model.load_state_dict(checkpoint['model']) - except: + except KeyError: print(" > Partial model initialization.") model_dict = model.state_dict() model_dict = set_init_dict(model_dict, checkpoint, c) @@ -239,6 +174,9 @@ def main(args): # pylint: disable=redefined-outer-name num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) + # pylint: disable=redefined-outer-name + meta_data_train, meta_data_eval = load_meta_data(c.datasets) + global_step = args.restore_step train_loss, global_step = train(model, criterion, optimizer, scheduler, ap, global_step) diff --git a/speaker_encoder/visual.py b/speaker_encoder/visual.py index e8acde94..68c48f12 100644 --- a/speaker_encoder/visual.py +++ b/speaker_encoder/visual.py @@ -3,28 +3,34 @@ import numpy as np import matplotlib import matplotlib.pyplot as plt -matplotlib.use('Agg') +matplotlib.use("Agg") -colormap = np.array([ - [76, 255, 0], - [0, 127, 70], - [255, 0, 0], - [255, 217, 38], - [0, 135, 255], - [165, 0, 165], - [255, 167, 255], - [0, 255, 255], - [255, 96, 38], - [142, 76, 0], - [33, 0, 127], - [0, 0, 0], - [183, 183, 183], -], dtype=np.float) / 255 +colormap = ( + np.array( + [ + [76, 255, 0], + [0, 127, 70], + [255, 0, 0], + [255, 217, 38], + [0, 135, 255], + [165, 0, 165], + [255, 167, 255], + [0, 255, 255], + [255, 96, 38], + [142, 76, 0], + [33, 0, 127], + [0, 0, 0], + [183, 183, 183], + ], + dtype=np.float, + ) + / 255 +) def plot_embeddings(embeddings, num_utter_per_speaker): - embeddings = embeddings[:10*num_utter_per_speaker] + embeddings = embeddings[: 10 * num_utter_per_speaker] model = umap.UMAP() projection = model.fit_transform(embeddings) num_speakers = embeddings.shape[0] // num_utter_per_speaker @@ -32,7 +38,7 @@ def plot_embeddings(embeddings, num_utter_per_speaker): colors = [colormap[i] for i in ground_truth] fig, ax = plt.subplots(figsize=(16, 10)) - im = ax.scatter(projection[:, 0], projection[:, 1], c=colors) + _ = ax.scatter(projection[:, 0], projection[:, 1], c=colors) plt.gca().set_aspect("equal", "datalim") plt.title("UMAP projection") plt.tight_layout() diff --git a/tests/outputs/dummy_model_config.json b/tests/outputs/dummy_model_config.json index 6bf90138..2f56c6ce 100644 --- a/tests/outputs/dummy_model_config.json +++ b/tests/outputs/dummy_model_config.json @@ -44,6 +44,8 @@ "prenet_dropout": true, // ONLY TACOTRON2 - enable/disable dropout at prenet. "use_forward_attn": true, // ONLY TACOTRON2 - if it uses forward attention. In general, it aligns faster. "forward_attn_mask": false, + "attention_type": "original", + "attention_heads": 5, "bidirectional_decoder": false, "transition_agent": false, // ONLY TACOTRON2 - enable/disable transition agent of forward attention. "location_attn": false, // ONLY TACOTRON2 - enable_disable location sensitive attention. It is enabled for TACOTRON by default. diff --git a/tests/test_layers.py b/tests/test_layers.py index f9e60363..f4f8b804 100644 --- a/tests/test_layers.py +++ b/tests/test_layers.py @@ -5,7 +5,8 @@ from TTS.layers.tacotron import Prenet, CBHG, Decoder, Encoder from TTS.layers.losses import L1LossMasked from TTS.utils.generic_utils import sequence_mask -#pylint: disable=unused-variable +# pylint: disable=unused-variable + class PrenetTests(unittest.TestCase): def test_in_out(self): @@ -49,6 +50,8 @@ class DecoderTests(unittest.TestCase): memory_size=4, attn_windowing=False, attn_norm="sigmoid", + attn_K=5, + attn_type="original", prenet_type='original', prenet_dropout=True, forward_attn=True, @@ -77,6 +80,8 @@ class DecoderTests(unittest.TestCase): memory_size=4, attn_windowing=False, attn_norm="sigmoid", + attn_K=5, + attn_type="graves", prenet_type='original', prenet_dropout=True, forward_attn=True, diff --git a/train.py b/train.py index 50942a7f..7ab213a6 100644 --- a/train.py +++ b/train.py @@ -117,7 +117,8 @@ def format_data(data): def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, ap, global_step, epoch): - data_loader = setup_loader(ap, model.decoder.r, is_val=False, verbose=(epoch == 0)) + data_loader = setup_loader(ap, model.decoder.r, is_val=False, + verbose=(epoch == 0)) model.train() epoch_time = 0 train_values = { diff --git a/utils/audio.py b/utils/audio.py index f57635df..cba5b344 100644 --- a/utils/audio.py +++ b/utils/audio.py @@ -15,7 +15,7 @@ class AudioProcessor(object): ref_level_db=None, num_freq=None, power=None, - preemphasis=None, + preemphasis=0.0, signal_norm=None, symmetric_norm=None, max_norm=None, @@ -48,7 +48,7 @@ class AudioProcessor(object): self.do_trim_silence = do_trim_silence self.sound_norm = sound_norm self.n_fft, self.hop_length, self.win_length = self._stft_parameters() - assert min_level_db ~= 0.0, " [!] min_level_db is 0" + assert min_level_db != 0.0, " [!] min_level_db is 0" members = vars(self) for key, value in members.items(): print(" | > {}:{}".format(key, value)) @@ -132,12 +132,12 @@ class AudioProcessor(object): def apply_preemphasis(self, x): if self.preemphasis == 0: - raise RuntimeError(" !! Preemphasis is applied with factor 0.0. ") + raise RuntimeError(" [!] Preemphasis is set 0.0.") return scipy.signal.lfilter([1, -self.preemphasis], [1], x) def apply_inv_preemphasis(self, x): if self.preemphasis == 0: - raise RuntimeError(" !! Preemphasis is applied with factor 0.0. ") + raise RuntimeError(" [!] Preemphasis is set 0.0.") return scipy.signal.lfilter([1], [1, -self.preemphasis], x) def spectrogram(self, y): diff --git a/utils/logger.py b/utils/logger.py index f83c9d0f..51a10422 100644 --- a/utils/logger.py +++ b/utils/logger.py @@ -13,8 +13,8 @@ class Logger(object): for name, param in model.named_parameters(): if param.numel() == 1: self.writer.add_scalar( - "layer{}-{}/value".format(layer_num, name), - param.max(), step) + "layer{}-{}/value".format(layer_num, name), + param.max(), step) else: self.writer.add_scalar( "layer{}-{}/max".format(layer_num, name),