From 134ed5ece6dc576ad83a5bd2ddca469a06696386 Mon Sep 17 00:00:00 2001 From: Aya Jafari Date: Fri, 17 Jun 2022 15:01:02 +0000 Subject: [PATCH] fixed bugs, added model in torch --- spkr-attr/classifier/model_torch.py | 116 ++++++++++++++++ spkr-attr/cv_data_processing.py | 208 ++++++++++++++-------------- 2 files changed, 219 insertions(+), 105 deletions(-) create mode 100644 spkr-attr/classifier/model_torch.py diff --git a/spkr-attr/classifier/model_torch.py b/spkr-attr/classifier/model_torch.py new file mode 100644 index 00000000..bff4f880 --- /dev/null +++ b/spkr-attr/classifier/model_torch.py @@ -0,0 +1,116 @@ +import argparse +import os + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from clearml import Task +from sklearn.metrics import accuracy_score, confusion_matrix, f1_score + + +class baseline_model(nn.Module): + def __init__(self, class_num, emb_dim): + super().__init__() + self.fc1 = nn.Linear(emb_dim, 1024) + self.fc2 = nn.Linear(1024, 512) + self.fc3 = nn.Linear(512, 128) + self.fc4 = nn.Linear(128, 64) + self.fc5 = nn.Linear(64, class_num) + self.dropout1 = nn.Dropout(0.2) + self.dropout2 = nn.Dropout(0.5) + + def forward(self, x): + output = self.dropout1(F.relu(self.fc1(x))) + output = self.dropout2(F.relu(self.fc2(output))) + output = F.relu(self.fc3(output)) + output = F.relu(self.fc4(output)) + output = F.softmax(self.fc5(output), dim=0) + return output + + +def load_data(train, labels): + t = torch.Tensor(np.load(train)) + l = torch.Tensor(np.load(labels)) + dataset = torch.utils.data.TensorDataset(t, l) + return dataset + + +def compute_metrics(y_pred, y_true): + acc = accuracy_score(y_true.argmax(axis=1), y_pred.argmax(axis=1)) + print(f"Model Accuracy on untrained data: {acc}") + conf_matrix = confusion_matrix(y_true.argmax(axis=1), y_pred.argmax(axis=1)) + print(f"Confusion matrix on {y_true.shape[-1]} classes: \n {conf_matrix}") + f1 = f1_score(y_true.argmax(axis=1), y_pred.argmax(axis=1), average="weighted") + print(f"Weighted f1 score: {f1}") + + +def test_model(test, test_labels, model): + test, y_true = load_data(test, test_labels) + y_pred = model.predict(test, verbose=1) + compute_metrics(y_pred, y_true) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--data", required=True) + parser.add_argument("--labels", required=True) + parser.add_argument("--mode", help="train or evaluate", required=True) + parser.add_argument("--embedding_dim", default=512) + parser.add_argument( + "--load_ckpt", + ) + parser.add_argument( + "--class_weights", + ) + parser.add_argument("--epoch", default=20) + parser.add_argument("--batch", default=50) + parser.add_argument( + "--clearml_project", + default="YourTTS-sprint2", + ) + parser.add_argument( + "--clearml_task", + default="attribute-classifier", + ) + + args = parser.parse_args() + Task.init(project_name=args.clearml_project, task_name=f"{args.clearml_task}-{args.labels}") + # Load data + dataset = load_data(args.data, args.labels) + dataloader = torch.utils.data.DataLoader(dataset, batch_size=args.batch, shuffle=True, num_workers=2) + + # Initialize model + class_num = len(np.unique(np.load(args.labels), axis=0)) + model = baseline_model(class_num, args.embedding_dim) + criterion = nn.CrossEntropyLoss() + optimizer = torch.optim.Adam(model.parameters(), lr=0.0001) + + # Train + for epoch in range(args.epoch): + + for i, data in enumerate(dataloader, 0): + # get the inputs; data is a list of [inputs, labels] + inputs, labels = data + + # zero the parameter gradients + optimizer.zero_grad() + + # forward + backward + optimize + outputs = model(inputs) + loss = criterion(outputs, labels) + loss.backward() + optimizer.step() + + # print statistics + print(f"[Epoch:{epoch + 1}, step:{i + 1:5d}] loss: {loss.item() :.3f}") + + checkpoint_dir = "checkpoint" + if not os.path.exists(checkpoint_dir): + os.makedirs(checkpoint_dir) + checkpoint_filepath = f"{checkpoint_dir}/checkpoint.pth" + torch.save(model.state_dict(), checkpoint_filepath) + + +if __name__ == "__main__": + main() diff --git a/spkr-attr/cv_data_processing.py b/spkr-attr/cv_data_processing.py index b8ae6e17..d2818599 100644 --- a/spkr-attr/cv_data_processing.py +++ b/spkr-attr/cv_data_processing.py @@ -4,49 +4,52 @@ import os import pickle import random import subprocess -from collections import Counter +from argparse import RawTextHelpFormatter import numpy as np import pandas as pd -from colorama import Back, Fore, Style from pydub import AudioSegment from sklearn.preprocessing import LabelEncoder, OneHotEncoder +from sklearn.utils import shuffle from tqdm import tqdm -def load_df(filename): - df = pd.read_csv(filename, sep ='\t') +def load_df(filename, n): + if n == "All": + df = pd.read_csv(filename, sep="\t") + else: + df = shuffle(pd.read_csv(filename, sep="\t")).head(n=int(n)) return df -def analyze_df(df,label): + +def analyze_df(df, label): label_dict = {} - df_filtered = df[df[label].notnull() & df[label].notna()] - df_final = df_filtered[df_filtered[label]!="other"][label] + df_filtered = df[df[label].notnull() & df[label].notna()] + df_final = df_filtered[df_filtered[label] != "other"][label] for ac in df_final.unique(): - speakers = df[df[label]==ac]['client_id'].unique() + speakers = df[df[label] == ac]["client_id"].unique() no_speakers = len(speakers) - if(no_speakers<50): - continue - label_dict[ac]=speakers - print(Fore.YELLOW, f"\"{ac}\" unique speakers no.: {no_speakers}") - print(Style.RESET_ALL) + label_dict[ac] = speakers + print(f'"{ac}" unique speakers no.: {no_speakers}') return label_dict + def train_test_split(df, label, label_dict, split=0.1): - print(len(label_dict.keys()),label_dict.keys()) + print(len(label_dict.keys()), label_dict.keys()) train = pd.DataFrame() test = pd.DataFrame() for l in label_dict.keys(): spkrs = label_dict[l] - train_spkrs = spkrs[:int(len(spkrs)*(1-split))] - test_spkrs = spkrs[int(len(spkrs)*(1-split)):] - train = pd.concat([train,df[df.client_id.isin(train_spkrs)]]) - test = pd.concat([test,df[df.client_id.isin(test_spkrs)]]) - train = train[train[label]!="other"] - test = test[test[label]!="other"] + train_spkrs = spkrs[: int(len(spkrs) * (1 - split))] + test_spkrs = spkrs[int(len(spkrs) * (1 - split)) :] + train = pd.concat([train, df[df.client_id.isin(train_spkrs)]]) + test = pd.concat([test, df[df.client_id.isin(test_spkrs)]]) + train = train[train[label] != "other"] + test = test[test[label] != "other"] return train, test -def mp3_to_wav(mp3_list,data_path,data_split_path,json_file): + +def mp3_to_wav(mp3_list, data_path, data_split_path, json_file): waves = [] for i in tqdm(mp3_list): sound = AudioSegment.from_mp3(f"{data_path}/{i}") @@ -54,133 +57,128 @@ def mp3_to_wav(mp3_list,data_path,data_split_path,json_file): waves.append(wav) sound.export(wav, format="wav") - ff = open(f"{data_split_path}",'w') - ff.write("wav_filename|gender|text|speaker_name\n") - for i,j in enumerate(waves): - ff.write(f"{j}|m|blabla|ID_{i}\n") - ff.close() - write_config_dataset(data_path,data_split_path,json_file) + with open(f"{data_split_path}", "w") as f: + f.write("wav_filename|gender|text|speaker_name\n") + for i, j in enumerate(waves): + f.write(f"{j}|m|blabla|ID_{i}\n") + write_config_dataset(data_path, data_split_path, json_file) -def write_config_dataset(data_path,data_split_path,json_path): + +def write_config_dataset(data_path, data_split_path, json_path): + cwd = os.getcwd() + data_split_full_path = os.path.join(cwd, data_split_path) data = { - "model": "vits", - "datasets": [ + "model": "vits", + "datasets": [ { - "name": "brspeech", - "path": data_path, - "meta_file_train": data_split_path, - "language": "en", - "meta_file_val": "null", - "meta_file_attn_mask": "" + "name": "brspeech", + "path": data_path, + "meta_file_train": data_split_full_path, + "language": "en", + "meta_file_val": "null", + "meta_file_attn_mask": "", } - ] + ], } - with open(json_path, 'w') as outfile: + with open(json_path, "w") as outfile: json.dump(data, outfile) -def compute_speaker_emb(tts_root_dir,spkr_emb_model, spkr_emb_config, config_dataset, out_emb_json): - cmd = ["python", f"{tts_root_dir}/TTS/bin/compute_embeddings.py", "--use_cuda", "True" , - "--no_eval", "True", spkr_emb_model, - spkr_emb_config, config_dataset, out_emb_json] + +def compute_speaker_emb(tts_root_dir, spkr_emb_model, spkr_emb_config, config_dataset, out_emb_json): + cmd = [ + "python", + f"{tts_root_dir}/TTS/bin/compute_embeddings.py", + "--no_eval", + "True", + spkr_emb_model, + spkr_emb_config, + config_dataset, + "--output_path", + out_emb_json, + ] print(" ".join(cmd)) print(subprocess.check_output(cmd).decode("utf-8")) -def compose_dataset(embeddings_json,df,label,split,out_array_path): - f = open(embeddings_json) - embs = json.load(f) + +def compose_dataset(embeddings_json, df, label, out_array_path): + with open(embeddings_json) as f: + embs = json.load(f) e = [] l = [] for i in tqdm(df.path): - id_=i.split('.mp3')[0]+".wav" - e.append(embs[id_]['embedding']) - l.append(df[df['path']==i][label].item()) - ''' - for i in tqdm(embs): - id_ = i.split('/')[-1].split('.wav')[0]+".mp3" - e.append(embs[i]['embedding']) - l.append(df[df['path']==id_][label].item()) - ''' - #import pdb; pdb.set_trace() + id_ = i.split(".mp3")[0] + ".wav" + e.append(embs[id_]["embedding"]) + l.append(df[df["path"] == i][label].item()) values = np.array(l) label_encoder = LabelEncoder() - #print(f"{l} {label_encoder}") integer_encoded = label_encoder.fit_transform(values) - print(np.unique(values,return_counts=True),np.unique(integer_encoded)) + print(np.unique(values, return_counts=True), np.unique(integer_encoded)) onehot_encoder = OneHotEncoder(sparse=False) integer_encoded = integer_encoded.reshape(len(integer_encoded), 1) onehot = onehot_encoder.fit_transform(integer_encoded) - d = list(zip(e,onehot)) + d = list(zip(e, onehot)) random.shuffle(d) - data , labels = zip(*d) + data, labels = zip(*d) data_name = f"{out_array_path}_data.npy" label_name = f"{out_array_path}_labels.npy" np.save(data_name, data) - np.save(label_name,labels) - uniq, counts = np.unique(values,return_counts=True) - weight={} + np.save(label_name, labels) + _, counts = np.unique(values, return_counts=True) + weight = {} for i in np.unique(integer_encoded): - weight[i]=(1/counts[i])*(len(values)/2.0) + weight[i] = (1 / counts[i]) * (len(values) / 2.0) print(weight) - with open(f'{out_array_path}-weights.pkl', 'wb') as f: + with open(f"{out_array_path}-weights.pkl", "wb") as f: pickle.dump(weight, f) print(f"Data: {np.array(data).shape} ,{data_name} \n Labels: {np.array(labels).shape} , {label_name}") + def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--data", - help="Full path of CV data in tsv format", - required=True + parser = argparse.ArgumentParser( + description="A scirpt to prepare CV data for speaker embedding classification.\n" + "Example runs:\n" + "python cv_data_processing.py --data /datasets/cv/8.0/en/train.tsv --attribute age --out_dir result --num_rec 100 --tts_root_dir /mount-storage/TTS/TTS --spkr_emb_model models/model_se.pth.tar --spkr_emb_config models/config_se.json", + formatter_class=RawTextHelpFormatter, ) + parser.add_argument("--data", help="Full path of CV data in tsv format", required=True) parser.add_argument( - "--label", - required=True - ) - parser.add_argument( - "--out_dir", - required=True - ) - parser.add_argument( - "--spkr_emb_model", - required=True - ) - parser.add_argument( - "--spkr_emb_config", - required=True - ) - parser.add_argument( - "--tts_root_dir", - required=True + "--num_rec", help="Number of records to use out of --data. Supply All to use all of the records", required=True ) + parser.add_argument("--attribute", help="Speaker attribute to sample from", required=True) + parser.add_argument("--out_dir", required=True) + + parser.add_argument("--spkr_emb_model", required=True) + parser.add_argument("--spkr_emb_config", required=True) + parser.add_argument("--tts_root_dir", required=True) args = parser.parse_args() - abs_path = '/'.join(args.data.split("/")[:-1]) - data_path = os.path.join(abs_path,"clips") + + abs_path = "/".join(args.data.split("/")[:-1]) + data_path = os.path.join(abs_path, "clips") if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) - args.out_dir = os.path.join(abs_path,args.out_dir) - df = load_df(args.data) + df = load_df(args.data, args.num_rec) - print(Fore.RED + f"Data header: {list(df)}") - print(Style.RESET_ALL) - assert args.label in list(df) - label_dict = analyze_df(df,args.label) - train_df, test_df = train_test_split(df, args.label, label_dict) + print(f"Data header: {list(df)}") + assert args.attribute in list(df) + label_dict = analyze_df(df, args.attribute) + train_df, test_df = train_test_split(df, args.attribute, label_dict) for split in ["train", "test"]: - if split=='train': + if split == "train": df_subset = train_df else: df_subset = test_df - tts_csv = os.path.join(args.out_dir,f"{args.label}_{split}_tts.csv") - config_dataset = os.path.join(args.out_dir, f"{args.label}_{split}_config_dataset.json") - #mp3_to_wav(df_subset['path'],data_path,tts_csv,config_dataset) - out_emb_json = "/datasets/cv/8.0/en/accent/filtered_spkr_embs.json" #os.path.join(args.out_dir,f"{args.label}_{split}_spkr_embs.json") - #compute_speaker_emb(args.tts_root_dir, args.spkr_emb_model, args.spkr_emb_config, config_dataset, out_emb_json) - out_array_path = os.path.join(args.out_dir, f"{args.label}_{args.label}_{split}") - compose_dataset(out_emb_json,df_subset,args.label,split,out_array_path) + tts_csv = os.path.join(args.out_dir, f"{args.attribute}_{split}_tts.csv") + config_dataset = os.path.join(args.out_dir, f"{args.attribute}_{split}_config_dataset.json") + mp3_to_wav(df_subset["path"], data_path, tts_csv, config_dataset) + out_emb_json = os.path.join(args.out_dir, f"{args.attribute}_{split}_spkr_embs.json") + compute_speaker_emb(args.tts_root_dir, args.spkr_emb_model, args.spkr_emb_config, config_dataset, out_emb_json) + out_array_path = os.path.join(args.out_dir, f"{args.attribute}_{split}") + compose_dataset(out_emb_json, df_subset, args.attribute, out_array_path) + + print("Done.") - print ("Done.") if __name__ == "__main__": main()