fixed bugs, added model in torch

This commit is contained in:
Aya Jafari 2022-06-17 15:01:02 +00:00
parent 1187d4df5f
commit 134ed5ece6
2 changed files with 219 additions and 105 deletions

View File

@ -0,0 +1,116 @@
import argparse
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from clearml import Task
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
class baseline_model(nn.Module):
def __init__(self, class_num, emb_dim):
super().__init__()
self.fc1 = nn.Linear(emb_dim, 1024)
self.fc2 = nn.Linear(1024, 512)
self.fc3 = nn.Linear(512, 128)
self.fc4 = nn.Linear(128, 64)
self.fc5 = nn.Linear(64, class_num)
self.dropout1 = nn.Dropout(0.2)
self.dropout2 = nn.Dropout(0.5)
def forward(self, x):
output = self.dropout1(F.relu(self.fc1(x)))
output = self.dropout2(F.relu(self.fc2(output)))
output = F.relu(self.fc3(output))
output = F.relu(self.fc4(output))
output = F.softmax(self.fc5(output), dim=0)
return output
def load_data(train, labels):
t = torch.Tensor(np.load(train))
l = torch.Tensor(np.load(labels))
dataset = torch.utils.data.TensorDataset(t, l)
return dataset
def compute_metrics(y_pred, y_true):
acc = accuracy_score(y_true.argmax(axis=1), y_pred.argmax(axis=1))
print(f"Model Accuracy on untrained data: {acc}")
conf_matrix = confusion_matrix(y_true.argmax(axis=1), y_pred.argmax(axis=1))
print(f"Confusion matrix on {y_true.shape[-1]} classes: \n {conf_matrix}")
f1 = f1_score(y_true.argmax(axis=1), y_pred.argmax(axis=1), average="weighted")
print(f"Weighted f1 score: {f1}")
def test_model(test, test_labels, model):
test, y_true = load_data(test, test_labels)
y_pred = model.predict(test, verbose=1)
compute_metrics(y_pred, y_true)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--data", required=True)
parser.add_argument("--labels", required=True)
parser.add_argument("--mode", help="train or evaluate", required=True)
parser.add_argument("--embedding_dim", default=512)
parser.add_argument(
"--load_ckpt",
)
parser.add_argument(
"--class_weights",
)
parser.add_argument("--epoch", default=20)
parser.add_argument("--batch", default=50)
parser.add_argument(
"--clearml_project",
default="YourTTS-sprint2",
)
parser.add_argument(
"--clearml_task",
default="attribute-classifier",
)
args = parser.parse_args()
Task.init(project_name=args.clearml_project, task_name=f"{args.clearml_task}-{args.labels}")
# Load data
dataset = load_data(args.data, args.labels)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=args.batch, shuffle=True, num_workers=2)
# Initialize model
class_num = len(np.unique(np.load(args.labels), axis=0))
model = baseline_model(class_num, args.embedding_dim)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
# Train
for epoch in range(args.epoch):
for i, data in enumerate(dataloader, 0):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# print statistics
print(f"[Epoch:{epoch + 1}, step:{i + 1:5d}] loss: {loss.item() :.3f}")
checkpoint_dir = "checkpoint"
if not os.path.exists(checkpoint_dir):
os.makedirs(checkpoint_dir)
checkpoint_filepath = f"{checkpoint_dir}/checkpoint.pth"
torch.save(model.state_dict(), checkpoint_filepath)
if __name__ == "__main__":
main()

View File

@ -4,49 +4,52 @@ import os
import pickle
import random
import subprocess
from collections import Counter
from argparse import RawTextHelpFormatter
import numpy as np
import pandas as pd
from colorama import Back, Fore, Style
from pydub import AudioSegment
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.utils import shuffle
from tqdm import tqdm
def load_df(filename):
df = pd.read_csv(filename, sep ='\t')
def load_df(filename, n):
if n == "All":
df = pd.read_csv(filename, sep="\t")
else:
df = shuffle(pd.read_csv(filename, sep="\t")).head(n=int(n))
return df
def analyze_df(df,label):
def analyze_df(df, label):
label_dict = {}
df_filtered = df[df[label].notnull() & df[label].notna()]
df_final = df_filtered[df_filtered[label]!="other"][label]
df_filtered = df[df[label].notnull() & df[label].notna()]
df_final = df_filtered[df_filtered[label] != "other"][label]
for ac in df_final.unique():
speakers = df[df[label]==ac]['client_id'].unique()
speakers = df[df[label] == ac]["client_id"].unique()
no_speakers = len(speakers)
if(no_speakers<50):
continue
label_dict[ac]=speakers
print(Fore.YELLOW, f"\"{ac}\" unique speakers no.: {no_speakers}")
print(Style.RESET_ALL)
label_dict[ac] = speakers
print(f'"{ac}" unique speakers no.: {no_speakers}')
return label_dict
def train_test_split(df, label, label_dict, split=0.1):
print(len(label_dict.keys()),label_dict.keys())
print(len(label_dict.keys()), label_dict.keys())
train = pd.DataFrame()
test = pd.DataFrame()
for l in label_dict.keys():
spkrs = label_dict[l]
train_spkrs = spkrs[:int(len(spkrs)*(1-split))]
test_spkrs = spkrs[int(len(spkrs)*(1-split)):]
train = pd.concat([train,df[df.client_id.isin(train_spkrs)]])
test = pd.concat([test,df[df.client_id.isin(test_spkrs)]])
train = train[train[label]!="other"]
test = test[test[label]!="other"]
train_spkrs = spkrs[: int(len(spkrs) * (1 - split))]
test_spkrs = spkrs[int(len(spkrs) * (1 - split)) :]
train = pd.concat([train, df[df.client_id.isin(train_spkrs)]])
test = pd.concat([test, df[df.client_id.isin(test_spkrs)]])
train = train[train[label] != "other"]
test = test[test[label] != "other"]
return train, test
def mp3_to_wav(mp3_list,data_path,data_split_path,json_file):
def mp3_to_wav(mp3_list, data_path, data_split_path, json_file):
waves = []
for i in tqdm(mp3_list):
sound = AudioSegment.from_mp3(f"{data_path}/{i}")
@ -54,133 +57,128 @@ def mp3_to_wav(mp3_list,data_path,data_split_path,json_file):
waves.append(wav)
sound.export(wav, format="wav")
ff = open(f"{data_split_path}",'w')
ff.write("wav_filename|gender|text|speaker_name\n")
for i,j in enumerate(waves):
ff.write(f"{j}|m|blabla|ID_{i}\n")
ff.close()
write_config_dataset(data_path,data_split_path,json_file)
with open(f"{data_split_path}", "w") as f:
f.write("wav_filename|gender|text|speaker_name\n")
for i, j in enumerate(waves):
f.write(f"{j}|m|blabla|ID_{i}\n")
write_config_dataset(data_path, data_split_path, json_file)
def write_config_dataset(data_path,data_split_path,json_path):
def write_config_dataset(data_path, data_split_path, json_path):
cwd = os.getcwd()
data_split_full_path = os.path.join(cwd, data_split_path)
data = {
"model": "vits",
"datasets": [
"model": "vits",
"datasets": [
{
"name": "brspeech",
"path": data_path,
"meta_file_train": data_split_path,
"language": "en",
"meta_file_val": "null",
"meta_file_attn_mask": ""
"name": "brspeech",
"path": data_path,
"meta_file_train": data_split_full_path,
"language": "en",
"meta_file_val": "null",
"meta_file_attn_mask": "",
}
]
],
}
with open(json_path, 'w') as outfile:
with open(json_path, "w") as outfile:
json.dump(data, outfile)
def compute_speaker_emb(tts_root_dir,spkr_emb_model, spkr_emb_config, config_dataset, out_emb_json):
cmd = ["python", f"{tts_root_dir}/TTS/bin/compute_embeddings.py", "--use_cuda", "True" ,
"--no_eval", "True", spkr_emb_model,
spkr_emb_config, config_dataset, out_emb_json]
def compute_speaker_emb(tts_root_dir, spkr_emb_model, spkr_emb_config, config_dataset, out_emb_json):
cmd = [
"python",
f"{tts_root_dir}/TTS/bin/compute_embeddings.py",
"--no_eval",
"True",
spkr_emb_model,
spkr_emb_config,
config_dataset,
"--output_path",
out_emb_json,
]
print(" ".join(cmd))
print(subprocess.check_output(cmd).decode("utf-8"))
def compose_dataset(embeddings_json,df,label,split,out_array_path):
f = open(embeddings_json)
embs = json.load(f)
def compose_dataset(embeddings_json, df, label, out_array_path):
with open(embeddings_json) as f:
embs = json.load(f)
e = []
l = []
for i in tqdm(df.path):
id_=i.split('.mp3')[0]+".wav"
e.append(embs[id_]['embedding'])
l.append(df[df['path']==i][label].item())
'''
for i in tqdm(embs):
id_ = i.split('/')[-1].split('.wav')[0]+".mp3"
e.append(embs[i]['embedding'])
l.append(df[df['path']==id_][label].item())
'''
#import pdb; pdb.set_trace()
id_ = i.split(".mp3")[0] + ".wav"
e.append(embs[id_]["embedding"])
l.append(df[df["path"] == i][label].item())
values = np.array(l)
label_encoder = LabelEncoder()
#print(f"{l} {label_encoder}")
integer_encoded = label_encoder.fit_transform(values)
print(np.unique(values,return_counts=True),np.unique(integer_encoded))
print(np.unique(values, return_counts=True), np.unique(integer_encoded))
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot = onehot_encoder.fit_transform(integer_encoded)
d = list(zip(e,onehot))
d = list(zip(e, onehot))
random.shuffle(d)
data , labels = zip(*d)
data, labels = zip(*d)
data_name = f"{out_array_path}_data.npy"
label_name = f"{out_array_path}_labels.npy"
np.save(data_name, data)
np.save(label_name,labels)
uniq, counts = np.unique(values,return_counts=True)
weight={}
np.save(label_name, labels)
_, counts = np.unique(values, return_counts=True)
weight = {}
for i in np.unique(integer_encoded):
weight[i]=(1/counts[i])*(len(values)/2.0)
weight[i] = (1 / counts[i]) * (len(values) / 2.0)
print(weight)
with open(f'{out_array_path}-weights.pkl', 'wb') as f:
with open(f"{out_array_path}-weights.pkl", "wb") as f:
pickle.dump(weight, f)
print(f"Data: {np.array(data).shape} ,{data_name} \n Labels: {np.array(labels).shape} , {label_name}")
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--data",
help="Full path of CV data in tsv format",
required=True
parser = argparse.ArgumentParser(
description="A scirpt to prepare CV data for speaker embedding classification.\n"
"Example runs:\n"
"python cv_data_processing.py --data /datasets/cv/8.0/en/train.tsv --attribute age --out_dir result --num_rec 100 --tts_root_dir /mount-storage/TTS/TTS --spkr_emb_model models/model_se.pth.tar --spkr_emb_config models/config_se.json",
formatter_class=RawTextHelpFormatter,
)
parser.add_argument("--data", help="Full path of CV data in tsv format", required=True)
parser.add_argument(
"--label",
required=True
)
parser.add_argument(
"--out_dir",
required=True
)
parser.add_argument(
"--spkr_emb_model",
required=True
)
parser.add_argument(
"--spkr_emb_config",
required=True
)
parser.add_argument(
"--tts_root_dir",
required=True
"--num_rec", help="Number of records to use out of --data. Supply All to use all of the records", required=True
)
parser.add_argument("--attribute", help="Speaker attribute to sample from", required=True)
parser.add_argument("--out_dir", required=True)
parser.add_argument("--spkr_emb_model", required=True)
parser.add_argument("--spkr_emb_config", required=True)
parser.add_argument("--tts_root_dir", required=True)
args = parser.parse_args()
abs_path = '/'.join(args.data.split("/")[:-1])
data_path = os.path.join(abs_path,"clips")
abs_path = "/".join(args.data.split("/")[:-1])
data_path = os.path.join(abs_path, "clips")
if not os.path.exists(args.out_dir):
os.makedirs(args.out_dir)
args.out_dir = os.path.join(abs_path,args.out_dir)
df = load_df(args.data)
df = load_df(args.data, args.num_rec)
print(Fore.RED + f"Data header: {list(df)}")
print(Style.RESET_ALL)
assert args.label in list(df)
label_dict = analyze_df(df,args.label)
train_df, test_df = train_test_split(df, args.label, label_dict)
print(f"Data header: {list(df)}")
assert args.attribute in list(df)
label_dict = analyze_df(df, args.attribute)
train_df, test_df = train_test_split(df, args.attribute, label_dict)
for split in ["train", "test"]:
if split=='train':
if split == "train":
df_subset = train_df
else:
df_subset = test_df
tts_csv = os.path.join(args.out_dir,f"{args.label}_{split}_tts.csv")
config_dataset = os.path.join(args.out_dir, f"{args.label}_{split}_config_dataset.json")
#mp3_to_wav(df_subset['path'],data_path,tts_csv,config_dataset)
out_emb_json = "/datasets/cv/8.0/en/accent/filtered_spkr_embs.json" #os.path.join(args.out_dir,f"{args.label}_{split}_spkr_embs.json")
#compute_speaker_emb(args.tts_root_dir, args.spkr_emb_model, args.spkr_emb_config, config_dataset, out_emb_json)
out_array_path = os.path.join(args.out_dir, f"{args.label}_{args.label}_{split}")
compose_dataset(out_emb_json,df_subset,args.label,split,out_array_path)
tts_csv = os.path.join(args.out_dir, f"{args.attribute}_{split}_tts.csv")
config_dataset = os.path.join(args.out_dir, f"{args.attribute}_{split}_config_dataset.json")
mp3_to_wav(df_subset["path"], data_path, tts_csv, config_dataset)
out_emb_json = os.path.join(args.out_dir, f"{args.attribute}_{split}_spkr_embs.json")
compute_speaker_emb(args.tts_root_dir, args.spkr_emb_model, args.spkr_emb_config, config_dataset, out_emb_json)
out_array_path = os.path.join(args.out_dir, f"{args.attribute}_{split}")
compose_dataset(out_emb_json, df_subset, args.attribute, out_array_path)
print("Done.")
print ("Done.")
if __name__ == "__main__":
main()