mirror of https://github.com/coqui-ai/TTS.git
fixed bugs, added model in torch
This commit is contained in:
parent
1187d4df5f
commit
134ed5ece6
|
@ -0,0 +1,116 @@
|
|||
import argparse
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from clearml import Task
|
||||
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
|
||||
|
||||
|
||||
class baseline_model(nn.Module):
|
||||
def __init__(self, class_num, emb_dim):
|
||||
super().__init__()
|
||||
self.fc1 = nn.Linear(emb_dim, 1024)
|
||||
self.fc2 = nn.Linear(1024, 512)
|
||||
self.fc3 = nn.Linear(512, 128)
|
||||
self.fc4 = nn.Linear(128, 64)
|
||||
self.fc5 = nn.Linear(64, class_num)
|
||||
self.dropout1 = nn.Dropout(0.2)
|
||||
self.dropout2 = nn.Dropout(0.5)
|
||||
|
||||
def forward(self, x):
|
||||
output = self.dropout1(F.relu(self.fc1(x)))
|
||||
output = self.dropout2(F.relu(self.fc2(output)))
|
||||
output = F.relu(self.fc3(output))
|
||||
output = F.relu(self.fc4(output))
|
||||
output = F.softmax(self.fc5(output), dim=0)
|
||||
return output
|
||||
|
||||
|
||||
def load_data(train, labels):
|
||||
t = torch.Tensor(np.load(train))
|
||||
l = torch.Tensor(np.load(labels))
|
||||
dataset = torch.utils.data.TensorDataset(t, l)
|
||||
return dataset
|
||||
|
||||
|
||||
def compute_metrics(y_pred, y_true):
|
||||
acc = accuracy_score(y_true.argmax(axis=1), y_pred.argmax(axis=1))
|
||||
print(f"Model Accuracy on untrained data: {acc}")
|
||||
conf_matrix = confusion_matrix(y_true.argmax(axis=1), y_pred.argmax(axis=1))
|
||||
print(f"Confusion matrix on {y_true.shape[-1]} classes: \n {conf_matrix}")
|
||||
f1 = f1_score(y_true.argmax(axis=1), y_pred.argmax(axis=1), average="weighted")
|
||||
print(f"Weighted f1 score: {f1}")
|
||||
|
||||
|
||||
def test_model(test, test_labels, model):
|
||||
test, y_true = load_data(test, test_labels)
|
||||
y_pred = model.predict(test, verbose=1)
|
||||
compute_metrics(y_pred, y_true)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--data", required=True)
|
||||
parser.add_argument("--labels", required=True)
|
||||
parser.add_argument("--mode", help="train or evaluate", required=True)
|
||||
parser.add_argument("--embedding_dim", default=512)
|
||||
parser.add_argument(
|
||||
"--load_ckpt",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--class_weights",
|
||||
)
|
||||
parser.add_argument("--epoch", default=20)
|
||||
parser.add_argument("--batch", default=50)
|
||||
parser.add_argument(
|
||||
"--clearml_project",
|
||||
default="YourTTS-sprint2",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--clearml_task",
|
||||
default="attribute-classifier",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
Task.init(project_name=args.clearml_project, task_name=f"{args.clearml_task}-{args.labels}")
|
||||
# Load data
|
||||
dataset = load_data(args.data, args.labels)
|
||||
dataloader = torch.utils.data.DataLoader(dataset, batch_size=args.batch, shuffle=True, num_workers=2)
|
||||
|
||||
# Initialize model
|
||||
class_num = len(np.unique(np.load(args.labels), axis=0))
|
||||
model = baseline_model(class_num, args.embedding_dim)
|
||||
criterion = nn.CrossEntropyLoss()
|
||||
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
|
||||
|
||||
# Train
|
||||
for epoch in range(args.epoch):
|
||||
|
||||
for i, data in enumerate(dataloader, 0):
|
||||
# get the inputs; data is a list of [inputs, labels]
|
||||
inputs, labels = data
|
||||
|
||||
# zero the parameter gradients
|
||||
optimizer.zero_grad()
|
||||
|
||||
# forward + backward + optimize
|
||||
outputs = model(inputs)
|
||||
loss = criterion(outputs, labels)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
# print statistics
|
||||
print(f"[Epoch:{epoch + 1}, step:{i + 1:5d}] loss: {loss.item() :.3f}")
|
||||
|
||||
checkpoint_dir = "checkpoint"
|
||||
if not os.path.exists(checkpoint_dir):
|
||||
os.makedirs(checkpoint_dir)
|
||||
checkpoint_filepath = f"{checkpoint_dir}/checkpoint.pth"
|
||||
torch.save(model.state_dict(), checkpoint_filepath)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -4,49 +4,52 @@ import os
|
|||
import pickle
|
||||
import random
|
||||
import subprocess
|
||||
from collections import Counter
|
||||
from argparse import RawTextHelpFormatter
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from colorama import Back, Fore, Style
|
||||
from pydub import AudioSegment
|
||||
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
|
||||
from sklearn.utils import shuffle
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def load_df(filename):
|
||||
df = pd.read_csv(filename, sep ='\t')
|
||||
def load_df(filename, n):
|
||||
if n == "All":
|
||||
df = pd.read_csv(filename, sep="\t")
|
||||
else:
|
||||
df = shuffle(pd.read_csv(filename, sep="\t")).head(n=int(n))
|
||||
return df
|
||||
|
||||
def analyze_df(df,label):
|
||||
|
||||
def analyze_df(df, label):
|
||||
label_dict = {}
|
||||
df_filtered = df[df[label].notnull() & df[label].notna()]
|
||||
df_final = df_filtered[df_filtered[label]!="other"][label]
|
||||
df_filtered = df[df[label].notnull() & df[label].notna()]
|
||||
df_final = df_filtered[df_filtered[label] != "other"][label]
|
||||
for ac in df_final.unique():
|
||||
speakers = df[df[label]==ac]['client_id'].unique()
|
||||
speakers = df[df[label] == ac]["client_id"].unique()
|
||||
no_speakers = len(speakers)
|
||||
if(no_speakers<50):
|
||||
continue
|
||||
label_dict[ac]=speakers
|
||||
print(Fore.YELLOW, f"\"{ac}\" unique speakers no.: {no_speakers}")
|
||||
print(Style.RESET_ALL)
|
||||
label_dict[ac] = speakers
|
||||
print(f'"{ac}" unique speakers no.: {no_speakers}')
|
||||
return label_dict
|
||||
|
||||
|
||||
def train_test_split(df, label, label_dict, split=0.1):
|
||||
print(len(label_dict.keys()),label_dict.keys())
|
||||
print(len(label_dict.keys()), label_dict.keys())
|
||||
train = pd.DataFrame()
|
||||
test = pd.DataFrame()
|
||||
for l in label_dict.keys():
|
||||
spkrs = label_dict[l]
|
||||
train_spkrs = spkrs[:int(len(spkrs)*(1-split))]
|
||||
test_spkrs = spkrs[int(len(spkrs)*(1-split)):]
|
||||
train = pd.concat([train,df[df.client_id.isin(train_spkrs)]])
|
||||
test = pd.concat([test,df[df.client_id.isin(test_spkrs)]])
|
||||
train = train[train[label]!="other"]
|
||||
test = test[test[label]!="other"]
|
||||
train_spkrs = spkrs[: int(len(spkrs) * (1 - split))]
|
||||
test_spkrs = spkrs[int(len(spkrs) * (1 - split)) :]
|
||||
train = pd.concat([train, df[df.client_id.isin(train_spkrs)]])
|
||||
test = pd.concat([test, df[df.client_id.isin(test_spkrs)]])
|
||||
train = train[train[label] != "other"]
|
||||
test = test[test[label] != "other"]
|
||||
return train, test
|
||||
|
||||
def mp3_to_wav(mp3_list,data_path,data_split_path,json_file):
|
||||
|
||||
def mp3_to_wav(mp3_list, data_path, data_split_path, json_file):
|
||||
waves = []
|
||||
for i in tqdm(mp3_list):
|
||||
sound = AudioSegment.from_mp3(f"{data_path}/{i}")
|
||||
|
@ -54,133 +57,128 @@ def mp3_to_wav(mp3_list,data_path,data_split_path,json_file):
|
|||
waves.append(wav)
|
||||
sound.export(wav, format="wav")
|
||||
|
||||
ff = open(f"{data_split_path}",'w')
|
||||
ff.write("wav_filename|gender|text|speaker_name\n")
|
||||
for i,j in enumerate(waves):
|
||||
ff.write(f"{j}|m|blabla|ID_{i}\n")
|
||||
ff.close()
|
||||
write_config_dataset(data_path,data_split_path,json_file)
|
||||
with open(f"{data_split_path}", "w") as f:
|
||||
f.write("wav_filename|gender|text|speaker_name\n")
|
||||
for i, j in enumerate(waves):
|
||||
f.write(f"{j}|m|blabla|ID_{i}\n")
|
||||
write_config_dataset(data_path, data_split_path, json_file)
|
||||
|
||||
def write_config_dataset(data_path,data_split_path,json_path):
|
||||
|
||||
def write_config_dataset(data_path, data_split_path, json_path):
|
||||
cwd = os.getcwd()
|
||||
data_split_full_path = os.path.join(cwd, data_split_path)
|
||||
data = {
|
||||
"model": "vits",
|
||||
"datasets": [
|
||||
"model": "vits",
|
||||
"datasets": [
|
||||
{
|
||||
"name": "brspeech",
|
||||
"path": data_path,
|
||||
"meta_file_train": data_split_path,
|
||||
"language": "en",
|
||||
"meta_file_val": "null",
|
||||
"meta_file_attn_mask": ""
|
||||
"name": "brspeech",
|
||||
"path": data_path,
|
||||
"meta_file_train": data_split_full_path,
|
||||
"language": "en",
|
||||
"meta_file_val": "null",
|
||||
"meta_file_attn_mask": "",
|
||||
}
|
||||
]
|
||||
],
|
||||
}
|
||||
with open(json_path, 'w') as outfile:
|
||||
with open(json_path, "w") as outfile:
|
||||
json.dump(data, outfile)
|
||||
|
||||
def compute_speaker_emb(tts_root_dir,spkr_emb_model, spkr_emb_config, config_dataset, out_emb_json):
|
||||
cmd = ["python", f"{tts_root_dir}/TTS/bin/compute_embeddings.py", "--use_cuda", "True" ,
|
||||
"--no_eval", "True", spkr_emb_model,
|
||||
spkr_emb_config, config_dataset, out_emb_json]
|
||||
|
||||
def compute_speaker_emb(tts_root_dir, spkr_emb_model, spkr_emb_config, config_dataset, out_emb_json):
|
||||
cmd = [
|
||||
"python",
|
||||
f"{tts_root_dir}/TTS/bin/compute_embeddings.py",
|
||||
"--no_eval",
|
||||
"True",
|
||||
spkr_emb_model,
|
||||
spkr_emb_config,
|
||||
config_dataset,
|
||||
"--output_path",
|
||||
out_emb_json,
|
||||
]
|
||||
print(" ".join(cmd))
|
||||
print(subprocess.check_output(cmd).decode("utf-8"))
|
||||
|
||||
def compose_dataset(embeddings_json,df,label,split,out_array_path):
|
||||
f = open(embeddings_json)
|
||||
embs = json.load(f)
|
||||
|
||||
def compose_dataset(embeddings_json, df, label, out_array_path):
|
||||
with open(embeddings_json) as f:
|
||||
embs = json.load(f)
|
||||
e = []
|
||||
l = []
|
||||
for i in tqdm(df.path):
|
||||
id_=i.split('.mp3')[0]+".wav"
|
||||
e.append(embs[id_]['embedding'])
|
||||
l.append(df[df['path']==i][label].item())
|
||||
'''
|
||||
for i in tqdm(embs):
|
||||
id_ = i.split('/')[-1].split('.wav')[0]+".mp3"
|
||||
e.append(embs[i]['embedding'])
|
||||
l.append(df[df['path']==id_][label].item())
|
||||
'''
|
||||
#import pdb; pdb.set_trace()
|
||||
id_ = i.split(".mp3")[0] + ".wav"
|
||||
e.append(embs[id_]["embedding"])
|
||||
l.append(df[df["path"] == i][label].item())
|
||||
values = np.array(l)
|
||||
label_encoder = LabelEncoder()
|
||||
#print(f"{l} {label_encoder}")
|
||||
integer_encoded = label_encoder.fit_transform(values)
|
||||
print(np.unique(values,return_counts=True),np.unique(integer_encoded))
|
||||
print(np.unique(values, return_counts=True), np.unique(integer_encoded))
|
||||
onehot_encoder = OneHotEncoder(sparse=False)
|
||||
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
|
||||
onehot = onehot_encoder.fit_transform(integer_encoded)
|
||||
|
||||
d = list(zip(e,onehot))
|
||||
d = list(zip(e, onehot))
|
||||
random.shuffle(d)
|
||||
data , labels = zip(*d)
|
||||
data, labels = zip(*d)
|
||||
data_name = f"{out_array_path}_data.npy"
|
||||
label_name = f"{out_array_path}_labels.npy"
|
||||
np.save(data_name, data)
|
||||
np.save(label_name,labels)
|
||||
uniq, counts = np.unique(values,return_counts=True)
|
||||
weight={}
|
||||
np.save(label_name, labels)
|
||||
_, counts = np.unique(values, return_counts=True)
|
||||
weight = {}
|
||||
for i in np.unique(integer_encoded):
|
||||
weight[i]=(1/counts[i])*(len(values)/2.0)
|
||||
weight[i] = (1 / counts[i]) * (len(values) / 2.0)
|
||||
print(weight)
|
||||
with open(f'{out_array_path}-weights.pkl', 'wb') as f:
|
||||
with open(f"{out_array_path}-weights.pkl", "wb") as f:
|
||||
pickle.dump(weight, f)
|
||||
print(f"Data: {np.array(data).shape} ,{data_name} \n Labels: {np.array(labels).shape} , {label_name}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--data",
|
||||
help="Full path of CV data in tsv format",
|
||||
required=True
|
||||
parser = argparse.ArgumentParser(
|
||||
description="A scirpt to prepare CV data for speaker embedding classification.\n"
|
||||
"Example runs:\n"
|
||||
"python cv_data_processing.py --data /datasets/cv/8.0/en/train.tsv --attribute age --out_dir result --num_rec 100 --tts_root_dir /mount-storage/TTS/TTS --spkr_emb_model models/model_se.pth.tar --spkr_emb_config models/config_se.json",
|
||||
formatter_class=RawTextHelpFormatter,
|
||||
)
|
||||
parser.add_argument("--data", help="Full path of CV data in tsv format", required=True)
|
||||
parser.add_argument(
|
||||
"--label",
|
||||
required=True
|
||||
)
|
||||
parser.add_argument(
|
||||
"--out_dir",
|
||||
required=True
|
||||
)
|
||||
parser.add_argument(
|
||||
"--spkr_emb_model",
|
||||
required=True
|
||||
)
|
||||
parser.add_argument(
|
||||
"--spkr_emb_config",
|
||||
required=True
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tts_root_dir",
|
||||
required=True
|
||||
"--num_rec", help="Number of records to use out of --data. Supply All to use all of the records", required=True
|
||||
)
|
||||
parser.add_argument("--attribute", help="Speaker attribute to sample from", required=True)
|
||||
parser.add_argument("--out_dir", required=True)
|
||||
|
||||
parser.add_argument("--spkr_emb_model", required=True)
|
||||
parser.add_argument("--spkr_emb_config", required=True)
|
||||
parser.add_argument("--tts_root_dir", required=True)
|
||||
|
||||
args = parser.parse_args()
|
||||
abs_path = '/'.join(args.data.split("/")[:-1])
|
||||
data_path = os.path.join(abs_path,"clips")
|
||||
|
||||
abs_path = "/".join(args.data.split("/")[:-1])
|
||||
data_path = os.path.join(abs_path, "clips")
|
||||
if not os.path.exists(args.out_dir):
|
||||
os.makedirs(args.out_dir)
|
||||
args.out_dir = os.path.join(abs_path,args.out_dir)
|
||||
df = load_df(args.data)
|
||||
df = load_df(args.data, args.num_rec)
|
||||
|
||||
print(Fore.RED + f"Data header: {list(df)}")
|
||||
print(Style.RESET_ALL)
|
||||
assert args.label in list(df)
|
||||
label_dict = analyze_df(df,args.label)
|
||||
train_df, test_df = train_test_split(df, args.label, label_dict)
|
||||
print(f"Data header: {list(df)}")
|
||||
assert args.attribute in list(df)
|
||||
label_dict = analyze_df(df, args.attribute)
|
||||
train_df, test_df = train_test_split(df, args.attribute, label_dict)
|
||||
for split in ["train", "test"]:
|
||||
if split=='train':
|
||||
if split == "train":
|
||||
df_subset = train_df
|
||||
else:
|
||||
df_subset = test_df
|
||||
tts_csv = os.path.join(args.out_dir,f"{args.label}_{split}_tts.csv")
|
||||
config_dataset = os.path.join(args.out_dir, f"{args.label}_{split}_config_dataset.json")
|
||||
#mp3_to_wav(df_subset['path'],data_path,tts_csv,config_dataset)
|
||||
out_emb_json = "/datasets/cv/8.0/en/accent/filtered_spkr_embs.json" #os.path.join(args.out_dir,f"{args.label}_{split}_spkr_embs.json")
|
||||
#compute_speaker_emb(args.tts_root_dir, args.spkr_emb_model, args.spkr_emb_config, config_dataset, out_emb_json)
|
||||
out_array_path = os.path.join(args.out_dir, f"{args.label}_{args.label}_{split}")
|
||||
compose_dataset(out_emb_json,df_subset,args.label,split,out_array_path)
|
||||
tts_csv = os.path.join(args.out_dir, f"{args.attribute}_{split}_tts.csv")
|
||||
config_dataset = os.path.join(args.out_dir, f"{args.attribute}_{split}_config_dataset.json")
|
||||
mp3_to_wav(df_subset["path"], data_path, tts_csv, config_dataset)
|
||||
out_emb_json = os.path.join(args.out_dir, f"{args.attribute}_{split}_spkr_embs.json")
|
||||
compute_speaker_emb(args.tts_root_dir, args.spkr_emb_model, args.spkr_emb_config, config_dataset, out_emb_json)
|
||||
out_array_path = os.path.join(args.out_dir, f"{args.attribute}_{split}")
|
||||
compose_dataset(out_emb_json, df_subset, args.attribute, out_array_path)
|
||||
|
||||
print("Done.")
|
||||
|
||||
print ("Done.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
Loading…
Reference in New Issue