mirror of https://github.com/coqui-ai/TTS.git
fixed bugs, added model in torch
This commit is contained in:
parent
1187d4df5f
commit
134ed5ece6
|
@ -0,0 +1,116 @@
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.nn.functional as F
|
||||||
|
from clearml import Task
|
||||||
|
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
|
||||||
|
|
||||||
|
|
||||||
|
class baseline_model(nn.Module):
|
||||||
|
def __init__(self, class_num, emb_dim):
|
||||||
|
super().__init__()
|
||||||
|
self.fc1 = nn.Linear(emb_dim, 1024)
|
||||||
|
self.fc2 = nn.Linear(1024, 512)
|
||||||
|
self.fc3 = nn.Linear(512, 128)
|
||||||
|
self.fc4 = nn.Linear(128, 64)
|
||||||
|
self.fc5 = nn.Linear(64, class_num)
|
||||||
|
self.dropout1 = nn.Dropout(0.2)
|
||||||
|
self.dropout2 = nn.Dropout(0.5)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
output = self.dropout1(F.relu(self.fc1(x)))
|
||||||
|
output = self.dropout2(F.relu(self.fc2(output)))
|
||||||
|
output = F.relu(self.fc3(output))
|
||||||
|
output = F.relu(self.fc4(output))
|
||||||
|
output = F.softmax(self.fc5(output), dim=0)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
def load_data(train, labels):
|
||||||
|
t = torch.Tensor(np.load(train))
|
||||||
|
l = torch.Tensor(np.load(labels))
|
||||||
|
dataset = torch.utils.data.TensorDataset(t, l)
|
||||||
|
return dataset
|
||||||
|
|
||||||
|
|
||||||
|
def compute_metrics(y_pred, y_true):
|
||||||
|
acc = accuracy_score(y_true.argmax(axis=1), y_pred.argmax(axis=1))
|
||||||
|
print(f"Model Accuracy on untrained data: {acc}")
|
||||||
|
conf_matrix = confusion_matrix(y_true.argmax(axis=1), y_pred.argmax(axis=1))
|
||||||
|
print(f"Confusion matrix on {y_true.shape[-1]} classes: \n {conf_matrix}")
|
||||||
|
f1 = f1_score(y_true.argmax(axis=1), y_pred.argmax(axis=1), average="weighted")
|
||||||
|
print(f"Weighted f1 score: {f1}")
|
||||||
|
|
||||||
|
|
||||||
|
def test_model(test, test_labels, model):
|
||||||
|
test, y_true = load_data(test, test_labels)
|
||||||
|
y_pred = model.predict(test, verbose=1)
|
||||||
|
compute_metrics(y_pred, y_true)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--data", required=True)
|
||||||
|
parser.add_argument("--labels", required=True)
|
||||||
|
parser.add_argument("--mode", help="train or evaluate", required=True)
|
||||||
|
parser.add_argument("--embedding_dim", default=512)
|
||||||
|
parser.add_argument(
|
||||||
|
"--load_ckpt",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--class_weights",
|
||||||
|
)
|
||||||
|
parser.add_argument("--epoch", default=20)
|
||||||
|
parser.add_argument("--batch", default=50)
|
||||||
|
parser.add_argument(
|
||||||
|
"--clearml_project",
|
||||||
|
default="YourTTS-sprint2",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--clearml_task",
|
||||||
|
default="attribute-classifier",
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
Task.init(project_name=args.clearml_project, task_name=f"{args.clearml_task}-{args.labels}")
|
||||||
|
# Load data
|
||||||
|
dataset = load_data(args.data, args.labels)
|
||||||
|
dataloader = torch.utils.data.DataLoader(dataset, batch_size=args.batch, shuffle=True, num_workers=2)
|
||||||
|
|
||||||
|
# Initialize model
|
||||||
|
class_num = len(np.unique(np.load(args.labels), axis=0))
|
||||||
|
model = baseline_model(class_num, args.embedding_dim)
|
||||||
|
criterion = nn.CrossEntropyLoss()
|
||||||
|
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
|
||||||
|
|
||||||
|
# Train
|
||||||
|
for epoch in range(args.epoch):
|
||||||
|
|
||||||
|
for i, data in enumerate(dataloader, 0):
|
||||||
|
# get the inputs; data is a list of [inputs, labels]
|
||||||
|
inputs, labels = data
|
||||||
|
|
||||||
|
# zero the parameter gradients
|
||||||
|
optimizer.zero_grad()
|
||||||
|
|
||||||
|
# forward + backward + optimize
|
||||||
|
outputs = model(inputs)
|
||||||
|
loss = criterion(outputs, labels)
|
||||||
|
loss.backward()
|
||||||
|
optimizer.step()
|
||||||
|
|
||||||
|
# print statistics
|
||||||
|
print(f"[Epoch:{epoch + 1}, step:{i + 1:5d}] loss: {loss.item() :.3f}")
|
||||||
|
|
||||||
|
checkpoint_dir = "checkpoint"
|
||||||
|
if not os.path.exists(checkpoint_dir):
|
||||||
|
os.makedirs(checkpoint_dir)
|
||||||
|
checkpoint_filepath = f"{checkpoint_dir}/checkpoint.pth"
|
||||||
|
torch.save(model.state_dict(), checkpoint_filepath)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
|
@ -4,49 +4,52 @@ import os
|
||||||
import pickle
|
import pickle
|
||||||
import random
|
import random
|
||||||
import subprocess
|
import subprocess
|
||||||
from collections import Counter
|
from argparse import RawTextHelpFormatter
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from colorama import Back, Fore, Style
|
|
||||||
from pydub import AudioSegment
|
from pydub import AudioSegment
|
||||||
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
|
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
|
||||||
|
from sklearn.utils import shuffle
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
def load_df(filename):
|
def load_df(filename, n):
|
||||||
df = pd.read_csv(filename, sep ='\t')
|
if n == "All":
|
||||||
|
df = pd.read_csv(filename, sep="\t")
|
||||||
|
else:
|
||||||
|
df = shuffle(pd.read_csv(filename, sep="\t")).head(n=int(n))
|
||||||
return df
|
return df
|
||||||
|
|
||||||
def analyze_df(df,label):
|
|
||||||
|
def analyze_df(df, label):
|
||||||
label_dict = {}
|
label_dict = {}
|
||||||
df_filtered = df[df[label].notnull() & df[label].notna()]
|
df_filtered = df[df[label].notnull() & df[label].notna()]
|
||||||
df_final = df_filtered[df_filtered[label]!="other"][label]
|
df_final = df_filtered[df_filtered[label] != "other"][label]
|
||||||
for ac in df_final.unique():
|
for ac in df_final.unique():
|
||||||
speakers = df[df[label]==ac]['client_id'].unique()
|
speakers = df[df[label] == ac]["client_id"].unique()
|
||||||
no_speakers = len(speakers)
|
no_speakers = len(speakers)
|
||||||
if(no_speakers<50):
|
label_dict[ac] = speakers
|
||||||
continue
|
print(f'"{ac}" unique speakers no.: {no_speakers}')
|
||||||
label_dict[ac]=speakers
|
|
||||||
print(Fore.YELLOW, f"\"{ac}\" unique speakers no.: {no_speakers}")
|
|
||||||
print(Style.RESET_ALL)
|
|
||||||
return label_dict
|
return label_dict
|
||||||
|
|
||||||
|
|
||||||
def train_test_split(df, label, label_dict, split=0.1):
|
def train_test_split(df, label, label_dict, split=0.1):
|
||||||
print(len(label_dict.keys()),label_dict.keys())
|
print(len(label_dict.keys()), label_dict.keys())
|
||||||
train = pd.DataFrame()
|
train = pd.DataFrame()
|
||||||
test = pd.DataFrame()
|
test = pd.DataFrame()
|
||||||
for l in label_dict.keys():
|
for l in label_dict.keys():
|
||||||
spkrs = label_dict[l]
|
spkrs = label_dict[l]
|
||||||
train_spkrs = spkrs[:int(len(spkrs)*(1-split))]
|
train_spkrs = spkrs[: int(len(spkrs) * (1 - split))]
|
||||||
test_spkrs = spkrs[int(len(spkrs)*(1-split)):]
|
test_spkrs = spkrs[int(len(spkrs) * (1 - split)) :]
|
||||||
train = pd.concat([train,df[df.client_id.isin(train_spkrs)]])
|
train = pd.concat([train, df[df.client_id.isin(train_spkrs)]])
|
||||||
test = pd.concat([test,df[df.client_id.isin(test_spkrs)]])
|
test = pd.concat([test, df[df.client_id.isin(test_spkrs)]])
|
||||||
train = train[train[label]!="other"]
|
train = train[train[label] != "other"]
|
||||||
test = test[test[label]!="other"]
|
test = test[test[label] != "other"]
|
||||||
return train, test
|
return train, test
|
||||||
|
|
||||||
def mp3_to_wav(mp3_list,data_path,data_split_path,json_file):
|
|
||||||
|
def mp3_to_wav(mp3_list, data_path, data_split_path, json_file):
|
||||||
waves = []
|
waves = []
|
||||||
for i in tqdm(mp3_list):
|
for i in tqdm(mp3_list):
|
||||||
sound = AudioSegment.from_mp3(f"{data_path}/{i}")
|
sound = AudioSegment.from_mp3(f"{data_path}/{i}")
|
||||||
|
@ -54,133 +57,128 @@ def mp3_to_wav(mp3_list,data_path,data_split_path,json_file):
|
||||||
waves.append(wav)
|
waves.append(wav)
|
||||||
sound.export(wav, format="wav")
|
sound.export(wav, format="wav")
|
||||||
|
|
||||||
ff = open(f"{data_split_path}",'w')
|
with open(f"{data_split_path}", "w") as f:
|
||||||
ff.write("wav_filename|gender|text|speaker_name\n")
|
f.write("wav_filename|gender|text|speaker_name\n")
|
||||||
for i,j in enumerate(waves):
|
for i, j in enumerate(waves):
|
||||||
ff.write(f"{j}|m|blabla|ID_{i}\n")
|
f.write(f"{j}|m|blabla|ID_{i}\n")
|
||||||
ff.close()
|
write_config_dataset(data_path, data_split_path, json_file)
|
||||||
write_config_dataset(data_path,data_split_path,json_file)
|
|
||||||
|
|
||||||
def write_config_dataset(data_path,data_split_path,json_path):
|
|
||||||
|
def write_config_dataset(data_path, data_split_path, json_path):
|
||||||
|
cwd = os.getcwd()
|
||||||
|
data_split_full_path = os.path.join(cwd, data_split_path)
|
||||||
data = {
|
data = {
|
||||||
"model": "vits",
|
"model": "vits",
|
||||||
"datasets": [
|
"datasets": [
|
||||||
{
|
{
|
||||||
"name": "brspeech",
|
"name": "brspeech",
|
||||||
"path": data_path,
|
"path": data_path,
|
||||||
"meta_file_train": data_split_path,
|
"meta_file_train": data_split_full_path,
|
||||||
"language": "en",
|
"language": "en",
|
||||||
"meta_file_val": "null",
|
"meta_file_val": "null",
|
||||||
"meta_file_attn_mask": ""
|
"meta_file_attn_mask": "",
|
||||||
}
|
}
|
||||||
]
|
],
|
||||||
}
|
}
|
||||||
with open(json_path, 'w') as outfile:
|
with open(json_path, "w") as outfile:
|
||||||
json.dump(data, outfile)
|
json.dump(data, outfile)
|
||||||
|
|
||||||
def compute_speaker_emb(tts_root_dir,spkr_emb_model, spkr_emb_config, config_dataset, out_emb_json):
|
|
||||||
cmd = ["python", f"{tts_root_dir}/TTS/bin/compute_embeddings.py", "--use_cuda", "True" ,
|
def compute_speaker_emb(tts_root_dir, spkr_emb_model, spkr_emb_config, config_dataset, out_emb_json):
|
||||||
"--no_eval", "True", spkr_emb_model,
|
cmd = [
|
||||||
spkr_emb_config, config_dataset, out_emb_json]
|
"python",
|
||||||
|
f"{tts_root_dir}/TTS/bin/compute_embeddings.py",
|
||||||
|
"--no_eval",
|
||||||
|
"True",
|
||||||
|
spkr_emb_model,
|
||||||
|
spkr_emb_config,
|
||||||
|
config_dataset,
|
||||||
|
"--output_path",
|
||||||
|
out_emb_json,
|
||||||
|
]
|
||||||
print(" ".join(cmd))
|
print(" ".join(cmd))
|
||||||
print(subprocess.check_output(cmd).decode("utf-8"))
|
print(subprocess.check_output(cmd).decode("utf-8"))
|
||||||
|
|
||||||
def compose_dataset(embeddings_json,df,label,split,out_array_path):
|
|
||||||
f = open(embeddings_json)
|
def compose_dataset(embeddings_json, df, label, out_array_path):
|
||||||
embs = json.load(f)
|
with open(embeddings_json) as f:
|
||||||
|
embs = json.load(f)
|
||||||
e = []
|
e = []
|
||||||
l = []
|
l = []
|
||||||
for i in tqdm(df.path):
|
for i in tqdm(df.path):
|
||||||
id_=i.split('.mp3')[0]+".wav"
|
id_ = i.split(".mp3")[0] + ".wav"
|
||||||
e.append(embs[id_]['embedding'])
|
e.append(embs[id_]["embedding"])
|
||||||
l.append(df[df['path']==i][label].item())
|
l.append(df[df["path"] == i][label].item())
|
||||||
'''
|
|
||||||
for i in tqdm(embs):
|
|
||||||
id_ = i.split('/')[-1].split('.wav')[0]+".mp3"
|
|
||||||
e.append(embs[i]['embedding'])
|
|
||||||
l.append(df[df['path']==id_][label].item())
|
|
||||||
'''
|
|
||||||
#import pdb; pdb.set_trace()
|
|
||||||
values = np.array(l)
|
values = np.array(l)
|
||||||
label_encoder = LabelEncoder()
|
label_encoder = LabelEncoder()
|
||||||
#print(f"{l} {label_encoder}")
|
|
||||||
integer_encoded = label_encoder.fit_transform(values)
|
integer_encoded = label_encoder.fit_transform(values)
|
||||||
print(np.unique(values,return_counts=True),np.unique(integer_encoded))
|
print(np.unique(values, return_counts=True), np.unique(integer_encoded))
|
||||||
onehot_encoder = OneHotEncoder(sparse=False)
|
onehot_encoder = OneHotEncoder(sparse=False)
|
||||||
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
|
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
|
||||||
onehot = onehot_encoder.fit_transform(integer_encoded)
|
onehot = onehot_encoder.fit_transform(integer_encoded)
|
||||||
|
|
||||||
d = list(zip(e,onehot))
|
d = list(zip(e, onehot))
|
||||||
random.shuffle(d)
|
random.shuffle(d)
|
||||||
data , labels = zip(*d)
|
data, labels = zip(*d)
|
||||||
data_name = f"{out_array_path}_data.npy"
|
data_name = f"{out_array_path}_data.npy"
|
||||||
label_name = f"{out_array_path}_labels.npy"
|
label_name = f"{out_array_path}_labels.npy"
|
||||||
np.save(data_name, data)
|
np.save(data_name, data)
|
||||||
np.save(label_name,labels)
|
np.save(label_name, labels)
|
||||||
uniq, counts = np.unique(values,return_counts=True)
|
_, counts = np.unique(values, return_counts=True)
|
||||||
weight={}
|
weight = {}
|
||||||
for i in np.unique(integer_encoded):
|
for i in np.unique(integer_encoded):
|
||||||
weight[i]=(1/counts[i])*(len(values)/2.0)
|
weight[i] = (1 / counts[i]) * (len(values) / 2.0)
|
||||||
print(weight)
|
print(weight)
|
||||||
with open(f'{out_array_path}-weights.pkl', 'wb') as f:
|
with open(f"{out_array_path}-weights.pkl", "wb") as f:
|
||||||
pickle.dump(weight, f)
|
pickle.dump(weight, f)
|
||||||
print(f"Data: {np.array(data).shape} ,{data_name} \n Labels: {np.array(labels).shape} , {label_name}")
|
print(f"Data: {np.array(data).shape} ,{data_name} \n Labels: {np.array(labels).shape} , {label_name}")
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser(
|
||||||
parser.add_argument(
|
description="A scirpt to prepare CV data for speaker embedding classification.\n"
|
||||||
"--data",
|
"Example runs:\n"
|
||||||
help="Full path of CV data in tsv format",
|
"python cv_data_processing.py --data /datasets/cv/8.0/en/train.tsv --attribute age --out_dir result --num_rec 100 --tts_root_dir /mount-storage/TTS/TTS --spkr_emb_model models/model_se.pth.tar --spkr_emb_config models/config_se.json",
|
||||||
required=True
|
formatter_class=RawTextHelpFormatter,
|
||||||
)
|
)
|
||||||
|
parser.add_argument("--data", help="Full path of CV data in tsv format", required=True)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--label",
|
"--num_rec", help="Number of records to use out of --data. Supply All to use all of the records", required=True
|
||||||
required=True
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--out_dir",
|
|
||||||
required=True
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--spkr_emb_model",
|
|
||||||
required=True
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--spkr_emb_config",
|
|
||||||
required=True
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--tts_root_dir",
|
|
||||||
required=True
|
|
||||||
)
|
)
|
||||||
|
parser.add_argument("--attribute", help="Speaker attribute to sample from", required=True)
|
||||||
|
parser.add_argument("--out_dir", required=True)
|
||||||
|
|
||||||
|
parser.add_argument("--spkr_emb_model", required=True)
|
||||||
|
parser.add_argument("--spkr_emb_config", required=True)
|
||||||
|
parser.add_argument("--tts_root_dir", required=True)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
abs_path = '/'.join(args.data.split("/")[:-1])
|
|
||||||
data_path = os.path.join(abs_path,"clips")
|
abs_path = "/".join(args.data.split("/")[:-1])
|
||||||
|
data_path = os.path.join(abs_path, "clips")
|
||||||
if not os.path.exists(args.out_dir):
|
if not os.path.exists(args.out_dir):
|
||||||
os.makedirs(args.out_dir)
|
os.makedirs(args.out_dir)
|
||||||
args.out_dir = os.path.join(abs_path,args.out_dir)
|
df = load_df(args.data, args.num_rec)
|
||||||
df = load_df(args.data)
|
|
||||||
|
|
||||||
print(Fore.RED + f"Data header: {list(df)}")
|
print(f"Data header: {list(df)}")
|
||||||
print(Style.RESET_ALL)
|
assert args.attribute in list(df)
|
||||||
assert args.label in list(df)
|
label_dict = analyze_df(df, args.attribute)
|
||||||
label_dict = analyze_df(df,args.label)
|
train_df, test_df = train_test_split(df, args.attribute, label_dict)
|
||||||
train_df, test_df = train_test_split(df, args.label, label_dict)
|
|
||||||
for split in ["train", "test"]:
|
for split in ["train", "test"]:
|
||||||
if split=='train':
|
if split == "train":
|
||||||
df_subset = train_df
|
df_subset = train_df
|
||||||
else:
|
else:
|
||||||
df_subset = test_df
|
df_subset = test_df
|
||||||
tts_csv = os.path.join(args.out_dir,f"{args.label}_{split}_tts.csv")
|
tts_csv = os.path.join(args.out_dir, f"{args.attribute}_{split}_tts.csv")
|
||||||
config_dataset = os.path.join(args.out_dir, f"{args.label}_{split}_config_dataset.json")
|
config_dataset = os.path.join(args.out_dir, f"{args.attribute}_{split}_config_dataset.json")
|
||||||
#mp3_to_wav(df_subset['path'],data_path,tts_csv,config_dataset)
|
mp3_to_wav(df_subset["path"], data_path, tts_csv, config_dataset)
|
||||||
out_emb_json = "/datasets/cv/8.0/en/accent/filtered_spkr_embs.json" #os.path.join(args.out_dir,f"{args.label}_{split}_spkr_embs.json")
|
out_emb_json = os.path.join(args.out_dir, f"{args.attribute}_{split}_spkr_embs.json")
|
||||||
#compute_speaker_emb(args.tts_root_dir, args.spkr_emb_model, args.spkr_emb_config, config_dataset, out_emb_json)
|
compute_speaker_emb(args.tts_root_dir, args.spkr_emb_model, args.spkr_emb_config, config_dataset, out_emb_json)
|
||||||
out_array_path = os.path.join(args.out_dir, f"{args.label}_{args.label}_{split}")
|
out_array_path = os.path.join(args.out_dir, f"{args.attribute}_{split}")
|
||||||
compose_dataset(out_emb_json,df_subset,args.label,split,out_array_path)
|
compose_dataset(out_emb_json, df_subset, args.attribute, out_array_path)
|
||||||
|
|
||||||
|
print("Done.")
|
||||||
|
|
||||||
print ("Done.")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
Loading…
Reference in New Issue