mirror of https://github.com/coqui-ai/TTS.git
add: Configurable encoder dataset storage to reduce disk I/O
add: Averaged time for data loader to console and Tensorboard output
This commit is contained in:
parent
95d2906307
commit
1511076fde
|
@ -44,6 +44,8 @@ def setup_loader(ap, is_val=False, verbose=False):
|
||||||
voice_len=1.6,
|
voice_len=1.6,
|
||||||
num_utter_per_speaker=10,
|
num_utter_per_speaker=10,
|
||||||
skip_speakers=False,
|
skip_speakers=False,
|
||||||
|
storage_size=c.storage["storage_size"],
|
||||||
|
sample_from_storage_p=c.storage["sample_from_storage_p"],
|
||||||
verbose=verbose)
|
verbose=verbose)
|
||||||
# sampler = DistributedSampler(dataset) if num_gpus > 1 else None
|
# sampler = DistributedSampler(dataset) if num_gpus > 1 else None
|
||||||
loader = DataLoader(dataset,
|
loader = DataLoader(dataset,
|
||||||
|
@ -60,6 +62,7 @@ def train(model, criterion, optimizer, scheduler, ap, global_step):
|
||||||
epoch_time = 0
|
epoch_time = 0
|
||||||
best_loss = float('inf')
|
best_loss = float('inf')
|
||||||
avg_loss = 0
|
avg_loss = 0
|
||||||
|
avg_loader_time = 0
|
||||||
end_time = time.time()
|
end_time = time.time()
|
||||||
for _, data in enumerate(data_loader):
|
for _, data in enumerate(data_loader):
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
@ -93,8 +96,12 @@ def train(model, criterion, optimizer, scheduler, ap, global_step):
|
||||||
step_time = time.time() - start_time
|
step_time = time.time() - start_time
|
||||||
epoch_time += step_time
|
epoch_time += step_time
|
||||||
|
|
||||||
avg_loss = 0.01 * loss.item(
|
# Averaged Loss and Averaged Loader Time
|
||||||
) + 0.99 * avg_loss if avg_loss != 0 else loss.item()
|
dataset_number_prefetched = 2 * c.num_loader_workers # this is hardcoded in pytorch
|
||||||
|
avg_loss = 0.01 * loss.item() \
|
||||||
|
+ 0.99 * avg_loss if avg_loss != 0 else loss.item()
|
||||||
|
avg_loader_time = 1/dataset_number_prefetched * loader_time\
|
||||||
|
+ (dataset_number_prefetched-1) / dataset_number_prefetched * avg_loader_time if avg_loader_time != 0 else loader_time
|
||||||
current_lr = optimizer.param_groups[0]['lr']
|
current_lr = optimizer.param_groups[0]['lr']
|
||||||
|
|
||||||
if global_step % c.steps_plot_stats == 0:
|
if global_step % c.steps_plot_stats == 0:
|
||||||
|
@ -103,7 +110,8 @@ def train(model, criterion, optimizer, scheduler, ap, global_step):
|
||||||
"loss": avg_loss,
|
"loss": avg_loss,
|
||||||
"lr": current_lr,
|
"lr": current_lr,
|
||||||
"grad_norm": grad_norm,
|
"grad_norm": grad_norm,
|
||||||
"step_time": step_time
|
"step_time": step_time,
|
||||||
|
"loader_time": loader_time
|
||||||
}
|
}
|
||||||
tb_logger.tb_train_epoch_stats(global_step, train_stats)
|
tb_logger.tb_train_epoch_stats(global_step, train_stats)
|
||||||
figures = {
|
figures = {
|
||||||
|
@ -116,9 +124,9 @@ def train(model, criterion, optimizer, scheduler, ap, global_step):
|
||||||
if global_step % c.print_step == 0:
|
if global_step % c.print_step == 0:
|
||||||
print(
|
print(
|
||||||
" | > Step:{} Loss:{:.5f} AvgLoss:{:.5f} GradNorm:{:.5f} "
|
" | > Step:{} Loss:{:.5f} AvgLoss:{:.5f} GradNorm:{:.5f} "
|
||||||
"StepTime:{:.2f} LoaderTime:{:.2f} LR:{:.6f}".format(
|
"StepTime:{:.2f} LoaderTime:{:.2f} AvGLoaderTime:{:.2f} LR:{:.6f}".format(
|
||||||
global_step, loss.item(), avg_loss, grad_norm, step_time,
|
global_step, loss.item(), avg_loss, grad_norm, step_time,
|
||||||
loader_time, current_lr),
|
loader_time, avg_loader_time, current_lr),
|
||||||
flush=True)
|
flush=True)
|
||||||
|
|
||||||
# save best model
|
# save best model
|
||||||
|
|
|
@ -23,7 +23,7 @@
|
||||||
"clip_norm": true, // clip normalized values into the range.
|
"clip_norm": true, // clip normalized values into the range.
|
||||||
"mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
|
"mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
|
||||||
"mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
|
"mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
|
||||||
"do_trim_silence": false, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
|
"do_trim_silence": true, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
|
||||||
"trim_db": 60 // threshold for timming silence. Set this according to your dataset.
|
"trim_db": 60 // threshold for timming silence. Set this according to your dataset.
|
||||||
},
|
},
|
||||||
"reinit_layers": [],
|
"reinit_layers": [],
|
||||||
|
@ -45,53 +45,57 @@
|
||||||
"model": {
|
"model": {
|
||||||
"input_dim": 40,
|
"input_dim": 40,
|
||||||
"proj_dim": 256,
|
"proj_dim": 256,
|
||||||
"lstm_dim": 256,
|
"lstm_dim": 768,
|
||||||
"num_lstm_layers": 3,
|
"num_lstm_layers": 3,
|
||||||
"use_lstm_with_projection": false
|
"use_lstm_with_projection": true
|
||||||
|
},
|
||||||
|
"storage": {
|
||||||
|
"sample_from_storage_p": 0.42, // the probability with which we'll sample from the DataSet in-memory storage
|
||||||
|
"storage_size": 5 // the size of the in-memory storage with respect to a single batch
|
||||||
},
|
},
|
||||||
"datasets":
|
"datasets":
|
||||||
[
|
[
|
||||||
{
|
{
|
||||||
"name": "common_voice_wav",
|
"name": "vctk_slim",
|
||||||
"path": "../../audio-datasets/en/MozillaCommonVoice",
|
|
||||||
"meta_file_train": "train.tsv",
|
|
||||||
"meta_file_val": "test.tsv"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "voxceleb1",
|
|
||||||
"path": "../../audio-datasets/en/voxceleb1/",
|
|
||||||
"meta_file_train": null,
|
|
||||||
"meta_file_val": null
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "voxceleb2",
|
|
||||||
"path": "../../audio-datasets/en/voxceleb2/",
|
|
||||||
"meta_file_train": null,
|
|
||||||
"meta_file_val": null
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "vctk",
|
|
||||||
"path": "../../audio-datasets/en/VCTK-Corpus/",
|
"path": "../../audio-datasets/en/VCTK-Corpus/",
|
||||||
"meta_file_train": null,
|
"meta_file_train": null,
|
||||||
"meta_file_val": null
|
"meta_file_val": null
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "libri_tts",
|
|
||||||
"path": "../../audio-datasets/en/LibriTTS/train-clean-100",
|
|
||||||
"meta_file_train": null,
|
|
||||||
"meta_file_val": null
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "libri_tts",
|
|
||||||
"path": "../../audio-datasets/en/LibriTTS/train-clean-360",
|
|
||||||
"meta_file_train": null,
|
|
||||||
"meta_file_val": null
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "libri_tts",
|
|
||||||
"path": "../../audio-datasets/en/LibriTTS/train-other-500",
|
|
||||||
"meta_file_train": null,
|
|
||||||
"meta_file_val": null
|
|
||||||
}
|
}
|
||||||
|
// {
|
||||||
|
// "name": "libri_tts",
|
||||||
|
// "path": "../../audio-datasets/en/LibriTTS/train-clean-100",
|
||||||
|
// "meta_file_train": null,
|
||||||
|
// "meta_file_val": null
|
||||||
|
// },
|
||||||
|
// {
|
||||||
|
// "name": "libri_tts",
|
||||||
|
// "path": "../../audio-datasets/en/LibriTTS/train-clean-360",
|
||||||
|
// "meta_file_train": null,
|
||||||
|
// "meta_file_val": null
|
||||||
|
// },
|
||||||
|
// {
|
||||||
|
// "name": "libri_tts",
|
||||||
|
// "path": "../../audio-datasets/en/LibriTTS/train-other-500",
|
||||||
|
// "meta_file_train": null,
|
||||||
|
// "meta_file_val": null
|
||||||
|
// },
|
||||||
|
// {
|
||||||
|
// "name": "voxceleb1",
|
||||||
|
// "path": "../../audio-datasets/en/voxceleb1/",
|
||||||
|
// "meta_file_train": null,
|
||||||
|
// "meta_file_val": null
|
||||||
|
// },
|
||||||
|
// {
|
||||||
|
// "name": "voxceleb2",
|
||||||
|
// "path": "../../audio-datasets/en/voxceleb2/",
|
||||||
|
// "meta_file_train": null,
|
||||||
|
// "meta_file_val": null
|
||||||
|
// },
|
||||||
|
// {
|
||||||
|
// "name": "common_voice_wav",
|
||||||
|
// "path": "../../audio-datasets/en/MozillaCommonVoice",
|
||||||
|
// "meta_file_train": "train.tsv",
|
||||||
|
// "meta_file_val": "test.tsv"
|
||||||
|
// }
|
||||||
]
|
]
|
||||||
}
|
}
|
|
@ -1,4 +1,5 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import queue
|
||||||
import torch
|
import torch
|
||||||
import random
|
import random
|
||||||
from torch.utils.data import Dataset
|
from torch.utils.data import Dataset
|
||||||
|
@ -7,6 +8,7 @@ from tqdm import tqdm
|
||||||
|
|
||||||
class MyDataset(Dataset):
|
class MyDataset(Dataset):
|
||||||
def __init__(self, ap, meta_data, voice_len=1.6, num_speakers_in_batch=64,
|
def __init__(self, ap, meta_data, voice_len=1.6, num_speakers_in_batch=64,
|
||||||
|
storage_size=1, sample_from_storage_p=0.5,
|
||||||
num_utter_per_speaker=10, skip_speakers=False, verbose=False):
|
num_utter_per_speaker=10, skip_speakers=False, verbose=False):
|
||||||
"""
|
"""
|
||||||
Args:
|
Args:
|
||||||
|
@ -25,8 +27,12 @@ class MyDataset(Dataset):
|
||||||
self.ap = ap
|
self.ap = ap
|
||||||
self.verbose = verbose
|
self.verbose = verbose
|
||||||
self.__parse_items()
|
self.__parse_items()
|
||||||
|
self.storage = queue.Queue(maxsize=storage_size*num_speakers_in_batch)
|
||||||
|
self.sample_from_storage_p = float(sample_from_storage_p)
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print("\n > DataLoader initialization")
|
print("\n > DataLoader initialization")
|
||||||
|
print(f" | > Storage Size: {self.storage.maxsize} speakers, each with {num_utter_per_speaker} utters")
|
||||||
|
print(f" | > Sample_from_storage_p : {self.sample_from_storage_p}")
|
||||||
print(f" | > Number of instances : {len(self.items)}")
|
print(f" | > Number of instances : {len(self.items)}")
|
||||||
print(f" | > Sequence length: {self.seq_len}")
|
print(f" | > Sequence length: {self.seq_len}")
|
||||||
print(f" | > Num speakers: {len(self.speakers)}")
|
print(f" | > Num speakers: {len(self.speakers)}")
|
||||||
|
@ -134,7 +140,17 @@ class MyDataset(Dataset):
|
||||||
labels = []
|
labels = []
|
||||||
feats = []
|
feats = []
|
||||||
for speaker in batch:
|
for speaker in batch:
|
||||||
|
if random.random() < self.sample_from_storage_p and self.storage.full():
|
||||||
|
# sample from storage (if full), ignoring the speaker
|
||||||
|
feats_, labels_ = random.choice(self.storage.queue)
|
||||||
|
else:
|
||||||
|
# don't sample from storage, but from HDD
|
||||||
feats_, labels_ = self.__sample_speaker_utterances(speaker)
|
feats_, labels_ = self.__sample_speaker_utterances(speaker)
|
||||||
|
# if storage is full, remove an item
|
||||||
|
if self.storage.full():
|
||||||
|
_ = self.storage.get_nowait()
|
||||||
|
# put the newly loaded item into storage
|
||||||
|
self.storage.put_nowait((feats_, labels_))
|
||||||
labels.append(labels_)
|
labels.append(labels_)
|
||||||
feats.extend(feats_)
|
feats.extend(feats_)
|
||||||
feats = torch.stack(feats)
|
feats = torch.stack(feats)
|
||||||
|
|
|
@ -23,7 +23,7 @@ def save_checkpoint(model, optimizer, model_loss, out_path,
|
||||||
|
|
||||||
def save_best_model(model, optimizer, model_loss, best_loss, out_path,
|
def save_best_model(model, optimizer, model_loss, best_loss, out_path,
|
||||||
current_step):
|
current_step):
|
||||||
if model_loss < best_loss:
|
if model_loss < best_loss and current_step > 1000:
|
||||||
new_state_dict = model.state_dict()
|
new_state_dict = model.state_dict()
|
||||||
state = {
|
state = {
|
||||||
'model': new_state_dict,
|
'model': new_state_dict,
|
||||||
|
@ -35,7 +35,7 @@ def save_best_model(model, optimizer, model_loss, best_loss, out_path,
|
||||||
best_loss = model_loss
|
best_loss = model_loss
|
||||||
bestmodel_path = 'best_model.pth.tar'
|
bestmodel_path = 'best_model.pth.tar'
|
||||||
bestmodel_path = os.path.join(out_path, bestmodel_path)
|
bestmodel_path = os.path.join(out_path, bestmodel_path)
|
||||||
print("\n > BEST MODEL ({0:.5f}) : {1:}".format(
|
print("\n > NEW BEST MODEL ({0:.5f}) : {1:}".format(
|
||||||
model_loss, bestmodel_path))
|
model_loss, os.path.abspath(bestmodel_path)))
|
||||||
torch.save(state, bestmodel_path)
|
torch.save(state, bestmodel_path)
|
||||||
return best_loss
|
return best_loss
|
||||||
|
|
|
@ -17,10 +17,10 @@ def load_meta_data(datasets):
|
||||||
root_path = dataset['path']
|
root_path = dataset['path']
|
||||||
meta_file_train = dataset['meta_file_train']
|
meta_file_train = dataset['meta_file_train']
|
||||||
meta_file_val = dataset['meta_file_val']
|
meta_file_val = dataset['meta_file_val']
|
||||||
|
print(f" | > Preprocessing {name}")
|
||||||
preprocessor = get_preprocessor_by_name(name)
|
preprocessor = get_preprocessor_by_name(name)
|
||||||
|
|
||||||
meta_data_train = preprocessor(root_path, meta_file_train)
|
meta_data_train = preprocessor(root_path, meta_file_train)
|
||||||
print(f"Found {len(meta_data_train)} files in {Path(root_path).absolute()}")
|
print(f" | > Found {len(meta_data_train)} files in {Path(root_path).resolve()}")
|
||||||
if meta_file_val is None:
|
if meta_file_val is None:
|
||||||
meta_data_eval, meta_data_train = split_dataset(meta_data_train)
|
meta_data_eval, meta_data_train = split_dataset(meta_data_train)
|
||||||
else:
|
else:
|
||||||
|
@ -257,6 +257,25 @@ def vctk(root_path, meta_files=None, wavs_path='wav48'):
|
||||||
|
|
||||||
return items
|
return items
|
||||||
|
|
||||||
|
|
||||||
|
def vctk_slim(root_path, meta_files=None, wavs_path='wav48'):
|
||||||
|
test_speakers = meta_files
|
||||||
|
"""homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz"""
|
||||||
|
items = []
|
||||||
|
meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True)
|
||||||
|
for meta_file in meta_files:
|
||||||
|
_, speaker_id, txt_file = os.path.relpath(meta_file,
|
||||||
|
root_path).split(os.sep)
|
||||||
|
file_id = txt_file.split('.')[0]
|
||||||
|
if isinstance(test_speakers, list): # if is list ignore this speakers ids
|
||||||
|
if speaker_id in test_speakers:
|
||||||
|
continue
|
||||||
|
wav_file = os.path.join(root_path, wavs_path, speaker_id,
|
||||||
|
file_id + '.wav')
|
||||||
|
items.append([None, wav_file, 'VCTK_' + speaker_id])
|
||||||
|
|
||||||
|
return items
|
||||||
|
|
||||||
# ======================================== VOX CELEB ===========================================
|
# ======================================== VOX CELEB ===========================================
|
||||||
def voxceleb2(root_path, meta_file):
|
def voxceleb2(root_path, meta_file):
|
||||||
"""
|
"""
|
||||||
|
|
Loading…
Reference in New Issue