mirror of https://github.com/coqui-ai/TTS.git
add: add random noise to dataset
This commit is contained in:
parent
e36a3067e4
commit
a273b1a210
|
@ -46,6 +46,7 @@ def setup_loader(ap, is_val=False, verbose=False):
|
||||||
skip_speakers=False,
|
skip_speakers=False,
|
||||||
storage_size=c.storage["storage_size"],
|
storage_size=c.storage["storage_size"],
|
||||||
sample_from_storage_p=c.storage["sample_from_storage_p"],
|
sample_from_storage_p=c.storage["sample_from_storage_p"],
|
||||||
|
additive_noise=c.storage["additive_noise"],
|
||||||
verbose=verbose)
|
verbose=verbose)
|
||||||
# sampler = DistributedSampler(dataset) if num_gpus > 1 else None
|
# sampler = DistributedSampler(dataset) if num_gpus > 1 else None
|
||||||
loader = DataLoader(dataset,
|
loader = DataLoader(dataset,
|
||||||
|
|
|
@ -27,7 +27,7 @@
|
||||||
"trim_db": 60 // threshold for timming silence. Set this according to your dataset.
|
"trim_db": 60 // threshold for timming silence. Set this according to your dataset.
|
||||||
},
|
},
|
||||||
"reinit_layers": [],
|
"reinit_layers": [],
|
||||||
"loss": "ge2e", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA)
|
"loss": "angleproto", // "ge2e" to use Generalized End-to-End loss and "angleproto" to use Angular Prototypical loss (new SOTA)
|
||||||
"grad_clip": 3.0, // upper limit for gradients for clipping.
|
"grad_clip": 3.0, // upper limit for gradients for clipping.
|
||||||
"epochs": 1000, // total number of epochs to train.
|
"epochs": 1000, // total number of epochs to train.
|
||||||
"lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
|
"lr": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.
|
||||||
|
@ -35,12 +35,12 @@
|
||||||
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
|
"warmup_steps": 4000, // Noam decay steps to increase the learning rate from 0 to "lr"
|
||||||
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
|
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
|
||||||
"steps_plot_stats": 10, // number of steps to plot embeddings.
|
"steps_plot_stats": 10, // number of steps to plot embeddings.
|
||||||
"num_speakers_in_batch": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
|
"num_speakers_in_batch": 64, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
|
||||||
"num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values.
|
"num_loader_workers": 4, // number of training data loader processes. Don't set it too big. 4-8 are good values.
|
||||||
"wd": 0.000001, // Weight decay weight.
|
"wd": 0.000001, // Weight decay weight.
|
||||||
"checkpoint": true, // If true, it saves checkpoints per "save_step"
|
"checkpoint": true, // If true, it saves checkpoints per "save_step"
|
||||||
"save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
|
"save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
|
||||||
"print_step": 1, // Number of steps to log traning on console.
|
"print_step": 20, // Number of steps to log traning on console.
|
||||||
"output_path": "../../MozillaTTSOutput/checkpoints/voxceleb_librispeech/speaker_encoder/", // DATASET-RELATED: output path for all training outputs.
|
"output_path": "../../MozillaTTSOutput/checkpoints/voxceleb_librispeech/speaker_encoder/", // DATASET-RELATED: output path for all training outputs.
|
||||||
"model": {
|
"model": {
|
||||||
"input_dim": 40,
|
"input_dim": 40,
|
||||||
|
@ -51,7 +51,8 @@
|
||||||
},
|
},
|
||||||
"storage": {
|
"storage": {
|
||||||
"sample_from_storage_p": 0.42, // the probability with which we'll sample from the DataSet in-memory storage
|
"sample_from_storage_p": 0.42, // the probability with which we'll sample from the DataSet in-memory storage
|
||||||
"storage_size": 5 // the size of the in-memory storage with respect to a single batch
|
"storage_size": 5, // the size of the in-memory storage with respect to a single batch
|
||||||
|
"additive_noise": 1e-5 // add very small gaussian noise to the data in order to increase robustness
|
||||||
},
|
},
|
||||||
"datasets":
|
"datasets":
|
||||||
[
|
[
|
||||||
|
@ -60,42 +61,42 @@
|
||||||
"path": "../../audio-datasets/en/VCTK-Corpus/",
|
"path": "../../audio-datasets/en/VCTK-Corpus/",
|
||||||
"meta_file_train": null,
|
"meta_file_train": null,
|
||||||
"meta_file_val": null
|
"meta_file_val": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "libri_tts",
|
||||||
|
"path": "../../audio-datasets/en/LibriTTS/train-clean-100",
|
||||||
|
"meta_file_train": null,
|
||||||
|
"meta_file_val": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "libri_tts",
|
||||||
|
"path": "../../audio-datasets/en/LibriTTS/train-clean-360",
|
||||||
|
"meta_file_train": null,
|
||||||
|
"meta_file_val": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "libri_tts",
|
||||||
|
"path": "../../audio-datasets/en/LibriTTS/train-other-500",
|
||||||
|
"meta_file_train": null,
|
||||||
|
"meta_file_val": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "voxceleb1",
|
||||||
|
"path": "../../audio-datasets/en/voxceleb1/",
|
||||||
|
"meta_file_train": null,
|
||||||
|
"meta_file_val": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "voxceleb2",
|
||||||
|
"path": "../../audio-datasets/en/voxceleb2/",
|
||||||
|
"meta_file_train": null,
|
||||||
|
"meta_file_val": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "common_voice_wav",
|
||||||
|
"path": "../../audio-datasets/en/MozillaCommonVoice",
|
||||||
|
"meta_file_train": "train.tsv",
|
||||||
|
"meta_file_val": "test.tsv"
|
||||||
}
|
}
|
||||||
// {
|
|
||||||
// "name": "libri_tts",
|
|
||||||
// "path": "../../audio-datasets/en/LibriTTS/train-clean-100",
|
|
||||||
// "meta_file_train": null,
|
|
||||||
// "meta_file_val": null
|
|
||||||
// },
|
|
||||||
// {
|
|
||||||
// "name": "libri_tts",
|
|
||||||
// "path": "../../audio-datasets/en/LibriTTS/train-clean-360",
|
|
||||||
// "meta_file_train": null,
|
|
||||||
// "meta_file_val": null
|
|
||||||
// },
|
|
||||||
// {
|
|
||||||
// "name": "libri_tts",
|
|
||||||
// "path": "../../audio-datasets/en/LibriTTS/train-other-500",
|
|
||||||
// "meta_file_train": null,
|
|
||||||
// "meta_file_val": null
|
|
||||||
// },
|
|
||||||
// {
|
|
||||||
// "name": "voxceleb1",
|
|
||||||
// "path": "../../audio-datasets/en/voxceleb1/",
|
|
||||||
// "meta_file_train": null,
|
|
||||||
// "meta_file_val": null
|
|
||||||
// },
|
|
||||||
// {
|
|
||||||
// "name": "voxceleb2",
|
|
||||||
// "path": "../../audio-datasets/en/voxceleb2/",
|
|
||||||
// "meta_file_train": null,
|
|
||||||
// "meta_file_val": null
|
|
||||||
// },
|
|
||||||
// {
|
|
||||||
// "name": "common_voice_wav",
|
|
||||||
// "path": "../../audio-datasets/en/MozillaCommonVoice",
|
|
||||||
// "meta_file_train": "train.tsv",
|
|
||||||
// "meta_file_val": "test.tsv"
|
|
||||||
// }
|
|
||||||
]
|
]
|
||||||
}
|
}
|
|
@ -1,3 +1,4 @@
|
||||||
|
import numpy
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import queue
|
import queue
|
||||||
import torch
|
import torch
|
||||||
|
@ -8,7 +9,7 @@ from tqdm import tqdm
|
||||||
|
|
||||||
class MyDataset(Dataset):
|
class MyDataset(Dataset):
|
||||||
def __init__(self, ap, meta_data, voice_len=1.6, num_speakers_in_batch=64,
|
def __init__(self, ap, meta_data, voice_len=1.6, num_speakers_in_batch=64,
|
||||||
storage_size=1, sample_from_storage_p=0.5,
|
storage_size=1, sample_from_storage_p=0.5, additive_noise=0,
|
||||||
num_utter_per_speaker=10, skip_speakers=False, verbose=False):
|
num_utter_per_speaker=10, skip_speakers=False, verbose=False):
|
||||||
"""
|
"""
|
||||||
Args:
|
Args:
|
||||||
|
@ -29,6 +30,7 @@ class MyDataset(Dataset):
|
||||||
self.__parse_items()
|
self.__parse_items()
|
||||||
self.storage = queue.Queue(maxsize=storage_size*num_speakers_in_batch)
|
self.storage = queue.Queue(maxsize=storage_size*num_speakers_in_batch)
|
||||||
self.sample_from_storage_p = float(sample_from_storage_p)
|
self.sample_from_storage_p = float(sample_from_storage_p)
|
||||||
|
self.additive_noise = float(additive_noise)
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print("\n > DataLoader initialization")
|
print("\n > DataLoader initialization")
|
||||||
print(f" | > Storage Size: {self.storage.maxsize} speakers, each with {num_utter_per_speaker} utters")
|
print(f" | > Storage Size: {self.storage.maxsize} speakers, each with {num_utter_per_speaker} utters")
|
||||||
|
@ -150,6 +152,11 @@ class MyDataset(Dataset):
|
||||||
# put the newly loaded item into storage
|
# put the newly loaded item into storage
|
||||||
self.storage.put_nowait((wavs_, labels_))
|
self.storage.put_nowait((wavs_, labels_))
|
||||||
|
|
||||||
|
# add random gaussian noise
|
||||||
|
if self.additive_noise > 0:
|
||||||
|
noises_ = [numpy.random.normal(0, self.additive_noise, size=len(w)) for w in wavs_]
|
||||||
|
wavs_ = [wavs_[i] + noises_[i] for i in range(len(wavs_))]
|
||||||
|
|
||||||
# get a random subset of each of the wavs and convert to MFCC.
|
# get a random subset of each of the wavs and convert to MFCC.
|
||||||
offsets_ = [random.randint(0, wav.shape[0] - self.seq_len) for wav in wavs_]
|
offsets_ = [random.randint(0, wav.shape[0] - self.seq_len) for wav in wavs_]
|
||||||
mels_ = [self.ap.melspectrogram(wavs_[i][offsets_[i]: offsets_[i] + self.seq_len]) for i in range(len(wavs_))]
|
mels_ = [self.ap.melspectrogram(wavs_[i][offsets_[i]: offsets_[i] + self.seq_len]) for i in range(len(wavs_))]
|
||||||
|
|
Loading…
Reference in New Issue