add Audio data augamentation Addtive and RIR

This commit is contained in:
Edresson 2021-05-11 00:59:57 -03:00
parent 77d85c6cc5
commit 85ccad7e0a
5 changed files with 217 additions and 20 deletions

View File

@ -51,8 +51,8 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False
skip_speakers=getattr(c, "skip_speakers", False),
storage_size=c.storage["storage_size"],
sample_from_storage_p=c.storage["sample_from_storage_p"],
additive_noise=c.storage["additive_noise"],
verbose=verbose,
augmentation_config=getattr(c, "audio_augmentation", None)
)
# sampler = DistributedSampler(dataset) if num_gpus > 1 else None

View File

@ -53,6 +53,16 @@
"num_lstm_layers": 3,
"use_lstm_with_projection": true
},
"audio_augmentation": {
"p": 0,
//add a gaussian noise to the data in order to increase robustness
"gaussian":{ // as the insertion of Gaussian noise is quick to be calculated, we added it after loading the wav file, this way, even audios that were reused with the cache can receive this noise
"p": 1, // propability of apply this method, 0 is disable
"min_amplitude": 0.0,
"max_amplitude": 1e-5
}
},
"storage": {
"sample_from_storage_p": 0.66, // the probability with which we'll sample from the DataSet in-memory storage
"storage_size": 15, // the size of the in-memory storage with respect to a single batch

View File

@ -25,7 +25,7 @@
"mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
"mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
"spec_gain": 20.0,
"do_trim_silence": false, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
"do_trim_silence": true, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
"trim_db": 60, // threshold for timming silence. Set this according to your dataset.
"stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
},
@ -41,7 +41,7 @@
"steps_plot_stats": 10, // number of steps to plot embeddings.
// Speakers config
"num_speakers_in_batch": 108, // Batch size for training.
"num_speakers_in_batch": 2, // Batch size for training.
"num_utters_per_speaker": 2, //
"skip_speakers": true, // skip speakers with samples less than "num_utters_per_speaker"
@ -54,6 +54,41 @@
"print_step": 20, // Number of steps to log traning on console.
"output_path": "../../../checkpoints/speaker_encoder/", // DATASET-RELATED: output path for all training outputs.
"audio_augmentation": {
"p": 0.75, // propability of apply this method, 0 is disable rir and additive noise augmentation
"rir":{
"rir_path": "/workspace/store/ecasanova/ComParE/RIRS_NOISES/simulated_rirs/",
"conv_mode": "full"
},
"additive":{
"sounds_path": "/workspace/store/ecasanova/ComParE/musan/",
// list of each of the directories in your data augmentation, if a directory is in "sounds_path" but is not listed here it will be ignored
"speech":{
"min_snr_in_db": 13,
"max_snr_in_db": 20,
"min_num_noises": 3,
"max_num_noises": 7
},
"noise":{
"min_snr_in_db": 0,
"max_snr_in_db": 15,
"min_num_noises": 1,
"max_num_noises": 1
},
"music":{
"min_snr_in_db": 5,
"max_snr_in_db": 15,
"min_num_noises": 1,
"max_num_noises": 1
}
},
//add a gaussian noise to the data in order to increase robustness
"gaussian":{ // as the insertion of Gaussian noise is quick to be calculated, we added it after loading the wav file, this way, even audios that were reused with the cache can receive this noise
"p": 1, // propability of apply this method, 0 is disable
"min_amplitude": 0.0,
"max_amplitude": 1e-5
}
},
"model": {
"input_dim": 80,
"proj_dim": 512,
@ -63,11 +98,17 @@
},
"storage": {
"sample_from_storage_p": 0.66, // the probability with which we'll sample from the DataSet in-memory storage
"storage_size": 15, // the size of the in-memory storage with respect to a single batch
"additive_noise": 1e-5 // add very small gaussian noise to the data in order to increase robustness
"storage_size": 1 // the size of the in-memory storage with respect to a single batch
},
"datasets":
[
{
"name": "common_voice",
"path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ab/",
"meta_file_train": "train.tsv",
"meta_file_val": "test.tsv"
},
{
"name": "vctk",
"path": "/workspace/store/ecasanova/datasets/VCTK-Corpus-removed-silence/",

View File

@ -4,7 +4,7 @@ import random
import numpy as np
import torch
from torch.utils.data import Dataset
from TTS.speaker_encoder.utils.generic_utils import AugmentWAV
class MyDataset(Dataset):
def __init__(
@ -15,10 +15,11 @@ class MyDataset(Dataset):
num_speakers_in_batch=64,
storage_size=1,
sample_from_storage_p=0.5,
additive_noise=0,
additive_noise= 1e-5,
num_utter_per_speaker=10,
skip_speakers=False,
verbose=False,
augmentation_config=None
):
"""
Args:
@ -39,18 +40,27 @@ class MyDataset(Dataset):
self.__parse_items()
self.storage = queue.Queue(maxsize=storage_size * num_speakers_in_batch)
self.sample_from_storage_p = float(sample_from_storage_p)
self.additive_noise = float(additive_noise)
speakers_aux = list(self.speakers)
speakers_aux.sort()
self.speakerid_to_classid = {key : i for i, key in enumerate(speakers_aux)}
# Augmentation
self.augmentator = None
self.gaussian_augmentation_config = None
if augmentation_config:
self.data_augmentation_p = augmentation_config['p']
if self.data_augmentation_p and ('additive' in augmentation_config or 'rir' in augmentation_config):
self.augmentator = AugmentWAV(ap, augmentation_config)
if 'gaussian' in augmentation_config.keys():
self.gaussian_augmentation_config = augmentation_config['gaussian']
if self.verbose:
print("\n > DataLoader initialization")
print(f" | > Speakers per Batch: {num_speakers_in_batch}")
print(f" | > Storage Size: {self.storage.maxsize} instances, each with {num_utter_per_speaker} utters")
print(f" | > Sample_from_storage_p : {self.sample_from_storage_p}")
print(f" | > Noise added : {self.additive_noise}")
print(f" | > Number of instances : {len(self.items)}")
print(f" | > Sequence length: {self.seq_len}")
print(f" | > Num speakers: {len(self.speakers)}")
@ -151,6 +161,10 @@ class MyDataset(Dataset):
break
self.speaker_to_utters[speaker].remove(utter)
if self.augmentator is not None and self.data_augmentation_p:
if random.random() < self.data_augmentation_p:
wav = self.augmentator.apply_one(wav)
wavs.append(wav)
labels.append(self.speakerid_to_classid[speaker])
return wavs, labels
@ -201,20 +215,21 @@ class MyDataset(Dataset):
# put the newly loaded item into storage
self.storage.put_nowait((wavs_, labels_))
# add random gaussian noise
if self.additive_noise > 0:
noises_ = [np.random.normal(0, self.additive_noise, size=len(w)) for w in wavs_]
wavs_ = [wavs_[i] + noises_[i] for i in range(len(wavs_))]
# get a random subset of each of the wavs and extract mel spectrograms.
offsets_ = [random.randint(0, wav.shape[0] - self.seq_len) for wav in wavs_]
mels_ = [
self.ap.melspectrogram(wavs_[i][offsets_[i] : offsets_[i] + self.seq_len]) for i in range(len(wavs_))
]
feats_ = [torch.FloatTensor(mel) for mel in mels_]
feats_ = []
for wav in wavs_:
offset = random.randint(0, wav.shape[0] - self.seq_len)
wav = wav[offset : offset + self.seq_len]
# add random gaussian noise
if self.gaussian_augmentation_config and self.gaussian_augmentation_config['p']:
if random.random() < self.gaussian_augmentation_config['p']:
wav += np.random.normal(self.gaussian_augmentation_config['min_amplitude'], self.gaussian_augmentation_config['max_amplitude'], size=len(wav))
mel = self.ap.melspectrogram(wav)
feats_.append(torch.FloatTensor(mel))
labels.append(torch.LongTensor(labels_))
feats.extend(feats_)
feats = torch.stack(feats)
labels = torch.stack(labels)
return feats.transpose(1, 2), labels

View File

@ -2,11 +2,143 @@ import datetime
import os
import re
import numpy as np
import torch
import glob
import random
from scipy import signal
from TTS.speaker_encoder.model import SpeakerEncoder
from TTS.utils.generic_utils import check_argument
class AugmentWAV(object):
def __init__(self, ap, augmentation_config):
self.ap = ap
'''augmentation_config = {
"p": 1,
"rir":{
"rir_path": "rir_path/"
"conv_mode": "full"
},
"additive":{
"sounds_path": "musan/",
# directorys in sounds_path
"speech":{
"min_snr_in_db": 13,
"max_snr_in_db": 20,
"min_num_noises": 3,
"max_num_noises": 7
},
"noise":{
"min_snr_in_db": 0,
"max_snr_in_db": 15,
"min_num_noises": 1,
"max_num_noises": 1
},
"music":{
"min_snr_in_db": 5,
"max_snr_in_db": 15,
"min_num_noises": 1,
"max_num_noises": 1
}
}
}'''
self.use_additive_noise = False
if 'additive' in augmentation_config.keys():
self.additive_noise_config = augmentation_config['additive']
additive_path = self.additive_noise_config['sounds_path']
if additive_path:
self.use_additive_noise = True
# get noise types
self.additive_noise_types = []
for key in self.additive_noise_config.keys():
if isinstance(self.additive_noise_config[key], dict):
self.additive_noise_types.append(key)
additive_files = glob.glob(os.path.join(additive_path,'**/*.wav'), recursive=True)
self.noise_list = {}
for wav_file in additive_files:
noise_dir = wav_file.replace(additive_path, '').split(os.sep)[0]
# ignore not listed directories
if noise_dir not in self.additive_noise_types:
continue
if not noise_dir in self.noise_list:
self.noise_list[noise_dir] = []
self.noise_list[noise_dir].append(wav_file)
print(f" | > Using Additive Noise Augmentation: with {len(additive_files)} audios instances from {self.additive_noise_types}")
self.use_rir = False
if 'rir' in augmentation_config.keys():
self.rir_config = augmentation_config['rir']
if self.rir_config['rir_path']:
self.rir_files = glob.glob(os.path.join(self.rir_config['rir_path'],'**/*.wav'), recursive=True)
self.use_rir = True
print(f" | > Using RIR Noise Augmentation: with {len(self.rir_files)} audios instances")
self.create_augmentation_global_list()
def create_augmentation_global_list(self):
if self.use_additive_noise:
self.global_noise_list = self.additive_noise_types
else:
self.global_noise_list = []
if self.use_rir:
self.global_noise_list.append("RIR_AUG")
def additive_noise(self, noise_type, audio):
clean_db = 10 * np.log10(np.mean(audio**2) + 1e-4)
noise_list = random.sample(self.noise_list[noise_type], random.randint(self.additive_noise_config[noise_type]['min_num_noises'], self.additive_noise_config[noise_type]['max_num_noises']))
audio_len = audio.shape[0]
noises_wav = None
for noise in noise_list:
noiseaudio = self.ap.load_wav(noise, sr=self.ap.sample_rate)[:audio_len]
if noiseaudio.shape[0] < audio_len:
continue
noise_snr = random.uniform(self.additive_noise_config[noise_type]['min_snr_in_db'], self.additive_noise_config[noise_type]['max_num_noises'])
noise_db = 10 * np.log10(np.mean(noiseaudio ** 2) + 1e-4)
noise_wav = np.sqrt(10 ** ((clean_db - noise_db - noise_snr) / 10)) * noiseaudio
if noises_wav is None:
noises_wav = noise_wav
else:
noises_wav += noise_wav
# if all possibel files is less than audio, choose other files
if noises_wav is None:
print("audio ignorado")
return self.additive_noise(noise_type, audio)
return audio + noises_wav
def reverberate(self, audio):
audio_len = audio.shape[0]
rir_file = random.choice(self.rir_files)
rir = self.ap.load_wav(rir_file, sr=self.ap.sample_rate)
rir = rir / np.sqrt(np.sum(rir ** 2))
return signal.convolve(audio, rir, mode=self.rir_config['conv_mode'])[:audio_len]
def apply_one(self, audio):
return self.reverberate(audio)
noise_type = random.choice(self.global_noise_list)
if noise_type == "RIR_AUG":
return self.reverberate(audio)
else:
return self.additive_noise(noise_type, audio)
def to_camel(text):
text = text.capitalize()
@ -112,7 +244,6 @@ def check_config_speaker_encoder(c):
check_argument("storage", c, restricted=True, val_type=dict)
check_argument("sample_from_storage_p", c["storage"], restricted=True, val_type=float, min_val=0.0, max_val=1.0)
check_argument("storage_size", c["storage"], restricted=True, val_type=int, min_val=1, max_val=100)
check_argument("additive_noise", c["storage"], restricted=True, val_type=float, min_val=0.0, max_val=1.0)
# datasets - checking only the first entry
check_argument("datasets", c, restricted=True, val_type=list)