mirror of https://github.com/coqui-ai/TTS.git
add Audio data augamentation Addtive and RIR
This commit is contained in:
parent
77d85c6cc5
commit
85ccad7e0a
|
@ -51,8 +51,8 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False
|
|||
skip_speakers=getattr(c, "skip_speakers", False),
|
||||
storage_size=c.storage["storage_size"],
|
||||
sample_from_storage_p=c.storage["sample_from_storage_p"],
|
||||
additive_noise=c.storage["additive_noise"],
|
||||
verbose=verbose,
|
||||
augmentation_config=getattr(c, "audio_augmentation", None)
|
||||
)
|
||||
|
||||
# sampler = DistributedSampler(dataset) if num_gpus > 1 else None
|
||||
|
|
|
@ -53,6 +53,16 @@
|
|||
"num_lstm_layers": 3,
|
||||
"use_lstm_with_projection": true
|
||||
},
|
||||
|
||||
"audio_augmentation": {
|
||||
"p": 0,
|
||||
//add a gaussian noise to the data in order to increase robustness
|
||||
"gaussian":{ // as the insertion of Gaussian noise is quick to be calculated, we added it after loading the wav file, this way, even audios that were reused with the cache can receive this noise
|
||||
"p": 1, // propability of apply this method, 0 is disable
|
||||
"min_amplitude": 0.0,
|
||||
"max_amplitude": 1e-5
|
||||
}
|
||||
},
|
||||
"storage": {
|
||||
"sample_from_storage_p": 0.66, // the probability with which we'll sample from the DataSet in-memory storage
|
||||
"storage_size": 15, // the size of the in-memory storage with respect to a single batch
|
||||
|
|
|
@ -25,7 +25,7 @@
|
|||
"mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
|
||||
"mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
|
||||
"spec_gain": 20.0,
|
||||
"do_trim_silence": false, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
|
||||
"do_trim_silence": true, // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
|
||||
"trim_db": 60, // threshold for timming silence. Set this according to your dataset.
|
||||
"stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
|
||||
},
|
||||
|
@ -41,7 +41,7 @@
|
|||
"steps_plot_stats": 10, // number of steps to plot embeddings.
|
||||
|
||||
// Speakers config
|
||||
"num_speakers_in_batch": 108, // Batch size for training.
|
||||
"num_speakers_in_batch": 2, // Batch size for training.
|
||||
"num_utters_per_speaker": 2, //
|
||||
"skip_speakers": true, // skip speakers with samples less than "num_utters_per_speaker"
|
||||
|
||||
|
@ -54,6 +54,41 @@
|
|||
"print_step": 20, // Number of steps to log traning on console.
|
||||
"output_path": "../../../checkpoints/speaker_encoder/", // DATASET-RELATED: output path for all training outputs.
|
||||
|
||||
"audio_augmentation": {
|
||||
"p": 0.75, // propability of apply this method, 0 is disable rir and additive noise augmentation
|
||||
"rir":{
|
||||
"rir_path": "/workspace/store/ecasanova/ComParE/RIRS_NOISES/simulated_rirs/",
|
||||
"conv_mode": "full"
|
||||
},
|
||||
"additive":{
|
||||
"sounds_path": "/workspace/store/ecasanova/ComParE/musan/",
|
||||
// list of each of the directories in your data augmentation, if a directory is in "sounds_path" but is not listed here it will be ignored
|
||||
"speech":{
|
||||
"min_snr_in_db": 13,
|
||||
"max_snr_in_db": 20,
|
||||
"min_num_noises": 3,
|
||||
"max_num_noises": 7
|
||||
},
|
||||
"noise":{
|
||||
"min_snr_in_db": 0,
|
||||
"max_snr_in_db": 15,
|
||||
"min_num_noises": 1,
|
||||
"max_num_noises": 1
|
||||
},
|
||||
"music":{
|
||||
"min_snr_in_db": 5,
|
||||
"max_snr_in_db": 15,
|
||||
"min_num_noises": 1,
|
||||
"max_num_noises": 1
|
||||
}
|
||||
},
|
||||
//add a gaussian noise to the data in order to increase robustness
|
||||
"gaussian":{ // as the insertion of Gaussian noise is quick to be calculated, we added it after loading the wav file, this way, even audios that were reused with the cache can receive this noise
|
||||
"p": 1, // propability of apply this method, 0 is disable
|
||||
"min_amplitude": 0.0,
|
||||
"max_amplitude": 1e-5
|
||||
}
|
||||
},
|
||||
"model": {
|
||||
"input_dim": 80,
|
||||
"proj_dim": 512,
|
||||
|
@ -63,11 +98,17 @@
|
|||
},
|
||||
"storage": {
|
||||
"sample_from_storage_p": 0.66, // the probability with which we'll sample from the DataSet in-memory storage
|
||||
"storage_size": 15, // the size of the in-memory storage with respect to a single batch
|
||||
"additive_noise": 1e-5 // add very small gaussian noise to the data in order to increase robustness
|
||||
"storage_size": 1 // the size of the in-memory storage with respect to a single batch
|
||||
},
|
||||
"datasets":
|
||||
[
|
||||
|
||||
{
|
||||
"name": "common_voice",
|
||||
"path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ab/",
|
||||
"meta_file_train": "train.tsv",
|
||||
"meta_file_val": "test.tsv"
|
||||
},
|
||||
{
|
||||
"name": "vctk",
|
||||
"path": "/workspace/store/ecasanova/datasets/VCTK-Corpus-removed-silence/",
|
||||
|
|
|
@ -4,7 +4,7 @@ import random
|
|||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import Dataset
|
||||
|
||||
from TTS.speaker_encoder.utils.generic_utils import AugmentWAV
|
||||
|
||||
class MyDataset(Dataset):
|
||||
def __init__(
|
||||
|
@ -15,10 +15,11 @@ class MyDataset(Dataset):
|
|||
num_speakers_in_batch=64,
|
||||
storage_size=1,
|
||||
sample_from_storage_p=0.5,
|
||||
additive_noise=0,
|
||||
additive_noise= 1e-5,
|
||||
num_utter_per_speaker=10,
|
||||
skip_speakers=False,
|
||||
verbose=False,
|
||||
augmentation_config=None
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
|
@ -39,18 +40,27 @@ class MyDataset(Dataset):
|
|||
self.__parse_items()
|
||||
self.storage = queue.Queue(maxsize=storage_size * num_speakers_in_batch)
|
||||
self.sample_from_storage_p = float(sample_from_storage_p)
|
||||
self.additive_noise = float(additive_noise)
|
||||
|
||||
speakers_aux = list(self.speakers)
|
||||
speakers_aux.sort()
|
||||
self.speakerid_to_classid = {key : i for i, key in enumerate(speakers_aux)}
|
||||
|
||||
# Augmentation
|
||||
self.augmentator = None
|
||||
self.gaussian_augmentation_config = None
|
||||
if augmentation_config:
|
||||
self.data_augmentation_p = augmentation_config['p']
|
||||
if self.data_augmentation_p and ('additive' in augmentation_config or 'rir' in augmentation_config):
|
||||
self.augmentator = AugmentWAV(ap, augmentation_config)
|
||||
|
||||
if 'gaussian' in augmentation_config.keys():
|
||||
self.gaussian_augmentation_config = augmentation_config['gaussian']
|
||||
|
||||
if self.verbose:
|
||||
print("\n > DataLoader initialization")
|
||||
print(f" | > Speakers per Batch: {num_speakers_in_batch}")
|
||||
print(f" | > Storage Size: {self.storage.maxsize} instances, each with {num_utter_per_speaker} utters")
|
||||
print(f" | > Sample_from_storage_p : {self.sample_from_storage_p}")
|
||||
print(f" | > Noise added : {self.additive_noise}")
|
||||
print(f" | > Number of instances : {len(self.items)}")
|
||||
print(f" | > Sequence length: {self.seq_len}")
|
||||
print(f" | > Num speakers: {len(self.speakers)}")
|
||||
|
@ -151,6 +161,10 @@ class MyDataset(Dataset):
|
|||
break
|
||||
self.speaker_to_utters[speaker].remove(utter)
|
||||
|
||||
if self.augmentator is not None and self.data_augmentation_p:
|
||||
if random.random() < self.data_augmentation_p:
|
||||
wav = self.augmentator.apply_one(wav)
|
||||
|
||||
wavs.append(wav)
|
||||
labels.append(self.speakerid_to_classid[speaker])
|
||||
return wavs, labels
|
||||
|
@ -201,20 +215,21 @@ class MyDataset(Dataset):
|
|||
# put the newly loaded item into storage
|
||||
self.storage.put_nowait((wavs_, labels_))
|
||||
|
||||
# add random gaussian noise
|
||||
if self.additive_noise > 0:
|
||||
noises_ = [np.random.normal(0, self.additive_noise, size=len(w)) for w in wavs_]
|
||||
wavs_ = [wavs_[i] + noises_[i] for i in range(len(wavs_))]
|
||||
|
||||
# get a random subset of each of the wavs and extract mel spectrograms.
|
||||
offsets_ = [random.randint(0, wav.shape[0] - self.seq_len) for wav in wavs_]
|
||||
mels_ = [
|
||||
self.ap.melspectrogram(wavs_[i][offsets_[i] : offsets_[i] + self.seq_len]) for i in range(len(wavs_))
|
||||
]
|
||||
feats_ = [torch.FloatTensor(mel) for mel in mels_]
|
||||
feats_ = []
|
||||
for wav in wavs_:
|
||||
offset = random.randint(0, wav.shape[0] - self.seq_len)
|
||||
wav = wav[offset : offset + self.seq_len]
|
||||
# add random gaussian noise
|
||||
if self.gaussian_augmentation_config and self.gaussian_augmentation_config['p']:
|
||||
if random.random() < self.gaussian_augmentation_config['p']:
|
||||
wav += np.random.normal(self.gaussian_augmentation_config['min_amplitude'], self.gaussian_augmentation_config['max_amplitude'], size=len(wav))
|
||||
mel = self.ap.melspectrogram(wav)
|
||||
feats_.append(torch.FloatTensor(mel))
|
||||
|
||||
labels.append(torch.LongTensor(labels_))
|
||||
feats.extend(feats_)
|
||||
feats = torch.stack(feats)
|
||||
labels = torch.stack(labels)
|
||||
|
||||
return feats.transpose(1, 2), labels
|
||||
|
|
|
@ -2,11 +2,143 @@ import datetime
|
|||
import os
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import glob
|
||||
import random
|
||||
|
||||
from scipy import signal
|
||||
from TTS.speaker_encoder.model import SpeakerEncoder
|
||||
from TTS.utils.generic_utils import check_argument
|
||||
|
||||
class AugmentWAV(object):
|
||||
|
||||
def __init__(self, ap, augmentation_config):
|
||||
|
||||
self.ap = ap
|
||||
|
||||
'''augmentation_config = {
|
||||
"p": 1,
|
||||
"rir":{
|
||||
"rir_path": "rir_path/"
|
||||
"conv_mode": "full"
|
||||
},
|
||||
"additive":{
|
||||
"sounds_path": "musan/",
|
||||
# directorys in sounds_path
|
||||
"speech":{
|
||||
"min_snr_in_db": 13,
|
||||
"max_snr_in_db": 20,
|
||||
"min_num_noises": 3,
|
||||
"max_num_noises": 7
|
||||
},
|
||||
"noise":{
|
||||
"min_snr_in_db": 0,
|
||||
"max_snr_in_db": 15,
|
||||
"min_num_noises": 1,
|
||||
"max_num_noises": 1
|
||||
},
|
||||
"music":{
|
||||
"min_snr_in_db": 5,
|
||||
"max_snr_in_db": 15,
|
||||
"min_num_noises": 1,
|
||||
"max_num_noises": 1
|
||||
}
|
||||
}
|
||||
}'''
|
||||
|
||||
self.use_additive_noise = False
|
||||
if 'additive' in augmentation_config.keys():
|
||||
self.additive_noise_config = augmentation_config['additive']
|
||||
additive_path = self.additive_noise_config['sounds_path']
|
||||
if additive_path:
|
||||
self.use_additive_noise = True
|
||||
# get noise types
|
||||
self.additive_noise_types = []
|
||||
for key in self.additive_noise_config.keys():
|
||||
if isinstance(self.additive_noise_config[key], dict):
|
||||
self.additive_noise_types.append(key)
|
||||
|
||||
additive_files = glob.glob(os.path.join(additive_path,'**/*.wav'), recursive=True)
|
||||
|
||||
self.noise_list = {}
|
||||
|
||||
for wav_file in additive_files:
|
||||
noise_dir = wav_file.replace(additive_path, '').split(os.sep)[0]
|
||||
# ignore not listed directories
|
||||
if noise_dir not in self.additive_noise_types:
|
||||
continue
|
||||
if not noise_dir in self.noise_list:
|
||||
self.noise_list[noise_dir] = []
|
||||
self.noise_list[noise_dir].append(wav_file)
|
||||
|
||||
print(f" | > Using Additive Noise Augmentation: with {len(additive_files)} audios instances from {self.additive_noise_types}")
|
||||
|
||||
self.use_rir = False
|
||||
if 'rir' in augmentation_config.keys():
|
||||
self.rir_config = augmentation_config['rir']
|
||||
if self.rir_config['rir_path']:
|
||||
self.rir_files = glob.glob(os.path.join(self.rir_config['rir_path'],'**/*.wav'), recursive=True)
|
||||
self.use_rir = True
|
||||
|
||||
print(f" | > Using RIR Noise Augmentation: with {len(self.rir_files)} audios instances")
|
||||
|
||||
self.create_augmentation_global_list()
|
||||
|
||||
def create_augmentation_global_list(self):
|
||||
if self.use_additive_noise:
|
||||
self.global_noise_list = self.additive_noise_types
|
||||
else:
|
||||
self.global_noise_list = []
|
||||
if self.use_rir:
|
||||
self.global_noise_list.append("RIR_AUG")
|
||||
|
||||
def additive_noise(self, noise_type, audio):
|
||||
|
||||
clean_db = 10 * np.log10(np.mean(audio**2) + 1e-4)
|
||||
|
||||
noise_list = random.sample(self.noise_list[noise_type], random.randint(self.additive_noise_config[noise_type]['min_num_noises'], self.additive_noise_config[noise_type]['max_num_noises']))
|
||||
|
||||
audio_len = audio.shape[0]
|
||||
noises_wav = None
|
||||
for noise in noise_list:
|
||||
noiseaudio = self.ap.load_wav(noise, sr=self.ap.sample_rate)[:audio_len]
|
||||
|
||||
if noiseaudio.shape[0] < audio_len:
|
||||
continue
|
||||
|
||||
noise_snr = random.uniform(self.additive_noise_config[noise_type]['min_snr_in_db'], self.additive_noise_config[noise_type]['max_num_noises'])
|
||||
noise_db = 10 * np.log10(np.mean(noiseaudio ** 2) + 1e-4)
|
||||
noise_wav = np.sqrt(10 ** ((clean_db - noise_db - noise_snr) / 10)) * noiseaudio
|
||||
|
||||
if noises_wav is None:
|
||||
noises_wav = noise_wav
|
||||
else:
|
||||
noises_wav += noise_wav
|
||||
|
||||
# if all possibel files is less than audio, choose other files
|
||||
if noises_wav is None:
|
||||
print("audio ignorado")
|
||||
return self.additive_noise(noise_type, audio)
|
||||
|
||||
return audio + noises_wav
|
||||
|
||||
def reverberate(self, audio):
|
||||
audio_len = audio.shape[0]
|
||||
|
||||
rir_file = random.choice(self.rir_files)
|
||||
|
||||
rir = self.ap.load_wav(rir_file, sr=self.ap.sample_rate)
|
||||
rir = rir / np.sqrt(np.sum(rir ** 2))
|
||||
return signal.convolve(audio, rir, mode=self.rir_config['conv_mode'])[:audio_len]
|
||||
|
||||
def apply_one(self, audio):
|
||||
return self.reverberate(audio)
|
||||
noise_type = random.choice(self.global_noise_list)
|
||||
if noise_type == "RIR_AUG":
|
||||
return self.reverberate(audio)
|
||||
else:
|
||||
return self.additive_noise(noise_type, audio)
|
||||
|
||||
def to_camel(text):
|
||||
text = text.capitalize()
|
||||
|
@ -112,7 +244,6 @@ def check_config_speaker_encoder(c):
|
|||
check_argument("storage", c, restricted=True, val_type=dict)
|
||||
check_argument("sample_from_storage_p", c["storage"], restricted=True, val_type=float, min_val=0.0, max_val=1.0)
|
||||
check_argument("storage_size", c["storage"], restricted=True, val_type=int, min_val=1, max_val=100)
|
||||
check_argument("additive_noise", c["storage"], restricted=True, val_type=float, min_val=0.0, max_val=1.0)
|
||||
|
||||
# datasets - checking only the first entry
|
||||
check_argument("datasets", c, restricted=True, val_type=list)
|
||||
|
|
Loading…
Reference in New Issue