add Audio data augamentation Addtive and RIR

2021-05-11 00:59:57 -03:00 · 2021-05-11 00:59:57 -03:00 · 85ccad7e0a
parent 77d85c6cc5
commit 85ccad7e0a
5 changed files with 217 additions and 20 deletions
--- a/TTS/bin/train_encoder.py
+++ b/TTS/bin/train_encoder.py
@ -51,8 +51,8 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False
            skip_speakers=getattr(c, "skip_speakers", False),
            storage_size=c.storage["storage_size"],
            sample_from_storage_p=c.storage["sample_from_storage_p"],
-            additive_noise=c.storage["additive_noise"],
            verbose=verbose,
+            augmentation_config=getattr(c, "audio_augmentation", None) 
        )

        # sampler = DistributedSampler(dataset) if num_gpus > 1 else None
--- a/TTS/speaker_encoder/configs/config.json
+++ b/TTS/speaker_encoder/configs/config.json
@ -53,6 +53,16 @@
        "num_lstm_layers": 3,
        "use_lstm_with_projection": true
    },
+
+    "audio_augmentation": {
+        "p": 0,
+        //add a gaussian noise to the data in order to increase robustness
+        "gaussian":{ // as the insertion of Gaussian noise is quick to be calculated, we added it after loading the wav file, this way, even audios that were reused with the cache can receive this noise
+            "p": 1, // propability of apply this method, 0 is disable
+            "min_amplitude": 0.0,
+            "max_amplitude": 1e-5     
+        }
+    },
    "storage": {
        "sample_from_storage_p": 0.66,  // the probability with which we'll sample from the DataSet in-memory storage
        "storage_size": 15,   // the size of the in-memory storage with respect to a single batch
--- a/TTS/speaker_encoder/configs/config_softmaxproto.json
+++ b/TTS/speaker_encoder/configs/config_softmaxproto.json
@ -25,7 +25,7 @@
        "mel_fmin": 0.0,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
        "mel_fmax": 8000.0,        // maximum freq level for mel-spec. Tune for dataset!!
        "spec_gain": 20.0, 
-        "do_trim_silence": false,  // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
+        "do_trim_silence": true,  // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
        "trim_db": 60,          // threshold for timming silence. Set this according to your dataset.
        "stats_path": null    // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
    },
@ -41,7 +41,7 @@
    "steps_plot_stats": 10, // number of steps to plot embeddings.

    // Speakers config
-    "num_speakers_in_batch": 108, // Batch size for training.
+    "num_speakers_in_batch": 2, // Batch size for training.
    "num_utters_per_speaker": 2,  //
    "skip_speakers": true, // skip speakers with samples less than "num_utters_per_speaker"

@ -54,6 +54,41 @@
    "print_step": 20, // Number of steps to log traning on console.
    "output_path": "../../../checkpoints/speaker_encoder/", // DATASET-RELATED: output path for all training outputs.

+    "audio_augmentation": {
+        "p": 0.75, // propability of apply this method, 0 is disable rir and additive noise augmentation
+        "rir":{
+            "rir_path": "/workspace/store/ecasanova/ComParE/RIRS_NOISES/simulated_rirs/",
+            "conv_mode": "full"
+        },
+        "additive":{
+            "sounds_path": "/workspace/store/ecasanova/ComParE/musan/",
+            // list of each of the directories in your data augmentation, if a directory is in "sounds_path" but is not listed here it will be ignored
+            "speech":{
+                "min_snr_in_db": 13,
+                "max_snr_in_db": 20,
+                "min_num_noises": 3,
+                "max_num_noises": 7
+                },
+            "noise":{
+                "min_snr_in_db": 0,
+                "max_snr_in_db": 15,
+                "min_num_noises": 1,
+                "max_num_noises": 1
+                },
+            "music":{
+                "min_snr_in_db": 5,
+                "max_snr_in_db": 15,
+                "min_num_noises": 1,
+                "max_num_noises": 1
+                }
+        },
+        //add a gaussian noise to the data in order to increase robustness
+        "gaussian":{ // as the insertion of Gaussian noise is quick to be calculated, we added it after loading the wav file, this way, even audios that were reused with the cache can receive this noise
+            "p": 1, // propability of apply this method, 0 is disable
+            "min_amplitude": 0.0,
+            "max_amplitude": 1e-5     
+        }   
+    },
    "model": {
        "input_dim": 80,
        "proj_dim": 512,
@ -63,11 +98,17 @@
    },
    "storage": {
        "sample_from_storage_p": 0.66,  // the probability with which we'll sample from the DataSet in-memory storage
-        "storage_size": 15,   // the size of the in-memory storage with respect to a single batch
-        "additive_noise": 1e-5   // add very small gaussian noise to the data in order to increase robustness
+        "storage_size": 1   // the size of the in-memory storage with respect to a single batch
    },
    "datasets": 
        [
+
+            {
+                "name": "common_voice",
+                "path": "/workspace/scratch/ecasanova/datasets/common-voice/cv-corpus-6.1-2020-12-11_16khz/ab/",
+                "meta_file_train": "train.tsv",
+                "meta_file_val": "test.tsv"
+            },
            {
                "name": "vctk",
                "path": "/workspace/store/ecasanova/datasets/VCTK-Corpus-removed-silence/",
--- a/TTS/speaker_encoder/dataset.py
+++ b/TTS/speaker_encoder/dataset.py
@ -4,7 +4,7 @@ import random
 import numpy as np
 import torch
 from torch.utils.data import Dataset
-
+from TTS.speaker_encoder.utils.generic_utils import AugmentWAV

 class MyDataset(Dataset):
    def __init__(
@ -15,10 +15,11 @@ class MyDataset(Dataset):
        num_speakers_in_batch=64,
        storage_size=1,
        sample_from_storage_p=0.5,
-        additive_noise=0,
+        additive_noise= 1e-5,
        num_utter_per_speaker=10,
        skip_speakers=False,
        verbose=False,
+        augmentation_config=None
    ):
        """
        Args:
@ -39,18 +40,27 @@ class MyDataset(Dataset):
        self.__parse_items()
        self.storage = queue.Queue(maxsize=storage_size * num_speakers_in_batch)
        self.sample_from_storage_p = float(sample_from_storage_p)
-        self.additive_noise = float(additive_noise)

        speakers_aux = list(self.speakers)
        speakers_aux.sort()
        self.speakerid_to_classid = {key : i for i, key in enumerate(speakers_aux)}

+        # Augmentation
+        self.augmentator = None
+        self.gaussian_augmentation_config = None
+        if augmentation_config:
+            self.data_augmentation_p = augmentation_config['p']
+            if self.data_augmentation_p and ('additive' in augmentation_config or 'rir' in augmentation_config):
+                self.augmentator = AugmentWAV(ap, augmentation_config)
+
+            if 'gaussian' in augmentation_config.keys():
+                self.gaussian_augmentation_config = augmentation_config['gaussian']
+
        if self.verbose:
            print("\n > DataLoader initialization")
            print(f" | > Speakers per Batch: {num_speakers_in_batch}")
            print(f" | > Storage Size: {self.storage.maxsize} instances, each with {num_utter_per_speaker} utters")
            print(f" | > Sample_from_storage_p : {self.sample_from_storage_p}")
-            print(f" | > Noise added : {self.additive_noise}")
            print(f" | > Number of instances : {len(self.items)}")
            print(f" | > Sequence length: {self.seq_len}")
            print(f" | > Num speakers: {len(self.speakers)}")
@ -151,6 +161,10 @@ class MyDataset(Dataset):
                    break
                self.speaker_to_utters[speaker].remove(utter)

+            if self.augmentator is not None and self.data_augmentation_p:
+                if random.random() < self.data_augmentation_p:
+                    wav = self.augmentator.apply_one(wav)
+
            wavs.append(wav)
            labels.append(self.speakerid_to_classid[speaker])
        return wavs, labels
@ -201,20 +215,21 @@ class MyDataset(Dataset):
                # put the newly loaded item into storage
                self.storage.put_nowait((wavs_, labels_))

-            # add random gaussian noise
-            if self.additive_noise > 0:
-                noises_ = [np.random.normal(0, self.additive_noise, size=len(w)) for w in wavs_]
-                wavs_ = [wavs_[i] + noises_[i] for i in range(len(wavs_))]
-
            # get a random subset of each of the wavs and extract mel spectrograms.
-            offsets_ = [random.randint(0, wav.shape[0] - self.seq_len) for wav in wavs_]
-            mels_ = [
-                self.ap.melspectrogram(wavs_[i][offsets_[i] : offsets_[i] + self.seq_len]) for i in range(len(wavs_))
-            ]
-            feats_ = [torch.FloatTensor(mel) for mel in mels_]
+            feats_ = []
+            for wav in wavs_:
+                offset = random.randint(0, wav.shape[0] - self.seq_len)
+                wav = wav[offset : offset + self.seq_len]
+                # add random gaussian noise
+                if self.gaussian_augmentation_config and self.gaussian_augmentation_config['p']:
+                    if random.random() < self.gaussian_augmentation_config['p']:
+                        wav += np.random.normal(self.gaussian_augmentation_config['min_amplitude'], self.gaussian_augmentation_config['max_amplitude'], size=len(wav))
+                mel = self.ap.melspectrogram(wav)
+                feats_.append(torch.FloatTensor(mel))

            labels.append(torch.LongTensor(labels_))
            feats.extend(feats_)
        feats = torch.stack(feats)
        labels = torch.stack(labels)
+
        return feats.transpose(1, 2), labels
--- a/TTS/speaker_encoder/utils/generic_utils.py
+++ b/TTS/speaker_encoder/utils/generic_utils.py
@ -2,11 +2,143 @@ import datetime
 import os
 import re

+import numpy as np
 import torch
+import glob
+import random

+from scipy import signal
 from TTS.speaker_encoder.model import SpeakerEncoder
 from TTS.utils.generic_utils import check_argument
+    
+class AugmentWAV(object):

+    def __init__(self, ap, augmentation_config):
+
+        self.ap = ap
+
+        '''augmentation_config = {
+            "p": 1,
+            "rir":{
+                "rir_path": "rir_path/"
+                "conv_mode": "full"
+            },
+            "additive":{
+                "sounds_path": "musan/",
+                # directorys in sounds_path
+                "speech":{
+                    "min_snr_in_db": 13,
+                    "max_snr_in_db": 20,
+                    "min_num_noises": 3,
+                    "max_num_noises": 7
+                    },
+                "noise":{
+                    "min_snr_in_db": 0,
+                    "max_snr_in_db": 15,
+                    "min_num_noises": 1,
+                    "max_num_noises": 1
+                    },
+                "music":{
+                    "min_snr_in_db": 5,
+                    "max_snr_in_db": 15,
+                    "min_num_noises": 1,
+                    "max_num_noises": 1
+                    }
+            }   
+        }'''
+
+        self.use_additive_noise = False
+        if 'additive' in augmentation_config.keys():
+            self.additive_noise_config = augmentation_config['additive']
+            additive_path = self.additive_noise_config['sounds_path']
+            if additive_path:
+                self.use_additive_noise = True
+                # get noise types
+                self.additive_noise_types = []
+                for key in self.additive_noise_config.keys():
+                    if isinstance(self.additive_noise_config[key], dict):
+                        self.additive_noise_types.append(key)
+
+                additive_files = glob.glob(os.path.join(additive_path,'**/*.wav'), recursive=True)
+
+                self.noise_list = {}
+
+                for wav_file in additive_files:
+                    noise_dir = wav_file.replace(additive_path, '').split(os.sep)[0]
+                    # ignore not listed directories
+                    if noise_dir not in self.additive_noise_types:
+                        continue
+                    if not noise_dir in self.noise_list:
+                        self.noise_list[noise_dir] = []
+                    self.noise_list[noise_dir].append(wav_file)
+
+                print(f" | > Using Additive Noise Augmentation: with {len(additive_files)} audios instances from {self.additive_noise_types}")
+                
+        self.use_rir = False
+        if 'rir' in augmentation_config.keys():
+            self.rir_config = augmentation_config['rir']
+            if self.rir_config['rir_path']:
+                self.rir_files = glob.glob(os.path.join(self.rir_config['rir_path'],'**/*.wav'), recursive=True)
+                self.use_rir = True
+
+            print(f" | > Using RIR Noise Augmentation: with {len(self.rir_files)} audios instances")
+
+        self.create_augmentation_global_list()
+
+    def create_augmentation_global_list(self):
+        if self.use_additive_noise:
+            self.global_noise_list = self.additive_noise_types
+        else:
+            self.global_noise_list = []
+        if self.use_rir:
+            self.global_noise_list.append("RIR_AUG")
+
+    def additive_noise(self, noise_type, audio):
+
+        clean_db = 10 * np.log10(np.mean(audio**2) + 1e-4)
+
+        noise_list = random.sample(self.noise_list[noise_type], random.randint(self.additive_noise_config[noise_type]['min_num_noises'], self.additive_noise_config[noise_type]['max_num_noises']))
+
+        audio_len = audio.shape[0]
+        noises_wav = None
+        for noise in noise_list:
+            noiseaudio = self.ap.load_wav(noise, sr=self.ap.sample_rate)[:audio_len]
+
+            if noiseaudio.shape[0] < audio_len:
+                continue
+
+            noise_snr = random.uniform(self.additive_noise_config[noise_type]['min_snr_in_db'], self.additive_noise_config[noise_type]['max_num_noises'])
+            noise_db = 10 * np.log10(np.mean(noiseaudio ** 2) + 1e-4)
+            noise_wav = np.sqrt(10 ** ((clean_db - noise_db - noise_snr) / 10)) * noiseaudio
+
+            if noises_wav is None:
+                noises_wav = noise_wav
+            else:
+                noises_wav += noise_wav
+
+        # if all possibel files is less than audio, choose other files
+        if noises_wav is None:
+            print("audio ignorado")
+            return self.additive_noise(noise_type, audio)
+
+        return audio + noises_wav
+
+    def reverberate(self, audio):
+        audio_len = audio.shape[0]
+
+        rir_file = random.choice(self.rir_files)
+        
+        rir = self.ap.load_wav(rir_file, sr=self.ap.sample_rate)
+        rir = rir / np.sqrt(np.sum(rir ** 2))
+        return signal.convolve(audio, rir, mode=self.rir_config['conv_mode'])[:audio_len]
+
+    def apply_one(self, audio):
+        return self.reverberate(audio)
+        noise_type = random.choice(self.global_noise_list)
+        if noise_type == "RIR_AUG":
+            return self.reverberate(audio)
+        else:
+            return self.additive_noise(noise_type, audio)

 def to_camel(text):
    text = text.capitalize()
@ -112,7 +244,6 @@ def check_config_speaker_encoder(c):
    check_argument("storage", c, restricted=True, val_type=dict)
    check_argument("sample_from_storage_p", c["storage"], restricted=True, val_type=float, min_val=0.0, max_val=1.0)
    check_argument("storage_size", c["storage"], restricted=True, val_type=int, min_val=1, max_val=100)
-    check_argument("additive_noise", c["storage"], restricted=True, val_type=float, min_val=0.0, max_val=1.0)

    # datasets - checking only the first entry
    check_argument("datasets", c, restricted=True, val_type=list)