From df180148e9fd7a8cfe428179bc6e7d0533cdd56a Mon Sep 17 00:00:00 2001 From: erogol Date: Wed, 9 Dec 2020 15:46:25 +0100 Subject: [PATCH] use noise augmentation in TTSDataset --- TTS/bin/train_glow_tts.py | 1 + TTS/tts/datasets/TTSDataset.py | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/TTS/bin/train_glow_tts.py b/TTS/bin/train_glow_tts.py index 70d0506a..f56dfb5e 100644 --- a/TTS/bin/train_glow_tts.py +++ b/TTS/bin/train_glow_tts.py @@ -57,6 +57,7 @@ def setup_loader(ap, r, is_val=False, verbose=False): use_phonemes=c.use_phonemes, phoneme_language=c.phoneme_language, enable_eos_bos=c.enable_eos_bos_chars, + use_noise_augment=not is_val, verbose=verbose, speaker_mapping=speaker_mapping if c.use_speaker_embedding and c.use_external_speaker_embedding_file else None) diff --git a/TTS/tts/datasets/TTSDataset.py b/TTS/tts/datasets/TTSDataset.py index 88545d45..38dd2890 100644 --- a/TTS/tts/datasets/TTSDataset.py +++ b/TTS/tts/datasets/TTSDataset.py @@ -30,6 +30,7 @@ class MyDataset(Dataset): phoneme_language="en-us", enable_eos_bos=False, speaker_mapping=None, + use_noise_augment=False, verbose=False): """ Args: @@ -48,6 +49,7 @@ class MyDataset(Dataset): phoneme_language (str): one the languages from https://github.com/bootphon/phonemizer#languages enable_eos_bos (bool): enable end of sentence and beginning of sentences characters. + use_noise_augment (bool): enable adding random noise to wav for augmentation. verbose (bool): print diagnostic information. """ self.batch_group_size = batch_group_size @@ -66,6 +68,7 @@ class MyDataset(Dataset): self.phoneme_language = phoneme_language self.enable_eos_bos = enable_eos_bos self.speaker_mapping = speaker_mapping + self.use_noise_augment = use_noise_augment self.verbose = verbose self.input_seq_computed = False if use_phonemes and not os.path.isdir(phoneme_cache_path): @@ -134,6 +137,10 @@ class MyDataset(Dataset): wav = np.asarray(self.load_wav(wav_file), dtype=np.float32) + # apply noise for augmentation + if self.use_noise_augment: + wav = wav + (1.0 / 32768.0) * np.random.rand(*wav.shape) + if not self.input_seq_computed: if self.use_phonemes: text = self._load_or_generate_phoneme_sequence(wav_file, text, self.phoneme_cache_path, self.enable_eos_bos, self.cleaners, self.phoneme_language, self.tp, self.add_blank)