use noise augmentation in TTSDataset

This commit is contained in:
erogol 2020-12-09 15:46:25 +01:00
parent a141a717e4
commit df180148e9
2 changed files with 8 additions and 0 deletions

View File

@ -57,6 +57,7 @@ def setup_loader(ap, r, is_val=False, verbose=False):
use_phonemes=c.use_phonemes,
phoneme_language=c.phoneme_language,
enable_eos_bos=c.enable_eos_bos_chars,
use_noise_augment=not is_val,
verbose=verbose,
speaker_mapping=speaker_mapping if c.use_speaker_embedding and c.use_external_speaker_embedding_file else None)

View File

@ -30,6 +30,7 @@ class MyDataset(Dataset):
phoneme_language="en-us",
enable_eos_bos=False,
speaker_mapping=None,
use_noise_augment=False,
verbose=False):
"""
Args:
@ -48,6 +49,7 @@ class MyDataset(Dataset):
phoneme_language (str): one the languages from
https://github.com/bootphon/phonemizer#languages
enable_eos_bos (bool): enable end of sentence and beginning of sentences characters.
use_noise_augment (bool): enable adding random noise to wav for augmentation.
verbose (bool): print diagnostic information.
"""
self.batch_group_size = batch_group_size
@ -66,6 +68,7 @@ class MyDataset(Dataset):
self.phoneme_language = phoneme_language
self.enable_eos_bos = enable_eos_bos
self.speaker_mapping = speaker_mapping
self.use_noise_augment = use_noise_augment
self.verbose = verbose
self.input_seq_computed = False
if use_phonemes and not os.path.isdir(phoneme_cache_path):
@ -134,6 +137,10 @@ class MyDataset(Dataset):
wav = np.asarray(self.load_wav(wav_file), dtype=np.float32)
# apply noise for augmentation
if self.use_noise_augment:
wav = wav + (1.0 / 32768.0) * np.random.rand(*wav.shape)
if not self.input_seq_computed:
if self.use_phonemes:
text = self._load_or_generate_phoneme_sequence(wav_file, text, self.phoneme_cache_path, self.enable_eos_bos, self.cleaners, self.phoneme_language, self.tp, self.add_blank)