sound normalization while reading, adapting get_Speaker for multiple datasets

This commit is contained in:
Eren Golge 2019-10-04 18:20:30 +02:00
parent 8dec2a9e95
commit 0849e3c42f
3 changed files with 11 additions and 8 deletions

View File

@ -43,10 +43,6 @@ print(" > Number of GPUs: ", num_gpus)
def setup_loader(ap, is_val=False, verbose=False):
global meta_data_train
global meta_data_eval
if "meta_data_train" not in globals():
meta_data_train, meta_data_eval = load_meta_data(c.datasets)
if is_val and not c.run_eval:
loader = None
else:
@ -470,6 +466,7 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
# FIXME: move args definition/parsing inside of main?
def main(args): # pylint: disable=redefined-outer-name
global meta_data_train, meta_data_eval
# Audio processor
ap = AudioProcessor(**c.audio)
@ -479,8 +476,12 @@ def main(args): # pylint: disable=redefined-outer-name
c.distributed["backend"], c.distributed["url"])
num_chars = len(phonemes) if c.use_phonemes else len(symbols)
# load data instances
meta_data_train, meta_data_eval = load_meta_data(c.datasets)
# parse speakers
if c.use_speaker_embedding:
speakers = get_speakers(c.data_path, c.meta_file_train, c.dataset)
speakers = get_speakers(meta_data_train)
if args.restore_path:
prev_out_path = os.path.dirname(args.restore_path)
speaker_mapping = load_speaker_mapping(prev_out_path)

View File

@ -24,6 +24,7 @@ class AudioProcessor(object):
clip_norm=True,
griffin_lim_iters=None,
do_trim_silence=False,
sound_norm=False,
**_):
print(" > Setting up Audio Processor...")
@ -45,6 +46,7 @@ class AudioProcessor(object):
self.max_norm = 1.0 if max_norm is None else float(max_norm)
self.clip_norm = clip_norm
self.do_trim_silence = do_trim_silence
self.sound_norm = sound_norm
self.n_fft, self.hop_length, self.win_length = self._stft_parameters()
members = vars(self)
for key, value in members.items():
@ -243,6 +245,8 @@ class AudioProcessor(object):
except ValueError:
print(f' [!] File cannot be trimmed for silence - {filename}')
assert self.sample_rate == sr, "%s vs %s"%(self.sample_rate, sr)
if self.sound_norm:
x = x / x.max() * 0.9
return x
@staticmethod

View File

@ -25,9 +25,7 @@ def save_speaker_mapping(out_path, speaker_mapping):
json.dump(speaker_mapping, f, indent=4)
def get_speakers(data_root, meta_file, dataset_type):
def get_speakers(items):
"""Returns a sorted, unique list of speakers in a given dataset."""
preprocessor = get_preprocessor_by_name(dataset_type)
items = preprocessor(data_root, meta_file)
speakers = {e[2] for e in items}
return sorted(speakers)