mirror of https://github.com/coqui-ai/TTS.git
sound normalization while reading, adapting get_Speaker for multiple datasets
This commit is contained in:
parent
8dec2a9e95
commit
0849e3c42f
11
train.py
11
train.py
|
@ -43,10 +43,6 @@ print(" > Number of GPUs: ", num_gpus)
|
|||
|
||||
|
||||
def setup_loader(ap, is_val=False, verbose=False):
|
||||
global meta_data_train
|
||||
global meta_data_eval
|
||||
if "meta_data_train" not in globals():
|
||||
meta_data_train, meta_data_eval = load_meta_data(c.datasets)
|
||||
if is_val and not c.run_eval:
|
||||
loader = None
|
||||
else:
|
||||
|
@ -470,6 +466,7 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch):
|
|||
|
||||
# FIXME: move args definition/parsing inside of main?
|
||||
def main(args): # pylint: disable=redefined-outer-name
|
||||
global meta_data_train, meta_data_eval
|
||||
# Audio processor
|
||||
ap = AudioProcessor(**c.audio)
|
||||
|
||||
|
@ -479,8 +476,12 @@ def main(args): # pylint: disable=redefined-outer-name
|
|||
c.distributed["backend"], c.distributed["url"])
|
||||
num_chars = len(phonemes) if c.use_phonemes else len(symbols)
|
||||
|
||||
# load data instances
|
||||
meta_data_train, meta_data_eval = load_meta_data(c.datasets)
|
||||
|
||||
# parse speakers
|
||||
if c.use_speaker_embedding:
|
||||
speakers = get_speakers(c.data_path, c.meta_file_train, c.dataset)
|
||||
speakers = get_speakers(meta_data_train)
|
||||
if args.restore_path:
|
||||
prev_out_path = os.path.dirname(args.restore_path)
|
||||
speaker_mapping = load_speaker_mapping(prev_out_path)
|
||||
|
|
|
@ -24,6 +24,7 @@ class AudioProcessor(object):
|
|||
clip_norm=True,
|
||||
griffin_lim_iters=None,
|
||||
do_trim_silence=False,
|
||||
sound_norm=False,
|
||||
**_):
|
||||
|
||||
print(" > Setting up Audio Processor...")
|
||||
|
@ -45,6 +46,7 @@ class AudioProcessor(object):
|
|||
self.max_norm = 1.0 if max_norm is None else float(max_norm)
|
||||
self.clip_norm = clip_norm
|
||||
self.do_trim_silence = do_trim_silence
|
||||
self.sound_norm = sound_norm
|
||||
self.n_fft, self.hop_length, self.win_length = self._stft_parameters()
|
||||
members = vars(self)
|
||||
for key, value in members.items():
|
||||
|
@ -243,6 +245,8 @@ class AudioProcessor(object):
|
|||
except ValueError:
|
||||
print(f' [!] File cannot be trimmed for silence - {filename}')
|
||||
assert self.sample_rate == sr, "%s vs %s"%(self.sample_rate, sr)
|
||||
if self.sound_norm:
|
||||
x = x / x.max() * 0.9
|
||||
return x
|
||||
|
||||
@staticmethod
|
||||
|
|
|
@ -25,9 +25,7 @@ def save_speaker_mapping(out_path, speaker_mapping):
|
|||
json.dump(speaker_mapping, f, indent=4)
|
||||
|
||||
|
||||
def get_speakers(data_root, meta_file, dataset_type):
|
||||
def get_speakers(items):
|
||||
"""Returns a sorted, unique list of speakers in a given dataset."""
|
||||
preprocessor = get_preprocessor_by_name(dataset_type)
|
||||
items = preprocessor(data_root, meta_file)
|
||||
speakers = {e[2] for e in items}
|
||||
return sorted(speakers)
|
||||
|
|
Loading…
Reference in New Issue