Make phoneme training configurable through config.json

This commit is contained in:
Eren Golge 2019-01-16 13:07:03 +01:00
parent 8d87791470
commit b241104778
3 changed files with 47 additions and 22 deletions

View File

@ -51,5 +51,8 @@
"max_seq_len": 300, // DATASET-RELATED: maximum text length "max_seq_len": 300, // DATASET-RELATED: maximum text length
"output_path": "/media/erogol/data_ssd/Data/models/en_UK/", // DATASET-RELATED: output path for all training outputs. "output_path": "/media/erogol/data_ssd/Data/models/en_UK/", // DATASET-RELATED: output path for all training outputs.
"num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values. "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values.
"num_val_loader_workers": 4 // number of evaluation data loader processes. "num_val_loader_workers": 4, // number of evaluation data loader processes.
"phoneme_cache_path": "tmp_en_uk", // phoneme computation is slow, therefore, it caches results in the given folder.
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
"phoneme_language": "en-us" // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
} }

View File

@ -23,7 +23,9 @@ class MyDataset(Dataset):
min_seq_len=0, min_seq_len=0,
max_seq_len=float("inf"), max_seq_len=float("inf"),
cached=False, cached=False,
phoneme_cache_path=None): use_phonemes=True,
phoneme_cache_path=None,
phoneme_language="en-us"):
""" """
Args: Args:
root_path (str): root path for the data folder. root_path (str): root path for the data folder.
@ -41,7 +43,10 @@ class MyDataset(Dataset):
max_seq_len (int): (float("inf")) maximum sequence length. max_seq_len (int): (float("inf")) maximum sequence length.
cached (bool): (false) true if the given data path is created cached (bool): (false) true if the given data path is created
by extract_features.py. by extract_features.py.
use_phonemes (bool): (true) if true, text converted to phonemes.
phoneme_cache_path (str): path to cache phoneme features. phoneme_cache_path (str): path to cache phoneme features.
phoneme_language (str): one the languages from
https://github.com/bootphon/phonemizer#languages
""" """
self.root_path = root_path self.root_path = root_path
self.batch_group_size = batch_group_size self.batch_group_size = batch_group_size
@ -53,9 +58,16 @@ class MyDataset(Dataset):
self.max_seq_len = max_seq_len self.max_seq_len = max_seq_len
self.ap = ap self.ap = ap
self.cached = cached self.cached = cached
self.use_phonemes = use_phonemes
self.phoneme_cache_path = phoneme_cache_path self.phoneme_cache_path = phoneme_cache_path
self.phoneme_language = phoneme_language
if not os.path.isdir(phoneme_cache_path):
os.makedirs(phoneme_cache_path)
print(" > DataLoader initialization") print(" > DataLoader initialization")
print(" | > Data path: {}".format(root_path)) print(" | > Data path: {}".format(root_path))
print(" | > Use phonemes: {}".format(self.use_phonemes))
if use_phonemes:
print(" | > phoneme language: {}".format(phoneme_language))
print(" | > Cached dataset: {}".format(self.cached)) print(" | > Cached dataset: {}".format(self.cached))
print(" | > Number of instances : {}".format(len(self.items))) print(" | > Number of instances : {}".format(len(self.items)))
@ -72,33 +84,42 @@ class MyDataset(Dataset):
data = np.load(filename).astype('float32') data = np.load(filename).astype('float32')
return data return data
def load_data(self, idx): def load_phoneme_sequence(self, wav_file, text):
if self.cached:
wav_name = self.items[idx][1]
mel_name = self.items[idx][2]
linear_name = self.items[idx][3]
text = self.items[idx][0]
text = np.asarray(
text_to_sequence(text, [self.cleaners]), dtype=np.int32)
if wav_name.split('.')[-1] == 'npy':
wav = self.load_np(wav_name)
else:
wav = np.asarray(self.load_wav(wav_name), dtype=np.float32)
mel = self.load_np(mel_name)
linear = self.load_np(linear_name)
sample = {'text': text, 'wav': wav, 'item_idx': self.items[idx][1], 'mel':mel, 'linear': linear}
else:
text, wav_file = self.items[idx]
file_name = os.path.basename(wav_file).split('.')[0] file_name = os.path.basename(wav_file).split('.')[0]
tmp_path = os.path.join(self.phoneme_cache_path, file_name+'_phoneme.npy') tmp_path = os.path.join(self.phoneme_cache_path, file_name+'_phoneme.npy')
if os.path.isfile(tmp_path): if os.path.isfile(tmp_path):
text = np.load(tmp_path) text = np.load(tmp_path)
else: else:
text = np.asarray( text = np.asarray(
phoneme_to_sequence(text, [self.cleaners]), dtype=np.int32) phoneme_to_sequence(text, [self.cleaners], language=self.phoneme_language), dtype=np.int32)
np.save(tmp_path, text) np.save(tmp_path, text)
return text
def load_data(self, idx):
if self.cached:
wav_name = self.items[idx][1]
mel_name = self.items[idx][2]
linear_name = self.items[idx][3]
text = self.items[idx][0]
if wav_name.split('.')[-1] == 'npy':
wav = self.load_np(wav_name)
else:
wav = np.asarray(self.load_wav(wav_name), dtype=np.float32)
mel = self.load_np(mel_name)
linear = self.load_np(linear_name)
else:
text, wav_file = self.items[idx]
wav = np.asarray(self.load_wav(wav_file), dtype=np.float32) wav = np.asarray(self.load_wav(wav_file), dtype=np.float32)
sample = {'text': text, 'wav': wav, 'item_idx': self.items[idx][1]} mel = None
linear = None
if self.use_phonemes:
text = self.load_phoneme_sequence(wav_file, text)
else:
text = np.asarray(
text_to_sequence(text, [self.cleaners]), dtype=np.int32)
sample = {'text': text, 'wav': wav, 'item_idx': self.items[idx][1], 'mel':mel, 'linear': linear}
return sample return sample
def sort_items(self): def sort_items(self):
@ -157,12 +178,13 @@ class MyDataset(Dataset):
text_lenghts = np.array([len(x) for x in text]) text_lenghts = np.array([len(x) for x in text])
max_text_len = np.max(text_lenghts) max_text_len = np.max(text_lenghts)
if self.cached: # if specs are not computed, compute them.
mel = [d['mel'] for d in batch] if batch[0]['mel'] is None and batch[0]['linear'] is None:
linear = [d['linear'] for d in batch]
else:
mel = [self.ap.melspectrogram(w).astype('float32') for w in wav] mel = [self.ap.melspectrogram(w).astype('float32') for w in wav]
linear = [self.ap.spectrogram(w).astype('float32') for w in wav] linear = [self.ap.spectrogram(w).astype('float32') for w in wav]
else:
mel = [d['mel'] for d in batch]
linear = [d['linear'] for d in batch]
mel_lengths = [m.shape[1] + 1 for m in mel] # +1 for zero-frame mel_lengths = [m.shape[1] + 1 for m in mel] # +1 for zero-frame
# compute 'stop token' targets # compute 'stop token' targets

View File

@ -68,14 +68,14 @@ def sequence_to_phoneme(sequence):
return result.replace('}{', ' ') return result.replace('}{', ' ')
def text2phone(text): def text2phone(text, language):
''' '''
Convert graphemes to phonemes. Convert graphemes to phonemes.
''' '''
seperator = phonemizer.separator.Separator(' ', '', '|') seperator = phonemizer.separator.Separator(' ', '', '|')
#try: #try:
punctuations = re.findall(pat, text) punctuations = re.findall(pat, text)
ph = phonemizer.phonemize(text, separator=seperator, strip=False, njobs=1, backend='espeak', language='en-us') ph = phonemize(text, separator=seperator, strip=False, njobs=1, backend='espeak', language=language)
# Replace \n with matching punctuations. # Replace \n with matching punctuations.
if len(punctuations) > 0: if len(punctuations) > 0:
for punct in punctuations[:-1]: for punct in punctuations[:-1]:
@ -87,13 +87,13 @@ def text2phone(text):
return ph return ph
def phoneme_to_sequence(text, cleaner_names): def phoneme_to_sequence(text, cleaner_names, language):
''' '''
TODO: This ignores punctuations TODO: This ignores punctuations
''' '''
sequence = [] sequence = []
clean_text = _clean_text(text, cleaner_names) clean_text = _clean_text(text, cleaner_names)
phonemes = text2phone(clean_text) phonemes = text2phone(clean_text, language)
# print(phonemes.replace('|', '')) # print(phonemes.replace('|', ''))
if phonemes is None: if phonemes is None:
print("!! After phoneme conversion the result is None. -- {} ".format(clean_text)) print("!! After phoneme conversion the result is None. -- {} ".format(clean_text))