add blank token in sequence for encrease glowtts results

This commit is contained in:
Edresson 2020-10-25 15:08:28 -03:00
parent fbea058c59
commit d9540a5857
7 changed files with 26 additions and 7 deletions

View File

@ -47,6 +47,7 @@ def setup_loader(ap, r, is_val=False, verbose=False, speaker_mapping=None):
meta_data=meta_data_eval if is_val else meta_data_train, meta_data=meta_data_eval if is_val else meta_data_train,
ap=ap, ap=ap,
tp=c.characters if 'characters' in c.keys() else None, tp=c.characters if 'characters' in c.keys() else None,
add_blank=c['add_blank'] if 'add_blank' in c.keys() else False,
batch_group_size=0 if is_val else c.batch_group_size * batch_group_size=0 if is_val else c.batch_group_size *
c.batch_size, c.batch_size,
min_seq_len=c.min_seq_len, min_seq_len=c.min_seq_len,

View File

@ -51,6 +51,7 @@ def setup_loader(ap, r, is_val=False, verbose=False, speaker_mapping=None):
meta_data=meta_data_eval if is_val else meta_data_train, meta_data=meta_data_eval if is_val else meta_data_train,
ap=ap, ap=ap,
tp=c.characters if 'characters' in c.keys() else None, tp=c.characters if 'characters' in c.keys() else None,
add_blank=c['add_blank'] if 'add_blank' in c.keys() else False,
batch_group_size=0 if is_val else c.batch_group_size * batch_group_size=0 if is_val else c.batch_group_size *
c.batch_size, c.batch_size,
min_seq_len=c.min_seq_len, min_seq_len=c.min_seq_len,

View File

@ -51,6 +51,8 @@
// "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ" // "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ"
// }, // },
"add_blank": false, // if true add a new token after each token of the sentence. This increases the size of the input sequence, but has considerably improved the prosody of the GlowTTS model.
// DISTRIBUTED TRAINING // DISTRIBUTED TRAINING
"distributed":{ "distributed":{
"backend": "nccl", "backend": "nccl",

View File

@ -51,6 +51,8 @@
// "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ" // "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ"
// }, // },
"add_blank": false, // if true add a new token after each token of the sentence. This increases the size of the input sequence, but has considerably improved the prosody of the GlowTTS model.
// DISTRIBUTED TRAINING // DISTRIBUTED TRAINING
"distributed":{ "distributed":{
"backend": "nccl", "backend": "nccl",

View File

@ -17,6 +17,7 @@ class MyDataset(Dataset):
ap, ap,
meta_data, meta_data,
tp=None, tp=None,
add_blank=False,
batch_group_size=0, batch_group_size=0,
min_seq_len=0, min_seq_len=0,
max_seq_len=float("inf"), max_seq_len=float("inf"),
@ -55,6 +56,7 @@ class MyDataset(Dataset):
self.max_seq_len = max_seq_len self.max_seq_len = max_seq_len
self.ap = ap self.ap = ap
self.tp = tp self.tp = tp
self.add_blank = add_blank
self.use_phonemes = use_phonemes self.use_phonemes = use_phonemes
self.phoneme_cache_path = phoneme_cache_path self.phoneme_cache_path = phoneme_cache_path
self.phoneme_language = phoneme_language self.phoneme_language = phoneme_language
@ -88,7 +90,7 @@ class MyDataset(Dataset):
phonemes = phoneme_to_sequence(text, [self.cleaners], phonemes = phoneme_to_sequence(text, [self.cleaners],
language=self.phoneme_language, language=self.phoneme_language,
enable_eos_bos=False, enable_eos_bos=False,
tp=self.tp) tp=self.tp, add_blank=self.add_blank)
phonemes = np.asarray(phonemes, dtype=np.int32) phonemes = np.asarray(phonemes, dtype=np.int32)
np.save(cache_path, phonemes) np.save(cache_path, phonemes)
return phonemes return phonemes
@ -127,7 +129,7 @@ class MyDataset(Dataset):
text = self._load_or_generate_phoneme_sequence(wav_file, text) text = self._load_or_generate_phoneme_sequence(wav_file, text)
else: else:
text = np.asarray(text_to_sequence(text, [self.cleaners], text = np.asarray(text_to_sequence(text, [self.cleaners],
tp=self.tp), tp=self.tp, add_blank=self.add_blank),
dtype=np.int32) dtype=np.int32)
assert text.size > 0, self.items[idx][1] assert text.size > 0, self.items[idx][1]

View File

@ -14,10 +14,13 @@ def text_to_seqvec(text, CONFIG):
seq = np.asarray( seq = np.asarray(
phoneme_to_sequence(text, text_cleaner, CONFIG.phoneme_language, phoneme_to_sequence(text, text_cleaner, CONFIG.phoneme_language,
CONFIG.enable_eos_bos_chars, CONFIG.enable_eos_bos_chars,
tp=CONFIG.characters if 'characters' in CONFIG.keys() else None), tp=CONFIG.characters if 'characters' in CONFIG.keys() else None,
add_blank=CONFIG['add_blank'] if 'add_blank' in CONFIG.keys() else False),
dtype=np.int32) dtype=np.int32)
else: else:
seq = np.asarray(text_to_sequence(text, text_cleaner, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None), dtype=np.int32) seq = np.asarray(
text_to_sequence(text, text_cleaner, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None,
add_blank=CONFIG['add_blank'] if 'add_blank' in CONFIG.keys() else False), dtype=np.int32)
return seq return seq

View File

@ -57,6 +57,10 @@ def text2phone(text, language):
return ph return ph
def intersperse(sequence, token):
result = [token] * (len(sequence) * 2 + 1)
result[1::2] = sequence
return result
def pad_with_eos_bos(phoneme_sequence, tp=None): def pad_with_eos_bos(phoneme_sequence, tp=None):
# pylint: disable=global-statement # pylint: disable=global-statement
@ -69,8 +73,7 @@ def pad_with_eos_bos(phoneme_sequence, tp=None):
return [_phonemes_to_id[_bos]] + list(phoneme_sequence) + [_phonemes_to_id[_eos]] return [_phonemes_to_id[_bos]] + list(phoneme_sequence) + [_phonemes_to_id[_eos]]
def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None, add_blank=False):
def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None):
# pylint: disable=global-statement # pylint: disable=global-statement
global _phonemes_to_id global _phonemes_to_id
if tp: if tp:
@ -88,6 +91,8 @@ def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=
# Append EOS char # Append EOS char
if enable_eos_bos: if enable_eos_bos:
sequence = pad_with_eos_bos(sequence, tp=tp) sequence = pad_with_eos_bos(sequence, tp=tp)
if add_blank:
sequence = intersperse(sequence, len(_phonemes)) # add a blank token (new), whose id number is len(_phonemes)
return sequence return sequence
@ -107,7 +112,7 @@ def sequence_to_phoneme(sequence, tp=None):
return result.replace('}{', ' ') return result.replace('}{', ' ')
def text_to_sequence(text, cleaner_names, tp=None): def text_to_sequence(text, cleaner_names, tp=None, add_blank=False):
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
The text can optionally have ARPAbet sequences enclosed in curly braces embedded The text can optionally have ARPAbet sequences enclosed in curly braces embedded
@ -137,6 +142,9 @@ def text_to_sequence(text, cleaner_names, tp=None):
_clean_text(m.group(1), cleaner_names)) _clean_text(m.group(1), cleaner_names))
sequence += _arpabet_to_sequence(m.group(2)) sequence += _arpabet_to_sequence(m.group(2))
text = m.group(3) text = m.group(3)
if add_blank:
sequence = intersperse(sequence, len(_symbols)) # add a blank token (new), whose id number is len(_symbols)
return sequence return sequence