mirror of https://github.com/coqui-ai/TTS.git
add blank token in sequence for encrease glowtts results
This commit is contained in:
parent
fbea058c59
commit
d9540a5857
|
@ -47,6 +47,7 @@ def setup_loader(ap, r, is_val=False, verbose=False, speaker_mapping=None):
|
||||||
meta_data=meta_data_eval if is_val else meta_data_train,
|
meta_data=meta_data_eval if is_val else meta_data_train,
|
||||||
ap=ap,
|
ap=ap,
|
||||||
tp=c.characters if 'characters' in c.keys() else None,
|
tp=c.characters if 'characters' in c.keys() else None,
|
||||||
|
add_blank=c['add_blank'] if 'add_blank' in c.keys() else False,
|
||||||
batch_group_size=0 if is_val else c.batch_group_size *
|
batch_group_size=0 if is_val else c.batch_group_size *
|
||||||
c.batch_size,
|
c.batch_size,
|
||||||
min_seq_len=c.min_seq_len,
|
min_seq_len=c.min_seq_len,
|
||||||
|
|
|
@ -51,6 +51,7 @@ def setup_loader(ap, r, is_val=False, verbose=False, speaker_mapping=None):
|
||||||
meta_data=meta_data_eval if is_val else meta_data_train,
|
meta_data=meta_data_eval if is_val else meta_data_train,
|
||||||
ap=ap,
|
ap=ap,
|
||||||
tp=c.characters if 'characters' in c.keys() else None,
|
tp=c.characters if 'characters' in c.keys() else None,
|
||||||
|
add_blank=c['add_blank'] if 'add_blank' in c.keys() else False,
|
||||||
batch_group_size=0 if is_val else c.batch_group_size *
|
batch_group_size=0 if is_val else c.batch_group_size *
|
||||||
c.batch_size,
|
c.batch_size,
|
||||||
min_seq_len=c.min_seq_len,
|
min_seq_len=c.min_seq_len,
|
||||||
|
|
|
@ -51,6 +51,8 @@
|
||||||
// "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ"
|
// "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ"
|
||||||
// },
|
// },
|
||||||
|
|
||||||
|
"add_blank": false, // if true add a new token after each token of the sentence. This increases the size of the input sequence, but has considerably improved the prosody of the GlowTTS model.
|
||||||
|
|
||||||
// DISTRIBUTED TRAINING
|
// DISTRIBUTED TRAINING
|
||||||
"distributed":{
|
"distributed":{
|
||||||
"backend": "nccl",
|
"backend": "nccl",
|
||||||
|
|
|
@ -51,6 +51,8 @@
|
||||||
// "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ"
|
// "phonemes":"iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻʘɓǀɗǃʄǂɠǁʛpbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟˈˌːˑʍwɥʜʢʡɕʑɺɧɚ˞ɫ"
|
||||||
// },
|
// },
|
||||||
|
|
||||||
|
"add_blank": false, // if true add a new token after each token of the sentence. This increases the size of the input sequence, but has considerably improved the prosody of the GlowTTS model.
|
||||||
|
|
||||||
// DISTRIBUTED TRAINING
|
// DISTRIBUTED TRAINING
|
||||||
"distributed":{
|
"distributed":{
|
||||||
"backend": "nccl",
|
"backend": "nccl",
|
||||||
|
|
|
@ -17,6 +17,7 @@ class MyDataset(Dataset):
|
||||||
ap,
|
ap,
|
||||||
meta_data,
|
meta_data,
|
||||||
tp=None,
|
tp=None,
|
||||||
|
add_blank=False,
|
||||||
batch_group_size=0,
|
batch_group_size=0,
|
||||||
min_seq_len=0,
|
min_seq_len=0,
|
||||||
max_seq_len=float("inf"),
|
max_seq_len=float("inf"),
|
||||||
|
@ -55,6 +56,7 @@ class MyDataset(Dataset):
|
||||||
self.max_seq_len = max_seq_len
|
self.max_seq_len = max_seq_len
|
||||||
self.ap = ap
|
self.ap = ap
|
||||||
self.tp = tp
|
self.tp = tp
|
||||||
|
self.add_blank = add_blank
|
||||||
self.use_phonemes = use_phonemes
|
self.use_phonemes = use_phonemes
|
||||||
self.phoneme_cache_path = phoneme_cache_path
|
self.phoneme_cache_path = phoneme_cache_path
|
||||||
self.phoneme_language = phoneme_language
|
self.phoneme_language = phoneme_language
|
||||||
|
@ -88,7 +90,7 @@ class MyDataset(Dataset):
|
||||||
phonemes = phoneme_to_sequence(text, [self.cleaners],
|
phonemes = phoneme_to_sequence(text, [self.cleaners],
|
||||||
language=self.phoneme_language,
|
language=self.phoneme_language,
|
||||||
enable_eos_bos=False,
|
enable_eos_bos=False,
|
||||||
tp=self.tp)
|
tp=self.tp, add_blank=self.add_blank)
|
||||||
phonemes = np.asarray(phonemes, dtype=np.int32)
|
phonemes = np.asarray(phonemes, dtype=np.int32)
|
||||||
np.save(cache_path, phonemes)
|
np.save(cache_path, phonemes)
|
||||||
return phonemes
|
return phonemes
|
||||||
|
@ -127,7 +129,7 @@ class MyDataset(Dataset):
|
||||||
text = self._load_or_generate_phoneme_sequence(wav_file, text)
|
text = self._load_or_generate_phoneme_sequence(wav_file, text)
|
||||||
else:
|
else:
|
||||||
text = np.asarray(text_to_sequence(text, [self.cleaners],
|
text = np.asarray(text_to_sequence(text, [self.cleaners],
|
||||||
tp=self.tp),
|
tp=self.tp, add_blank=self.add_blank),
|
||||||
dtype=np.int32)
|
dtype=np.int32)
|
||||||
|
|
||||||
assert text.size > 0, self.items[idx][1]
|
assert text.size > 0, self.items[idx][1]
|
||||||
|
|
|
@ -14,10 +14,13 @@ def text_to_seqvec(text, CONFIG):
|
||||||
seq = np.asarray(
|
seq = np.asarray(
|
||||||
phoneme_to_sequence(text, text_cleaner, CONFIG.phoneme_language,
|
phoneme_to_sequence(text, text_cleaner, CONFIG.phoneme_language,
|
||||||
CONFIG.enable_eos_bos_chars,
|
CONFIG.enable_eos_bos_chars,
|
||||||
tp=CONFIG.characters if 'characters' in CONFIG.keys() else None),
|
tp=CONFIG.characters if 'characters' in CONFIG.keys() else None,
|
||||||
|
add_blank=CONFIG['add_blank'] if 'add_blank' in CONFIG.keys() else False),
|
||||||
dtype=np.int32)
|
dtype=np.int32)
|
||||||
else:
|
else:
|
||||||
seq = np.asarray(text_to_sequence(text, text_cleaner, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None), dtype=np.int32)
|
seq = np.asarray(
|
||||||
|
text_to_sequence(text, text_cleaner, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None,
|
||||||
|
add_blank=CONFIG['add_blank'] if 'add_blank' in CONFIG.keys() else False), dtype=np.int32)
|
||||||
return seq
|
return seq
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -57,6 +57,10 @@ def text2phone(text, language):
|
||||||
|
|
||||||
return ph
|
return ph
|
||||||
|
|
||||||
|
def intersperse(sequence, token):
|
||||||
|
result = [token] * (len(sequence) * 2 + 1)
|
||||||
|
result[1::2] = sequence
|
||||||
|
return result
|
||||||
|
|
||||||
def pad_with_eos_bos(phoneme_sequence, tp=None):
|
def pad_with_eos_bos(phoneme_sequence, tp=None):
|
||||||
# pylint: disable=global-statement
|
# pylint: disable=global-statement
|
||||||
|
@ -69,8 +73,7 @@ def pad_with_eos_bos(phoneme_sequence, tp=None):
|
||||||
|
|
||||||
return [_phonemes_to_id[_bos]] + list(phoneme_sequence) + [_phonemes_to_id[_eos]]
|
return [_phonemes_to_id[_bos]] + list(phoneme_sequence) + [_phonemes_to_id[_eos]]
|
||||||
|
|
||||||
|
def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None, add_blank=False):
|
||||||
def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None):
|
|
||||||
# pylint: disable=global-statement
|
# pylint: disable=global-statement
|
||||||
global _phonemes_to_id
|
global _phonemes_to_id
|
||||||
if tp:
|
if tp:
|
||||||
|
@ -88,6 +91,8 @@ def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=
|
||||||
# Append EOS char
|
# Append EOS char
|
||||||
if enable_eos_bos:
|
if enable_eos_bos:
|
||||||
sequence = pad_with_eos_bos(sequence, tp=tp)
|
sequence = pad_with_eos_bos(sequence, tp=tp)
|
||||||
|
if add_blank:
|
||||||
|
sequence = intersperse(sequence, len(_phonemes)) # add a blank token (new), whose id number is len(_phonemes)
|
||||||
return sequence
|
return sequence
|
||||||
|
|
||||||
|
|
||||||
|
@ -107,7 +112,7 @@ def sequence_to_phoneme(sequence, tp=None):
|
||||||
return result.replace('}{', ' ')
|
return result.replace('}{', ' ')
|
||||||
|
|
||||||
|
|
||||||
def text_to_sequence(text, cleaner_names, tp=None):
|
def text_to_sequence(text, cleaner_names, tp=None, add_blank=False):
|
||||||
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
||||||
|
|
||||||
The text can optionally have ARPAbet sequences enclosed in curly braces embedded
|
The text can optionally have ARPAbet sequences enclosed in curly braces embedded
|
||||||
|
@ -137,6 +142,9 @@ def text_to_sequence(text, cleaner_names, tp=None):
|
||||||
_clean_text(m.group(1), cleaner_names))
|
_clean_text(m.group(1), cleaner_names))
|
||||||
sequence += _arpabet_to_sequence(m.group(2))
|
sequence += _arpabet_to_sequence(m.group(2))
|
||||||
text = m.group(3)
|
text = m.group(3)
|
||||||
|
|
||||||
|
if add_blank:
|
||||||
|
sequence = intersperse(sequence, len(_symbols)) # add a blank token (new), whose id number is len(_symbols)
|
||||||
return sequence
|
return sequence
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue