mirror of https://github.com/coqui-ai/TTS.git
76 lines
2.6 KiB
Python
76 lines
2.6 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""
|
||
Defines the set of symbols used in text input to the model.
|
||
|
||
The default is a set of ASCII characters that works well for English or text that has been run
|
||
through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details.
|
||
"""
|
||
|
||
|
||
def make_symbols(
|
||
characters,
|
||
phonemes=None,
|
||
punctuations="!'(),-.:;? ",
|
||
pad="_",
|
||
eos="~",
|
||
bos="^",
|
||
unique=True,
|
||
): # pylint: disable=redefined-outer-name
|
||
"""Function to create symbols and phonemes
|
||
TODO: create phonemes_to_id and symbols_to_id dicts here."""
|
||
_symbols = list(characters)
|
||
_symbols = [bos] + _symbols if len(bos) > 0 and bos is not None else _symbols
|
||
_symbols = [eos] + _symbols if len(bos) > 0 and eos is not None else _symbols
|
||
_symbols = [pad] + _symbols if len(bos) > 0 and pad is not None else _symbols
|
||
_phonemes = None
|
||
if phonemes is not None:
|
||
_phonemes_sorted = (
|
||
sorted(list(set(phonemes))) if unique else sorted(list(phonemes))
|
||
) # this is to keep previous models compatible.
|
||
# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
|
||
# _arpabet = ["@" + s for s in _phonemes_sorted]
|
||
# Export all symbols:
|
||
_phonemes = [pad, eos, bos] + list(_phonemes_sorted) + list(punctuations)
|
||
# _symbols += _arpabet
|
||
return _symbols, _phonemes
|
||
|
||
|
||
_pad = "_"
|
||
_eos = "~"
|
||
_bos = "^"
|
||
_characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? "
|
||
_punctuations = "!'(),-.:;? "
|
||
|
||
# Phonemes definition (All IPA characters)
|
||
_vowels = "iyɨʉɯuɪʏʊeøɘəɵɤoɛœɜɞʌɔæɐaɶɑɒᵻ"
|
||
_non_pulmonic_consonants = "ʘɓǀɗǃʄǂɠǁʛ"
|
||
_pulmonic_consonants = "pbtdʈɖcɟkɡqɢʔɴŋɲɳnɱmʙrʀⱱɾɽɸβfvθðszʃʒʂʐçʝxɣχʁħʕhɦɬɮʋɹɻjɰlɭʎʟ"
|
||
_suprasegmentals = "ˈˌːˑ"
|
||
_other_symbols = "ʍwɥʜʢʡɕʑɺɧʲ"
|
||
_diacrilics = "ɚ˞ɫ"
|
||
_phonemes = _vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics
|
||
|
||
symbols, phonemes = make_symbols(_characters, _phonemes, _punctuations, _pad, _eos, _bos)
|
||
|
||
# Generate ALIEN language
|
||
# from random import shuffle
|
||
# shuffle(phonemes)
|
||
|
||
|
||
def parse_symbols():
|
||
return {
|
||
"pad": _pad,
|
||
"eos": _eos,
|
||
"bos": _bos,
|
||
"characters": _characters,
|
||
"punctuations": _punctuations,
|
||
"phonemes": _phonemes,
|
||
}
|
||
|
||
|
||
if __name__ == "__main__":
|
||
print(" > TTS symbols {}".format(len(symbols)))
|
||
print(symbols)
|
||
print(" > TTS phonemes {}".format(len(phonemes)))
|
||
print("".join(sorted(phonemes)))
|