add ```unique``` param to keep scglow models compatible (they are duplicate symbols ins the character set)

This commit is contained in:
Eren Gölge 2021-04-16 18:29:52 +02:00
parent c955a12428
commit 99dc07a7dd
1 changed files with 4 additions and 3 deletions

View File

@ -8,16 +8,17 @@ through Unidecode. For other data, you can modify _characters. See TRAINING_DATA
def make_symbols(
characters, phonemes=None, punctuations="!'(),-.:;? ", pad="_", eos="~", bos="^"
characters, phonemes=None, punctuations="!'(),-.:;? ", pad="_", eos="~", bos="^", unique=True,
): # pylint: disable=redefined-outer-name
""" Function to create symbols and phonemes """
""" Function to create symbols and phonemes
TODO: create phonemes_to_id and symbols_to_id dicts here."""
_symbols = list(characters)
_symbols = [bos] + _symbols if len(bos) > 0 and bos is not None else _symbols
_symbols = [eos] + _symbols if len(bos) > 0 and eos is not None else _symbols
_symbols = [pad] + _symbols if len(bos) > 0 and pad is not None else _symbols
_phonemes = None
if phonemes is not None:
_phonemes_sorted = sorted(list(set(phonemes)))
_phonemes_sorted = sorted(list(set(phonemes))) if unique else sorted(list(phonemes)) # this is to keep previous models compatible.
# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
_arpabet = ["@" + s for s in _phonemes_sorted]
# Export all symbols: