Allow None pad and blank tokens

This commit is contained in:
Eren Gölge 2021-12-07 12:52:45 +00:00
parent c9972e6f14
commit 8649d4fd36
1 changed files with 33 additions and 15 deletions

View File

@ -57,8 +57,8 @@ class TTSTokenizer:
@characters.setter @characters.setter
def characters(self, new_characters): def characters(self, new_characters):
self._characters = new_characters self._characters = new_characters
self.pad_id = self.characters.char_to_id(self.characters.pad) self.pad_id = self.characters.char_to_id(self.characters.pad) if self.characters.pad else None
self.blank_id = self.characters.char_to_id(self.characters.blank) self.blank_id = self.characters.char_to_id(self.characters.blank) if self.characters.blank else None
def encode(self, text: str) -> List[int]: def encode(self, text: str) -> List[int]:
"""Encodes a string of text as a sequence of IDs.""" """Encodes a string of text as a sequence of IDs."""
@ -82,7 +82,7 @@ class TTSTokenizer:
text += self.characters.id_to_char(token_id) text += self.characters.id_to_char(token_id)
return text return text
def text_to_ids(self, text: str, language: str = None) -> List[int]: def text_to_ids(self, text: str, language: str = None) -> List[int]: # pylint: disable=unused-argument
"""Converts a string of text to a sequence of token IDs. """Converts a string of text to a sequence of token IDs.
Args: Args:
@ -137,32 +137,50 @@ class TTSTokenizer:
print(f"{indent}| > {char}") print(f"{indent}| > {char}")
@staticmethod @staticmethod
def init_from_config(config: "Coqpit"): def init_from_config(config: "Coqpit", characters: "BaseCharacters" = None):
"""Init Tokenizer object from config """Init Tokenizer object from config
Args: Args:
config (Coqpit): Coqpit model config. config (Coqpit): Coqpit model config.
characters (BaseCharacters): Defines the model character set. If not set, use the default options based on
the config values. Defaults to None.
""" """
# init cleaners # init cleaners
if isinstance(config.text_cleaner, (str, list)): if isinstance(config.text_cleaner, (str, list)):
text_cleaner = getattr(cleaners, config.text_cleaner) text_cleaner = getattr(cleaners, config.text_cleaner)
# init characters
if characters is None:
if config.use_phonemes:
# init phoneme set
characters, new_config = IPAPhonemes().init_from_config(config)
else:
# init character set
characters, new_config = Graphemes().init_from_config(config)
else:
characters, new_config = characters.init_from_config(config)
# init phonemizer
phonemizer = None phonemizer = None
if config.use_phonemes: if config.use_phonemes:
# init phoneme set
characters = IPAPhonemes().init_from_config(config)
phonemizer_kwargs = {"language": config.phoneme_language} phonemizer_kwargs = {"language": config.phoneme_language}
# init phonemizer
if "phonemizer" in config and config.phonemizer: if "phonemizer" in config and config.phonemizer:
phonemizer = get_phonemizer_by_name(config.phonemizer, **phonemizer_kwargs) phonemizer = get_phonemizer_by_name(config.phonemizer, **phonemizer_kwargs)
else: else:
phonemizer = get_phonemizer_by_name( try:
DEF_LANG_TO_PHONEMIZER[config.phoneme_language], **phonemizer_kwargs phonemizer = get_phonemizer_by_name(
) DEF_LANG_TO_PHONEMIZER[config.phoneme_language], **phonemizer_kwargs
else: )
# init character set except KeyError as e:
characters = Graphemes().init_from_config(config) raise ValueError(
return TTSTokenizer( f"""No phonemizer found for language {config.phoneme_language}.
config.use_phonemes, text_cleaner, characters, phonemizer, config.add_blank, config.enable_eos_bos_chars You may need to install a third party library for this language."""
) from e
return (
TTSTokenizer(
config.use_phonemes, text_cleaner, characters, phonemizer, config.add_blank, config.enable_eos_bos_chars
),
new_config,
) )