Implement TTSTokenizer

2021-11-16 13:27:42 +01:00 · 2021-11-16 13:27:42 +01:00 · 0344645e90
parent 2fb1f70503
commit 0344645e90
1 changed files with 120 additions and 0 deletions
--- a/TTS/tts/utils/text/tokenizer.py
+++ b/TTS/tts/utils/text/tokenizer.py
@ -0,0 +1,120 @@
 from typing import Callable, Dict, List, Union
 from TTS.tts.utils.text import cleaners
 from TTS.tts.utils.text.phonemizers import DEF_LANG_TO_PHONEMIZER, get_phonemizer_by_name
 from TTS.tts.utils.text.symbols import Graphemes, IPAPhonemes
 class TTSTokenizer:
    """🐸TTS tokenizer to convert input characters to token IDs and back.
    Args:
        use_phonemes (bool):
            Whether to use phonemes instead of characters. Defaults to False.
        characters (Characters):
            A Characters object to use for character-to-ID and ID-to-character mappings.
        text_cleaner (callable):
            A function to pre-process the text before tokenization and phonemization. Defaults to None.
        phonemizer (Phonemizer):
            A phonemizer object or a dict that maps language codes to phonemizer objects. Defaults to None.
    """
    def __init__(
        self,
        use_phonemes=False,
        text_cleaner: Callable = None,
        characters: "BaseCharacters" = None,
        phonemizer: Union["Phonemizer", Dict] = None,
        add_blank: bool = False,
        use_eos_bos=False,
    ):
        self.text_cleaner = text_cleaner or (lambda x: x)
        self.use_phonemes = use_phonemes
        self.add_blank = add_blank
        self.use_eos_bos = use_eos_bos
        self.characters = characters
        self.phonemizer = phonemizer
    def encode(self, text: str) -> List[int]:
        """Encodes a string of text as a sequence of IDs."""
        token_ids = []
        for char in text:
            idx = self.characters.char_to_id(char)
            token_ids.append(idx)
        return token_ids
    def decode(self, token_ids: List[int]) -> str:
        """Decodes a sequence of IDs to a string of text."""
        text = ""
        for token_id in token_ids:
            text += self.characters.id_to_char(token_id)
        return text
    def text_to_ids(self, text: str, language: str = None) -> List[int]:
        """Converts a string of text to a sequence of token IDs.
        Args:
            text(str):
                The text to convert to token IDs.
            language(str):
                The language code of the text. Defaults to None.
        1. Text normalizatin
        2. Phonemization (if use_phonemes is True)
        3. Add blank char between characters
        4. Add BOS and EOS characters
        5. Text to token IDs
        """
        # TODO: text cleaner should pick the right routine based on the language
        text = self.text_cleaner(text)
        if self.use_phonemes:
            text = self.phonemizer.phonemize(text, separator="")
        if self.add_blank:
            text = self.intersperse_blank_char(text, True)
        if self.use_eos_bos:
            text = self.pad_with_bos_eos(text)
        return self.encode(text)
    def ids_to_text(self, id_sequence: List[int]) -> str:
        """Converts a sequence of token IDs to a string of text."""
        return self.decode(id_sequence)
    def pad_with_bos_eos(self, char_sequence: List[str]):
        """Pads a sequence with the special BOS and EOS characters."""
        return [self.characters.bos] + list(char_sequence) + [self.characters.eos]
    def intersperse_blank_char(self, char_sequence: List[str], use_blank_char: bool = False):
        char_to_use = self.characters.blank_char if use_blank_char else self.characters.pad
        result = [char_to_use] * (len(char_sequence) * 2 + 1)
        result[1::2] = char_sequence
        return result
    def print_logs(self, level: int = 1):
        indent = "\t" * level
        print(f"{indent}| > add_blank: {self.use_phonemes}")
        print(f"{indent}| > use_eos_bos: {self.use_phonemes}")
        print(f"{indent}| > use_phonemes: {self.use_phonemes}")
        print(f"{indent}| > phonemizer: {self.phonemizer.print_logs(level + 1)}")
    @staticmethod
    def init_from_config(config: "Coqpit"):
        """Init Tokenizer object from the config.
        Args:
            config (Coqpit): Coqpit model config.
        """
        if isinstance(config.text_cleaner, (str, list)):
            text_cleaner = getattr(cleaners, config.text_cleaner)
        if config.use_phonemes:
            characters = IPAPhonemes().init_from_config(config)
            phonemizer_kwargs = {"language": config.phoneme_language}
            phonemizer = get_phonemizer_by_name(DEF_LANG_TO_PHONEMIZER[config.phoneme_language], **phonemizer_kwargs)
        else:
            characters = Graphemes().init_from_config(config)
        return TTSTokenizer(config.use_phonemes, text_cleaner, characters, phonemizer, config.add_blank, config.enable_eos_bos_chars)