mirror of https://github.com/coqui-ai/TTS.git
Implement BaseCharacters, IPAPhonemes, Graphemes
This commit is contained in:
parent
1bee40af40
commit
2fb1f70503
|
@ -7,21 +7,33 @@ through Unidecode. For other data, you can modify _characters. See TRAINING_DATA
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def parse_symbols():
|
||||||
|
return {
|
||||||
|
"pad": _pad,
|
||||||
|
"eos": _eos,
|
||||||
|
"bos": _bos,
|
||||||
|
"characters": _characters,
|
||||||
|
"punctuations": _punctuations,
|
||||||
|
"phonemes": _phonemes,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def make_symbols(
|
def make_symbols(
|
||||||
characters,
|
characters,
|
||||||
phonemes=None,
|
phonemes=None,
|
||||||
punctuations="!'(),-.:;? ",
|
punctuations="!'(),-.:;? ",
|
||||||
pad="_",
|
pad="<PAD>",
|
||||||
eos="~",
|
eos="<EOS>",
|
||||||
bos="^",
|
bos="<BOS>",
|
||||||
|
blank="<BLNK>",
|
||||||
unique=True,
|
unique=True,
|
||||||
): # pylint: disable=redefined-outer-name
|
): # pylint: disable=redefined-outer-name
|
||||||
"""Function to create symbols and phonemes
|
"""Function to create default characters and phonemes"""
|
||||||
TODO: create phonemes_to_id and symbols_to_id dicts here."""
|
|
||||||
_symbols = list(characters)
|
_symbols = list(characters)
|
||||||
_symbols = [bos] + _symbols if len(bos) > 0 and bos is not None else _symbols
|
_symbols = [bos] + _symbols if len(bos) > 0 and bos is not None else _symbols
|
||||||
_symbols = [eos] + _symbols if len(bos) > 0 and eos is not None else _symbols
|
_symbols = [eos] + _symbols if len(bos) > 0 and eos is not None else _symbols
|
||||||
_symbols = [pad] + _symbols if len(bos) > 0 and pad is not None else _symbols
|
_symbols = [pad] + _symbols if len(bos) > 0 and pad is not None else _symbols
|
||||||
|
_symbols = [blank] + _symbols if len(bos) > 0 and blank is not None else _symbols
|
||||||
_phonemes = None
|
_phonemes = None
|
||||||
if phonemes is not None:
|
if phonemes is not None:
|
||||||
_phonemes_sorted = (
|
_phonemes_sorted = (
|
||||||
|
@ -35,9 +47,10 @@ def make_symbols(
|
||||||
return _symbols, _phonemes
|
return _symbols, _phonemes
|
||||||
|
|
||||||
|
|
||||||
_pad = "_"
|
_pad = "<PAD>"
|
||||||
_eos = "~"
|
_eos = "<EOS>"
|
||||||
_bos = "^"
|
_bos = "<BOS>"
|
||||||
|
_blank = "<BLNK>" # TODO: check if we need this alongside with PAD
|
||||||
_characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? "
|
_characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? "
|
||||||
_punctuations = "!'(),-.:;? "
|
_punctuations = "!'(),-.:;? "
|
||||||
|
|
||||||
|
@ -52,24 +65,252 @@ _phonemes = _vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprase
|
||||||
|
|
||||||
symbols, phonemes = make_symbols(_characters, _phonemes, _punctuations, _pad, _eos, _bos)
|
symbols, phonemes = make_symbols(_characters, _phonemes, _punctuations, _pad, _eos, _bos)
|
||||||
|
|
||||||
# Generate ALIEN language
|
|
||||||
# from random import shuffle
|
class BaseCharacters:
|
||||||
# shuffle(phonemes)
|
"""🐸BaseCharacters class
|
||||||
|
|
||||||
|
Every vocabulary class should inherit from this class.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
characters (str):
|
||||||
|
Main set of characters to be used in the vocabulary.
|
||||||
|
|
||||||
|
punctuations (str):
|
||||||
|
Characters to be treated as punctuation.
|
||||||
|
|
||||||
|
pad (str):
|
||||||
|
Special padding character that would be ignored by the model.
|
||||||
|
|
||||||
|
eos (str):
|
||||||
|
End of the sentence character.
|
||||||
|
|
||||||
|
bos (str):
|
||||||
|
Beginning of the sentence character.
|
||||||
|
|
||||||
|
blank (str):
|
||||||
|
Optional character used between characters by some models for better prosody.
|
||||||
|
|
||||||
|
is_unique (bool):
|
||||||
|
Remove duplicates from the provided characters. Defaults to True.
|
||||||
|
|
||||||
|
is_sorted (bool):
|
||||||
|
Sort the characters in alphabetical order. Defaults to True.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
characters: str,
|
||||||
|
punctuations: str,
|
||||||
|
pad: str,
|
||||||
|
eos: str,
|
||||||
|
bos: str,
|
||||||
|
blank: str,
|
||||||
|
is_unique: bool = True,
|
||||||
|
is_sorted: bool = True,
|
||||||
|
) -> None:
|
||||||
|
self._characters = characters
|
||||||
|
self._punctuations = punctuations
|
||||||
|
self._pad = pad
|
||||||
|
self._eos = eos
|
||||||
|
self._bos = bos
|
||||||
|
self._blank = blank
|
||||||
|
self.is_unique = is_unique
|
||||||
|
self.is_sorted = is_sorted
|
||||||
|
self._create_vocab()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def characters(self):
|
||||||
|
return self._characters
|
||||||
|
|
||||||
|
@characters.setter
|
||||||
|
def characters(self, characters):
|
||||||
|
self._characters = characters
|
||||||
|
self._vocab = self.create_vocab()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def punctuations(self):
|
||||||
|
return self._punctuations
|
||||||
|
|
||||||
|
@punctuations.setter
|
||||||
|
def punctuations(self, punctuations):
|
||||||
|
self._punctuations = punctuations
|
||||||
|
self._vocab = self.create_vocab()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def pad(self):
|
||||||
|
return self._pad
|
||||||
|
|
||||||
|
@pad.setter
|
||||||
|
def pad(self, pad):
|
||||||
|
self._pad = pad
|
||||||
|
self._vocab = self.create_vocab()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def eos(self):
|
||||||
|
return self._eos
|
||||||
|
|
||||||
|
@eos.setter
|
||||||
|
def eos(self, eos):
|
||||||
|
self._eos = eos
|
||||||
|
self._vocab = self.create_vocab()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def bos(self):
|
||||||
|
return self._bos
|
||||||
|
|
||||||
|
@bos.setter
|
||||||
|
def bos(self, bos):
|
||||||
|
self._bos = bos
|
||||||
|
self._vocab = self.create_vocab()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def blank(self):
|
||||||
|
return self._bos
|
||||||
|
|
||||||
|
@bos.setter
|
||||||
|
def blank(self, bos):
|
||||||
|
self._bos = bos
|
||||||
|
self._vocab = self.create_vocab()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def vocab(self):
|
||||||
|
return self._vocab
|
||||||
|
|
||||||
|
@property
|
||||||
|
def num_chars(self):
|
||||||
|
return len(self._vocab)
|
||||||
|
|
||||||
|
def _create_vocab(self):
|
||||||
|
_vocab = self.characters
|
||||||
|
if self.is_unique:
|
||||||
|
_vocab = list(set(_vocab))
|
||||||
|
if self.is_sorted:
|
||||||
|
_vocab = sorted(_vocab)
|
||||||
|
_vocab = list(_vocab)
|
||||||
|
_vocab = [self.bos] + _vocab if len(self.bos) > 0 and self.bos is not None else _vocab
|
||||||
|
_vocab = [self.eos] + _vocab if len(self.bos) > 0 and self.eos is not None else _vocab
|
||||||
|
_vocab = [self.pad] + _vocab if len(self.bos) > 0 and self.pad is not None else _vocab
|
||||||
|
self._vocab = _vocab + list(self._punctuations)
|
||||||
|
self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)}
|
||||||
|
self._id_to_char = {idx: char for idx, char in enumerate(self.vocab)}
|
||||||
|
assert len(self.vocab) == len(self._char_to_id) == len(self._id_to_char)
|
||||||
|
|
||||||
|
def char_to_id(self, char: str) -> int:
|
||||||
|
return self._char_to_id[char]
|
||||||
|
|
||||||
|
def id_to_char(self, idx: int) -> str:
|
||||||
|
return self._id_to_char[idx]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def init_from_config(config: "Coqpit"):
|
||||||
|
return BaseCharacters(
|
||||||
|
**config.characters if config.characters is not None else {},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def parse_symbols():
|
class IPAPhonemes(BaseCharacters):
|
||||||
return {
|
"""🐸IPAPhonemes class to manage `TTS.tts` model vocabulary
|
||||||
"pad": _pad,
|
|
||||||
"eos": _eos,
|
Intended to be used with models using IPAPhonemes as input.
|
||||||
"bos": _bos,
|
It uses system defaults for the undefined class arguments.
|
||||||
"characters": _characters,
|
|
||||||
"punctuations": _punctuations,
|
Args:
|
||||||
"phonemes": _phonemes,
|
characters (str):
|
||||||
}
|
Main set of case-sensitive characters to be used in the vocabulary. Defaults to `_phonemes`.
|
||||||
|
|
||||||
|
punctuations (str):
|
||||||
|
Characters to be treated as punctuation. Defaults to `_punctuations`.
|
||||||
|
|
||||||
|
pad (str):
|
||||||
|
Special padding character that would be ignored by the model. Defaults to `_pad`.
|
||||||
|
|
||||||
|
eos (str):
|
||||||
|
End of the sentence character. Defaults to `_eos`.
|
||||||
|
|
||||||
|
bos (str):
|
||||||
|
Beginning of the sentence character. Defaults to `_bos`.
|
||||||
|
|
||||||
|
is_unique (bool):
|
||||||
|
Remove duplicates from the provided characters. Defaults to True.
|
||||||
|
|
||||||
|
is_sorted (bool):
|
||||||
|
Sort the characters in alphabetical order. Defaults to True.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
characters: str = _phonemes,
|
||||||
|
punctuations: str = _punctuations,
|
||||||
|
pad: str = _pad,
|
||||||
|
eos: str = _eos,
|
||||||
|
bos: str = _bos,
|
||||||
|
is_unique: bool = True,
|
||||||
|
is_sorted: bool = True,
|
||||||
|
) -> None:
|
||||||
|
super().__init__(characters, punctuations, pad, eos, bos, is_unique, is_sorted)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def init_from_config(config: "Coqpit"):
|
||||||
|
return IPAPhonemes(
|
||||||
|
**config.characters if config.characters is not None else {},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class Graphemes(BaseCharacters):
|
||||||
|
"""🐸Graphemes class to manage `TTS.tts` model vocabulary
|
||||||
|
|
||||||
|
Intended to be used with models using graphemes as input.
|
||||||
|
It uses system defaults for the undefined class arguments.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
characters (str):
|
||||||
|
Main set of case-sensitive characters to be used in the vocabulary. Defaults to `_characters`.
|
||||||
|
|
||||||
|
punctuations (str):
|
||||||
|
Characters to be treated as punctuation. Defaults to `_punctuations`.
|
||||||
|
|
||||||
|
pad (str):
|
||||||
|
Special padding character that would be ignored by the model. Defaults to `_pad`.
|
||||||
|
|
||||||
|
eos (str):
|
||||||
|
End of the sentence character. Defaults to `_eos`.
|
||||||
|
|
||||||
|
bos (str):
|
||||||
|
Beginning of the sentence character. Defaults to `_bos`.
|
||||||
|
|
||||||
|
is_unique (bool):
|
||||||
|
Remove duplicates from the provided characters. Defaults to True.
|
||||||
|
|
||||||
|
is_sorted (bool):
|
||||||
|
Sort the characters in alphabetical order. Defaults to True.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
characters: str = _characters,
|
||||||
|
punctuations: str = _punctuations,
|
||||||
|
pad: str = _pad,
|
||||||
|
eos: str = _eos,
|
||||||
|
bos: str = _bos,
|
||||||
|
is_unique: bool = True,
|
||||||
|
is_sorted: bool = True,
|
||||||
|
) -> None:
|
||||||
|
super().__init__(characters, punctuations, pad, eos, bos, is_unique, is_sorted)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def init_from_config(config: "Coqpit"):
|
||||||
|
return Graphemes(
|
||||||
|
**config.characters if config.characters is not None else {},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
print(" > TTS symbols {}".format(len(symbols)))
|
gr = Graphemes()
|
||||||
print(symbols)
|
ph = IPAPhonemes()
|
||||||
print(" > TTS phonemes {}".format(len(phonemes)))
|
|
||||||
print("".join(sorted(phonemes)))
|
print(gr.vocab)
|
||||||
|
print(ph.vocab)
|
||||||
|
|
||||||
|
print(gr.num_chars)
|
||||||
|
assert "a" == gr.id_to_char(gr.char_to_id("a"))
|
||||||
|
|
Loading…
Reference in New Issue