mirror of https://github.com/coqui-ai/TTS.git
Implement Punctuation class
This commit is contained in:
parent
1aca58afaf
commit
8d85af84cd
|
@ -0,0 +1,156 @@
|
|||
import collections
|
||||
import re
|
||||
from enum import Enum
|
||||
|
||||
import six
|
||||
|
||||
_DEF_PUNCS = ';:,.!?¡¿—…"«»“”'
|
||||
|
||||
_PUNC_IDX = collections.namedtuple("_punc_index", ["punc", "position"])
|
||||
|
||||
|
||||
class PuncPosition(Enum):
|
||||
"""Enum for the punctuations positions"""
|
||||
|
||||
BEGIN = 0
|
||||
END = 1
|
||||
MIDDLE = 2
|
||||
ALONE = 3
|
||||
|
||||
|
||||
class Punctuation:
|
||||
"""Handle punctuations characters in text.
|
||||
|
||||
Just strip punctuations from text or strip and restore them later.
|
||||
|
||||
Args:
|
||||
puncs (str): The punctuations to be processed. Defaults to `_DEF_PUNCS`.
|
||||
"""
|
||||
|
||||
def __init__(self, puncs: str = _DEF_PUNCS):
|
||||
self.puncs = puncs
|
||||
|
||||
@staticmethod
|
||||
def default_puncs():
|
||||
"""Return default set of punctuations."""
|
||||
return _DEF_PUNCS
|
||||
|
||||
@property
|
||||
def puncs(self):
|
||||
return self._puncs
|
||||
|
||||
@puncs.setter
|
||||
def puncs(self, value):
|
||||
if not isinstance(value, six.string_types):
|
||||
raise ValueError("[!] Punctuations must be of type str.")
|
||||
self._puncs = "".join(set(value))
|
||||
self.puncs_regular_exp = re.compile(fr"(\s*[{re.escape(self._puncs)}]+\s*)+")
|
||||
|
||||
def strip(self, text):
|
||||
"""Remove all the punctuations by replacing with `space`.
|
||||
|
||||
Args:
|
||||
text (str): The text to be processed.
|
||||
|
||||
Example::
|
||||
|
||||
"This is. example !" -> "This is example "
|
||||
"""
|
||||
return re.sub(self.puncs_regular_exp, " ", text).strip()
|
||||
|
||||
def strip_to_restore(self, text):
|
||||
"""Remove punctuations from text to restore them later.
|
||||
|
||||
Args:
|
||||
text (str): The text to be processed.
|
||||
|
||||
Examples ::
|
||||
|
||||
"This is. example !" -> [["This is", "example"], [".", "!"]]
|
||||
|
||||
"""
|
||||
text, puncs = self._strip_to_restore(text)
|
||||
return text, puncs
|
||||
|
||||
def _strip_to_restore(self, text):
|
||||
"""Auxiliary method for Punctuation.preserve()"""
|
||||
matches = list(re.finditer(self.puncs_regular_exp, text))
|
||||
if not matches:
|
||||
return [text], []
|
||||
# the text is only punctuations
|
||||
if len(matches) == 1 and matches[0].group() == text:
|
||||
return [], [_PUNC_IDX(text, PuncPosition.ALONE)]
|
||||
# build a punctuation map to be used later to restore punctuations
|
||||
puncs = []
|
||||
for match in matches:
|
||||
position = PuncPosition.MIDDLE
|
||||
if match == matches[0] and text.startswith(match.group()):
|
||||
position = PuncPosition.BEGIN
|
||||
elif match == matches[-1] and text.endswith(match.group()):
|
||||
position = PuncPosition.END
|
||||
puncs.append(_PUNC_IDX(match.group(), position))
|
||||
# convert str text to a List[str], each item is separated by a punctuation
|
||||
splitted_text = []
|
||||
for punc in puncs:
|
||||
split = text.split(punc.punc)
|
||||
prefix, suffix = split[0], punc.punc.join(split[1:])
|
||||
splitted_text.append(prefix)
|
||||
text = suffix
|
||||
return splitted_text, puncs
|
||||
|
||||
@classmethod
|
||||
def restore(cls, text, puncs):
|
||||
"""Restore punctuation in a text.
|
||||
|
||||
Args:
|
||||
text (str): The text to be processed.
|
||||
puncs (List[str]): The list of punctuations map to be used for restoring.
|
||||
|
||||
Examples ::
|
||||
|
||||
['This is', 'example'], ['.', '!'] -> "This is. example!"
|
||||
|
||||
"""
|
||||
return cls._restore(text, puncs, 0)
|
||||
|
||||
@classmethod
|
||||
def _restore(cls, text, puncs, num):
|
||||
"""Auxiliary method for Punctuation.restore()"""
|
||||
if not puncs:
|
||||
return text
|
||||
|
||||
# nothing have been phonemized, returns the puncs alone
|
||||
if not text:
|
||||
return ["".join(m.mark for m in puncs)]
|
||||
|
||||
current = puncs[0]
|
||||
|
||||
if current.position == PuncPosition.BEGIN:
|
||||
return cls._restore([current.mark + text[0]] + text[1:], puncs[1:], num)
|
||||
|
||||
if current.position == PuncPosition.END:
|
||||
return [text[0] + current.punc] + cls._restore(text[1:], puncs[1:], num + 1)
|
||||
|
||||
if current.position == PuncPosition.ALONE:
|
||||
return [current.mark] + cls._restore(text, puncs[1:], num + 1)
|
||||
|
||||
# POSITION == MIDDLE
|
||||
if len(text) == 1: # pragma: nocover
|
||||
# a corner case where the final part of an intermediate
|
||||
# mark (I) has not been phonemized
|
||||
return cls._restore([text[0] + current.punc], puncs[1:], num)
|
||||
|
||||
return cls._restore([text[0] + current.punc + text[1]] + text[2:], puncs[1:], num)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
punc = Punctuation()
|
||||
text = "This is. This is, example!"
|
||||
|
||||
print(punc.strip(text))
|
||||
|
||||
split_text, puncs = punc.strip_to_restore(text)
|
||||
print(split_text, " ---- ", puncs)
|
||||
|
||||
restored_text = punc.restore(split_text, puncs)
|
||||
print(restored_text)
|
Loading…
Reference in New Issue