fix(punctuation): correctly handle initial punctuation

Stripping and restoring initial punctuation didn't work correctly because the
string-splitting caused an additional empty string to be inserted in the text
list (because `".A".split(".")` => `["", "A"]`). Now, an initial empty string is
skipped and relevant test cases are added.

Fixes #3333
This commit is contained in:
Enno Hermann 2023-11-29 22:48:48 +01:00
parent 87974f917a
commit 45200d15c9
2 changed files with 7 additions and 1 deletions

View File

@ -106,11 +106,14 @@ class Punctuation:
for idx, punc in enumerate(puncs): for idx, punc in enumerate(puncs):
split = text.split(punc.punc) split = text.split(punc.punc)
prefix, suffix = split[0], punc.punc.join(split[1:]) prefix, suffix = split[0], punc.punc.join(split[1:])
text = suffix
if prefix == "":
# We don't want to insert an empty string in case of initial punctuation
continue
splitted_text.append(prefix) splitted_text.append(prefix)
# if the text does not end with a punctuation, add it to the last item # if the text does not end with a punctuation, add it to the last item
if idx == len(puncs) - 1 and len(suffix) > 0: if idx == len(puncs) - 1 and len(suffix) > 0:
splitted_text.append(suffix) splitted_text.append(suffix)
text = suffix
return splitted_text, puncs return splitted_text, puncs
@classmethod @classmethod

View File

@ -13,6 +13,9 @@ class PunctuationTest(unittest.TestCase):
("This, is my text to be striped from text", "This is my text to be striped from text"), ("This, is my text to be striped from text", "This is my text to be striped from text"),
(".", ""), (".", ""),
(" . ", ""), (" . ", ""),
("!!! Attention !!!", "Attention"),
("!!! Attention !!! This is just a ... test.", "Attention This is just a test"),
("!!! Attention! This is just a ... test.", "Attention This is just a test"),
] ]
def test_get_set_puncs(self): def test_get_set_puncs(self):