fix(punctuation): correctly handle initial punctuation

Stripping and restoring initial punctuation didn't work correctly because the
string-splitting caused an additional empty string to be inserted in the text
list (because `".A".split(".")` => `["", "A"]`). Now, an initial empty string is
skipped and relevant test cases are added.

Fixes #3333
This commit is contained in:
Enno Hermann 2023-11-29 22:48:48 +01:00
parent 87974f917a
commit 45200d15c9
2 changed files with 7 additions and 1 deletions

View File

@ -106,11 +106,14 @@ class Punctuation:
for idx, punc in enumerate(puncs):
split = text.split(punc.punc)
prefix, suffix = split[0], punc.punc.join(split[1:])
text = suffix
if prefix == "":
# We don't want to insert an empty string in case of initial punctuation
continue
splitted_text.append(prefix)
# if the text does not end with a punctuation, add it to the last item
if idx == len(puncs) - 1 and len(suffix) > 0:
splitted_text.append(suffix)
text = suffix
return splitted_text, puncs
@classmethod

View File

@ -13,6 +13,9 @@ class PunctuationTest(unittest.TestCase):
("This, is my text to be striped from text", "This is my text to be striped from text"),
(".", ""),
(" . ", ""),
("!!! Attention !!!", "Attention"),
("!!! Attention !!! This is just a ... test.", "Attention This is just a test"),
("!!! Attention! This is just a ... test.", "Attention This is just a test"),
]
def test_get_set_puncs(self):