Add Catalan text cleaners for Catalan support (#2295)

This commit is contained in:
Gerard Sant Muniesa 2023-01-23 11:56:30 +01:00 committed by GitHub
parent d83ee8fe45
commit c59b3f75b8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 21 additions and 1 deletions

View File

@ -44,8 +44,25 @@ def remove_aux_symbols(text):
def replace_symbols(text, lang="en"):
"""Replace symbols based on the lenguage tag.
Args:
text:
Input text.
lang:
Lenguage identifier. ex: "en", "fr", "pt", "ca".
Returns:
The modified text
example:
input args:
text: "si l'avi cau, diguem-ho"
lang: "ca"
Output:
text: "si lavi cau, diguemho"
"""
text = text.replace(";", ",")
text = text.replace("-", " ")
text = text.replace("-", " ") if lang != "ca" else text.replace("-", "")
text = text.replace(":", ",")
if lang == "en":
text = text.replace("&", " and ")
@ -53,6 +70,9 @@ def replace_symbols(text, lang="en"):
text = text.replace("&", " et ")
elif lang == "pt":
text = text.replace("&", " e ")
elif lang == "ca":
text = text.replace("&", " i ")
text = text.replace("'", "")
return text