add Portuguese Cleaner

This commit is contained in:
Edresson 2020-07-27 17:20:51 -03:00 committed by erogol
parent 93a9cc4683
commit b750452782
1 changed files with 14 additions and 5 deletions

View File

@ -67,15 +67,16 @@ def remove_aux_symbols(text):
text = re.sub(r'[\<\>\(\)\[\]\"]+', '', text)
return text
def replace_symbols(text):
def replace_symbols(text, lang='en'):
text = text.replace(';', ',')
text = text.replace('-', ' ')
text = text.replace(':', ',')
text = text.replace('&', 'and')
text = text.replace(':', ' ')
if lang == 'en':
text = text.replace('&', 'and')
elif lang == 'pt':
text = text.replace('&', ' e ')
return text
def basic_cleaners(text):
'''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
text = lowercase(text)
@ -118,6 +119,14 @@ def english_cleaners(text):
text = collapse_whitespace(text)
return text
def portuguese_cleaners(text):
'''Basic pipeline for Portuguese text. There is no need to expand abbreviation and
numbers, phonemizer already does that'''
text = lowercase(text)
text = replace_symbols(text, lang='pt')
text = remove_aux_symbols(text)
text = collapse_whitespace(text)
return text
def phoneme_cleaners(text):
'''Pipeline for phonemes mode, including number and abbreviation expansion.'''