Merge pull request #531 from WeberJulian/french-cleaners

Adding support for french cleaners
This commit is contained in:
Eren Gölge 2020-09-30 15:30:50 +02:00 committed by GitHub
commit 4873601694
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 80 additions and 26 deletions

View File

@ -0,0 +1,61 @@
import re
# List of (regular expression, replacement) pairs for abbreviations in english:
abbreviations_en = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
for x in [
('mrs', 'misess'),
('mr', 'mister'),
('dr', 'doctor'),
('st', 'saint'),
('co', 'company'),
('jr', 'junior'),
('maj', 'major'),
('gen', 'general'),
('drs', 'doctors'),
('rev', 'reverend'),
('lt', 'lieutenant'),
('hon', 'honorable'),
('sgt', 'sergeant'),
('capt', 'captain'),
('esq', 'esquire'),
('ltd', 'limited'),
('col', 'colonel'),
('ft', 'fort'),
]]
# List of (regular expression, replacement) pairs for abbreviations in french:
abbreviations_fr = [(re.compile('\\b%s\\.?' % x[0], re.IGNORECASE), x[1])
for x in [
('M', 'monsieur'),
('Mlle', 'mademoiselle'),
('Mlles', 'mesdemoiselles'),
('Mme', 'Madame'),
('Mmes', 'Mesdames'),
('N.B', 'nota bene'),
('M', 'monsieur'),
('p.c.q', 'parce que'),
('Pr', 'professeur'),
('qqch', 'quelque chose'),
('rdv', 'rendez-vous'),
('max', 'maximum'),
('min', 'minimum'),
('no', 'numéro'),
('adr', 'adresse'),
('dr', 'docteur'),
('st', 'saint'),
('co', 'companie'),
('jr', 'junior'),
('sgt', 'sergent'),
('capt', 'capitain'),
('col', 'colonel'),
('av', 'avenue'),
('av. J.-C', 'avant Jésus-Christ'),
('apr. J.-C', 'après Jésus-Christ'),
('art', 'article'),
('boul', 'boulevard'),
('c.-à-d', 'cest-à-dire'),
('etc', 'et cetera'),
('ex', 'exemple'),
('excl', 'exclusivement'),
('boul', 'boulevard'),
]]

View File

@ -13,35 +13,17 @@ hyperparameter. Some cleaners are English-specific. You'll typically want to use
import re import re
from unidecode import unidecode from unidecode import unidecode
from .number_norm import normalize_numbers from .number_norm import normalize_numbers
from .abbreviations import abbreviations_en, abbreviations_fr
# Regular expression matching whitespace: # Regular expression matching whitespace:
_whitespace_re = re.compile(r'\s+') _whitespace_re = re.compile(r'\s+')
# List of (regular expression, replacement) pairs for abbreviations:
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
for x in [
('mrs', 'misess'),
('mr', 'mister'),
('dr', 'doctor'),
('st', 'saint'),
('co', 'company'),
('jr', 'junior'),
('maj', 'major'),
('gen', 'general'),
('drs', 'doctors'),
('rev', 'reverend'),
('lt', 'lieutenant'),
('hon', 'honorable'),
('sgt', 'sergeant'),
('capt', 'captain'),
('esq', 'esquire'),
('ltd', 'limited'),
('col', 'colonel'),
('ft', 'fort'),
]]
def expand_abbreviations(text, lang='en'):
def expand_abbreviations(text): if lang == 'en':
_abbreviations = abbreviations_en
elif lang == 'fr':
_abbreviations = abbreviations_fr
for regex, replacement in _abbreviations: for regex, replacement in _abbreviations:
text = re.sub(regex, replacement, text) text = re.sub(regex, replacement, text)
return text return text
@ -70,9 +52,11 @@ def remove_aux_symbols(text):
def replace_symbols(text, lang='en'): def replace_symbols(text, lang='en'):
text = text.replace(';', ',') text = text.replace(';', ',')
text = text.replace('-', ' ') text = text.replace('-', ' ')
text = text.replace(':', ' ') text = text.replace(':', ',')
if lang == 'en': if lang == 'en':
text = text.replace('&', 'and') text = text.replace('&', ' and ')
elif lang == 'fr':
text = text.replace('&', ' et ')
elif lang == 'pt': elif lang == 'pt':
text = text.replace('&', ' e ') text = text.replace('&', ' e ')
return text return text
@ -118,6 +102,15 @@ def english_cleaners(text):
text = collapse_whitespace(text) text = collapse_whitespace(text)
return text return text
def french_cleaners(text):
'''Pipeline for French text. There is no need to expand numbers, phonemizer already does that'''
text = lowercase(text)
text = expand_abbreviations(text, lang='fr')
text = replace_symbols(text, lang='fr')
text = remove_aux_symbols(text)
text = collapse_whitespace(text)
return text
def portuguese_cleaners(text): def portuguese_cleaners(text):
'''Basic pipeline for Portuguese text. There is no need to expand abbreviation and '''Basic pipeline for Portuguese text. There is no need to expand abbreviation and
numbers, phonemizer already does that''' numbers, phonemizer already does that'''