Merge pull request #531 from WeberJulian/french-cleaners

Adding support for french cleaners
This commit is contained in:
Eren Gölge 2020-09-30 15:30:50 +02:00 committed by GitHub
commit 4873601694
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 80 additions and 26 deletions

View File

@ -0,0 +1,61 @@
import re
# List of (regular expression, replacement) pairs for abbreviations in english:
abbreviations_en = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
for x in [
('mrs', 'misess'),
('mr', 'mister'),
('dr', 'doctor'),
('st', 'saint'),
('co', 'company'),
('jr', 'junior'),
('maj', 'major'),
('gen', 'general'),
('drs', 'doctors'),
('rev', 'reverend'),
('lt', 'lieutenant'),
('hon', 'honorable'),
('sgt', 'sergeant'),
('capt', 'captain'),
('esq', 'esquire'),
('ltd', 'limited'),
('col', 'colonel'),
('ft', 'fort'),
]]
# List of (regular expression, replacement) pairs for abbreviations in french:
abbreviations_fr = [(re.compile('\\b%s\\.?' % x[0], re.IGNORECASE), x[1])
for x in [
('M', 'monsieur'),
('Mlle', 'mademoiselle'),
('Mlles', 'mesdemoiselles'),
('Mme', 'Madame'),
('Mmes', 'Mesdames'),
('N.B', 'nota bene'),
('M', 'monsieur'),
('p.c.q', 'parce que'),
('Pr', 'professeur'),
('qqch', 'quelque chose'),
('rdv', 'rendez-vous'),
('max', 'maximum'),
('min', 'minimum'),
('no', 'numéro'),
('adr', 'adresse'),
('dr', 'docteur'),
('st', 'saint'),
('co', 'companie'),
('jr', 'junior'),
('sgt', 'sergent'),
('capt', 'capitain'),
('col', 'colonel'),
('av', 'avenue'),
('av. J.-C', 'avant Jésus-Christ'),
('apr. J.-C', 'après Jésus-Christ'),
('art', 'article'),
('boul', 'boulevard'),
('c.-à-d', 'cest-à-dire'),
('etc', 'et cetera'),
('ex', 'exemple'),
('excl', 'exclusivement'),
('boul', 'boulevard'),
]]

View File

@ -13,35 +13,17 @@ hyperparameter. Some cleaners are English-specific. You'll typically want to use
import re
from unidecode import unidecode
from .number_norm import normalize_numbers
from .abbreviations import abbreviations_en, abbreviations_fr
# Regular expression matching whitespace:
_whitespace_re = re.compile(r'\s+')
# List of (regular expression, replacement) pairs for abbreviations:
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
for x in [
('mrs', 'misess'),
('mr', 'mister'),
('dr', 'doctor'),
('st', 'saint'),
('co', 'company'),
('jr', 'junior'),
('maj', 'major'),
('gen', 'general'),
('drs', 'doctors'),
('rev', 'reverend'),
('lt', 'lieutenant'),
('hon', 'honorable'),
('sgt', 'sergeant'),
('capt', 'captain'),
('esq', 'esquire'),
('ltd', 'limited'),
('col', 'colonel'),
('ft', 'fort'),
]]
def expand_abbreviations(text):
def expand_abbreviations(text, lang='en'):
if lang == 'en':
_abbreviations = abbreviations_en
elif lang == 'fr':
_abbreviations = abbreviations_fr
for regex, replacement in _abbreviations:
text = re.sub(regex, replacement, text)
return text
@ -70,9 +52,11 @@ def remove_aux_symbols(text):
def replace_symbols(text, lang='en'):
text = text.replace(';', ',')
text = text.replace('-', ' ')
text = text.replace(':', ' ')
text = text.replace(':', ',')
if lang == 'en':
text = text.replace('&', 'and')
text = text.replace('&', ' and ')
elif lang == 'fr':
text = text.replace('&', ' et ')
elif lang == 'pt':
text = text.replace('&', ' e ')
return text
@ -118,6 +102,15 @@ def english_cleaners(text):
text = collapse_whitespace(text)
return text
def french_cleaners(text):
'''Pipeline for French text. There is no need to expand numbers, phonemizer already does that'''
text = lowercase(text)
text = expand_abbreviations(text, lang='fr')
text = replace_symbols(text, lang='fr')
text = remove_aux_symbols(text)
text = collapse_whitespace(text)
return text
def portuguese_cleaners(text):
'''Basic pipeline for Portuguese text. There is no need to expand abbreviation and
numbers, phonemizer already does that'''