fix french_cleaners

This commit is contained in:
WeberJulian 2021-03-05 19:56:50 +01:00 committed by Eren Gölge
parent b94373afb8
commit 1574d8dd39
2 changed files with 42 additions and 38 deletions

View File

@ -24,7 +24,7 @@ abbreviations_en = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
]] ]]
# List of (regular expression, replacement) pairs for abbreviations in french: # List of (regular expression, replacement) pairs for abbreviations in french:
abbreviations_fr = [(re.compile('\\b%s\\.?' % x[0], re.IGNORECASE), x[1]) abbreviations_fr = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
for x in [ for x in [
('M', 'monsieur'), ('M', 'monsieur'),
('Mlle', 'mademoiselle'), ('Mlle', 'mademoiselle'),
@ -58,4 +58,10 @@ abbreviations_fr = [(re.compile('\\b%s\\.?' % x[0], re.IGNORECASE), x[1])
('ex', 'exemple'), ('ex', 'exemple'),
('excl', 'exclusivement'), ('excl', 'exclusivement'),
('boul', 'boulevard'), ('boul', 'boulevard'),
]] + [(re.compile('\\b%s' % x[0]), x[1])
for x in [
('Mlle', 'mademoiselle'),
('Mlles', 'mesdemoiselles'),
('Mme', 'Madame'),
('Mmes', 'Mesdames'),
]] ]]

View File

@ -108,8 +108,8 @@ def english_cleaners(text):
def french_cleaners(text): def french_cleaners(text):
'''Pipeline for French text. There is no need to expand numbers, phonemizer already does that''' '''Pipeline for French text. There is no need to expand numbers, phonemizer already does that'''
text = lowercase(text)
text = expand_abbreviations(text, lang='fr') text = expand_abbreviations(text, lang='fr')
text = lowercase(text)
text = replace_symbols(text, lang='fr') text = replace_symbols(text, lang='fr')
text = remove_aux_symbols(text) text = remove_aux_symbols(text)
text = collapse_whitespace(text) text = collapse_whitespace(text)
@ -129,8 +129,6 @@ def chinese_mandarin_cleaners(text: str) -> str:
text = replace_numbers_to_characters_in_text(text) text = replace_numbers_to_characters_in_text(text)
return text return text
def phoneme_cleaners(text): def phoneme_cleaners(text):
'''Pipeline for phonemes mode, including number and abbreviation expansion.''' '''Pipeline for phonemes mode, including number and abbreviation expansion.'''
text = expand_numbers(text) text = expand_numbers(text)