mirror of https://github.com/coqui-ai/TTS.git
Merge pull request #531 from WeberJulian/french-cleaners
Adding support for french cleaners
This commit is contained in:
commit
4873601694
|
@ -0,0 +1,61 @@
|
||||||
|
import re
|
||||||
|
|
||||||
|
# List of (regular expression, replacement) pairs for abbreviations in english:
|
||||||
|
abbreviations_en = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
|
||||||
|
for x in [
|
||||||
|
('mrs', 'misess'),
|
||||||
|
('mr', 'mister'),
|
||||||
|
('dr', 'doctor'),
|
||||||
|
('st', 'saint'),
|
||||||
|
('co', 'company'),
|
||||||
|
('jr', 'junior'),
|
||||||
|
('maj', 'major'),
|
||||||
|
('gen', 'general'),
|
||||||
|
('drs', 'doctors'),
|
||||||
|
('rev', 'reverend'),
|
||||||
|
('lt', 'lieutenant'),
|
||||||
|
('hon', 'honorable'),
|
||||||
|
('sgt', 'sergeant'),
|
||||||
|
('capt', 'captain'),
|
||||||
|
('esq', 'esquire'),
|
||||||
|
('ltd', 'limited'),
|
||||||
|
('col', 'colonel'),
|
||||||
|
('ft', 'fort'),
|
||||||
|
]]
|
||||||
|
|
||||||
|
# List of (regular expression, replacement) pairs for abbreviations in french:
|
||||||
|
abbreviations_fr = [(re.compile('\\b%s\\.?' % x[0], re.IGNORECASE), x[1])
|
||||||
|
for x in [
|
||||||
|
('M', 'monsieur'),
|
||||||
|
('Mlle', 'mademoiselle'),
|
||||||
|
('Mlles', 'mesdemoiselles'),
|
||||||
|
('Mme', 'Madame'),
|
||||||
|
('Mmes', 'Mesdames'),
|
||||||
|
('N.B', 'nota bene'),
|
||||||
|
('M', 'monsieur'),
|
||||||
|
('p.c.q', 'parce que'),
|
||||||
|
('Pr', 'professeur'),
|
||||||
|
('qqch', 'quelque chose'),
|
||||||
|
('rdv', 'rendez-vous'),
|
||||||
|
('max', 'maximum'),
|
||||||
|
('min', 'minimum'),
|
||||||
|
('no', 'numéro'),
|
||||||
|
('adr', 'adresse'),
|
||||||
|
('dr', 'docteur'),
|
||||||
|
('st', 'saint'),
|
||||||
|
('co', 'companie'),
|
||||||
|
('jr', 'junior'),
|
||||||
|
('sgt', 'sergent'),
|
||||||
|
('capt', 'capitain'),
|
||||||
|
('col', 'colonel'),
|
||||||
|
('av', 'avenue'),
|
||||||
|
('av. J.-C', 'avant Jésus-Christ'),
|
||||||
|
('apr. J.-C', 'après Jésus-Christ'),
|
||||||
|
('art', 'article'),
|
||||||
|
('boul', 'boulevard'),
|
||||||
|
('c.-à-d', 'c’est-à-dire'),
|
||||||
|
('etc', 'et cetera'),
|
||||||
|
('ex', 'exemple'),
|
||||||
|
('excl', 'exclusivement'),
|
||||||
|
('boul', 'boulevard'),
|
||||||
|
]]
|
|
@ -13,35 +13,17 @@ hyperparameter. Some cleaners are English-specific. You'll typically want to use
|
||||||
import re
|
import re
|
||||||
from unidecode import unidecode
|
from unidecode import unidecode
|
||||||
from .number_norm import normalize_numbers
|
from .number_norm import normalize_numbers
|
||||||
|
from .abbreviations import abbreviations_en, abbreviations_fr
|
||||||
|
|
||||||
# Regular expression matching whitespace:
|
# Regular expression matching whitespace:
|
||||||
_whitespace_re = re.compile(r'\s+')
|
_whitespace_re = re.compile(r'\s+')
|
||||||
|
|
||||||
# List of (regular expression, replacement) pairs for abbreviations:
|
|
||||||
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
|
|
||||||
for x in [
|
|
||||||
('mrs', 'misess'),
|
|
||||||
('mr', 'mister'),
|
|
||||||
('dr', 'doctor'),
|
|
||||||
('st', 'saint'),
|
|
||||||
('co', 'company'),
|
|
||||||
('jr', 'junior'),
|
|
||||||
('maj', 'major'),
|
|
||||||
('gen', 'general'),
|
|
||||||
('drs', 'doctors'),
|
|
||||||
('rev', 'reverend'),
|
|
||||||
('lt', 'lieutenant'),
|
|
||||||
('hon', 'honorable'),
|
|
||||||
('sgt', 'sergeant'),
|
|
||||||
('capt', 'captain'),
|
|
||||||
('esq', 'esquire'),
|
|
||||||
('ltd', 'limited'),
|
|
||||||
('col', 'colonel'),
|
|
||||||
('ft', 'fort'),
|
|
||||||
]]
|
|
||||||
|
|
||||||
|
def expand_abbreviations(text, lang='en'):
|
||||||
def expand_abbreviations(text):
|
if lang == 'en':
|
||||||
|
_abbreviations = abbreviations_en
|
||||||
|
elif lang == 'fr':
|
||||||
|
_abbreviations = abbreviations_fr
|
||||||
for regex, replacement in _abbreviations:
|
for regex, replacement in _abbreviations:
|
||||||
text = re.sub(regex, replacement, text)
|
text = re.sub(regex, replacement, text)
|
||||||
return text
|
return text
|
||||||
|
@ -70,9 +52,11 @@ def remove_aux_symbols(text):
|
||||||
def replace_symbols(text, lang='en'):
|
def replace_symbols(text, lang='en'):
|
||||||
text = text.replace(';', ',')
|
text = text.replace(';', ',')
|
||||||
text = text.replace('-', ' ')
|
text = text.replace('-', ' ')
|
||||||
text = text.replace(':', ' ')
|
text = text.replace(':', ',')
|
||||||
if lang == 'en':
|
if lang == 'en':
|
||||||
text = text.replace('&', 'and')
|
text = text.replace('&', ' and ')
|
||||||
|
elif lang == 'fr':
|
||||||
|
text = text.replace('&', ' et ')
|
||||||
elif lang == 'pt':
|
elif lang == 'pt':
|
||||||
text = text.replace('&', ' e ')
|
text = text.replace('&', ' e ')
|
||||||
return text
|
return text
|
||||||
|
@ -118,6 +102,15 @@ def english_cleaners(text):
|
||||||
text = collapse_whitespace(text)
|
text = collapse_whitespace(text)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
def french_cleaners(text):
|
||||||
|
'''Pipeline for French text. There is no need to expand numbers, phonemizer already does that'''
|
||||||
|
text = lowercase(text)
|
||||||
|
text = expand_abbreviations(text, lang='fr')
|
||||||
|
text = replace_symbols(text, lang='fr')
|
||||||
|
text = remove_aux_symbols(text)
|
||||||
|
text = collapse_whitespace(text)
|
||||||
|
return text
|
||||||
|
|
||||||
def portuguese_cleaners(text):
|
def portuguese_cleaners(text):
|
||||||
'''Basic pipeline for Portuguese text. There is no need to expand abbreviation and
|
'''Basic pipeline for Portuguese text. There is no need to expand abbreviation and
|
||||||
numbers, phonemizer already does that'''
|
numbers, phonemizer already does that'''
|
||||||
|
|
Loading…
Reference in New Issue