Merge pull request #531 from WeberJulian/french-cleaners

Adding support for french cleaners
2020-09-30 15:30:50 +02:00 · 2020-09-30 15:30:50 +02:00 · 4873601694
parent 592bb668fd ea7c2e15c0
commit 4873601694
2 changed files with 80 additions and 26 deletions
--- a/TTS/tts/utils/text/abbreviations.py
+++ b/TTS/tts/utils/text/abbreviations.py
@ -0,0 +1,61 @@
 import re
 # List of (regular expression, replacement) pairs for abbreviations in english:
 abbreviations_en = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
                  for x in [
                      ('mrs', 'misess'),
                      ('mr', 'mister'),
                      ('dr', 'doctor'),
                      ('st', 'saint'),
                      ('co', 'company'),
                      ('jr', 'junior'),
                      ('maj', 'major'),
                      ('gen', 'general'),
                      ('drs', 'doctors'),
                      ('rev', 'reverend'),
                      ('lt', 'lieutenant'),
                      ('hon', 'honorable'),
                      ('sgt', 'sergeant'),
                      ('capt', 'captain'),
                      ('esq', 'esquire'),
                      ('ltd', 'limited'),
                      ('col', 'colonel'),
                      ('ft', 'fort'),
                  ]]
 # List of (regular expression, replacement) pairs for abbreviations in french:
 abbreviations_fr = [(re.compile('\\b%s\\.?' % x[0], re.IGNORECASE), x[1])
                  for x in [
                      ('M', 'monsieur'),
                      ('Mlle', 'mademoiselle'),
                      ('Mlles', 'mesdemoiselles'),
                      ('Mme', 'Madame'),
                      ('Mmes', 'Mesdames'),
                      ('N.B', 'nota bene'),
                      ('M', 'monsieur'),
                      ('p.c.q', 'parce que'),
                      ('Pr', 'professeur'),
                      ('qqch', 'quelque chose'),
                      ('rdv', 'rendez-vous'),
                      ('max', 'maximum'),
                      ('min', 'minimum'),
                      ('no', 'numéro'),
                      ('adr', 'adresse'),
                      ('dr', 'docteur'),
                      ('st', 'saint'),
                      ('co', 'companie'),
                      ('jr', 'junior'),
                      ('sgt', 'sergent'),
                      ('capt', 'capitain'),
                      ('col', 'colonel'),
                      ('av', 'avenue'),
                      ('av. J.-C', 'avant Jésus-Christ'),
                      ('apr. J.-C', 'après Jésus-Christ'),
                      ('art', 'article'),
                      ('boul', 'boulevard'),
                      ('c.-à-d', 'c’est-à-dire'),
                      ('etc', 'et cetera'),
                      ('ex', 'exemple'),
                      ('excl', 'exclusivement'),
                      ('boul', 'boulevard'),
                  ]]
--- a/TTS/tts/utils/text/cleaners.py
+++ b/TTS/tts/utils/text/cleaners.py
@ -13,35 +13,17 @@ hyperparameter. Some cleaners are English-specific. You'll typically want to use
 import re
 from unidecode import unidecode
 from .number_norm import normalize_numbers
 from .abbreviations import abbreviations_en, abbreviations_fr
 # Regular expression matching whitespace:
 _whitespace_re = re.compile(r'\s+')
 # List of (regular expression, replacement) pairs for abbreviations:
 _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
                  for x in [
                      ('mrs', 'misess'),
                      ('mr', 'mister'),
                      ('dr', 'doctor'),
                      ('st', 'saint'),
                      ('co', 'company'),
                      ('jr', 'junior'),
                      ('maj', 'major'),
                      ('gen', 'general'),
                      ('drs', 'doctors'),
                      ('rev', 'reverend'),
                      ('lt', 'lieutenant'),
                      ('hon', 'honorable'),
                      ('sgt', 'sergeant'),
                      ('capt', 'captain'),
                      ('esq', 'esquire'),
                      ('ltd', 'limited'),
                      ('col', 'colonel'),
                      ('ft', 'fort'),
                  ]]
-
+def expand_abbreviations(text, lang='en'):
-def expand_abbreviations(text):
+    if lang == 'en':
        _abbreviations = abbreviations_en
    elif lang == 'fr':
        _abbreviations = abbreviations_fr
    for regex, replacement in _abbreviations:
        text = re.sub(regex, replacement, text)
    return text
@ -70,9 +52,11 @@ def remove_aux_symbols(text):
 def replace_symbols(text, lang='en'):
    text = text.replace(';', ',')
    text = text.replace('-', ' ')
-    text = text.replace(':', ' ')
+    text = text.replace(':', ',')
    if lang == 'en':
-        text = text.replace('&', 'and')
+        text = text.replace('&', ' and ')
    elif lang == 'fr':
        text = text.replace('&', ' et ')
    elif lang == 'pt':
        text = text.replace('&', ' e ')
    return text
@ -118,6 +102,15 @@ def english_cleaners(text):
    text = collapse_whitespace(text)
    return text
 def french_cleaners(text):
    '''Pipeline for French text. There is no need to expand numbers, phonemizer already does that'''
    text = lowercase(text)
    text = expand_abbreviations(text, lang='fr')
    text = replace_symbols(text, lang='fr')
    text = remove_aux_symbols(text)
    text = collapse_whitespace(text)
    return text
 def portuguese_cleaners(text):
    '''Basic pipeline for Portuguese text. There is no need to expand abbreviation and
        numbers, phonemizer already does that'''