Merge pull request #531 from WeberJulian/french-cleaners

Adding support for french cleaners
2020-09-30 15:30:50 +02:00 · 2020-09-30 15:30:50 +02:00 · 4873601694
parent 592bb668fd ea7c2e15c0
commit 4873601694
2 changed files with 80 additions and 26 deletions
--- a/TTS/tts/utils/text/abbreviations.py
+++ b/TTS/tts/utils/text/abbreviations.py
@ -0,0 +1,61 @@
+import re
+
+# List of (regular expression, replacement) pairs for abbreviations in english:
+abbreviations_en = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
+                  for x in [
+                      ('mrs', 'misess'),
+                      ('mr', 'mister'),
+                      ('dr', 'doctor'),
+                      ('st', 'saint'),
+                      ('co', 'company'),
+                      ('jr', 'junior'),
+                      ('maj', 'major'),
+                      ('gen', 'general'),
+                      ('drs', 'doctors'),
+                      ('rev', 'reverend'),
+                      ('lt', 'lieutenant'),
+                      ('hon', 'honorable'),
+                      ('sgt', 'sergeant'),
+                      ('capt', 'captain'),
+                      ('esq', 'esquire'),
+                      ('ltd', 'limited'),
+                      ('col', 'colonel'),
+                      ('ft', 'fort'),
+                  ]]
+
+# List of (regular expression, replacement) pairs for abbreviations in french:
+abbreviations_fr = [(re.compile('\\b%s\\.?' % x[0], re.IGNORECASE), x[1])
+                  for x in [
+                      ('M', 'monsieur'),
+                      ('Mlle', 'mademoiselle'),
+                      ('Mlles', 'mesdemoiselles'),
+                      ('Mme', 'Madame'),
+                      ('Mmes', 'Mesdames'),
+                      ('N.B', 'nota bene'),
+                      ('M', 'monsieur'),
+                      ('p.c.q', 'parce que'),
+                      ('Pr', 'professeur'),
+                      ('qqch', 'quelque chose'),
+                      ('rdv', 'rendez-vous'),
+                      ('max', 'maximum'),
+                      ('min', 'minimum'),
+                      ('no', 'numéro'),
+                      ('adr', 'adresse'),
+                      ('dr', 'docteur'),
+                      ('st', 'saint'),
+                      ('co', 'companie'),
+                      ('jr', 'junior'),
+                      ('sgt', 'sergent'),
+                      ('capt', 'capitain'),
+                      ('col', 'colonel'),
+                      ('av', 'avenue'),
+                      ('av. J.-C', 'avant Jésus-Christ'),
+                      ('apr. J.-C', 'après Jésus-Christ'),
+                      ('art', 'article'),
+                      ('boul', 'boulevard'),
+                      ('c.-à-d', 'c’est-à-dire'),
+                      ('etc', 'et cetera'),
+                      ('ex', 'exemple'),
+                      ('excl', 'exclusivement'),
+                      ('boul', 'boulevard'),
+                  ]]
--- a/TTS/tts/utils/text/cleaners.py
+++ b/TTS/tts/utils/text/cleaners.py
@ -13,35 +13,17 @@ hyperparameter. Some cleaners are English-specific. You'll typically want to use
 import re
 from unidecode import unidecode
 from .number_norm import normalize_numbers
+from .abbreviations import abbreviations_en, abbreviations_fr

 # Regular expression matching whitespace:
 _whitespace_re = re.compile(r'\s+')

-# List of (regular expression, replacement) pairs for abbreviations:
-_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
-                  for x in [
-                      ('mrs', 'misess'),
-                      ('mr', 'mister'),
-                      ('dr', 'doctor'),
-                      ('st', 'saint'),
-                      ('co', 'company'),
-                      ('jr', 'junior'),
-                      ('maj', 'major'),
-                      ('gen', 'general'),
-                      ('drs', 'doctors'),
-                      ('rev', 'reverend'),
-                      ('lt', 'lieutenant'),
-                      ('hon', 'honorable'),
-                      ('sgt', 'sergeant'),
-                      ('capt', 'captain'),
-                      ('esq', 'esquire'),
-                      ('ltd', 'limited'),
-                      ('col', 'colonel'),
-                      ('ft', 'fort'),
-                  ]]

-
-def expand_abbreviations(text):
+def expand_abbreviations(text, lang='en'):
+    if lang == 'en':
+        _abbreviations = abbreviations_en
+    elif lang == 'fr':
+        _abbreviations = abbreviations_fr
    for regex, replacement in _abbreviations:
        text = re.sub(regex, replacement, text)
    return text
@ -70,9 +52,11 @@ def remove_aux_symbols(text):
 def replace_symbols(text, lang='en'):
    text = text.replace(';', ',')
    text = text.replace('-', ' ')
-    text = text.replace(':', ' ')
+    text = text.replace(':', ',')
    if lang == 'en':
-        text = text.replace('&', 'and')
+        text = text.replace('&', ' and ')
+    elif lang == 'fr':
+        text = text.replace('&', ' et ')
    elif lang == 'pt':
        text = text.replace('&', ' e ')
    return text
@ -118,6 +102,15 @@ def english_cleaners(text):
    text = collapse_whitespace(text)
    return text

+def french_cleaners(text):
+    '''Pipeline for French text. There is no need to expand numbers, phonemizer already does that'''
+    text = lowercase(text)
+    text = expand_abbreviations(text, lang='fr')
+    text = replace_symbols(text, lang='fr')
+    text = remove_aux_symbols(text)
+    text = collapse_whitespace(text)
+    return text
+
 def portuguese_cleaners(text):
    '''Basic pipeline for Portuguese text. There is no need to expand abbreviation and
        numbers, phonemizer already does that'''