From 4d8426fa0a26a5724f562941dfdfe2da1f0e7ee9 Mon Sep 17 00:00:00 2001
From: Michael Hansen <mike@rhasspy.org>
Date: Tue, 15 Jun 2021 15:57:08 -0400
Subject: [PATCH] Use eSpeak IPA lexicons by default for phoneme models

---
 TTS/tts/configs/shared_configs.py |   3 +
 TTS/tts/utils/synthesis.py        |   1 +
 TTS/tts/utils/text/__init__.py    |  69 ++++--------
 requirements.txt                  |   2 +-
 tests/test_text_processing.py     | 176 ++++++++++++------------------
 5 files changed, 97 insertions(+), 154 deletions(-)

diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py
index 4690e76f..a501a880 100644
--- a/TTS/tts/configs/shared_configs.py
+++ b/TTS/tts/configs/shared_configs.py
@@ -97,6 +97,8 @@ class BaseTTSConfig(BaseTrainingConfig):
             Audio processor config object instance.
         use_phonemes (bool):
             enable / disable phoneme use.
+        use_espeak_phonemes (bool):
+            enable / disable eSpeak-compatible phonemes (only if use_phonemes = `True`).
         compute_input_seq_cache (bool):
             enable / disable precomputation of the phoneme sequences. At the expense of some delay at the beginning of
             the training, It allows faster data loader time and precise limitation with `max_seq_len` and
@@ -136,6 +138,7 @@ class BaseTTSConfig(BaseTrainingConfig):
     audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
     # phoneme settings
     use_phonemes: bool = False
+    use_espeak_phonemes: bool = True
     phoneme_language: str = None
     compute_input_seq_cache: bool = False
     text_cleaner: str = MISSING
diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py
index 9f417a1d..da50f1ca 100644
--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@@ -25,6 +25,7 @@ def text_to_seqvec(text, CONFIG):
                 CONFIG.enable_eos_bos_chars,
                 tp=CONFIG.characters,
                 add_blank=CONFIG.add_blank,
+                use_espeak_phonemes=CONFIG.use_espeak_phonemes
             ),
             dtype=np.int32,
         )
diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py
index 73bd829c..350e5934 100644
--- a/TTS/tts/utils/text/__init__.py
+++ b/TTS/tts/utils/text/__init__.py
@@ -31,46 +31,7 @@ PHONEME_PUNCTUATION_PATTERN = r"[" + _punctuations.replace(" ", "") + "]+"
 GRUUT_TRANS_TABLE = str.maketrans("g", "ɡ")
 
 
-def clean_gruut_phonemes(ph_list):
-    """Decompose, substitute, and clean gruut phonemes for TTS.
-
-    gruut phonemes may contain any IPA characters (e.g., "ẽ" for the nasalized
-    "e"), and may be composed of multiple characters (e.g., "aɪ" in the English
-    "r[i]ce").
-
-    TTS phonemes come from a fixed set of symbols, and do not include every
-    possible variation of every vowel/consonant. Here, we decompose dipthongs,
-    etc. into single characters and then filter out Unicode combining characters
-    such as ties. This ensures that (most) phonemes will exist in the TTS symbol
-    table.
-
-    Args:
-        ph_list (list[str]): list of phonemes from gruut
-
-    Returns:
-        clean_list (list[str]): decomposed/clean list of phonemes for TTS
-    """
-    cleaned_phonemes = []
-
-    for phoneme_text in ph_list:
-        phoneme_text = unicodedata.normalize("NFC", phoneme_text)
-        if phoneme_text in phonemes:
-            cleaned_phonemes.append(phoneme_text)
-            continue
-
-        # Decompose into codepoints (ã -> ["a", "\u0303"])
-        phoneme_text = unicodedata.normalize("NFD", phoneme_text)
-        for codepoint in phoneme_text.translate(GRUUT_TRANS_TABLE):
-            if unicodedata.combining(codepoint) > 0:
-                # Skip combining characters like ties
-                continue
-
-            cleaned_phonemes.append(codepoint)
-
-    return cleaned_phonemes
-
-
-def text2phone(text, language):
+def text2phone(text, language, use_espeak_phonemes=False):
     """Convert graphemes to phonemes.
     Parameters:
             text (str): text to phonemize
@@ -93,21 +54,32 @@ def text2phone(text, language):
 
     if gruut.is_language_supported(language):
         # Use gruut for phonemization
+        phonemizer_args={
+            "remove_stress": True,
+            "ipa_minor_breaks": False,  # don't replace commas/semi-colons with IPA |
+            "ipa_major_breaks": False,  # don't replace periods with IPA ‖
+        }
+
+        if use_espeak_phonemes:
+            # Use a lexicon/g2p model train on eSpeak IPA instead of gruut IPA.
+            # This is intended for backwards compatibility with TTS<=v0.0.13
+            # pre-trained models.
+            phonemizer_args["model_prefix"] = "espeak"
+
         ph_list = gruut.text_to_phonemes(
             text,
             lang=language,
             return_format="word_phonemes",
-            phonemizer_args={
-                "remove_accents": True,  # remove accute/grave accents (Swedish)
-                "ipa_minor_breaks": False,  # don't replace commas/semi-colons with IPA |
-                "ipa_major_breaks": False,  # don't replace periods with IPA ‖
-            },
+            phonemizer_args=phonemizer_args,
         )
 
         # Join and re-split to break apart dipthongs, suprasegmentals, etc.
-        ph_words = ["|".join(clean_gruut_phonemes(word_phonemes)) for word_phonemes in ph_list]
+        ph_words = ["|".join(word_phonemes) for word_phonemes in ph_list]
         ph = "| ".join(ph_words)
 
+        # Fix a few phonemes
+        ph = ph.translate(GRUUT_TRANS_TABLE)
+
         print(" > Phonemes: {}".format(ph))
         return ph
 
@@ -132,7 +104,7 @@ def pad_with_eos_bos(phoneme_sequence, tp=None):
     return [_phonemes_to_id[_bos]] + list(phoneme_sequence) + [_phonemes_to_id[_eos]]
 
 
-def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None, add_blank=False):
+def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None, add_blank=False, use_espeak_phonemes=False):
     # pylint: disable=global-statement
     global _phonemes_to_id, _phonemes
     if tp:
@@ -141,7 +113,7 @@ def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=
 
     sequence = []
     clean_text = _clean_text(text, cleaner_names)
-    to_phonemes = text2phone(clean_text, language)
+    to_phonemes = text2phone(clean_text, language, use_espeak_phonemes=use_espeak_phonemes)
     if to_phonemes is None:
         print("!! After phoneme conversion the result is None. -- {} ".format(clean_text))
     # iterate by skipping empty strings - NOTE: might be useful to keep it to have a better intonation.
@@ -152,6 +124,7 @@ def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=
         sequence = pad_with_eos_bos(sequence, tp=tp)
     if add_blank:
         sequence = intersperse(sequence, len(_phonemes))  # add a blank token (new), whose id number is len(_phonemes)
+
     return sequence
 
 
diff --git a/requirements.txt b/requirements.txt
index cb304693..046139d0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -23,4 +23,4 @@ coqpit
 mecab-python3==1.0.3
 unidic-lite==1.0.8
 # gruut+supported langs
-gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=1.1.0
+gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=1.2.0
diff --git a/tests/test_text_processing.py b/tests/test_text_processing.py
index f4938ca0..17ee755e 100644
--- a/tests/test_text_processing.py
+++ b/tests/test_text_processing.py
@@ -1,54 +1,15 @@
 """Tests for text to phoneme converstion"""
 import unittest
 
-import gruut
-from gruut_ipa import IPA, Phonemes
-
-from TTS.tts.utils.text import clean_gruut_phonemes, phoneme_to_sequence
-from TTS.tts.utils.text import phonemes as all_phonemes
-from TTS.tts.utils.text import sequence_to_phoneme
+from TTS.tts.utils.text import phoneme_to_sequence, sequence_to_phoneme, text2phone
 
 # -----------------------------------------------------------------------------
 
+LANG = "en-us"
+
 EXAMPLE_TEXT = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"
 
-# Raw phonemes from run of gruut with example text (en-us).
-# This includes IPA ties, etc.
-EXAMPLE_PHONEMES = [
-    ["ɹ", "ˈi", "s", "ə", "n", "t"],
-    ["ɹ", "i", "s", "ˈɚ", "t͡ʃ"],
-    ["ˈæ", "t"],
-    ["h", "ˈɑ", "ɹ", "v", "ɚ", "d"],
-    ["h", "ˈæ", "z"],
-    ["ʃ", "ˈoʊ", "n"],
-    ["m", "ˈɛ", "d", "ɪ", "t", "ˌeɪ", "t", "ɪ", "ŋ"],
-    ["f", "ɚ"],
-    ["ˈæ", "z"],
-    ["l", "ˈɪ", "t", "ə", "l"],
-    ["ˈæ", "z"],
-    ["ˈeɪ", "t"],
-    ["w", "ˈi", "k", "s"],
-    ["k", "ə", "n"],
-    ["ˈæ", "k", "t͡ʃ", "ə", "l", "i"],
-    ["ɪ", "ŋ", "k", "ɹ", "ˈi", "s"],
-    [","],
-    ["ð", "ə"],
-    ["ɡ", "ɹ", "ˈeɪ"],
-    ["m", "ˈæ", "t", "ɚ"],
-    ["ˈɪ", "n"],
-    ["ð", "ə"],
-    ["p", "ˈɑ", "ɹ", "t", "s"],
-    ["ə", "v"],
-    ["ð", "ə"],
-    ["b", "ɹ", "ˈeɪ", "n"],
-    ["ɹ", "i", "s", "p", "ˈɑ", "n", "s", "ɪ", "b", "ə", "l"],
-    ["f", "ɚ"],
-    ["ɪ", "m", "ˈoʊ", "ʃ", "ə", "n", "ə", "l"],
-    ["ɹ", "ˌɛ", "ɡ", "j", "ə", "l", "ˈeɪ", "ʃ", "ə", "n"],
-    ["ˈæ", "n", "d"],
-    ["l", "ˈɚ", "n", "ɪ", "ŋ"],
-    ["!"],
-]
+EXPECTED_PHONEMES = "ɹ|iː|s|ə|n|t| ɹ|ᵻ|s|ɜː|tʃ| æ|t| h|ɑːɹ|v|ɚ|d| h|æ|z| ʃ|oʊ|n| m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| f|ɔːɹ| æ|z| l|ɪ|ɾ|əl| æ|z| eɪ|t| w|iː|k|s| k|æ|n| æ|k|tʃ|uː|əl|i| ɪ|ŋ|k|ɹ|iː|s| ,| ð|ə| ɡ|ɹ|eɪ| m|æ|ɾ|ɚ| ɪ|n| ð|ə| p|ɑːɹ|t|s| ʌ|v| ð|ə| b|ɹ|eɪ|n| ɹ|ᵻ|s|p|ɑː|n|s|ᵻ|b|əl| f|ɔːɹ| ɪ|m|oʊ|ʃ|ə|n|əl| ɹ|ɛ|ɡ|j|ʊ|l|eɪ|ʃ|ə|n| æ|n|d| l|ɜː|n|ɪ|ŋ| !"
 
 # -----------------------------------------------------------------------------
 
@@ -56,79 +17,84 @@ EXAMPLE_PHONEMES = [
 class TextProcessingTextCase(unittest.TestCase):
     """Tests for text to phoneme conversion"""
 
-    def test_all_phonemes_in_tts(self):
-        """Ensure that all phonemes from gruut are present in TTS phonemes"""
-        tts_phonemes = set(all_phonemes)
-
-        # Check stress characters
-        for suprasegmental in [IPA.STRESS_PRIMARY, IPA.STRESS_SECONDARY]:
-            self.assertIn(suprasegmental, tts_phonemes)
-
-        # Check that gruut's phonemes are a subset of TTS phonemes
-        for lang in gruut.get_supported_languages():
-            for phoneme in Phonemes.from_language(lang):
-                for codepoint in clean_gruut_phonemes(phoneme.text):
-
-                    self.assertIn(codepoint, tts_phonemes)
-
     def test_phoneme_to_sequence(self):
-        """Verify example (text -> sequence -> phoneme string) pipeline"""
-        lang = "en-us"
-        expected_phoneme_str = " ".join(
-            "".join(clean_gruut_phonemes(word_phonemes)) for word_phonemes in EXAMPLE_PHONEMES
-        )
-
-        # Ensure that TTS produces same phoneme string
-        text_cleaner = ["phoneme_cleaners"]
-        actual_sequence = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, lang)
-        actual_phoneme_str = sequence_to_phoneme(actual_sequence)
-
-        self.assertEqual(actual_phoneme_str, expected_phoneme_str)
+        """Verify en-us sentence phonemes without blank token"""
+        self._test_phoneme_to_sequence(add_blank=False)
 
     def test_phoneme_to_sequence_with_blank_token(self):
-        """Verify example (text -> sequence -> phoneme string) pipeline with blank token"""
-        lang = "en-us"
+        """Verify en-us sentence phonemes with blank token"""
+        self._test_phoneme_to_sequence(add_blank=True)
+
+    def _test_phoneme_to_sequence(self, add_blank):
         text_cleaner = ["phoneme_cleaners"]
+        sequence = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
+        text_hat = sequence_to_phoneme(sequence)
+        text_hat_with_params = sequence_to_phoneme(sequence)
+        gt = EXPECTED_PHONEMES.replace("|", "")
+        self.assertEqual(text_hat, text_hat_with_params)
+        self.assertEqual(text_hat, gt)
 
-        # Create with/without blank sequences
-        sequence_without_blank = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, lang, add_blank=False)
-        sequence_with_blank = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, lang, add_blank=True)
+        # multiple punctuations
+        text = "Be a voice, not an! echo?"
+        sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
+        text_hat = sequence_to_phoneme(sequence)
+        text_hat_with_params = sequence_to_phoneme(sequence)
+        gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ ?"
+        print(text_hat)
+        print(len(sequence))
+        self.assertEqual(text_hat, text_hat_with_params)
+        self.assertEqual(text_hat, gt)
 
-        # With blank sequence should be bigger
-        self.assertGreater(len(sequence_with_blank), len(sequence_without_blank))
+        # not ending with punctuation
+        text = "Be a voice, not an! echo"
+        sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
+        text_hat = sequence_to_phoneme(sequence)
+        text_hat_with_params = sequence_to_phoneme(sequence)
+        gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ"
+        print(text_hat)
+        print(len(sequence))
+        self.assertEqual(text_hat, text_hat_with_params)
+        self.assertEqual(text_hat, gt)
 
-        # But phoneme strings should still be identical
-        phoneme_str_without_blank = sequence_to_phoneme(sequence_without_blank, add_blank=False)
-        phoneme_str_with_blank = sequence_to_phoneme(sequence_with_blank, add_blank=True)
+        # original
+        text = "Be a voice, not an echo!"
+        sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
+        text_hat = sequence_to_phoneme(sequence)
+        text_hat_with_params = sequence_to_phoneme(sequence)
+        gt = "biː ɐ vɔɪs , nɑːt ɐn ɛkoʊ !"
+        print(text_hat)
+        print(len(sequence))
+        self.assertEqual(text_hat, text_hat_with_params)
+        self.assertEqual(text_hat, gt)
 
-        self.assertEqual(phoneme_str_with_blank, phoneme_str_without_blank)
+        # extra space after the sentence
+        text = "Be a voice, not an! echo.  "
+        sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
+        text_hat = sequence_to_phoneme(sequence)
+        text_hat_with_params = sequence_to_phoneme(sequence)
+        gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ ."
+        print(text_hat)
+        print(len(sequence))
+        self.assertEqual(text_hat, text_hat_with_params)
+        self.assertEqual(text_hat, gt)
 
-    def test_messy_text(self):
-        """Verify text with extra punctuation/whitespace/etc. makes it through the pipeline"""
-        text = '"Be" a! voice, [NOT]? (an eCHo.   '
-        lang = "en-us"
-        expected_phonemes = [
-            ["b", "ˈi"],
-            ["ə"],
-            ["!"],
-            ["v", "ˈɔɪ", "s"],
-            [","],
-            ["n", "ˈɑ", "t"],
-            ["?"],
-            ["ə", "n"],
-            ["ˈɛ", "k", "oʊ"],
-            ["."],
-        ]
-        expected_phoneme_str = " ".join(
-            "".join(clean_gruut_phonemes(word_phonemes)) for word_phonemes in expected_phonemes
+        # extra space after the sentence
+        text = "Be a voice, not an! echo.  "
+        sequence = phoneme_to_sequence(
+            text, text_cleaner, LANG, enable_eos_bos=True, add_blank=add_blank, use_espeak_phonemes=True
         )
+        text_hat = sequence_to_phoneme(sequence)
+        text_hat_with_params = sequence_to_phoneme(sequence)
+        gt = "^biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ .~"
+        print(text_hat)
+        print(len(sequence))
+        self.assertEqual(text_hat, text_hat_with_params)
+        self.assertEqual(text_hat, gt)
 
-        # Ensure that TTS produces same phoneme string
-        text_cleaner = ["phoneme_cleaners"]
-        actual_sequence = phoneme_to_sequence(text, text_cleaner, lang)
-        actual_phoneme_str = sequence_to_phoneme(actual_sequence)
-
-        self.assertEqual(actual_phoneme_str, expected_phoneme_str)
+    def test_text2phone(self):
+        text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"
+        ph = text2phone(EXAMPLE_TEXT, LANG)
+        self.assertEqual(ph, EXPECTED_PHONEMES)
 
 
 # -----------------------------------------------------------------------------