From d245b5d48f01cdaf45479b7f924b8a74b6c58b97 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Tue, 8 Jun 2021 09:21:01 +0200
Subject: [PATCH 01/11] bump up v0.0.15.1

---
 TTS/VERSION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/VERSION b/TTS/VERSION
index ceddfb28..58c4b6e9 100644
--- a/TTS/VERSION
+++ b/TTS/VERSION
@@ -1 +1 @@
-0.0.15
+0.0.15.1

From 67869e77f9ac4d51f9c972395ca7e0b2916ab14f Mon Sep 17 00:00:00 2001
From: Michael Hansen <michael.hansen.24@us.af.mil>
Date: Wed, 2 Jun 2021 10:35:59 -0400
Subject: [PATCH 02/11] Use gruut for phonemization

---
 TTS/tts/utils/text/__init__.py | 57 ++++++++++++++++++++++++++++++++++
 TTS/utils/manage.py            |  8 ++---
 requirements.txt               |  2 ++
 3 files changed, 63 insertions(+), 4 deletions(-)

diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py
index f9f44167..49e7a08a 100644
--- a/TTS/tts/utils/text/__init__.py
+++ b/TTS/tts/utils/text/__init__.py
@@ -2,6 +2,7 @@
 
 import re
 
+import gruut
 from packaging import version
 
 from TTS.tts.utils.text import cleaners
@@ -25,6 +26,33 @@ _CURLY_RE = re.compile(r"(.*?)\{(.+?)\}(.*)")
 # Regular expression matching punctuations, ignoring empty space
 PHONEME_PUNCTUATION_PATTERN = r"[" + _punctuations.replace(" ", "") + "]+"
 
+# language -> source phoneme -> dest phoneme
+# Used to make gruut's phonemes fit better with eSpeak's.
+GRUUT_PHONEME_MAP = {
+    "en-us": {
+        "i": "iː",
+        "ɑ": "ɑː",
+        "ɚ": "ɜːɹ",
+    },
+    "de": {
+        "ʁ": "ɾ",
+        "g": "ɡ",
+        "ʔ": "",
+    },
+    "nl": {
+        "a": "aː",
+        "e": "eː",
+        "ʏ": "ɵ",
+        "ʋ": "w",
+        "ɹ": "r",
+        "ɔː": "oː",
+    },
+    "es": {
+        "ɾ": "r",
+        "g": "ɣ",
+    },
+}
+
 
 def text2phone(text, language):
     """Convert graphemes to phonemes.
@@ -39,10 +67,39 @@ def text2phone(text, language):
     # TO REVIEW : How to have a good implementation for this?
     if language == "zh-CN":
         ph = chinese_text_to_phonemes(text)
+        print(" > Phonemes: {}".format(ph))
         return ph
 
     if language == "ja-jp":
         ph = japanese_text_to_phonemes(text)
+        print(" > Phonemes: {}".format(ph))
+        return ph
+
+    if gruut.is_language_supported(language):
+        # Use gruut for phonemization
+        ph_list = gruut.text_to_phonemes(
+            text,
+            lang=language,
+            return_format="word_phonemes",
+            phonemizer_args={
+                "remove_stress": True,  # remove primary/secondary stress
+                "ipa_minor_breaks": False,  # don't replace commas/semi-colons with IPA |
+                "ipa_major_breaks": False,  # don't replace periods with IPA ‖
+            },
+        )
+
+        ph_map = GRUUT_PHONEME_MAP.get(language)
+        if ph_map:
+            # Re-map phonemes to fit with eSpeak conventions
+            for word in ph_list:
+                for p_idx, p in enumerate(word):
+                    word[p_idx] = ph_map.get(p, p)
+
+        # Join and re-split to break apart dipthongs, suprasegmentals, etc.
+        ph_words = ["|".join(word_phonemes) for word_phonemes in ph_list]
+        ph = "| ".join(ph_words)
+
+        print(" > Phonemes: {}".format(ph))
         return ph
 
     raise ValueError(f" [!] Language {language} is not supported for phonemization.")
diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py
index cf7df7de..f5165079 100644
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@@ -102,10 +102,10 @@ class ModelManager(object):
         output_model_path = os.path.join(output_path, "model_file.pth.tar")
         output_config_path = os.path.join(output_path, "config.json")
         # NOTE : band-aid for removing phoneme support
-        if "needs_phonemizer" in model_item and model_item["needs_phonemizer"]:
-            raise RuntimeError(
-                " [!] Use 🐸TTS <= v0.0.13 for this model. Current version does not support phoneme based models."
-            )
+        # if "needs_phonemizer" in model_item and model_item["needs_phonemizer"]:
+        #     raise RuntimeError(
+        #         " [!] Use 🐸TTS <= v0.0.13 for this model. Current version does not support phoneme based models."
+        #     )
         if os.path.exists(output_path):
             print(f" > {model_name} is already downloaded.")
         else:
diff --git a/requirements.txt b/requirements.txt
index fde48978..a2fb4132 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -22,3 +22,5 @@ coqpit
 # japanese g2p deps
 mecab-python3==1.0.3
 unidic-lite==1.0.8
+# gruut+supported langs
+gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=1.0.0

From 47191f3eccd9ec42c3db16767e6a0be96e0aaa0d Mon Sep 17 00:00:00 2001
From: Michael Hansen <mike@rhasspy.org>
Date: Wed, 9 Jun 2021 11:52:10 -0400
Subject: [PATCH 03/11] Add tests for gruut phonemization

---
 TTS/__init__.py                |   1 -
 TTS/tts/utils/text/__init__.py |  66 ++++++++--------
 tests/test_text_processing.py  | 137 +++++++++++++++++++++++++++++++++
 3 files changed, 168 insertions(+), 36 deletions(-)
 create mode 100644 tests/test_text_processing.py

diff --git a/TTS/__init__.py b/TTS/__init__.py
index da35faf8..5162d4ec 100644
--- a/TTS/__init__.py
+++ b/TTS/__init__.py
@@ -1,6 +1,5 @@
 import os
 
-
 with open(os.path.join(os.path.dirname(__file__), "VERSION")) as f:
     version = f.read().strip()
 
diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py
index 49e7a08a..14319c44 100644
--- a/TTS/tts/utils/text/__init__.py
+++ b/TTS/tts/utils/text/__init__.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 
 import re
+import unicodedata
 
 import gruut
 from packaging import version
@@ -26,32 +27,34 @@ _CURLY_RE = re.compile(r"(.*?)\{(.+?)\}(.*)")
 # Regular expression matching punctuations, ignoring empty space
 PHONEME_PUNCTUATION_PATTERN = r"[" + _punctuations.replace(" ", "") + "]+"
 
-# language -> source phoneme -> dest phoneme
-# Used to make gruut's phonemes fit better with eSpeak's.
-GRUUT_PHONEME_MAP = {
-    "en-us": {
-        "i": "iː",
-        "ɑ": "ɑː",
-        "ɚ": "ɜːɹ",
-    },
-    "de": {
-        "ʁ": "ɾ",
-        "g": "ɡ",
-        "ʔ": "",
-    },
-    "nl": {
-        "a": "aː",
-        "e": "eː",
-        "ʏ": "ɵ",
-        "ʋ": "w",
-        "ɹ": "r",
-        "ɔː": "oː",
-    },
-    "es": {
-        "ɾ": "r",
-        "g": "ɣ",
-    },
-}
+# Table for str.translate to fix gruut/TTS phoneme mismatch
+GRUUT_TRANS_TABLE = str.maketrans("g", "ɡ")
+
+
+def clean_gruut_phonemes(ph_list):
+    """Decompose, substitute, and clean gruut phonemes for TTS.
+
+    Parameters:
+            ph_list (list[str]): list of phonemes from gruut
+
+    Returns:
+            clean_list (list[str]): decomposed/clean list of phonemes for TTS
+                    Dipthongs, etc. are decomposed into single characters
+                    Unicode combining characters are removed (e.g., ties)
+    """
+    cleaned_phonemes = []
+
+    for phoneme_text in ph_list:
+        # Decompose into codepoints (ã -> ["a", "\u0303"])
+        phoneme_text = unicodedata.normalize("NFD", phoneme_text)
+        for codepoint in phoneme_text.translate(GRUUT_TRANS_TABLE):
+            if unicodedata.combining(codepoint) > 0:
+                # Skip combining characters like ties
+                continue
+
+            cleaned_phonemes.append(codepoint)
+
+    return cleaned_phonemes
 
 
 def text2phone(text, language):
@@ -82,21 +85,14 @@ def text2phone(text, language):
             lang=language,
             return_format="word_phonemes",
             phonemizer_args={
-                "remove_stress": True,  # remove primary/secondary stress
+                "remove_accents": True,  # remove accute/grave accents (Swedish)
                 "ipa_minor_breaks": False,  # don't replace commas/semi-colons with IPA |
                 "ipa_major_breaks": False,  # don't replace periods with IPA ‖
             },
         )
 
-        ph_map = GRUUT_PHONEME_MAP.get(language)
-        if ph_map:
-            # Re-map phonemes to fit with eSpeak conventions
-            for word in ph_list:
-                for p_idx, p in enumerate(word):
-                    word[p_idx] = ph_map.get(p, p)
-
         # Join and re-split to break apart dipthongs, suprasegmentals, etc.
-        ph_words = ["|".join(word_phonemes) for word_phonemes in ph_list]
+        ph_words = ["|".join(clean_gruut_phonemes(word_phonemes)) for word_phonemes in ph_list]
         ph = "| ".join(ph_words)
 
         print(" > Phonemes: {}".format(ph))
diff --git a/tests/test_text_processing.py b/tests/test_text_processing.py
new file mode 100644
index 00000000..f4938ca0
--- /dev/null
+++ b/tests/test_text_processing.py
@@ -0,0 +1,137 @@
+"""Tests for text to phoneme converstion"""
+import unittest
+
+import gruut
+from gruut_ipa import IPA, Phonemes
+
+from TTS.tts.utils.text import clean_gruut_phonemes, phoneme_to_sequence
+from TTS.tts.utils.text import phonemes as all_phonemes
+from TTS.tts.utils.text import sequence_to_phoneme
+
+# -----------------------------------------------------------------------------
+
+EXAMPLE_TEXT = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"
+
+# Raw phonemes from run of gruut with example text (en-us).
+# This includes IPA ties, etc.
+EXAMPLE_PHONEMES = [
+    ["ɹ", "ˈi", "s", "ə", "n", "t"],
+    ["ɹ", "i", "s", "ˈɚ", "t͡ʃ"],
+    ["ˈæ", "t"],
+    ["h", "ˈɑ", "ɹ", "v", "ɚ", "d"],
+    ["h", "ˈæ", "z"],
+    ["ʃ", "ˈoʊ", "n"],
+    ["m", "ˈɛ", "d", "ɪ", "t", "ˌeɪ", "t", "ɪ", "ŋ"],
+    ["f", "ɚ"],
+    ["ˈæ", "z"],
+    ["l", "ˈɪ", "t", "ə", "l"],
+    ["ˈæ", "z"],
+    ["ˈeɪ", "t"],
+    ["w", "ˈi", "k", "s"],
+    ["k", "ə", "n"],
+    ["ˈæ", "k", "t͡ʃ", "ə", "l", "i"],
+    ["ɪ", "ŋ", "k", "ɹ", "ˈi", "s"],
+    [","],
+    ["ð", "ə"],
+    ["ɡ", "ɹ", "ˈeɪ"],
+    ["m", "ˈæ", "t", "ɚ"],
+    ["ˈɪ", "n"],
+    ["ð", "ə"],
+    ["p", "ˈɑ", "ɹ", "t", "s"],
+    ["ə", "v"],
+    ["ð", "ə"],
+    ["b", "ɹ", "ˈeɪ", "n"],
+    ["ɹ", "i", "s", "p", "ˈɑ", "n", "s", "ɪ", "b", "ə", "l"],
+    ["f", "ɚ"],
+    ["ɪ", "m", "ˈoʊ", "ʃ", "ə", "n", "ə", "l"],
+    ["ɹ", "ˌɛ", "ɡ", "j", "ə", "l", "ˈeɪ", "ʃ", "ə", "n"],
+    ["ˈæ", "n", "d"],
+    ["l", "ˈɚ", "n", "ɪ", "ŋ"],
+    ["!"],
+]
+
+# -----------------------------------------------------------------------------
+
+
+class TextProcessingTextCase(unittest.TestCase):
+    """Tests for text to phoneme conversion"""
+
+    def test_all_phonemes_in_tts(self):
+        """Ensure that all phonemes from gruut are present in TTS phonemes"""
+        tts_phonemes = set(all_phonemes)
+
+        # Check stress characters
+        for suprasegmental in [IPA.STRESS_PRIMARY, IPA.STRESS_SECONDARY]:
+            self.assertIn(suprasegmental, tts_phonemes)
+
+        # Check that gruut's phonemes are a subset of TTS phonemes
+        for lang in gruut.get_supported_languages():
+            for phoneme in Phonemes.from_language(lang):
+                for codepoint in clean_gruut_phonemes(phoneme.text):
+
+                    self.assertIn(codepoint, tts_phonemes)
+
+    def test_phoneme_to_sequence(self):
+        """Verify example (text -> sequence -> phoneme string) pipeline"""
+        lang = "en-us"
+        expected_phoneme_str = " ".join(
+            "".join(clean_gruut_phonemes(word_phonemes)) for word_phonemes in EXAMPLE_PHONEMES
+        )
+
+        # Ensure that TTS produces same phoneme string
+        text_cleaner = ["phoneme_cleaners"]
+        actual_sequence = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, lang)
+        actual_phoneme_str = sequence_to_phoneme(actual_sequence)
+
+        self.assertEqual(actual_phoneme_str, expected_phoneme_str)
+
+    def test_phoneme_to_sequence_with_blank_token(self):
+        """Verify example (text -> sequence -> phoneme string) pipeline with blank token"""
+        lang = "en-us"
+        text_cleaner = ["phoneme_cleaners"]
+
+        # Create with/without blank sequences
+        sequence_without_blank = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, lang, add_blank=False)
+        sequence_with_blank = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, lang, add_blank=True)
+
+        # With blank sequence should be bigger
+        self.assertGreater(len(sequence_with_blank), len(sequence_without_blank))
+
+        # But phoneme strings should still be identical
+        phoneme_str_without_blank = sequence_to_phoneme(sequence_without_blank, add_blank=False)
+        phoneme_str_with_blank = sequence_to_phoneme(sequence_with_blank, add_blank=True)
+
+        self.assertEqual(phoneme_str_with_blank, phoneme_str_without_blank)
+
+    def test_messy_text(self):
+        """Verify text with extra punctuation/whitespace/etc. makes it through the pipeline"""
+        text = '"Be" a! voice, [NOT]? (an eCHo.   '
+        lang = "en-us"
+        expected_phonemes = [
+            ["b", "ˈi"],
+            ["ə"],
+            ["!"],
+            ["v", "ˈɔɪ", "s"],
+            [","],
+            ["n", "ˈɑ", "t"],
+            ["?"],
+            ["ə", "n"],
+            ["ˈɛ", "k", "oʊ"],
+            ["."],
+        ]
+        expected_phoneme_str = " ".join(
+            "".join(clean_gruut_phonemes(word_phonemes)) for word_phonemes in expected_phonemes
+        )
+
+        # Ensure that TTS produces same phoneme string
+        text_cleaner = ["phoneme_cleaners"]
+        actual_sequence = phoneme_to_sequence(text, text_cleaner, lang)
+        actual_phoneme_str = sequence_to_phoneme(actual_sequence)
+
+        self.assertEqual(actual_phoneme_str, expected_phoneme_str)
+
+
+# -----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    unittest.main()

From 07e8ff193a9b549da8960a42128b464cfcef904e Mon Sep 17 00:00:00 2001
From: Michael Hansen <mike@rhasspy.org>
Date: Wed, 9 Jun 2021 13:44:04 -0400
Subject: [PATCH 04/11] Bump version of gruut to 1.1

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index a2fb4132..cb304693 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -23,4 +23,4 @@ coqpit
 mecab-python3==1.0.3
 unidic-lite==1.0.8
 # gruut+supported langs
-gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=1.0.0
+gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=1.1.0

From da6f6a4a01763256341cc55e633f55c47f49ce2b Mon Sep 17 00:00:00 2001
From: Michael Hansen <mike@rhasspy.org>
Date: Thu, 10 Jun 2021 10:08:01 -0400
Subject: [PATCH 05/11] Update docstring for clean_gruut_phonemes

---
 TTS/tts/utils/text/__init__.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py
index 14319c44..3d2f5004 100644
--- a/TTS/tts/utils/text/__init__.py
+++ b/TTS/tts/utils/text/__init__.py
@@ -34,13 +34,21 @@ GRUUT_TRANS_TABLE = str.maketrans("g", "ɡ")
 def clean_gruut_phonemes(ph_list):
     """Decompose, substitute, and clean gruut phonemes for TTS.
 
-    Parameters:
-            ph_list (list[str]): list of phonemes from gruut
+    gruut phonemes may contain any IPA characters (e.g., "ẽ" for the nasalized
+    "e"), and may be composed of multiple characters (e.g., "aɪ" in the English
+    "r[i]ce").
+
+    TTS phonemes come from a fixed set of symbols, and do not include every
+    possible variation of every vowel/consonant. Here, we decompose dipthongs,
+    etc. into single characters and then filter out Unicode combining characters
+    such as ties. This ensures that (most) phonemes will exist in the TTS symbol
+    table.
+
+    Args:
+        ph_list (list[str]): list of phonemes from gruut
 
     Returns:
-            clean_list (list[str]): decomposed/clean list of phonemes for TTS
-                    Dipthongs, etc. are decomposed into single characters
-                    Unicode combining characters are removed (e.g., ties)
+        clean_list (list[str]): decomposed/clean list of phonemes for TTS
     """
     cleaned_phonemes = []
 

From 618b509204cf045f20dc972bbb3220ed160ab0ca Mon Sep 17 00:00:00 2001
From: Michael Hansen <mike@rhasspy.org>
Date: Fri, 11 Jun 2021 10:43:52 -0400
Subject: [PATCH 06/11] =?UTF-8?q?Use=20combined=20characters=20available?=
 =?UTF-8?q?=20in=20TTS=20phonemes=20(like=20=C3=A7)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 TTS/tts/utils/text/__init__.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py
index 3d2f5004..73bd829c 100644
--- a/TTS/tts/utils/text/__init__.py
+++ b/TTS/tts/utils/text/__init__.py
@@ -53,6 +53,11 @@ def clean_gruut_phonemes(ph_list):
     cleaned_phonemes = []
 
     for phoneme_text in ph_list:
+        phoneme_text = unicodedata.normalize("NFC", phoneme_text)
+        if phoneme_text in phonemes:
+            cleaned_phonemes.append(phoneme_text)
+            continue
+
         # Decompose into codepoints (ã -> ["a", "\u0303"])
         phoneme_text = unicodedata.normalize("NFD", phoneme_text)
         for codepoint in phoneme_text.translate(GRUUT_TRANS_TABLE):

From 4d8426fa0a26a5724f562941dfdfe2da1f0e7ee9 Mon Sep 17 00:00:00 2001
From: Michael Hansen <mike@rhasspy.org>
Date: Tue, 15 Jun 2021 15:57:08 -0400
Subject: [PATCH 07/11] Use eSpeak IPA lexicons by default for phoneme models

---
 TTS/tts/configs/shared_configs.py |   3 +
 TTS/tts/utils/synthesis.py        |   1 +
 TTS/tts/utils/text/__init__.py    |  69 ++++--------
 requirements.txt                  |   2 +-
 tests/test_text_processing.py     | 176 ++++++++++++------------------
 5 files changed, 97 insertions(+), 154 deletions(-)

diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py
index 4690e76f..a501a880 100644
--- a/TTS/tts/configs/shared_configs.py
+++ b/TTS/tts/configs/shared_configs.py
@@ -97,6 +97,8 @@ class BaseTTSConfig(BaseTrainingConfig):
             Audio processor config object instance.
         use_phonemes (bool):
             enable / disable phoneme use.
+        use_espeak_phonemes (bool):
+            enable / disable eSpeak-compatible phonemes (only if use_phonemes = `True`).
         compute_input_seq_cache (bool):
             enable / disable precomputation of the phoneme sequences. At the expense of some delay at the beginning of
             the training, It allows faster data loader time and precise limitation with `max_seq_len` and
@@ -136,6 +138,7 @@ class BaseTTSConfig(BaseTrainingConfig):
     audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
     # phoneme settings
     use_phonemes: bool = False
+    use_espeak_phonemes: bool = True
     phoneme_language: str = None
     compute_input_seq_cache: bool = False
     text_cleaner: str = MISSING
diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py
index 9f417a1d..da50f1ca 100644
--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@@ -25,6 +25,7 @@ def text_to_seqvec(text, CONFIG):
                 CONFIG.enable_eos_bos_chars,
                 tp=CONFIG.characters,
                 add_blank=CONFIG.add_blank,
+                use_espeak_phonemes=CONFIG.use_espeak_phonemes
             ),
             dtype=np.int32,
         )
diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py
index 73bd829c..350e5934 100644
--- a/TTS/tts/utils/text/__init__.py
+++ b/TTS/tts/utils/text/__init__.py
@@ -31,46 +31,7 @@ PHONEME_PUNCTUATION_PATTERN = r"[" + _punctuations.replace(" ", "") + "]+"
 GRUUT_TRANS_TABLE = str.maketrans("g", "ɡ")
 
 
-def clean_gruut_phonemes(ph_list):
-    """Decompose, substitute, and clean gruut phonemes for TTS.
-
-    gruut phonemes may contain any IPA characters (e.g., "ẽ" for the nasalized
-    "e"), and may be composed of multiple characters (e.g., "aɪ" in the English
-    "r[i]ce").
-
-    TTS phonemes come from a fixed set of symbols, and do not include every
-    possible variation of every vowel/consonant. Here, we decompose dipthongs,
-    etc. into single characters and then filter out Unicode combining characters
-    such as ties. This ensures that (most) phonemes will exist in the TTS symbol
-    table.
-
-    Args:
-        ph_list (list[str]): list of phonemes from gruut
-
-    Returns:
-        clean_list (list[str]): decomposed/clean list of phonemes for TTS
-    """
-    cleaned_phonemes = []
-
-    for phoneme_text in ph_list:
-        phoneme_text = unicodedata.normalize("NFC", phoneme_text)
-        if phoneme_text in phonemes:
-            cleaned_phonemes.append(phoneme_text)
-            continue
-
-        # Decompose into codepoints (ã -> ["a", "\u0303"])
-        phoneme_text = unicodedata.normalize("NFD", phoneme_text)
-        for codepoint in phoneme_text.translate(GRUUT_TRANS_TABLE):
-            if unicodedata.combining(codepoint) > 0:
-                # Skip combining characters like ties
-                continue
-
-            cleaned_phonemes.append(codepoint)
-
-    return cleaned_phonemes
-
-
-def text2phone(text, language):
+def text2phone(text, language, use_espeak_phonemes=False):
     """Convert graphemes to phonemes.
     Parameters:
             text (str): text to phonemize
@@ -93,21 +54,32 @@ def text2phone(text, language):
 
     if gruut.is_language_supported(language):
         # Use gruut for phonemization
+        phonemizer_args={
+            "remove_stress": True,
+            "ipa_minor_breaks": False,  # don't replace commas/semi-colons with IPA |
+            "ipa_major_breaks": False,  # don't replace periods with IPA ‖
+        }
+
+        if use_espeak_phonemes:
+            # Use a lexicon/g2p model train on eSpeak IPA instead of gruut IPA.
+            # This is intended for backwards compatibility with TTS<=v0.0.13
+            # pre-trained models.
+            phonemizer_args["model_prefix"] = "espeak"
+
         ph_list = gruut.text_to_phonemes(
             text,
             lang=language,
             return_format="word_phonemes",
-            phonemizer_args={
-                "remove_accents": True,  # remove accute/grave accents (Swedish)
-                "ipa_minor_breaks": False,  # don't replace commas/semi-colons with IPA |
-                "ipa_major_breaks": False,  # don't replace periods with IPA ‖
-            },
+            phonemizer_args=phonemizer_args,
         )
 
         # Join and re-split to break apart dipthongs, suprasegmentals, etc.
-        ph_words = ["|".join(clean_gruut_phonemes(word_phonemes)) for word_phonemes in ph_list]
+        ph_words = ["|".join(word_phonemes) for word_phonemes in ph_list]
         ph = "| ".join(ph_words)
 
+        # Fix a few phonemes
+        ph = ph.translate(GRUUT_TRANS_TABLE)
+
         print(" > Phonemes: {}".format(ph))
         return ph
 
@@ -132,7 +104,7 @@ def pad_with_eos_bos(phoneme_sequence, tp=None):
     return [_phonemes_to_id[_bos]] + list(phoneme_sequence) + [_phonemes_to_id[_eos]]
 
 
-def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None, add_blank=False):
+def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None, add_blank=False, use_espeak_phonemes=False):
     # pylint: disable=global-statement
     global _phonemes_to_id, _phonemes
     if tp:
@@ -141,7 +113,7 @@ def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=
 
     sequence = []
     clean_text = _clean_text(text, cleaner_names)
-    to_phonemes = text2phone(clean_text, language)
+    to_phonemes = text2phone(clean_text, language, use_espeak_phonemes=use_espeak_phonemes)
     if to_phonemes is None:
         print("!! After phoneme conversion the result is None. -- {} ".format(clean_text))
     # iterate by skipping empty strings - NOTE: might be useful to keep it to have a better intonation.
@@ -152,6 +124,7 @@ def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=
         sequence = pad_with_eos_bos(sequence, tp=tp)
     if add_blank:
         sequence = intersperse(sequence, len(_phonemes))  # add a blank token (new), whose id number is len(_phonemes)
+
     return sequence
 
 
diff --git a/requirements.txt b/requirements.txt
index cb304693..046139d0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -23,4 +23,4 @@ coqpit
 mecab-python3==1.0.3
 unidic-lite==1.0.8
 # gruut+supported langs
-gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=1.1.0
+gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=1.2.0
diff --git a/tests/test_text_processing.py b/tests/test_text_processing.py
index f4938ca0..17ee755e 100644
--- a/tests/test_text_processing.py
+++ b/tests/test_text_processing.py
@@ -1,54 +1,15 @@
 """Tests for text to phoneme converstion"""
 import unittest
 
-import gruut
-from gruut_ipa import IPA, Phonemes
-
-from TTS.tts.utils.text import clean_gruut_phonemes, phoneme_to_sequence
-from TTS.tts.utils.text import phonemes as all_phonemes
-from TTS.tts.utils.text import sequence_to_phoneme
+from TTS.tts.utils.text import phoneme_to_sequence, sequence_to_phoneme, text2phone
 
 # -----------------------------------------------------------------------------
 
+LANG = "en-us"
+
 EXAMPLE_TEXT = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"
 
-# Raw phonemes from run of gruut with example text (en-us).
-# This includes IPA ties, etc.
-EXAMPLE_PHONEMES = [
-    ["ɹ", "ˈi", "s", "ə", "n", "t"],
-    ["ɹ", "i", "s", "ˈɚ", "t͡ʃ"],
-    ["ˈæ", "t"],
-    ["h", "ˈɑ", "ɹ", "v", "ɚ", "d"],
-    ["h", "ˈæ", "z"],
-    ["ʃ", "ˈoʊ", "n"],
-    ["m", "ˈɛ", "d", "ɪ", "t", "ˌeɪ", "t", "ɪ", "ŋ"],
-    ["f", "ɚ"],
-    ["ˈæ", "z"],
-    ["l", "ˈɪ", "t", "ə", "l"],
-    ["ˈæ", "z"],
-    ["ˈeɪ", "t"],
-    ["w", "ˈi", "k", "s"],
-    ["k", "ə", "n"],
-    ["ˈæ", "k", "t͡ʃ", "ə", "l", "i"],
-    ["ɪ", "ŋ", "k", "ɹ", "ˈi", "s"],
-    [","],
-    ["ð", "ə"],
-    ["ɡ", "ɹ", "ˈeɪ"],
-    ["m", "ˈæ", "t", "ɚ"],
-    ["ˈɪ", "n"],
-    ["ð", "ə"],
-    ["p", "ˈɑ", "ɹ", "t", "s"],
-    ["ə", "v"],
-    ["ð", "ə"],
-    ["b", "ɹ", "ˈeɪ", "n"],
-    ["ɹ", "i", "s", "p", "ˈɑ", "n", "s", "ɪ", "b", "ə", "l"],
-    ["f", "ɚ"],
-    ["ɪ", "m", "ˈoʊ", "ʃ", "ə", "n", "ə", "l"],
-    ["ɹ", "ˌɛ", "ɡ", "j", "ə", "l", "ˈeɪ", "ʃ", "ə", "n"],
-    ["ˈæ", "n", "d"],
-    ["l", "ˈɚ", "n", "ɪ", "ŋ"],
-    ["!"],
-]
+EXPECTED_PHONEMES = "ɹ|iː|s|ə|n|t| ɹ|ᵻ|s|ɜː|tʃ| æ|t| h|ɑːɹ|v|ɚ|d| h|æ|z| ʃ|oʊ|n| m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| f|ɔːɹ| æ|z| l|ɪ|ɾ|əl| æ|z| eɪ|t| w|iː|k|s| k|æ|n| æ|k|tʃ|uː|əl|i| ɪ|ŋ|k|ɹ|iː|s| ,| ð|ə| ɡ|ɹ|eɪ| m|æ|ɾ|ɚ| ɪ|n| ð|ə| p|ɑːɹ|t|s| ʌ|v| ð|ə| b|ɹ|eɪ|n| ɹ|ᵻ|s|p|ɑː|n|s|ᵻ|b|əl| f|ɔːɹ| ɪ|m|oʊ|ʃ|ə|n|əl| ɹ|ɛ|ɡ|j|ʊ|l|eɪ|ʃ|ə|n| æ|n|d| l|ɜː|n|ɪ|ŋ| !"
 
 # -----------------------------------------------------------------------------
 
@@ -56,79 +17,84 @@ EXAMPLE_PHONEMES = [
 class TextProcessingTextCase(unittest.TestCase):
     """Tests for text to phoneme conversion"""
 
-    def test_all_phonemes_in_tts(self):
-        """Ensure that all phonemes from gruut are present in TTS phonemes"""
-        tts_phonemes = set(all_phonemes)
-
-        # Check stress characters
-        for suprasegmental in [IPA.STRESS_PRIMARY, IPA.STRESS_SECONDARY]:
-            self.assertIn(suprasegmental, tts_phonemes)
-
-        # Check that gruut's phonemes are a subset of TTS phonemes
-        for lang in gruut.get_supported_languages():
-            for phoneme in Phonemes.from_language(lang):
-                for codepoint in clean_gruut_phonemes(phoneme.text):
-
-                    self.assertIn(codepoint, tts_phonemes)
-
     def test_phoneme_to_sequence(self):
-        """Verify example (text -> sequence -> phoneme string) pipeline"""
-        lang = "en-us"
-        expected_phoneme_str = " ".join(
-            "".join(clean_gruut_phonemes(word_phonemes)) for word_phonemes in EXAMPLE_PHONEMES
-        )
-
-        # Ensure that TTS produces same phoneme string
-        text_cleaner = ["phoneme_cleaners"]
-        actual_sequence = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, lang)
-        actual_phoneme_str = sequence_to_phoneme(actual_sequence)
-
-        self.assertEqual(actual_phoneme_str, expected_phoneme_str)
+        """Verify en-us sentence phonemes without blank token"""
+        self._test_phoneme_to_sequence(add_blank=False)
 
     def test_phoneme_to_sequence_with_blank_token(self):
-        """Verify example (text -> sequence -> phoneme string) pipeline with blank token"""
-        lang = "en-us"
+        """Verify en-us sentence phonemes with blank token"""
+        self._test_phoneme_to_sequence(add_blank=True)
+
+    def _test_phoneme_to_sequence(self, add_blank):
         text_cleaner = ["phoneme_cleaners"]
+        sequence = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
+        text_hat = sequence_to_phoneme(sequence)
+        text_hat_with_params = sequence_to_phoneme(sequence)
+        gt = EXPECTED_PHONEMES.replace("|", "")
+        self.assertEqual(text_hat, text_hat_with_params)
+        self.assertEqual(text_hat, gt)
 
-        # Create with/without blank sequences
-        sequence_without_blank = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, lang, add_blank=False)
-        sequence_with_blank = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, lang, add_blank=True)
+        # multiple punctuations
+        text = "Be a voice, not an! echo?"
+        sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
+        text_hat = sequence_to_phoneme(sequence)
+        text_hat_with_params = sequence_to_phoneme(sequence)
+        gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ ?"
+        print(text_hat)
+        print(len(sequence))
+        self.assertEqual(text_hat, text_hat_with_params)
+        self.assertEqual(text_hat, gt)
 
-        # With blank sequence should be bigger
-        self.assertGreater(len(sequence_with_blank), len(sequence_without_blank))
+        # not ending with punctuation
+        text = "Be a voice, not an! echo"
+        sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
+        text_hat = sequence_to_phoneme(sequence)
+        text_hat_with_params = sequence_to_phoneme(sequence)
+        gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ"
+        print(text_hat)
+        print(len(sequence))
+        self.assertEqual(text_hat, text_hat_with_params)
+        self.assertEqual(text_hat, gt)
 
-        # But phoneme strings should still be identical
-        phoneme_str_without_blank = sequence_to_phoneme(sequence_without_blank, add_blank=False)
-        phoneme_str_with_blank = sequence_to_phoneme(sequence_with_blank, add_blank=True)
+        # original
+        text = "Be a voice, not an echo!"
+        sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
+        text_hat = sequence_to_phoneme(sequence)
+        text_hat_with_params = sequence_to_phoneme(sequence)
+        gt = "biː ɐ vɔɪs , nɑːt ɐn ɛkoʊ !"
+        print(text_hat)
+        print(len(sequence))
+        self.assertEqual(text_hat, text_hat_with_params)
+        self.assertEqual(text_hat, gt)
 
-        self.assertEqual(phoneme_str_with_blank, phoneme_str_without_blank)
+        # extra space after the sentence
+        text = "Be a voice, not an! echo.  "
+        sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
+        text_hat = sequence_to_phoneme(sequence)
+        text_hat_with_params = sequence_to_phoneme(sequence)
+        gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ ."
+        print(text_hat)
+        print(len(sequence))
+        self.assertEqual(text_hat, text_hat_with_params)
+        self.assertEqual(text_hat, gt)
 
-    def test_messy_text(self):
-        """Verify text with extra punctuation/whitespace/etc. makes it through the pipeline"""
-        text = '"Be" a! voice, [NOT]? (an eCHo.   '
-        lang = "en-us"
-        expected_phonemes = [
-            ["b", "ˈi"],
-            ["ə"],
-            ["!"],
-            ["v", "ˈɔɪ", "s"],
-            [","],
-            ["n", "ˈɑ", "t"],
-            ["?"],
-            ["ə", "n"],
-            ["ˈɛ", "k", "oʊ"],
-            ["."],
-        ]
-        expected_phoneme_str = " ".join(
-            "".join(clean_gruut_phonemes(word_phonemes)) for word_phonemes in expected_phonemes
+        # extra space after the sentence
+        text = "Be a voice, not an! echo.  "
+        sequence = phoneme_to_sequence(
+            text, text_cleaner, LANG, enable_eos_bos=True, add_blank=add_blank, use_espeak_phonemes=True
         )
+        text_hat = sequence_to_phoneme(sequence)
+        text_hat_with_params = sequence_to_phoneme(sequence)
+        gt = "^biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ .~"
+        print(text_hat)
+        print(len(sequence))
+        self.assertEqual(text_hat, text_hat_with_params)
+        self.assertEqual(text_hat, gt)
 
-        # Ensure that TTS produces same phoneme string
-        text_cleaner = ["phoneme_cleaners"]
-        actual_sequence = phoneme_to_sequence(text, text_cleaner, lang)
-        actual_phoneme_str = sequence_to_phoneme(actual_sequence)
-
-        self.assertEqual(actual_phoneme_str, expected_phoneme_str)
+    def test_text2phone(self):
+        text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"
+        ph = text2phone(EXAMPLE_TEXT, LANG)
+        self.assertEqual(ph, EXPECTED_PHONEMES)
 
 
 # -----------------------------------------------------------------------------

From 3f172b84d850b2f1e7716dfb4428843037669b3b Mon Sep 17 00:00:00 2001
From: Michael Hansen <mike@rhasspy.org>
Date: Wed, 16 Jun 2021 15:26:36 -0400
Subject: [PATCH 08/11] Fix linting issues

---
 TTS/bin/extract_tts_spectrograms.py | 1 +
 TTS/tts/utils/synthesis.py          | 2 +-
 TTS/tts/utils/text/__init__.py      | 6 ++++--
 tests/test_text_processing.py       | 3 ++-
 4 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py
index ace7464a..fb3a8321 100755
--- a/TTS/bin/extract_tts_spectrograms.py
+++ b/TTS/bin/extract_tts_spectrograms.py
@@ -299,4 +299,5 @@ if __name__ == "__main__":
     args = parser.parse_args()
 
     c = load_config(args.config_path)
+    c.audio["do_trim_silence"] = False  # IMPORTANT!!!!!!!!!!!!!!! disable to align mel
     main(args)
diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py
index da50f1ca..0ddf7ebe 100644
--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@@ -25,7 +25,7 @@ def text_to_seqvec(text, CONFIG):
                 CONFIG.enable_eos_bos_chars,
                 tp=CONFIG.characters,
                 add_blank=CONFIG.add_blank,
-                use_espeak_phonemes=CONFIG.use_espeak_phonemes
+                use_espeak_phonemes=CONFIG.use_espeak_phonemes,
             ),
             dtype=np.int32,
         )
diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py
index 350e5934..787394b5 100644
--- a/TTS/tts/utils/text/__init__.py
+++ b/TTS/tts/utils/text/__init__.py
@@ -54,7 +54,7 @@ def text2phone(text, language, use_espeak_phonemes=False):
 
     if gruut.is_language_supported(language):
         # Use gruut for phonemization
-        phonemizer_args={
+        phonemizer_args = {
             "remove_stress": True,
             "ipa_minor_breaks": False,  # don't replace commas/semi-colons with IPA |
             "ipa_major_breaks": False,  # don't replace periods with IPA ‖
@@ -104,7 +104,9 @@ def pad_with_eos_bos(phoneme_sequence, tp=None):
     return [_phonemes_to_id[_bos]] + list(phoneme_sequence) + [_phonemes_to_id[_eos]]
 
 
-def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None, add_blank=False, use_espeak_phonemes=False):
+def phoneme_to_sequence(
+    text, cleaner_names, language, enable_eos_bos=False, tp=None, add_blank=False, use_espeak_phonemes=False
+):
     # pylint: disable=global-statement
     global _phonemes_to_id, _phonemes
     if tp:
diff --git a/tests/test_text_processing.py b/tests/test_text_processing.py
index 17ee755e..4a1ba64f 100644
--- a/tests/test_text_processing.py
+++ b/tests/test_text_processing.py
@@ -26,6 +26,7 @@ class TextProcessingTextCase(unittest.TestCase):
         self._test_phoneme_to_sequence(add_blank=True)
 
     def _test_phoneme_to_sequence(self, add_blank):
+        """Verify en-us sentence phonemes"""
         text_cleaner = ["phoneme_cleaners"]
         sequence = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
         text_hat = sequence_to_phoneme(sequence)
@@ -92,7 +93,7 @@ class TextProcessingTextCase(unittest.TestCase):
         self.assertEqual(text_hat, gt)
 
     def test_text2phone(self):
-        text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"
+        """Verify phones directly (with |)"""
         ph = text2phone(EXAMPLE_TEXT, LANG)
         self.assertEqual(ph, EXPECTED_PHONEMES)
 

From a41f53fe725241f3f53f3474631c9f8b11d59012 Mon Sep 17 00:00:00 2001
From: Michael Hansen <hansen.mike@gmail.com>
Date: Wed, 16 Jun 2021 18:10:51 -0400
Subject: [PATCH 09/11] Fix silly error in tests

---
 tests/test_text_processing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_text_processing.py b/tests/test_text_processing.py
index 4a1ba64f..3c424a15 100644
--- a/tests/test_text_processing.py
+++ b/tests/test_text_processing.py
@@ -94,7 +94,7 @@ class TextProcessingTextCase(unittest.TestCase):
 
     def test_text2phone(self):
         """Verify phones directly (with |)"""
-        ph = text2phone(EXAMPLE_TEXT, LANG)
+        ph = text2phone(EXAMPLE_TEXT, LANG, use_espeak_phonemes=True)
         self.assertEqual(ph, EXPECTED_PHONEMES)
 
 

From 987cf1178b3dd1894cfd4c0b32a56ae4d2af1620 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Fri, 25 Jun 2021 14:44:33 +0200
Subject: [PATCH 10/11] Bump up to v0.0.16

---
 TTS/VERSION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/VERSION b/TTS/VERSION
index 58c4b6e9..e3b86dd9 100644
--- a/TTS/VERSION
+++ b/TTS/VERSION
@@ -1 +1 @@
-0.0.15.1
+0.0.16

From 6c7bbcaef04a911c3f49c24eb50a55f691ea7194 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Fri, 25 Jun 2021 16:52:17 +0200
Subject: [PATCH 11/11] Use `en-us` for testing phoneme models

---
 tests/tts_tests/test_glow_tts_train.py             | 3 ++-
 tests/tts_tests/test_speedy_speech_train.py        | 2 +-
 tests/vocoder_tests/test_multiband_melgan_train.py | 1 +
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/tts_tests/test_glow_tts_train.py b/tests/tts_tests/test_glow_tts_train.py
index 2e675d13..e44f6365 100644
--- a/tests/tts_tests/test_glow_tts_train.py
+++ b/tests/tts_tests/test_glow_tts_train.py
@@ -16,7 +16,8 @@ config = GlowTTSConfig(
     num_val_loader_workers=0,
     text_cleaner="english_cleaners",
     use_phonemes=True,
-    phoneme_language="zh-CN",
+    use_espeak_phonemes=True,
+    phoneme_language="en-us",
     phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
     run_eval=True,
     test_delay_epochs=-1,
diff --git a/tests/tts_tests/test_speedy_speech_train.py b/tests/tts_tests/test_speedy_speech_train.py
index 3f508117..9dcf0ad8 100644
--- a/tests/tts_tests/test_speedy_speech_train.py
+++ b/tests/tts_tests/test_speedy_speech_train.py
@@ -16,7 +16,7 @@ config = SpeedySpeechConfig(
     num_val_loader_workers=0,
     text_cleaner="english_cleaners",
     use_phonemes=True,
-    phoneme_language="zh-CN",
+    phoneme_language="en-us",
     phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
     run_eval=True,
     test_delay_epochs=-1,
diff --git a/tests/vocoder_tests/test_multiband_melgan_train.py b/tests/vocoder_tests/test_multiband_melgan_train.py
index 081fb40e..ef362414 100644
--- a/tests/vocoder_tests/test_multiband_melgan_train.py
+++ b/tests/vocoder_tests/test_multiband_melgan_train.py
@@ -20,6 +20,7 @@ config = MultibandMelganConfig(
     eval_split_size=1,
     print_step=1,
     print_eval=True,
+    discriminator_model_params={"base_channels": 16, "max_channels": 128, "downsample_factors": [4, 4, 4]},
     data_path="tests/data/ljspeech",
     output_path=output_path,
 )