Add tests for gruut phonemization

2021-06-09 11:52:10 -04:00 · 2021-06-09 11:52:10 -04:00 · 47191f3ecc
parent 67869e77f9
commit 47191f3ecc
3 changed files with 168 additions and 36 deletions
--- a/TTS/init.py
+++ b/TTS/init.py
@ -1,6 +1,5 @@
 import os

-
 with open(os.path.join(os.path.dirname(__file__), "VERSION")) as f:
    version = f.read().strip()

--- a/TTS/tts/utils/text/init.py
+++ b/TTS/tts/utils/text/init.py
@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-

 import re
+import unicodedata

 import gruut
 from packaging import version
@ -26,32 +27,34 @@ _CURLY_RE = re.compile(r"(.*?)\{(.+?)\}(.*)")
 # Regular expression matching punctuations, ignoring empty space
 PHONEME_PUNCTUATION_PATTERN = r"[" + _punctuations.replace(" ", "") + "]+"

-# language -> source phoneme -> dest phoneme
-# Used to make gruut's phonemes fit better with eSpeak's.
-GRUUT_PHONEME_MAP = {
-    "en-us": {
-        "i": "iː",
-        "ɑ": "ɑː",
-        "ɚ": "ɜːɹ",
-    },
-    "de": {
-        "ʁ": "ɾ",
-        "g": "ɡ",
-        "ʔ": "",
-    },
-    "nl": {
-        "a": "aː",
-        "e": "eː",
-        "ʏ": "ɵ",
-        "ʋ": "w",
-        "ɹ": "r",
-        "ɔː": "oː",
-    },
-    "es": {
-        "ɾ": "r",
-        "g": "ɣ",
-    },
-}
+# Table for str.translate to fix gruut/TTS phoneme mismatch
+GRUUT_TRANS_TABLE = str.maketrans("g", "ɡ")
+
+
+def clean_gruut_phonemes(ph_list):
+    """Decompose, substitute, and clean gruut phonemes for TTS.
+
+    Parameters:
+            ph_list (list[str]): list of phonemes from gruut
+
+    Returns:
+            clean_list (list[str]): decomposed/clean list of phonemes for TTS
+                    Dipthongs, etc. are decomposed into single characters
+                    Unicode combining characters are removed (e.g., ties)
+    """
+    cleaned_phonemes = []
+
+    for phoneme_text in ph_list:
+        # Decompose into codepoints (ã -> ["a", "\u0303"])
+        phoneme_text = unicodedata.normalize("NFD", phoneme_text)
+        for codepoint in phoneme_text.translate(GRUUT_TRANS_TABLE):
+            if unicodedata.combining(codepoint) > 0:
+                # Skip combining characters like ties
+                continue
+
+            cleaned_phonemes.append(codepoint)
+
+    return cleaned_phonemes


 def text2phone(text, language):
@ -82,21 +85,14 @@ def text2phone(text, language):
            lang=language,
            return_format="word_phonemes",
            phonemizer_args={
-                "remove_stress": True,  # remove primary/secondary stress
+                "remove_accents": True,  # remove accute/grave accents (Swedish)
                "ipa_minor_breaks": False,  # don't replace commas/semi-colons with IPA |
                "ipa_major_breaks": False,  # don't replace periods with IPA ‖
            },
        )

-        ph_map = GRUUT_PHONEME_MAP.get(language)
-        if ph_map:
-            # Re-map phonemes to fit with eSpeak conventions
-            for word in ph_list:
-                for p_idx, p in enumerate(word):
-                    word[p_idx] = ph_map.get(p, p)
-
        # Join and re-split to break apart dipthongs, suprasegmentals, etc.
-        ph_words = ["|".join(word_phonemes) for word_phonemes in ph_list]
+        ph_words = ["|".join(clean_gruut_phonemes(word_phonemes)) for word_phonemes in ph_list]
        ph = "| ".join(ph_words)

        print(" > Phonemes: {}".format(ph))
--- a/tests/test_text_processing.py
+++ b/tests/test_text_processing.py
@ -0,0 +1,137 @@
+"""Tests for text to phoneme converstion"""
+import unittest
+
+import gruut
+from gruut_ipa import IPA, Phonemes
+
+from TTS.tts.utils.text import clean_gruut_phonemes, phoneme_to_sequence
+from TTS.tts.utils.text import phonemes as all_phonemes
+from TTS.tts.utils.text import sequence_to_phoneme
+
+# -----------------------------------------------------------------------------
+
+EXAMPLE_TEXT = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"
+
+# Raw phonemes from run of gruut with example text (en-us).
+# This includes IPA ties, etc.
+EXAMPLE_PHONEMES = [
+    ["ɹ", "ˈi", "s", "ə", "n", "t"],
+    ["ɹ", "i", "s", "ˈɚ", "t͡ʃ"],
+    ["ˈæ", "t"],
+    ["h", "ˈɑ", "ɹ", "v", "ɚ", "d"],
+    ["h", "ˈæ", "z"],
+    ["ʃ", "ˈoʊ", "n"],
+    ["m", "ˈɛ", "d", "ɪ", "t", "ˌeɪ", "t", "ɪ", "ŋ"],
+    ["f", "ɚ"],
+    ["ˈæ", "z"],
+    ["l", "ˈɪ", "t", "ə", "l"],
+    ["ˈæ", "z"],
+    ["ˈeɪ", "t"],
+    ["w", "ˈi", "k", "s"],
+    ["k", "ə", "n"],
+    ["ˈæ", "k", "t͡ʃ", "ə", "l", "i"],
+    ["ɪ", "ŋ", "k", "ɹ", "ˈi", "s"],
+    [","],
+    ["ð", "ə"],
+    ["ɡ", "ɹ", "ˈeɪ"],
+    ["m", "ˈæ", "t", "ɚ"],
+    ["ˈɪ", "n"],
+    ["ð", "ə"],
+    ["p", "ˈɑ", "ɹ", "t", "s"],
+    ["ə", "v"],
+    ["ð", "ə"],
+    ["b", "ɹ", "ˈeɪ", "n"],
+    ["ɹ", "i", "s", "p", "ˈɑ", "n", "s", "ɪ", "b", "ə", "l"],
+    ["f", "ɚ"],
+    ["ɪ", "m", "ˈoʊ", "ʃ", "ə", "n", "ə", "l"],
+    ["ɹ", "ˌɛ", "ɡ", "j", "ə", "l", "ˈeɪ", "ʃ", "ə", "n"],
+    ["ˈæ", "n", "d"],
+    ["l", "ˈɚ", "n", "ɪ", "ŋ"],
+    ["!"],
+]
+
+# -----------------------------------------------------------------------------
+
+
+class TextProcessingTextCase(unittest.TestCase):
+    """Tests for text to phoneme conversion"""
+
+    def test_all_phonemes_in_tts(self):
+        """Ensure that all phonemes from gruut are present in TTS phonemes"""
+        tts_phonemes = set(all_phonemes)
+
+        # Check stress characters
+        for suprasegmental in [IPA.STRESS_PRIMARY, IPA.STRESS_SECONDARY]:
+            self.assertIn(suprasegmental, tts_phonemes)
+
+        # Check that gruut's phonemes are a subset of TTS phonemes
+        for lang in gruut.get_supported_languages():
+            for phoneme in Phonemes.from_language(lang):
+                for codepoint in clean_gruut_phonemes(phoneme.text):
+
+                    self.assertIn(codepoint, tts_phonemes)
+
+    def test_phoneme_to_sequence(self):
+        """Verify example (text -> sequence -> phoneme string) pipeline"""
+        lang = "en-us"
+        expected_phoneme_str = " ".join(
+            "".join(clean_gruut_phonemes(word_phonemes)) for word_phonemes in EXAMPLE_PHONEMES
+        )
+
+        # Ensure that TTS produces same phoneme string
+        text_cleaner = ["phoneme_cleaners"]
+        actual_sequence = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, lang)
+        actual_phoneme_str = sequence_to_phoneme(actual_sequence)
+
+        self.assertEqual(actual_phoneme_str, expected_phoneme_str)
+
+    def test_phoneme_to_sequence_with_blank_token(self):
+        """Verify example (text -> sequence -> phoneme string) pipeline with blank token"""
+        lang = "en-us"
+        text_cleaner = ["phoneme_cleaners"]
+
+        # Create with/without blank sequences
+        sequence_without_blank = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, lang, add_blank=False)
+        sequence_with_blank = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, lang, add_blank=True)
+
+        # With blank sequence should be bigger
+        self.assertGreater(len(sequence_with_blank), len(sequence_without_blank))
+
+        # But phoneme strings should still be identical
+        phoneme_str_without_blank = sequence_to_phoneme(sequence_without_blank, add_blank=False)
+        phoneme_str_with_blank = sequence_to_phoneme(sequence_with_blank, add_blank=True)
+
+        self.assertEqual(phoneme_str_with_blank, phoneme_str_without_blank)
+
+    def test_messy_text(self):
+        """Verify text with extra punctuation/whitespace/etc. makes it through the pipeline"""
+        text = '"Be" a! voice, [NOT]? (an eCHo.   '
+        lang = "en-us"
+        expected_phonemes = [
+            ["b", "ˈi"],
+            ["ə"],
+            ["!"],
+            ["v", "ˈɔɪ", "s"],
+            [","],
+            ["n", "ˈɑ", "t"],
+            ["?"],
+            ["ə", "n"],
+            ["ˈɛ", "k", "oʊ"],
+            ["."],
+        ]
+        expected_phoneme_str = " ".join(
+            "".join(clean_gruut_phonemes(word_phonemes)) for word_phonemes in expected_phonemes
+        )
+
+        # Ensure that TTS produces same phoneme string
+        text_cleaner = ["phoneme_cleaners"]
+        actual_sequence = phoneme_to_sequence(text, text_cleaner, lang)
+        actual_phoneme_str = sequence_to_phoneme(actual_sequence)
+
+        self.assertEqual(actual_phoneme_str, expected_phoneme_str)
+
+
+# -----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    unittest.main()