Use eSpeak IPA lexicons by default for phoneme models

2021-06-15 15:57:08 -04:00 · 2021-06-15 15:57:08 -04:00 · 4d8426fa0a
parent 618b509204
commit 4d8426fa0a
5 changed files with 97 additions and 154 deletions
--- a/TTS/tts/configs/shared_configs.py
+++ b/TTS/tts/configs/shared_configs.py
@ -97,6 +97,8 @@ class BaseTTSConfig(BaseTrainingConfig):
            Audio processor config object instance.
        use_phonemes (bool):
            enable / disable phoneme use.
        use_espeak_phonemes (bool):
            enable / disable eSpeak-compatible phonemes (only if use_phonemes = `True`).
        compute_input_seq_cache (bool):
            enable / disable precomputation of the phoneme sequences. At the expense of some delay at the beginning of
            the training, It allows faster data loader time and precise limitation with `max_seq_len` and
@ -136,6 +138,7 @@ class BaseTTSConfig(BaseTrainingConfig):
    audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
    # phoneme settings
    use_phonemes: bool = False
    use_espeak_phonemes: bool = True
    phoneme_language: str = None
    compute_input_seq_cache: bool = False
    text_cleaner: str = MISSING
--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@ -25,6 +25,7 @@ def text_to_seqvec(text, CONFIG):
                CONFIG.enable_eos_bos_chars,
                tp=CONFIG.characters,
                add_blank=CONFIG.add_blank,
                use_espeak_phonemes=CONFIG.use_espeak_phonemes
            ),
            dtype=np.int32,
        )
--- a/TTS/tts/utils/text/init.py
+++ b/TTS/tts/utils/text/init.py
@ -31,46 +31,7 @@ PHONEME_PUNCTUATION_PATTERN = r"[" + _punctuations.replace(" ", "") + "]+"
 GRUUT_TRANS_TABLE = str.maketrans("g", "ɡ")
-def clean_gruut_phonemes(ph_list):
+def text2phone(text, language, use_espeak_phonemes=False):
    """Decompose, substitute, and clean gruut phonemes for TTS.
    gruut phonemes may contain any IPA characters (e.g., "ẽ" for the nasalized
    "e"), and may be composed of multiple characters (e.g., "aɪ" in the English
    "r[i]ce").
    TTS phonemes come from a fixed set of symbols, and do not include every
    possible variation of every vowel/consonant. Here, we decompose dipthongs,
    etc. into single characters and then filter out Unicode combining characters
    such as ties. This ensures that (most) phonemes will exist in the TTS symbol
    table.
    Args:
        ph_list (list[str]): list of phonemes from gruut
    Returns:
        clean_list (list[str]): decomposed/clean list of phonemes for TTS
    """
    cleaned_phonemes = []
    for phoneme_text in ph_list:
        phoneme_text = unicodedata.normalize("NFC", phoneme_text)
        if phoneme_text in phonemes:
            cleaned_phonemes.append(phoneme_text)
            continue
        # Decompose into codepoints (ã -> ["a", "\u0303"])
        phoneme_text = unicodedata.normalize("NFD", phoneme_text)
        for codepoint in phoneme_text.translate(GRUUT_TRANS_TABLE):
            if unicodedata.combining(codepoint) > 0:
                # Skip combining characters like ties
                continue
            cleaned_phonemes.append(codepoint)
    return cleaned_phonemes
 def text2phone(text, language):
    """Convert graphemes to phonemes.
    Parameters:
            text (str): text to phonemize
@ -93,21 +54,32 @@ def text2phone(text, language):
    if gruut.is_language_supported(language):
        # Use gruut for phonemization
        phonemizer_args={
            "remove_stress": True,
            "ipa_minor_breaks": False,  # don't replace commas/semi-colons with IPA |
            "ipa_major_breaks": False,  # don't replace periods with IPA ‖
        }
        if use_espeak_phonemes:
            # Use a lexicon/g2p model train on eSpeak IPA instead of gruut IPA.
            # This is intended for backwards compatibility with TTS<=v0.0.13
            # pre-trained models.
            phonemizer_args["model_prefix"] = "espeak"
        ph_list = gruut.text_to_phonemes(
            text,
            lang=language,
            return_format="word_phonemes",
-            phonemizer_args={
+            phonemizer_args=phonemizer_args,
                "remove_accents": True,  # remove accute/grave accents (Swedish)
                "ipa_minor_breaks": False,  # don't replace commas/semi-colons with IPA |
                "ipa_major_breaks": False,  # don't replace periods with IPA ‖
            },
        )
        # Join and re-split to break apart dipthongs, suprasegmentals, etc.
-        ph_words = ["|".join(clean_gruut_phonemes(word_phonemes)) for word_phonemes in ph_list]
+        ph_words = ["|".join(word_phonemes) for word_phonemes in ph_list]
        ph = "| ".join(ph_words)
        # Fix a few phonemes
        ph = ph.translate(GRUUT_TRANS_TABLE)
        print(" > Phonemes: {}".format(ph))
        return ph
@ -132,7 +104,7 @@ def pad_with_eos_bos(phoneme_sequence, tp=None):
    return [_phonemes_to_id[_bos]] + list(phoneme_sequence) + [_phonemes_to_id[_eos]]
-def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None, add_blank=False):
+def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None, add_blank=False, use_espeak_phonemes=False):
    # pylint: disable=global-statement
    global _phonemes_to_id, _phonemes
    if tp:
@ -141,7 +113,7 @@ def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=
    sequence = []
    clean_text = _clean_text(text, cleaner_names)
-    to_phonemes = text2phone(clean_text, language)
+    to_phonemes = text2phone(clean_text, language, use_espeak_phonemes=use_espeak_phonemes)
    if to_phonemes is None:
        print("!! After phoneme conversion the result is None. -- {} ".format(clean_text))
    # iterate by skipping empty strings - NOTE: might be useful to keep it to have a better intonation.
@ -152,6 +124,7 @@ def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=
        sequence = pad_with_eos_bos(sequence, tp=tp)
    if add_blank:
        sequence = intersperse(sequence, len(_phonemes))  # add a blank token (new), whose id number is len(_phonemes)
    return sequence
--- a/requirements.txt
+++ b/requirements.txt
@ -23,4 +23,4 @@ coqpit
 mecab-python3==1.0.3
 unidic-lite==1.0.8
 # gruut+supported langs
-gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=1.1.0
+gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=1.2.0
--- a/tests/test_text_processing.py
+++ b/tests/test_text_processing.py
@ -1,54 +1,15 @@
 """Tests for text to phoneme converstion"""
 import unittest
-import gruut
+from TTS.tts.utils.text import phoneme_to_sequence, sequence_to_phoneme, text2phone
 from gruut_ipa import IPA, Phonemes
 from TTS.tts.utils.text import clean_gruut_phonemes, phoneme_to_sequence
 from TTS.tts.utils.text import phonemes as all_phonemes
 from TTS.tts.utils.text import sequence_to_phoneme
 # -----------------------------------------------------------------------------
 LANG = "en-us"
 EXAMPLE_TEXT = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"
-# Raw phonemes from run of gruut with example text (en-us).
+EXPECTED_PHONEMES = "ɹ|iː|s|ə|n|t| ɹ|ᵻ|s|ɜː|tʃ| æ|t| h|ɑːɹ|v|ɚ|d| h|æ|z| ʃ|oʊ|n| m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| f|ɔːɹ| æ|z| l|ɪ|ɾ|əl| æ|z| eɪ|t| w|iː|k|s| k|æ|n| æ|k|tʃ|uː|əl|i| ɪ|ŋ|k|ɹ|iː|s| ,| ð|ə| ɡ|ɹ|eɪ| m|æ|ɾ|ɚ| ɪ|n| ð|ə| p|ɑːɹ|t|s| ʌ|v| ð|ə| b|ɹ|eɪ|n| ɹ|ᵻ|s|p|ɑː|n|s|ᵻ|b|əl| f|ɔːɹ| ɪ|m|oʊ|ʃ|ə|n|əl| ɹ|ɛ|ɡ|j|ʊ|l|eɪ|ʃ|ə|n| æ|n|d| l|ɜː|n|ɪ|ŋ| !"
 # This includes IPA ties, etc.
 EXAMPLE_PHONEMES = [
    ["ɹ", "ˈi", "s", "ə", "n", "t"],
    ["ɹ", "i", "s", "ˈɚ", "t͡ʃ"],
    ["ˈæ", "t"],
    ["h", "ˈɑ", "ɹ", "v", "ɚ", "d"],
    ["h", "ˈæ", "z"],
    ["ʃ", "ˈoʊ", "n"],
    ["m", "ˈɛ", "d", "ɪ", "t", "ˌeɪ", "t", "ɪ", "ŋ"],
    ["f", "ɚ"],
    ["ˈæ", "z"],
    ["l", "ˈɪ", "t", "ə", "l"],
    ["ˈæ", "z"],
    ["ˈeɪ", "t"],
    ["w", "ˈi", "k", "s"],
    ["k", "ə", "n"],
    ["ˈæ", "k", "t͡ʃ", "ə", "l", "i"],
    ["ɪ", "ŋ", "k", "ɹ", "ˈi", "s"],
    [","],
    ["ð", "ə"],
    ["ɡ", "ɹ", "ˈeɪ"],
    ["m", "ˈæ", "t", "ɚ"],
    ["ˈɪ", "n"],
    ["ð", "ə"],
    ["p", "ˈɑ", "ɹ", "t", "s"],
    ["ə", "v"],
    ["ð", "ə"],
    ["b", "ɹ", "ˈeɪ", "n"],
    ["ɹ", "i", "s", "p", "ˈɑ", "n", "s", "ɪ", "b", "ə", "l"],
    ["f", "ɚ"],
    ["ɪ", "m", "ˈoʊ", "ʃ", "ə", "n", "ə", "l"],
    ["ɹ", "ˌɛ", "ɡ", "j", "ə", "l", "ˈeɪ", "ʃ", "ə", "n"],
    ["ˈæ", "n", "d"],
    ["l", "ˈɚ", "n", "ɪ", "ŋ"],
    ["!"],
 ]
 # -----------------------------------------------------------------------------
@ -56,79 +17,84 @@ EXAMPLE_PHONEMES = [
 class TextProcessingTextCase(unittest.TestCase):
    """Tests for text to phoneme conversion"""
    def test_all_phonemes_in_tts(self):
        """Ensure that all phonemes from gruut are present in TTS phonemes"""
        tts_phonemes = set(all_phonemes)
        # Check stress characters
        for suprasegmental in [IPA.STRESS_PRIMARY, IPA.STRESS_SECONDARY]:
            self.assertIn(suprasegmental, tts_phonemes)
        # Check that gruut's phonemes are a subset of TTS phonemes
        for lang in gruut.get_supported_languages():
            for phoneme in Phonemes.from_language(lang):
                for codepoint in clean_gruut_phonemes(phoneme.text):
                    self.assertIn(codepoint, tts_phonemes)
    def test_phoneme_to_sequence(self):
-        """Verify example (text -> sequence -> phoneme string) pipeline"""
+        """Verify en-us sentence phonemes without blank token"""
-        lang = "en-us"
+        self._test_phoneme_to_sequence(add_blank=False)
        expected_phoneme_str = " ".join(
            "".join(clean_gruut_phonemes(word_phonemes)) for word_phonemes in EXAMPLE_PHONEMES
        )
        # Ensure that TTS produces same phoneme string
        text_cleaner = ["phoneme_cleaners"]
        actual_sequence = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, lang)
        actual_phoneme_str = sequence_to_phoneme(actual_sequence)
        self.assertEqual(actual_phoneme_str, expected_phoneme_str)
    def test_phoneme_to_sequence_with_blank_token(self):
-        """Verify example (text -> sequence -> phoneme string) pipeline with blank token"""
+        """Verify en-us sentence phonemes with blank token"""
-        lang = "en-us"
+        self._test_phoneme_to_sequence(add_blank=True)
    def _test_phoneme_to_sequence(self, add_blank):
        text_cleaner = ["phoneme_cleaners"]
        sequence = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
        text_hat = sequence_to_phoneme(sequence)
        text_hat_with_params = sequence_to_phoneme(sequence)
        gt = EXPECTED_PHONEMES.replace("|", "")
        self.assertEqual(text_hat, text_hat_with_params)
        self.assertEqual(text_hat, gt)
-        # Create with/without blank sequences
+        # multiple punctuations
-        sequence_without_blank = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, lang, add_blank=False)
+        text = "Be a voice, not an! echo?"
-        sequence_with_blank = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, lang, add_blank=True)
+        sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
        text_hat = sequence_to_phoneme(sequence)
        text_hat_with_params = sequence_to_phoneme(sequence)
        gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ ?"
        print(text_hat)
        print(len(sequence))
        self.assertEqual(text_hat, text_hat_with_params)
        self.assertEqual(text_hat, gt)
-        # With blank sequence should be bigger
+        # not ending with punctuation
-        self.assertGreater(len(sequence_with_blank), len(sequence_without_blank))
+        text = "Be a voice, not an! echo"
        sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
        text_hat = sequence_to_phoneme(sequence)
        text_hat_with_params = sequence_to_phoneme(sequence)
        gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ"
        print(text_hat)
        print(len(sequence))
        self.assertEqual(text_hat, text_hat_with_params)
        self.assertEqual(text_hat, gt)
-        # But phoneme strings should still be identical
+        # original
-        phoneme_str_without_blank = sequence_to_phoneme(sequence_without_blank, add_blank=False)
+        text = "Be a voice, not an echo!"
-        phoneme_str_with_blank = sequence_to_phoneme(sequence_with_blank, add_blank=True)
+        sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
        text_hat = sequence_to_phoneme(sequence)
        text_hat_with_params = sequence_to_phoneme(sequence)
        gt = "biː ɐ vɔɪs , nɑːt ɐn ɛkoʊ !"
        print(text_hat)
        print(len(sequence))
        self.assertEqual(text_hat, text_hat_with_params)
        self.assertEqual(text_hat, gt)
-        self.assertEqual(phoneme_str_with_blank, phoneme_str_without_blank)
+        # extra space after the sentence
        text = "Be a voice, not an! echo.  "
        sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
        text_hat = sequence_to_phoneme(sequence)
        text_hat_with_params = sequence_to_phoneme(sequence)
        gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ ."
        print(text_hat)
        print(len(sequence))
        self.assertEqual(text_hat, text_hat_with_params)
        self.assertEqual(text_hat, gt)
-    def test_messy_text(self):
+        # extra space after the sentence
-        """Verify text with extra punctuation/whitespace/etc. makes it through the pipeline"""
+        text = "Be a voice, not an! echo.  "
-        text = '"Be" a! voice, [NOT]? (an eCHo.   '
+        sequence = phoneme_to_sequence(
-        lang = "en-us"
+            text, text_cleaner, LANG, enable_eos_bos=True, add_blank=add_blank, use_espeak_phonemes=True
        expected_phonemes = [
            ["b", "ˈi"],
            ["ə"],
            ["!"],
            ["v", "ˈɔɪ", "s"],
            [","],
            ["n", "ˈɑ", "t"],
            ["?"],
            ["ə", "n"],
            ["ˈɛ", "k", "oʊ"],
            ["."],
        ]
        expected_phoneme_str = " ".join(
            "".join(clean_gruut_phonemes(word_phonemes)) for word_phonemes in expected_phonemes
        )
        text_hat = sequence_to_phoneme(sequence)
        text_hat_with_params = sequence_to_phoneme(sequence)
        gt = "^biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ .~"
        print(text_hat)
        print(len(sequence))
        self.assertEqual(text_hat, text_hat_with_params)
        self.assertEqual(text_hat, gt)
-        # Ensure that TTS produces same phoneme string
+    def test_text2phone(self):
-        text_cleaner = ["phoneme_cleaners"]
+        text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"
-        actual_sequence = phoneme_to_sequence(text, text_cleaner, lang)
+        ph = text2phone(EXAMPLE_TEXT, LANG)
-        actual_phoneme_str = sequence_to_phoneme(actual_sequence)
+        self.assertEqual(ph, EXPECTED_PHONEMES)
        self.assertEqual(actual_phoneme_str, expected_phoneme_str)
 # -----------------------------------------------------------------------------