From 4d8426fa0a26a5724f562941dfdfe2da1f0e7ee9 Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Tue, 15 Jun 2021 15:57:08 -0400 Subject: [PATCH] Use eSpeak IPA lexicons by default for phoneme models --- TTS/tts/configs/shared_configs.py | 3 + TTS/tts/utils/synthesis.py | 1 + TTS/tts/utils/text/__init__.py | 69 ++++-------- requirements.txt | 2 +- tests/test_text_processing.py | 176 ++++++++++++------------------ 5 files changed, 97 insertions(+), 154 deletions(-) diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py index 4690e76f..a501a880 100644 --- a/TTS/tts/configs/shared_configs.py +++ b/TTS/tts/configs/shared_configs.py @@ -97,6 +97,8 @@ class BaseTTSConfig(BaseTrainingConfig): Audio processor config object instance. use_phonemes (bool): enable / disable phoneme use. + use_espeak_phonemes (bool): + enable / disable eSpeak-compatible phonemes (only if use_phonemes = `True`). compute_input_seq_cache (bool): enable / disable precomputation of the phoneme sequences. At the expense of some delay at the beginning of the training, It allows faster data loader time and precise limitation with `max_seq_len` and @@ -136,6 +138,7 @@ class BaseTTSConfig(BaseTrainingConfig): audio: BaseAudioConfig = field(default_factory=BaseAudioConfig) # phoneme settings use_phonemes: bool = False + use_espeak_phonemes: bool = True phoneme_language: str = None compute_input_seq_cache: bool = False text_cleaner: str = MISSING diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 9f417a1d..da50f1ca 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -25,6 +25,7 @@ def text_to_seqvec(text, CONFIG): CONFIG.enable_eos_bos_chars, tp=CONFIG.characters, add_blank=CONFIG.add_blank, + use_espeak_phonemes=CONFIG.use_espeak_phonemes ), dtype=np.int32, ) diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py index 73bd829c..350e5934 100644 --- a/TTS/tts/utils/text/__init__.py +++ b/TTS/tts/utils/text/__init__.py @@ -31,46 +31,7 @@ PHONEME_PUNCTUATION_PATTERN = r"[" + _punctuations.replace(" ", "") + "]+" GRUUT_TRANS_TABLE = str.maketrans("g", "ɡ") -def clean_gruut_phonemes(ph_list): - """Decompose, substitute, and clean gruut phonemes for TTS. - - gruut phonemes may contain any IPA characters (e.g., "ẽ" for the nasalized - "e"), and may be composed of multiple characters (e.g., "aɪ" in the English - "r[i]ce"). - - TTS phonemes come from a fixed set of symbols, and do not include every - possible variation of every vowel/consonant. Here, we decompose dipthongs, - etc. into single characters and then filter out Unicode combining characters - such as ties. This ensures that (most) phonemes will exist in the TTS symbol - table. - - Args: - ph_list (list[str]): list of phonemes from gruut - - Returns: - clean_list (list[str]): decomposed/clean list of phonemes for TTS - """ - cleaned_phonemes = [] - - for phoneme_text in ph_list: - phoneme_text = unicodedata.normalize("NFC", phoneme_text) - if phoneme_text in phonemes: - cleaned_phonemes.append(phoneme_text) - continue - - # Decompose into codepoints (ã -> ["a", "\u0303"]) - phoneme_text = unicodedata.normalize("NFD", phoneme_text) - for codepoint in phoneme_text.translate(GRUUT_TRANS_TABLE): - if unicodedata.combining(codepoint) > 0: - # Skip combining characters like ties - continue - - cleaned_phonemes.append(codepoint) - - return cleaned_phonemes - - -def text2phone(text, language): +def text2phone(text, language, use_espeak_phonemes=False): """Convert graphemes to phonemes. Parameters: text (str): text to phonemize @@ -93,21 +54,32 @@ def text2phone(text, language): if gruut.is_language_supported(language): # Use gruut for phonemization + phonemizer_args={ + "remove_stress": True, + "ipa_minor_breaks": False, # don't replace commas/semi-colons with IPA | + "ipa_major_breaks": False, # don't replace periods with IPA ‖ + } + + if use_espeak_phonemes: + # Use a lexicon/g2p model train on eSpeak IPA instead of gruut IPA. + # This is intended for backwards compatibility with TTS<=v0.0.13 + # pre-trained models. + phonemizer_args["model_prefix"] = "espeak" + ph_list = gruut.text_to_phonemes( text, lang=language, return_format="word_phonemes", - phonemizer_args={ - "remove_accents": True, # remove accute/grave accents (Swedish) - "ipa_minor_breaks": False, # don't replace commas/semi-colons with IPA | - "ipa_major_breaks": False, # don't replace periods with IPA ‖ - }, + phonemizer_args=phonemizer_args, ) # Join and re-split to break apart dipthongs, suprasegmentals, etc. - ph_words = ["|".join(clean_gruut_phonemes(word_phonemes)) for word_phonemes in ph_list] + ph_words = ["|".join(word_phonemes) for word_phonemes in ph_list] ph = "| ".join(ph_words) + # Fix a few phonemes + ph = ph.translate(GRUUT_TRANS_TABLE) + print(" > Phonemes: {}".format(ph)) return ph @@ -132,7 +104,7 @@ def pad_with_eos_bos(phoneme_sequence, tp=None): return [_phonemes_to_id[_bos]] + list(phoneme_sequence) + [_phonemes_to_id[_eos]] -def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None, add_blank=False): +def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None, add_blank=False, use_espeak_phonemes=False): # pylint: disable=global-statement global _phonemes_to_id, _phonemes if tp: @@ -141,7 +113,7 @@ def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp= sequence = [] clean_text = _clean_text(text, cleaner_names) - to_phonemes = text2phone(clean_text, language) + to_phonemes = text2phone(clean_text, language, use_espeak_phonemes=use_espeak_phonemes) if to_phonemes is None: print("!! After phoneme conversion the result is None. -- {} ".format(clean_text)) # iterate by skipping empty strings - NOTE: might be useful to keep it to have a better intonation. @@ -152,6 +124,7 @@ def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp= sequence = pad_with_eos_bos(sequence, tp=tp) if add_blank: sequence = intersperse(sequence, len(_phonemes)) # add a blank token (new), whose id number is len(_phonemes) + return sequence diff --git a/requirements.txt b/requirements.txt index cb304693..046139d0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,4 +23,4 @@ coqpit mecab-python3==1.0.3 unidic-lite==1.0.8 # gruut+supported langs -gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=1.1.0 +gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=1.2.0 diff --git a/tests/test_text_processing.py b/tests/test_text_processing.py index f4938ca0..17ee755e 100644 --- a/tests/test_text_processing.py +++ b/tests/test_text_processing.py @@ -1,54 +1,15 @@ """Tests for text to phoneme converstion""" import unittest -import gruut -from gruut_ipa import IPA, Phonemes - -from TTS.tts.utils.text import clean_gruut_phonemes, phoneme_to_sequence -from TTS.tts.utils.text import phonemes as all_phonemes -from TTS.tts.utils.text import sequence_to_phoneme +from TTS.tts.utils.text import phoneme_to_sequence, sequence_to_phoneme, text2phone # ----------------------------------------------------------------------------- +LANG = "en-us" + EXAMPLE_TEXT = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!" -# Raw phonemes from run of gruut with example text (en-us). -# This includes IPA ties, etc. -EXAMPLE_PHONEMES = [ - ["ɹ", "ˈi", "s", "ə", "n", "t"], - ["ɹ", "i", "s", "ˈɚ", "t͡ʃ"], - ["ˈæ", "t"], - ["h", "ˈɑ", "ɹ", "v", "ɚ", "d"], - ["h", "ˈæ", "z"], - ["ʃ", "ˈoʊ", "n"], - ["m", "ˈɛ", "d", "ɪ", "t", "ˌeɪ", "t", "ɪ", "ŋ"], - ["f", "ɚ"], - ["ˈæ", "z"], - ["l", "ˈɪ", "t", "ə", "l"], - ["ˈæ", "z"], - ["ˈeɪ", "t"], - ["w", "ˈi", "k", "s"], - ["k", "ə", "n"], - ["ˈæ", "k", "t͡ʃ", "ə", "l", "i"], - ["ɪ", "ŋ", "k", "ɹ", "ˈi", "s"], - [","], - ["ð", "ə"], - ["ɡ", "ɹ", "ˈeɪ"], - ["m", "ˈæ", "t", "ɚ"], - ["ˈɪ", "n"], - ["ð", "ə"], - ["p", "ˈɑ", "ɹ", "t", "s"], - ["ə", "v"], - ["ð", "ə"], - ["b", "ɹ", "ˈeɪ", "n"], - ["ɹ", "i", "s", "p", "ˈɑ", "n", "s", "ɪ", "b", "ə", "l"], - ["f", "ɚ"], - ["ɪ", "m", "ˈoʊ", "ʃ", "ə", "n", "ə", "l"], - ["ɹ", "ˌɛ", "ɡ", "j", "ə", "l", "ˈeɪ", "ʃ", "ə", "n"], - ["ˈæ", "n", "d"], - ["l", "ˈɚ", "n", "ɪ", "ŋ"], - ["!"], -] +EXPECTED_PHONEMES = "ɹ|iː|s|ə|n|t| ɹ|ᵻ|s|ɜː|tʃ| æ|t| h|ɑːɹ|v|ɚ|d| h|æ|z| ʃ|oʊ|n| m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| f|ɔːɹ| æ|z| l|ɪ|ɾ|əl| æ|z| eɪ|t| w|iː|k|s| k|æ|n| æ|k|tʃ|uː|əl|i| ɪ|ŋ|k|ɹ|iː|s| ,| ð|ə| ɡ|ɹ|eɪ| m|æ|ɾ|ɚ| ɪ|n| ð|ə| p|ɑːɹ|t|s| ʌ|v| ð|ə| b|ɹ|eɪ|n| ɹ|ᵻ|s|p|ɑː|n|s|ᵻ|b|əl| f|ɔːɹ| ɪ|m|oʊ|ʃ|ə|n|əl| ɹ|ɛ|ɡ|j|ʊ|l|eɪ|ʃ|ə|n| æ|n|d| l|ɜː|n|ɪ|ŋ| !" # ----------------------------------------------------------------------------- @@ -56,79 +17,84 @@ EXAMPLE_PHONEMES = [ class TextProcessingTextCase(unittest.TestCase): """Tests for text to phoneme conversion""" - def test_all_phonemes_in_tts(self): - """Ensure that all phonemes from gruut are present in TTS phonemes""" - tts_phonemes = set(all_phonemes) - - # Check stress characters - for suprasegmental in [IPA.STRESS_PRIMARY, IPA.STRESS_SECONDARY]: - self.assertIn(suprasegmental, tts_phonemes) - - # Check that gruut's phonemes are a subset of TTS phonemes - for lang in gruut.get_supported_languages(): - for phoneme in Phonemes.from_language(lang): - for codepoint in clean_gruut_phonemes(phoneme.text): - - self.assertIn(codepoint, tts_phonemes) - def test_phoneme_to_sequence(self): - """Verify example (text -> sequence -> phoneme string) pipeline""" - lang = "en-us" - expected_phoneme_str = " ".join( - "".join(clean_gruut_phonemes(word_phonemes)) for word_phonemes in EXAMPLE_PHONEMES - ) - - # Ensure that TTS produces same phoneme string - text_cleaner = ["phoneme_cleaners"] - actual_sequence = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, lang) - actual_phoneme_str = sequence_to_phoneme(actual_sequence) - - self.assertEqual(actual_phoneme_str, expected_phoneme_str) + """Verify en-us sentence phonemes without blank token""" + self._test_phoneme_to_sequence(add_blank=False) def test_phoneme_to_sequence_with_blank_token(self): - """Verify example (text -> sequence -> phoneme string) pipeline with blank token""" - lang = "en-us" + """Verify en-us sentence phonemes with blank token""" + self._test_phoneme_to_sequence(add_blank=True) + + def _test_phoneme_to_sequence(self, add_blank): text_cleaner = ["phoneme_cleaners"] + sequence = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True) + text_hat = sequence_to_phoneme(sequence) + text_hat_with_params = sequence_to_phoneme(sequence) + gt = EXPECTED_PHONEMES.replace("|", "") + self.assertEqual(text_hat, text_hat_with_params) + self.assertEqual(text_hat, gt) - # Create with/without blank sequences - sequence_without_blank = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, lang, add_blank=False) - sequence_with_blank = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, lang, add_blank=True) + # multiple punctuations + text = "Be a voice, not an! echo?" + sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True) + text_hat = sequence_to_phoneme(sequence) + text_hat_with_params = sequence_to_phoneme(sequence) + gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ ?" + print(text_hat) + print(len(sequence)) + self.assertEqual(text_hat, text_hat_with_params) + self.assertEqual(text_hat, gt) - # With blank sequence should be bigger - self.assertGreater(len(sequence_with_blank), len(sequence_without_blank)) + # not ending with punctuation + text = "Be a voice, not an! echo" + sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True) + text_hat = sequence_to_phoneme(sequence) + text_hat_with_params = sequence_to_phoneme(sequence) + gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ" + print(text_hat) + print(len(sequence)) + self.assertEqual(text_hat, text_hat_with_params) + self.assertEqual(text_hat, gt) - # But phoneme strings should still be identical - phoneme_str_without_blank = sequence_to_phoneme(sequence_without_blank, add_blank=False) - phoneme_str_with_blank = sequence_to_phoneme(sequence_with_blank, add_blank=True) + # original + text = "Be a voice, not an echo!" + sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True) + text_hat = sequence_to_phoneme(sequence) + text_hat_with_params = sequence_to_phoneme(sequence) + gt = "biː ɐ vɔɪs , nɑːt ɐn ɛkoʊ !" + print(text_hat) + print(len(sequence)) + self.assertEqual(text_hat, text_hat_with_params) + self.assertEqual(text_hat, gt) - self.assertEqual(phoneme_str_with_blank, phoneme_str_without_blank) + # extra space after the sentence + text = "Be a voice, not an! echo. " + sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True) + text_hat = sequence_to_phoneme(sequence) + text_hat_with_params = sequence_to_phoneme(sequence) + gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ ." + print(text_hat) + print(len(sequence)) + self.assertEqual(text_hat, text_hat_with_params) + self.assertEqual(text_hat, gt) - def test_messy_text(self): - """Verify text with extra punctuation/whitespace/etc. makes it through the pipeline""" - text = '"Be" a! voice, [NOT]? (an eCHo. ' - lang = "en-us" - expected_phonemes = [ - ["b", "ˈi"], - ["ə"], - ["!"], - ["v", "ˈɔɪ", "s"], - [","], - ["n", "ˈɑ", "t"], - ["?"], - ["ə", "n"], - ["ˈɛ", "k", "oʊ"], - ["."], - ] - expected_phoneme_str = " ".join( - "".join(clean_gruut_phonemes(word_phonemes)) for word_phonemes in expected_phonemes + # extra space after the sentence + text = "Be a voice, not an! echo. " + sequence = phoneme_to_sequence( + text, text_cleaner, LANG, enable_eos_bos=True, add_blank=add_blank, use_espeak_phonemes=True ) + text_hat = sequence_to_phoneme(sequence) + text_hat_with_params = sequence_to_phoneme(sequence) + gt = "^biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ .~" + print(text_hat) + print(len(sequence)) + self.assertEqual(text_hat, text_hat_with_params) + self.assertEqual(text_hat, gt) - # Ensure that TTS produces same phoneme string - text_cleaner = ["phoneme_cleaners"] - actual_sequence = phoneme_to_sequence(text, text_cleaner, lang) - actual_phoneme_str = sequence_to_phoneme(actual_sequence) - - self.assertEqual(actual_phoneme_str, expected_phoneme_str) + def test_text2phone(self): + text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!" + ph = text2phone(EXAMPLE_TEXT, LANG) + self.assertEqual(ph, EXPECTED_PHONEMES) # -----------------------------------------------------------------------------