From 3bc043faebe7d275ddfd9e12097ff9f3acf2bbc1 Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Sun, 31 Oct 2021 06:41:55 -0400 Subject: [PATCH] Upgrade to gruut 2.0 (#882) --- TTS/tts/utils/text/__init__.py | 61 ++++++++++++++----------- requirements.txt | 4 +- tests/aux_tests/test_text_processing.py | 14 +++--- 3 files changed, 44 insertions(+), 35 deletions(-) diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py index 66f518b4..537d2301 100644 --- a/TTS/tts/utils/text/__init__.py +++ b/TTS/tts/utils/text/__init__.py @@ -5,6 +5,7 @@ import re from typing import Dict, List import gruut +from gruut_ipa import IPA from TTS.tts.utils.text import cleaners from TTS.tts.utils.text.chinese_mandarin.phonemizer import chinese_text_to_phonemes @@ -32,7 +33,7 @@ PHONEME_PUNCTUATION_PATTERN = r"[" + _punctuations.replace(" ", "") + "]+" GRUUT_TRANS_TABLE = str.maketrans("g", "ɡ") -def text2phone(text, language, use_espeak_phonemes=False): +def text2phone(text, language, use_espeak_phonemes=False, keep_stress=False): """Convert graphemes to phonemes. Parameters: text (str): text to phonemize @@ -51,36 +52,44 @@ def text2phone(text, language, use_espeak_phonemes=False): ph = japanese_text_to_phonemes(text) return ph - if gruut.is_language_supported(language): - # Use gruut for phonemization - phonemizer_args = { - "remove_stress": True, - "ipa_minor_breaks": False, # don't replace commas/semi-colons with IPA | - "ipa_major_breaks": False, # don't replace periods with IPA ‖ - } + if not gruut.is_language_supported(language): + raise ValueError(f" [!] Language {language} is not supported for phonemization.") - if use_espeak_phonemes: - # Use a lexicon/g2p model train on eSpeak IPA instead of gruut IPA. - # This is intended for backwards compatibility with TTS<=v0.0.13 - # pre-trained models. - phonemizer_args["model_prefix"] = "espeak" + # Use gruut for phonemization + ph_list = [] + for sentence in gruut.sentences(text, lang=language, espeak=use_espeak_phonemes): + for word in sentence: + if word.is_break: + # Use actual character for break phoneme (e.g., comma) + if ph_list: + # Join with previous word + ph_list[-1].append(word.text) + else: + # First word is punctuation + ph_list.append([word.text]) + elif word.phonemes: + # Add phonemes for word + word_phonemes = [] - ph_list = gruut.text_to_phonemes( - text, - lang=language, - return_format="word_phonemes", - phonemizer_args=phonemizer_args, - ) + for word_phoneme in word.phonemes: + if not keep_stress: + # Remove primary/secondary stress + word_phoneme = IPA.without_stress(word_phoneme) - # Join and re-split to break apart dipthongs, suprasegmentals, etc. - ph_words = ["|".join(word_phonemes) for word_phonemes in ph_list] - ph = "| ".join(ph_words) + word_phoneme = word_phoneme.translate(GRUUT_TRANS_TABLE) - # Fix a few phonemes - ph = ph.translate(GRUUT_TRANS_TABLE) - return ph + if word_phoneme: + # Flatten phonemes + word_phonemes.extend(word_phoneme) - raise ValueError(f" [!] Language {language} is not supported for phonemization.") + if word_phonemes: + ph_list.append(word_phonemes) + + # Join and re-split to break apart dipthongs, suprasegmentals, etc. + ph_words = ["|".join(word_phonemes) for word_phonemes in ph_list] + ph = "| ".join(ph_words) + + return ph def intersperse(sequence, token): diff --git a/requirements.txt b/requirements.txt index a87a3c6f..3ec33ceb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,6 +23,6 @@ coqpit mecab-python3==1.0.3 unidic-lite==1.0.8 # gruut+supported langs -gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=1.2.0 +gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=2.0.0 fsspec>=2021.04.0 -pyworld \ No newline at end of file +pyworld diff --git a/tests/aux_tests/test_text_processing.py b/tests/aux_tests/test_text_processing.py index 3c424a15..62d60a42 100644 --- a/tests/aux_tests/test_text_processing.py +++ b/tests/aux_tests/test_text_processing.py @@ -9,12 +9,12 @@ LANG = "en-us" EXAMPLE_TEXT = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!" -EXPECTED_PHONEMES = "ɹ|iː|s|ə|n|t| ɹ|ᵻ|s|ɜː|tʃ| æ|t| h|ɑːɹ|v|ɚ|d| h|æ|z| ʃ|oʊ|n| m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| f|ɔːɹ| æ|z| l|ɪ|ɾ|əl| æ|z| eɪ|t| w|iː|k|s| k|æ|n| æ|k|tʃ|uː|əl|i| ɪ|ŋ|k|ɹ|iː|s| ,| ð|ə| ɡ|ɹ|eɪ| m|æ|ɾ|ɚ| ɪ|n| ð|ə| p|ɑːɹ|t|s| ʌ|v| ð|ə| b|ɹ|eɪ|n| ɹ|ᵻ|s|p|ɑː|n|s|ᵻ|b|əl| f|ɔːɹ| ɪ|m|oʊ|ʃ|ə|n|əl| ɹ|ɛ|ɡ|j|ʊ|l|eɪ|ʃ|ə|n| æ|n|d| l|ɜː|n|ɪ|ŋ| !" +EXPECTED_PHONEMES = "ɹ|i|ː|s|ə|n|t| ɹ|ᵻ|s|ɜ|ː|t|ʃ| æ|ɾ| h|ɑ|ː|ɹ|v|ɚ|d| h|ɐ|z| ʃ|o|ʊ|n| m|ɛ|d|ᵻ|t|e|ɪ|ɾ|ɪ|ŋ| f|ɔ|ː|ɹ| æ|z| l|ɪ|ɾ|ə|l| æ|z| e|ɪ|t| w|i|ː|k|s| k|æ|ŋ| æ|k|t|ʃ|u|ː|ə|l|i| ɪ|ŋ|k|ɹ|i|ː|s|,| ð|ə| ɡ|ɹ|e|ɪ| m|æ|ɾ|ɚ| ɪ|n| ð|ə| p|ɑ|ː|ɹ|t|s| ʌ|v| ð|ə| b|ɹ|e|ɪ|n| ɹ|ᵻ|s|p|ɑ|ː|n|s|ᵻ|b|ə|l| f|ɔ|ː|ɹ| ɪ|m|o|ʊ|ʃ|ə|n|ə|l| ɹ|ɛ|ɡ|j|ʊ|l|e|ɪ|ʃ|ə|n| æ|n|d| l|ɜ|ː|n|ɪ|ŋ|!" # ----------------------------------------------------------------------------- -class TextProcessingTextCase(unittest.TestCase): +class TextProcessingTestCase(unittest.TestCase): """Tests for text to phoneme conversion""" def test_phoneme_to_sequence(self): @@ -40,7 +40,7 @@ class TextProcessingTextCase(unittest.TestCase): sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True) text_hat = sequence_to_phoneme(sequence) text_hat_with_params = sequence_to_phoneme(sequence) - gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ ?" + gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ?" print(text_hat) print(len(sequence)) self.assertEqual(text_hat, text_hat_with_params) @@ -51,7 +51,7 @@ class TextProcessingTextCase(unittest.TestCase): sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True) text_hat = sequence_to_phoneme(sequence) text_hat_with_params = sequence_to_phoneme(sequence) - gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ" + gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ" print(text_hat) print(len(sequence)) self.assertEqual(text_hat, text_hat_with_params) @@ -62,7 +62,7 @@ class TextProcessingTextCase(unittest.TestCase): sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True) text_hat = sequence_to_phoneme(sequence) text_hat_with_params = sequence_to_phoneme(sequence) - gt = "biː ɐ vɔɪs , nɑːt ɐn ɛkoʊ !" + gt = "biː ɐ vɔɪs, nɑːt ɐn ɛkoʊ!" print(text_hat) print(len(sequence)) self.assertEqual(text_hat, text_hat_with_params) @@ -73,7 +73,7 @@ class TextProcessingTextCase(unittest.TestCase): sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True) text_hat = sequence_to_phoneme(sequence) text_hat_with_params = sequence_to_phoneme(sequence) - gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ ." + gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ." print(text_hat) print(len(sequence)) self.assertEqual(text_hat, text_hat_with_params) @@ -86,7 +86,7 @@ class TextProcessingTextCase(unittest.TestCase): ) text_hat = sequence_to_phoneme(sequence) text_hat_with_params = sequence_to_phoneme(sequence) - gt = "^biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ .~" + gt = "^biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ.~" print(text_hat) print(len(sequence)) self.assertEqual(text_hat, text_hat_with_params)