diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py index eab7a689..29f4af1d 100644 --- a/TTS/tts/utils/text/__init__.py +++ b/TTS/tts/utils/text/__init__.py @@ -16,6 +16,8 @@ _id_to_symbol = {i: s for i, s in enumerate(symbols)} _phonemes_to_id = {s: i for i, s in enumerate(phonemes)} _id_to_phonemes = {i: s for i, s in enumerate(phonemes)} +_symbols = symbols +_phonemes = phonemes # Regular expression matching text enclosed in curly braces: _CURLY_RE = re.compile(r'(.*?)\{(.+?)\}(.*)') @@ -75,7 +77,7 @@ def pad_with_eos_bos(phoneme_sequence, tp=None): def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None, add_blank=False): # pylint: disable=global-statement - global _phonemes_to_id + global _phonemes_to_id, _phonemes if tp: _, _phonemes = make_symbols(**tp) _phonemes_to_id = {s: i for i, s in enumerate(_phonemes)} @@ -96,10 +98,12 @@ def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp= return sequence -def sequence_to_phoneme(sequence, tp=None): +def sequence_to_phoneme(sequence, tp=None, add_blank=False): # pylint: disable=global-statement '''Converts a sequence of IDs back to a string''' - global _id_to_phonemes + global _id_to_phonemes, _phonemes + if add_blank: + sequence = list(filter(lambda x: x != len(_phonemes), sequence)) result = '' if tp: _, _phonemes = make_symbols(**tp) @@ -126,7 +130,7 @@ def text_to_sequence(text, cleaner_names, tp=None, add_blank=False): List of integers corresponding to the symbols in the text ''' # pylint: disable=global-statement - global _symbol_to_id + global _symbol_to_id, _symbols if tp: _symbols, _ = make_symbols(**tp) _symbol_to_id = {s: i for i, s in enumerate(_symbols)} @@ -148,10 +152,13 @@ def text_to_sequence(text, cleaner_names, tp=None, add_blank=False): return sequence -def sequence_to_text(sequence, tp=None): +def sequence_to_text(sequence, tp=None, add_blank=False): '''Converts a sequence of IDs back to a string''' # pylint: disable=global-statement - global _id_to_symbol + global _id_to_symbol, _symbols + if add_blank: + sequence = list(filter(lambda x: x != len(_symbols), sequence)) + if tp: _symbols, _ = make_symbols(**tp) _id_to_symbol = {i: s for i, s in enumerate(_symbols)} diff --git a/tests/test_text_processing.py b/tests/test_text_processing.py index 1eb9f9a8..ae3250a8 100644 --- a/tests/test_text_processing.py +++ b/tests/test_text_processing.py @@ -11,6 +11,7 @@ from TTS.utils.io import load_config conf = load_config(os.path.join(get_tests_input_path(), 'test_config.json')) def test_phoneme_to_sequence(): + text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!" text_cleaner = ["phoneme_cleaners"] lang = "en-us" @@ -20,7 +21,7 @@ def test_phoneme_to_sequence(): text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters) gt = "ɹiːsənt ɹɪsɜːtʃ æt hɑːɹvɚd hɐz ʃoʊn mɛdᵻteɪɾɪŋ fɔːɹ æz lɪɾəl æz eɪt wiːks kæn æktʃuːəli ɪnkɹiːs, ðə ɡɹeɪ mæɾɚɹ ɪnðə pɑːɹts ʌvðə bɹeɪn ɹɪspɑːnsəbəl fɔːɹ ɪmoʊʃənəl ɹɛɡjuːleɪʃən ænd lɜːnɪŋ!" assert text_hat == text_hat_with_params == gt - + # multiple punctuations text = "Be a voice, not an! echo?" sequence = phoneme_to_sequence(text, text_cleaner, lang) @@ -87,6 +88,84 @@ def test_phoneme_to_sequence(): print(len(sequence)) assert text_hat == text_hat_with_params == gt +def test_phoneme_to_sequence_with_blank_token(): + + text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!" + text_cleaner = ["phoneme_cleaners"] + lang = "en-us" + sequence = phoneme_to_sequence(text, text_cleaner, lang) + text_hat = sequence_to_phoneme(sequence) + _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters, add_blank=True) + text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters, add_blank=True) + gt = "ɹiːsənt ɹɪsɜːtʃ æt hɑːɹvɚd hɐz ʃoʊn mɛdᵻteɪɾɪŋ fɔːɹ æz lɪɾəl æz eɪt wiːks kæn æktʃuːəli ɪnkɹiːs, ðə ɡɹeɪ mæɾɚɹ ɪnðə pɑːɹts ʌvðə bɹeɪn ɹɪspɑːnsəbəl fɔːɹ ɪmoʊʃənəl ɹɛɡjuːleɪʃən ænd lɜːnɪŋ!" + assert text_hat == text_hat_with_params == gt + + # multiple punctuations + text = "Be a voice, not an! echo?" + sequence = phoneme_to_sequence(text, text_cleaner, lang) + text_hat = sequence_to_phoneme(sequence) + _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters, add_blank=True) + text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters, add_blank=True) + gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ?" + print(text_hat) + print(len(sequence)) + assert text_hat == text_hat_with_params == gt + + # not ending with punctuation + text = "Be a voice, not an! echo" + sequence = phoneme_to_sequence(text, text_cleaner, lang) + text_hat = sequence_to_phoneme(sequence) + _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters, add_blank=True) + text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters, add_blank=True) + gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ" + print(text_hat) + print(len(sequence)) + assert text_hat == text_hat_with_params == gt + + # original + text = "Be a voice, not an echo!" + sequence = phoneme_to_sequence(text, text_cleaner, lang) + text_hat = sequence_to_phoneme(sequence) + _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters, add_blank=True) + text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters, add_blank=True) + gt = "biː ɐ vɔɪs, nɑːt ɐn ɛkoʊ!" + print(text_hat) + print(len(sequence)) + assert text_hat == text_hat_with_params == gt + + # extra space after the sentence + text = "Be a voice, not an! echo. " + sequence = phoneme_to_sequence(text, text_cleaner, lang) + text_hat = sequence_to_phoneme(sequence) + _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters, add_blank=True) + text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters, add_blank=True) + gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ." + print(text_hat) + print(len(sequence)) + assert text_hat == text_hat_with_params == gt + + # extra space after the sentence + text = "Be a voice, not an! echo. " + sequence = phoneme_to_sequence(text, text_cleaner, lang, True) + text_hat = sequence_to_phoneme(sequence) + _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters, add_blank=True) + text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters, add_blank=True) + gt = "^biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ.~" + print(text_hat) + print(len(sequence)) + assert text_hat == text_hat_with_params == gt + + # padding char + text = "_Be a _voice, not an! echo_" + sequence = phoneme_to_sequence(text, text_cleaner, lang) + text_hat = sequence_to_phoneme(sequence) + _ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters, add_blank=True) + text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters, add_blank=True) + gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ" + print(text_hat) + print(len(sequence)) + assert text_hat == text_hat_with_params == gt + def test_text2phone(): text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!" gt = "ɹ|iː|s|ə|n|t| |ɹ|ɪ|s|ɜː|tʃ| |æ|t| |h|ɑːɹ|v|ɚ|d| |h|ɐ|z| |ʃ|oʊ|n| |m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| |f|ɔː|ɹ| |æ|z| |l|ɪ|ɾ|əl| |æ|z| |eɪ|t| |w|iː|k|s| |k|æ|n| |æ|k|tʃ|uː|əl|i| |ɪ|n|k|ɹ|iː|s|,| |ð|ə| |ɡ|ɹ|eɪ| |m|æ|ɾ|ɚ|ɹ| |ɪ|n|ð|ə| |p|ɑːɹ|t|s| |ʌ|v|ð|ə| |b|ɹ|eɪ|n| |ɹ|ɪ|s|p|ɑː|n|s|ə|b|əl| |f|ɔː|ɹ| |ɪ|m|oʊ|ʃ|ə|n|əl| |ɹ|ɛ|ɡ|j|uː|l|eɪ|ʃ|ə|n| |æ|n|d| |l|ɜː|n|ɪ|ŋ|!"