Add tests for gruut phonemization

This commit is contained in:
Michael Hansen 2021-06-09 11:52:10 -04:00 committed by Eren Gölge
parent 67869e77f9
commit 47191f3ecc
3 changed files with 168 additions and 36 deletions

View File

@ -1,6 +1,5 @@
import os
with open(os.path.join(os.path.dirname(__file__), "VERSION")) as f:
version = f.read().strip()

View File

@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
import re
import unicodedata
import gruut
from packaging import version
@ -26,32 +27,34 @@ _CURLY_RE = re.compile(r"(.*?)\{(.+?)\}(.*)")
# Regular expression matching punctuations, ignoring empty space
PHONEME_PUNCTUATION_PATTERN = r"[" + _punctuations.replace(" ", "") + "]+"
# language -> source phoneme -> dest phoneme
# Used to make gruut's phonemes fit better with eSpeak's.
GRUUT_PHONEME_MAP = {
"en-us": {
"i": "iː",
"ɑ": "ɑː",
"ɚ": "ɜːɹ",
},
"de": {
"ʁ": "ɾ",
"g": "ɡ",
"ʔ": "",
},
"nl": {
"a": "aː",
"e": "eː",
"ʏ": "ɵ",
"ʋ": "w",
"ɹ": "r",
"ɔː": "oː",
},
"es": {
"ɾ": "r",
"g": "ɣ",
},
}
# Table for str.translate to fix gruut/TTS phoneme mismatch
GRUUT_TRANS_TABLE = str.maketrans("g", "ɡ")
def clean_gruut_phonemes(ph_list):
"""Decompose, substitute, and clean gruut phonemes for TTS.
Parameters:
ph_list (list[str]): list of phonemes from gruut
Returns:
clean_list (list[str]): decomposed/clean list of phonemes for TTS
Dipthongs, etc. are decomposed into single characters
Unicode combining characters are removed (e.g., ties)
"""
cleaned_phonemes = []
for phoneme_text in ph_list:
# Decompose into codepoints (ã -> ["a", "\u0303"])
phoneme_text = unicodedata.normalize("NFD", phoneme_text)
for codepoint in phoneme_text.translate(GRUUT_TRANS_TABLE):
if unicodedata.combining(codepoint) > 0:
# Skip combining characters like ties
continue
cleaned_phonemes.append(codepoint)
return cleaned_phonemes
def text2phone(text, language):
@ -82,21 +85,14 @@ def text2phone(text, language):
lang=language,
return_format="word_phonemes",
phonemizer_args={
"remove_stress": True, # remove primary/secondary stress
"remove_accents": True, # remove accute/grave accents (Swedish)
"ipa_minor_breaks": False, # don't replace commas/semi-colons with IPA |
"ipa_major_breaks": False, # don't replace periods with IPA ‖
},
)
ph_map = GRUUT_PHONEME_MAP.get(language)
if ph_map:
# Re-map phonemes to fit with eSpeak conventions
for word in ph_list:
for p_idx, p in enumerate(word):
word[p_idx] = ph_map.get(p, p)
# Join and re-split to break apart dipthongs, suprasegmentals, etc.
ph_words = ["|".join(word_phonemes) for word_phonemes in ph_list]
ph_words = ["|".join(clean_gruut_phonemes(word_phonemes)) for word_phonemes in ph_list]
ph = "| ".join(ph_words)
print(" > Phonemes: {}".format(ph))

View File

@ -0,0 +1,137 @@
"""Tests for text to phoneme converstion"""
import unittest
import gruut
from gruut_ipa import IPA, Phonemes
from TTS.tts.utils.text import clean_gruut_phonemes, phoneme_to_sequence
from TTS.tts.utils.text import phonemes as all_phonemes
from TTS.tts.utils.text import sequence_to_phoneme
# -----------------------------------------------------------------------------
EXAMPLE_TEXT = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"
# Raw phonemes from run of gruut with example text (en-us).
# This includes IPA ties, etc.
EXAMPLE_PHONEMES = [
["ɹ", "ˈi", "s", "ə", "n", "t"],
["ɹ", "i", "s", "ˈɚ", "t͡ʃ"],
["ˈæ", "t"],
["h", "ˈɑ", "ɹ", "v", "ɚ", "d"],
["h", "ˈæ", "z"],
["ʃ", "ˈ", "n"],
["m", "ˈɛ", "d", "ɪ", "t", "ˌeɪ", "t", "ɪ", "ŋ"],
["f", "ɚ"],
["ˈæ", "z"],
["l", "ˈɪ", "t", "ə", "l"],
["ˈæ", "z"],
["ˈeɪ", "t"],
["w", "ˈi", "k", "s"],
["k", "ə", "n"],
["ˈæ", "k", "t͡ʃ", "ə", "l", "i"],
["ɪ", "ŋ", "k", "ɹ", "ˈi", "s"],
[","],
["ð", "ə"],
["ɡ", "ɹ", "ˈeɪ"],
["m", "ˈæ", "t", "ɚ"],
["ˈɪ", "n"],
["ð", "ə"],
["p", "ˈɑ", "ɹ", "t", "s"],
["ə", "v"],
["ð", "ə"],
["b", "ɹ", "ˈeɪ", "n"],
["ɹ", "i", "s", "p", "ˈɑ", "n", "s", "ɪ", "b", "ə", "l"],
["f", "ɚ"],
["ɪ", "m", "ˈ", "ʃ", "ə", "n", "ə", "l"],
["ɹ", "ˌɛ", "ɡ", "j", "ə", "l", "ˈeɪ", "ʃ", "ə", "n"],
["ˈæ", "n", "d"],
["l", "ˈɚ", "n", "ɪ", "ŋ"],
["!"],
]
# -----------------------------------------------------------------------------
class TextProcessingTextCase(unittest.TestCase):
"""Tests for text to phoneme conversion"""
def test_all_phonemes_in_tts(self):
"""Ensure that all phonemes from gruut are present in TTS phonemes"""
tts_phonemes = set(all_phonemes)
# Check stress characters
for suprasegmental in [IPA.STRESS_PRIMARY, IPA.STRESS_SECONDARY]:
self.assertIn(suprasegmental, tts_phonemes)
# Check that gruut's phonemes are a subset of TTS phonemes
for lang in gruut.get_supported_languages():
for phoneme in Phonemes.from_language(lang):
for codepoint in clean_gruut_phonemes(phoneme.text):
self.assertIn(codepoint, tts_phonemes)
def test_phoneme_to_sequence(self):
"""Verify example (text -> sequence -> phoneme string) pipeline"""
lang = "en-us"
expected_phoneme_str = " ".join(
"".join(clean_gruut_phonemes(word_phonemes)) for word_phonemes in EXAMPLE_PHONEMES
)
# Ensure that TTS produces same phoneme string
text_cleaner = ["phoneme_cleaners"]
actual_sequence = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, lang)
actual_phoneme_str = sequence_to_phoneme(actual_sequence)
self.assertEqual(actual_phoneme_str, expected_phoneme_str)
def test_phoneme_to_sequence_with_blank_token(self):
"""Verify example (text -> sequence -> phoneme string) pipeline with blank token"""
lang = "en-us"
text_cleaner = ["phoneme_cleaners"]
# Create with/without blank sequences
sequence_without_blank = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, lang, add_blank=False)
sequence_with_blank = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, lang, add_blank=True)
# With blank sequence should be bigger
self.assertGreater(len(sequence_with_blank), len(sequence_without_blank))
# But phoneme strings should still be identical
phoneme_str_without_blank = sequence_to_phoneme(sequence_without_blank, add_blank=False)
phoneme_str_with_blank = sequence_to_phoneme(sequence_with_blank, add_blank=True)
self.assertEqual(phoneme_str_with_blank, phoneme_str_without_blank)
def test_messy_text(self):
"""Verify text with extra punctuation/whitespace/etc. makes it through the pipeline"""
text = '"Be" a! voice, [NOT]? (an eCHo. '
lang = "en-us"
expected_phonemes = [
["b", "ˈi"],
["ə"],
["!"],
["v", "ˈɔɪ", "s"],
[","],
["n", "ˈɑ", "t"],
["?"],
["ə", "n"],
["ˈɛ", "k", ""],
["."],
]
expected_phoneme_str = " ".join(
"".join(clean_gruut_phonemes(word_phonemes)) for word_phonemes in expected_phonemes
)
# Ensure that TTS produces same phoneme string
text_cleaner = ["phoneme_cleaners"]
actual_sequence = phoneme_to_sequence(text, text_cleaner, lang)
actual_phoneme_str = sequence_to_phoneme(actual_sequence)
self.assertEqual(actual_phoneme_str, expected_phoneme_str)
# -----------------------------------------------------------------------------
if __name__ == "__main__":
unittest.main()