Use eSpeak IPA lexicons by default for phoneme models

This commit is contained in:
Michael Hansen 2021-06-15 15:57:08 -04:00 committed by Eren Gölge
parent 618b509204
commit 4d8426fa0a
5 changed files with 97 additions and 154 deletions

View File

@ -97,6 +97,8 @@ class BaseTTSConfig(BaseTrainingConfig):
Audio processor config object instance.
use_phonemes (bool):
enable / disable phoneme use.
use_espeak_phonemes (bool):
enable / disable eSpeak-compatible phonemes (only if use_phonemes = `True`).
compute_input_seq_cache (bool):
enable / disable precomputation of the phoneme sequences. At the expense of some delay at the beginning of
the training, It allows faster data loader time and precise limitation with `max_seq_len` and
@ -136,6 +138,7 @@ class BaseTTSConfig(BaseTrainingConfig):
audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
# phoneme settings
use_phonemes: bool = False
use_espeak_phonemes: bool = True
phoneme_language: str = None
compute_input_seq_cache: bool = False
text_cleaner: str = MISSING

View File

@ -25,6 +25,7 @@ def text_to_seqvec(text, CONFIG):
CONFIG.enable_eos_bos_chars,
tp=CONFIG.characters,
add_blank=CONFIG.add_blank,
use_espeak_phonemes=CONFIG.use_espeak_phonemes
),
dtype=np.int32,
)

View File

@ -31,46 +31,7 @@ PHONEME_PUNCTUATION_PATTERN = r"[" + _punctuations.replace(" ", "") + "]+"
GRUUT_TRANS_TABLE = str.maketrans("g", "ɡ")
def clean_gruut_phonemes(ph_list):
"""Decompose, substitute, and clean gruut phonemes for TTS.
gruut phonemes may contain any IPA characters (e.g., "" for the nasalized
"e"), and may be composed of multiple characters (e.g., "aɪ" in the English
"r[i]ce").
TTS phonemes come from a fixed set of symbols, and do not include every
possible variation of every vowel/consonant. Here, we decompose dipthongs,
etc. into single characters and then filter out Unicode combining characters
such as ties. This ensures that (most) phonemes will exist in the TTS symbol
table.
Args:
ph_list (list[str]): list of phonemes from gruut
Returns:
clean_list (list[str]): decomposed/clean list of phonemes for TTS
"""
cleaned_phonemes = []
for phoneme_text in ph_list:
phoneme_text = unicodedata.normalize("NFC", phoneme_text)
if phoneme_text in phonemes:
cleaned_phonemes.append(phoneme_text)
continue
# Decompose into codepoints (ã -> ["a", "\u0303"])
phoneme_text = unicodedata.normalize("NFD", phoneme_text)
for codepoint in phoneme_text.translate(GRUUT_TRANS_TABLE):
if unicodedata.combining(codepoint) > 0:
# Skip combining characters like ties
continue
cleaned_phonemes.append(codepoint)
return cleaned_phonemes
def text2phone(text, language):
def text2phone(text, language, use_espeak_phonemes=False):
"""Convert graphemes to phonemes.
Parameters:
text (str): text to phonemize
@ -93,21 +54,32 @@ def text2phone(text, language):
if gruut.is_language_supported(language):
# Use gruut for phonemization
phonemizer_args={
"remove_stress": True,
"ipa_minor_breaks": False, # don't replace commas/semi-colons with IPA |
"ipa_major_breaks": False, # don't replace periods with IPA ‖
}
if use_espeak_phonemes:
# Use a lexicon/g2p model train on eSpeak IPA instead of gruut IPA.
# This is intended for backwards compatibility with TTS<=v0.0.13
# pre-trained models.
phonemizer_args["model_prefix"] = "espeak"
ph_list = gruut.text_to_phonemes(
text,
lang=language,
return_format="word_phonemes",
phonemizer_args={
"remove_accents": True, # remove accute/grave accents (Swedish)
"ipa_minor_breaks": False, # don't replace commas/semi-colons with IPA |
"ipa_major_breaks": False, # don't replace periods with IPA ‖
},
phonemizer_args=phonemizer_args,
)
# Join and re-split to break apart dipthongs, suprasegmentals, etc.
ph_words = ["|".join(clean_gruut_phonemes(word_phonemes)) for word_phonemes in ph_list]
ph_words = ["|".join(word_phonemes) for word_phonemes in ph_list]
ph = "| ".join(ph_words)
# Fix a few phonemes
ph = ph.translate(GRUUT_TRANS_TABLE)
print(" > Phonemes: {}".format(ph))
return ph
@ -132,7 +104,7 @@ def pad_with_eos_bos(phoneme_sequence, tp=None):
return [_phonemes_to_id[_bos]] + list(phoneme_sequence) + [_phonemes_to_id[_eos]]
def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None, add_blank=False):
def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None, add_blank=False, use_espeak_phonemes=False):
# pylint: disable=global-statement
global _phonemes_to_id, _phonemes
if tp:
@ -141,7 +113,7 @@ def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=
sequence = []
clean_text = _clean_text(text, cleaner_names)
to_phonemes = text2phone(clean_text, language)
to_phonemes = text2phone(clean_text, language, use_espeak_phonemes=use_espeak_phonemes)
if to_phonemes is None:
print("!! After phoneme conversion the result is None. -- {} ".format(clean_text))
# iterate by skipping empty strings - NOTE: might be useful to keep it to have a better intonation.
@ -152,6 +124,7 @@ def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=
sequence = pad_with_eos_bos(sequence, tp=tp)
if add_blank:
sequence = intersperse(sequence, len(_phonemes)) # add a blank token (new), whose id number is len(_phonemes)
return sequence

View File

@ -23,4 +23,4 @@ coqpit
mecab-python3==1.0.3
unidic-lite==1.0.8
# gruut+supported langs
gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=1.1.0
gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=1.2.0

View File

@ -1,54 +1,15 @@
"""Tests for text to phoneme converstion"""
import unittest
import gruut
from gruut_ipa import IPA, Phonemes
from TTS.tts.utils.text import clean_gruut_phonemes, phoneme_to_sequence
from TTS.tts.utils.text import phonemes as all_phonemes
from TTS.tts.utils.text import sequence_to_phoneme
from TTS.tts.utils.text import phoneme_to_sequence, sequence_to_phoneme, text2phone
# -----------------------------------------------------------------------------
LANG = "en-us"
EXAMPLE_TEXT = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"
# Raw phonemes from run of gruut with example text (en-us).
# This includes IPA ties, etc.
EXAMPLE_PHONEMES = [
["ɹ", "ˈi", "s", "ə", "n", "t"],
["ɹ", "i", "s", "ˈɚ", "t͡ʃ"],
["ˈæ", "t"],
["h", "ˈɑ", "ɹ", "v", "ɚ", "d"],
["h", "ˈæ", "z"],
["ʃ", "ˈ", "n"],
["m", "ˈɛ", "d", "ɪ", "t", "ˌeɪ", "t", "ɪ", "ŋ"],
["f", "ɚ"],
["ˈæ", "z"],
["l", "ˈɪ", "t", "ə", "l"],
["ˈæ", "z"],
["ˈeɪ", "t"],
["w", "ˈi", "k", "s"],
["k", "ə", "n"],
["ˈæ", "k", "t͡ʃ", "ə", "l", "i"],
["ɪ", "ŋ", "k", "ɹ", "ˈi", "s"],
[","],
["ð", "ə"],
["ɡ", "ɹ", "ˈeɪ"],
["m", "ˈæ", "t", "ɚ"],
["ˈɪ", "n"],
["ð", "ə"],
["p", "ˈɑ", "ɹ", "t", "s"],
["ə", "v"],
["ð", "ə"],
["b", "ɹ", "ˈeɪ", "n"],
["ɹ", "i", "s", "p", "ˈɑ", "n", "s", "ɪ", "b", "ə", "l"],
["f", "ɚ"],
["ɪ", "m", "ˈ", "ʃ", "ə", "n", "ə", "l"],
["ɹ", "ˌɛ", "ɡ", "j", "ə", "l", "ˈeɪ", "ʃ", "ə", "n"],
["ˈæ", "n", "d"],
["l", "ˈɚ", "n", "ɪ", "ŋ"],
["!"],
]
EXPECTED_PHONEMES = "ɹ|iː|s|ə|n|t| ɹ|ᵻ|s|ɜː|tʃ| æ|t| h|ɑːɹ|v|ɚ|d| h|æ|z| ʃ|oʊ|n| m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| f|ɔːɹ| æ|z| l|ɪ|ɾ|əl| æ|z| eɪ|t| w|iː|k|s| k|æ|n| æ|k|tʃ|uː|əl|i| ɪ|ŋ|k|ɹ|iː|s| ,| ð|ə| ɡ|ɹ|eɪ| m|æ|ɾ|ɚ| ɪ|n| ð|ə| p|ɑːɹ|t|s| ʌ|v| ð|ə| b|ɹ|eɪ|n| ɹ|ᵻ|s|p|ɑː|n|s|ᵻ|b|əl| f|ɔːɹ| ɪ|m|oʊ|ʃ|ə|n|əl| ɹ|ɛ|ɡ|j|ʊ|l|eɪ|ʃ|ə|n| æ|n|d| l|ɜː|n|ɪ|ŋ| !"
# -----------------------------------------------------------------------------
@ -56,79 +17,84 @@ EXAMPLE_PHONEMES = [
class TextProcessingTextCase(unittest.TestCase):
"""Tests for text to phoneme conversion"""
def test_all_phonemes_in_tts(self):
"""Ensure that all phonemes from gruut are present in TTS phonemes"""
tts_phonemes = set(all_phonemes)
# Check stress characters
for suprasegmental in [IPA.STRESS_PRIMARY, IPA.STRESS_SECONDARY]:
self.assertIn(suprasegmental, tts_phonemes)
# Check that gruut's phonemes are a subset of TTS phonemes
for lang in gruut.get_supported_languages():
for phoneme in Phonemes.from_language(lang):
for codepoint in clean_gruut_phonemes(phoneme.text):
self.assertIn(codepoint, tts_phonemes)
def test_phoneme_to_sequence(self):
"""Verify example (text -> sequence -> phoneme string) pipeline"""
lang = "en-us"
expected_phoneme_str = " ".join(
"".join(clean_gruut_phonemes(word_phonemes)) for word_phonemes in EXAMPLE_PHONEMES
)
# Ensure that TTS produces same phoneme string
text_cleaner = ["phoneme_cleaners"]
actual_sequence = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, lang)
actual_phoneme_str = sequence_to_phoneme(actual_sequence)
self.assertEqual(actual_phoneme_str, expected_phoneme_str)
"""Verify en-us sentence phonemes without blank token"""
self._test_phoneme_to_sequence(add_blank=False)
def test_phoneme_to_sequence_with_blank_token(self):
"""Verify example (text -> sequence -> phoneme string) pipeline with blank token"""
lang = "en-us"
"""Verify en-us sentence phonemes with blank token"""
self._test_phoneme_to_sequence(add_blank=True)
def _test_phoneme_to_sequence(self, add_blank):
text_cleaner = ["phoneme_cleaners"]
sequence = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
text_hat = sequence_to_phoneme(sequence)
text_hat_with_params = sequence_to_phoneme(sequence)
gt = EXPECTED_PHONEMES.replace("|", "")
self.assertEqual(text_hat, text_hat_with_params)
self.assertEqual(text_hat, gt)
# Create with/without blank sequences
sequence_without_blank = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, lang, add_blank=False)
sequence_with_blank = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, lang, add_blank=True)
# multiple punctuations
text = "Be a voice, not an! echo?"
sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
text_hat = sequence_to_phoneme(sequence)
text_hat_with_params = sequence_to_phoneme(sequence)
gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ ?"
print(text_hat)
print(len(sequence))
self.assertEqual(text_hat, text_hat_with_params)
self.assertEqual(text_hat, gt)
# With blank sequence should be bigger
self.assertGreater(len(sequence_with_blank), len(sequence_without_blank))
# not ending with punctuation
text = "Be a voice, not an! echo"
sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
text_hat = sequence_to_phoneme(sequence)
text_hat_with_params = sequence_to_phoneme(sequence)
gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ"
print(text_hat)
print(len(sequence))
self.assertEqual(text_hat, text_hat_with_params)
self.assertEqual(text_hat, gt)
# But phoneme strings should still be identical
phoneme_str_without_blank = sequence_to_phoneme(sequence_without_blank, add_blank=False)
phoneme_str_with_blank = sequence_to_phoneme(sequence_with_blank, add_blank=True)
# original
text = "Be a voice, not an echo!"
sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
text_hat = sequence_to_phoneme(sequence)
text_hat_with_params = sequence_to_phoneme(sequence)
gt = "biː ɐ vɔɪs , nɑːt ɐn ɛkoʊ !"
print(text_hat)
print(len(sequence))
self.assertEqual(text_hat, text_hat_with_params)
self.assertEqual(text_hat, gt)
self.assertEqual(phoneme_str_with_blank, phoneme_str_without_blank)
# extra space after the sentence
text = "Be a voice, not an! echo. "
sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
text_hat = sequence_to_phoneme(sequence)
text_hat_with_params = sequence_to_phoneme(sequence)
gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ ."
print(text_hat)
print(len(sequence))
self.assertEqual(text_hat, text_hat_with_params)
self.assertEqual(text_hat, gt)
def test_messy_text(self):
"""Verify text with extra punctuation/whitespace/etc. makes it through the pipeline"""
text = '"Be" a! voice, [NOT]? (an eCHo. '
lang = "en-us"
expected_phonemes = [
["b", "ˈi"],
["ə"],
["!"],
["v", "ˈɔɪ", "s"],
[","],
["n", "ˈɑ", "t"],
["?"],
["ə", "n"],
["ˈɛ", "k", ""],
["."],
]
expected_phoneme_str = " ".join(
"".join(clean_gruut_phonemes(word_phonemes)) for word_phonemes in expected_phonemes
# extra space after the sentence
text = "Be a voice, not an! echo. "
sequence = phoneme_to_sequence(
text, text_cleaner, LANG, enable_eos_bos=True, add_blank=add_blank, use_espeak_phonemes=True
)
text_hat = sequence_to_phoneme(sequence)
text_hat_with_params = sequence_to_phoneme(sequence)
gt = "^biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ .~"
print(text_hat)
print(len(sequence))
self.assertEqual(text_hat, text_hat_with_params)
self.assertEqual(text_hat, gt)
# Ensure that TTS produces same phoneme string
text_cleaner = ["phoneme_cleaners"]
actual_sequence = phoneme_to_sequence(text, text_cleaner, lang)
actual_phoneme_str = sequence_to_phoneme(actual_sequence)
self.assertEqual(actual_phoneme_str, expected_phoneme_str)
def test_text2phone(self):
text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"
ph = text2phone(EXAMPLE_TEXT, LANG)
self.assertEqual(ph, EXPECTED_PHONEMES)
# -----------------------------------------------------------------------------