Use eSpeak IPA lexicons by default for phoneme models

This commit is contained in:
Michael Hansen 2021-06-15 15:57:08 -04:00 committed by Eren Gölge
parent 618b509204
commit 4d8426fa0a
5 changed files with 97 additions and 154 deletions

View File

@ -97,6 +97,8 @@ class BaseTTSConfig(BaseTrainingConfig):
Audio processor config object instance. Audio processor config object instance.
use_phonemes (bool): use_phonemes (bool):
enable / disable phoneme use. enable / disable phoneme use.
use_espeak_phonemes (bool):
enable / disable eSpeak-compatible phonemes (only if use_phonemes = `True`).
compute_input_seq_cache (bool): compute_input_seq_cache (bool):
enable / disable precomputation of the phoneme sequences. At the expense of some delay at the beginning of enable / disable precomputation of the phoneme sequences. At the expense of some delay at the beginning of
the training, It allows faster data loader time and precise limitation with `max_seq_len` and the training, It allows faster data loader time and precise limitation with `max_seq_len` and
@ -136,6 +138,7 @@ class BaseTTSConfig(BaseTrainingConfig):
audio: BaseAudioConfig = field(default_factory=BaseAudioConfig) audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
# phoneme settings # phoneme settings
use_phonemes: bool = False use_phonemes: bool = False
use_espeak_phonemes: bool = True
phoneme_language: str = None phoneme_language: str = None
compute_input_seq_cache: bool = False compute_input_seq_cache: bool = False
text_cleaner: str = MISSING text_cleaner: str = MISSING

View File

@ -25,6 +25,7 @@ def text_to_seqvec(text, CONFIG):
CONFIG.enable_eos_bos_chars, CONFIG.enable_eos_bos_chars,
tp=CONFIG.characters, tp=CONFIG.characters,
add_blank=CONFIG.add_blank, add_blank=CONFIG.add_blank,
use_espeak_phonemes=CONFIG.use_espeak_phonemes
), ),
dtype=np.int32, dtype=np.int32,
) )

View File

@ -31,46 +31,7 @@ PHONEME_PUNCTUATION_PATTERN = r"[" + _punctuations.replace(" ", "") + "]+"
GRUUT_TRANS_TABLE = str.maketrans("g", "ɡ") GRUUT_TRANS_TABLE = str.maketrans("g", "ɡ")
def clean_gruut_phonemes(ph_list): def text2phone(text, language, use_espeak_phonemes=False):
"""Decompose, substitute, and clean gruut phonemes for TTS.
gruut phonemes may contain any IPA characters (e.g., "" for the nasalized
"e"), and may be composed of multiple characters (e.g., "aɪ" in the English
"r[i]ce").
TTS phonemes come from a fixed set of symbols, and do not include every
possible variation of every vowel/consonant. Here, we decompose dipthongs,
etc. into single characters and then filter out Unicode combining characters
such as ties. This ensures that (most) phonemes will exist in the TTS symbol
table.
Args:
ph_list (list[str]): list of phonemes from gruut
Returns:
clean_list (list[str]): decomposed/clean list of phonemes for TTS
"""
cleaned_phonemes = []
for phoneme_text in ph_list:
phoneme_text = unicodedata.normalize("NFC", phoneme_text)
if phoneme_text in phonemes:
cleaned_phonemes.append(phoneme_text)
continue
# Decompose into codepoints (ã -> ["a", "\u0303"])
phoneme_text = unicodedata.normalize("NFD", phoneme_text)
for codepoint in phoneme_text.translate(GRUUT_TRANS_TABLE):
if unicodedata.combining(codepoint) > 0:
# Skip combining characters like ties
continue
cleaned_phonemes.append(codepoint)
return cleaned_phonemes
def text2phone(text, language):
"""Convert graphemes to phonemes. """Convert graphemes to phonemes.
Parameters: Parameters:
text (str): text to phonemize text (str): text to phonemize
@ -93,21 +54,32 @@ def text2phone(text, language):
if gruut.is_language_supported(language): if gruut.is_language_supported(language):
# Use gruut for phonemization # Use gruut for phonemization
phonemizer_args={
"remove_stress": True,
"ipa_minor_breaks": False, # don't replace commas/semi-colons with IPA |
"ipa_major_breaks": False, # don't replace periods with IPA ‖
}
if use_espeak_phonemes:
# Use a lexicon/g2p model train on eSpeak IPA instead of gruut IPA.
# This is intended for backwards compatibility with TTS<=v0.0.13
# pre-trained models.
phonemizer_args["model_prefix"] = "espeak"
ph_list = gruut.text_to_phonemes( ph_list = gruut.text_to_phonemes(
text, text,
lang=language, lang=language,
return_format="word_phonemes", return_format="word_phonemes",
phonemizer_args={ phonemizer_args=phonemizer_args,
"remove_accents": True, # remove accute/grave accents (Swedish)
"ipa_minor_breaks": False, # don't replace commas/semi-colons with IPA |
"ipa_major_breaks": False, # don't replace periods with IPA ‖
},
) )
# Join and re-split to break apart dipthongs, suprasegmentals, etc. # Join and re-split to break apart dipthongs, suprasegmentals, etc.
ph_words = ["|".join(clean_gruut_phonemes(word_phonemes)) for word_phonemes in ph_list] ph_words = ["|".join(word_phonemes) for word_phonemes in ph_list]
ph = "| ".join(ph_words) ph = "| ".join(ph_words)
# Fix a few phonemes
ph = ph.translate(GRUUT_TRANS_TABLE)
print(" > Phonemes: {}".format(ph)) print(" > Phonemes: {}".format(ph))
return ph return ph
@ -132,7 +104,7 @@ def pad_with_eos_bos(phoneme_sequence, tp=None):
return [_phonemes_to_id[_bos]] + list(phoneme_sequence) + [_phonemes_to_id[_eos]] return [_phonemes_to_id[_bos]] + list(phoneme_sequence) + [_phonemes_to_id[_eos]]
def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None, add_blank=False): def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None, add_blank=False, use_espeak_phonemes=False):
# pylint: disable=global-statement # pylint: disable=global-statement
global _phonemes_to_id, _phonemes global _phonemes_to_id, _phonemes
if tp: if tp:
@ -141,7 +113,7 @@ def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=
sequence = [] sequence = []
clean_text = _clean_text(text, cleaner_names) clean_text = _clean_text(text, cleaner_names)
to_phonemes = text2phone(clean_text, language) to_phonemes = text2phone(clean_text, language, use_espeak_phonemes=use_espeak_phonemes)
if to_phonemes is None: if to_phonemes is None:
print("!! After phoneme conversion the result is None. -- {} ".format(clean_text)) print("!! After phoneme conversion the result is None. -- {} ".format(clean_text))
# iterate by skipping empty strings - NOTE: might be useful to keep it to have a better intonation. # iterate by skipping empty strings - NOTE: might be useful to keep it to have a better intonation.
@ -152,6 +124,7 @@ def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=
sequence = pad_with_eos_bos(sequence, tp=tp) sequence = pad_with_eos_bos(sequence, tp=tp)
if add_blank: if add_blank:
sequence = intersperse(sequence, len(_phonemes)) # add a blank token (new), whose id number is len(_phonemes) sequence = intersperse(sequence, len(_phonemes)) # add a blank token (new), whose id number is len(_phonemes)
return sequence return sequence

View File

@ -23,4 +23,4 @@ coqpit
mecab-python3==1.0.3 mecab-python3==1.0.3
unidic-lite==1.0.8 unidic-lite==1.0.8
# gruut+supported langs # gruut+supported langs
gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=1.1.0 gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=1.2.0

View File

@ -1,54 +1,15 @@
"""Tests for text to phoneme converstion""" """Tests for text to phoneme converstion"""
import unittest import unittest
import gruut from TTS.tts.utils.text import phoneme_to_sequence, sequence_to_phoneme, text2phone
from gruut_ipa import IPA, Phonemes
from TTS.tts.utils.text import clean_gruut_phonemes, phoneme_to_sequence
from TTS.tts.utils.text import phonemes as all_phonemes
from TTS.tts.utils.text import sequence_to_phoneme
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
LANG = "en-us"
EXAMPLE_TEXT = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!" EXAMPLE_TEXT = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"
# Raw phonemes from run of gruut with example text (en-us). EXPECTED_PHONEMES = "ɹ|iː|s|ə|n|t| ɹ|ᵻ|s|ɜː|tʃ| æ|t| h|ɑːɹ|v|ɚ|d| h|æ|z| ʃ|oʊ|n| m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| f|ɔːɹ| æ|z| l|ɪ|ɾ|əl| æ|z| eɪ|t| w|iː|k|s| k|æ|n| æ|k|tʃ|uː|əl|i| ɪ|ŋ|k|ɹ|iː|s| ,| ð|ə| ɡ|ɹ|eɪ| m|æ|ɾ|ɚ| ɪ|n| ð|ə| p|ɑːɹ|t|s| ʌ|v| ð|ə| b|ɹ|eɪ|n| ɹ|ᵻ|s|p|ɑː|n|s|ᵻ|b|əl| f|ɔːɹ| ɪ|m|oʊ|ʃ|ə|n|əl| ɹ|ɛ|ɡ|j|ʊ|l|eɪ|ʃ|ə|n| æ|n|d| l|ɜː|n|ɪ|ŋ| !"
# This includes IPA ties, etc.
EXAMPLE_PHONEMES = [
["ɹ", "ˈi", "s", "ə", "n", "t"],
["ɹ", "i", "s", "ˈɚ", "t͡ʃ"],
["ˈæ", "t"],
["h", "ˈɑ", "ɹ", "v", "ɚ", "d"],
["h", "ˈæ", "z"],
["ʃ", "ˈ", "n"],
["m", "ˈɛ", "d", "ɪ", "t", "ˌeɪ", "t", "ɪ", "ŋ"],
["f", "ɚ"],
["ˈæ", "z"],
["l", "ˈɪ", "t", "ə", "l"],
["ˈæ", "z"],
["ˈeɪ", "t"],
["w", "ˈi", "k", "s"],
["k", "ə", "n"],
["ˈæ", "k", "t͡ʃ", "ə", "l", "i"],
["ɪ", "ŋ", "k", "ɹ", "ˈi", "s"],
[","],
["ð", "ə"],
["ɡ", "ɹ", "ˈeɪ"],
["m", "ˈæ", "t", "ɚ"],
["ˈɪ", "n"],
["ð", "ə"],
["p", "ˈɑ", "ɹ", "t", "s"],
["ə", "v"],
["ð", "ə"],
["b", "ɹ", "ˈeɪ", "n"],
["ɹ", "i", "s", "p", "ˈɑ", "n", "s", "ɪ", "b", "ə", "l"],
["f", "ɚ"],
["ɪ", "m", "ˈ", "ʃ", "ə", "n", "ə", "l"],
["ɹ", "ˌɛ", "ɡ", "j", "ə", "l", "ˈeɪ", "ʃ", "ə", "n"],
["ˈæ", "n", "d"],
["l", "ˈɚ", "n", "ɪ", "ŋ"],
["!"],
]
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
@ -56,79 +17,84 @@ EXAMPLE_PHONEMES = [
class TextProcessingTextCase(unittest.TestCase): class TextProcessingTextCase(unittest.TestCase):
"""Tests for text to phoneme conversion""" """Tests for text to phoneme conversion"""
def test_all_phonemes_in_tts(self):
"""Ensure that all phonemes from gruut are present in TTS phonemes"""
tts_phonemes = set(all_phonemes)
# Check stress characters
for suprasegmental in [IPA.STRESS_PRIMARY, IPA.STRESS_SECONDARY]:
self.assertIn(suprasegmental, tts_phonemes)
# Check that gruut's phonemes are a subset of TTS phonemes
for lang in gruut.get_supported_languages():
for phoneme in Phonemes.from_language(lang):
for codepoint in clean_gruut_phonemes(phoneme.text):
self.assertIn(codepoint, tts_phonemes)
def test_phoneme_to_sequence(self): def test_phoneme_to_sequence(self):
"""Verify example (text -> sequence -> phoneme string) pipeline""" """Verify en-us sentence phonemes without blank token"""
lang = "en-us" self._test_phoneme_to_sequence(add_blank=False)
expected_phoneme_str = " ".join(
"".join(clean_gruut_phonemes(word_phonemes)) for word_phonemes in EXAMPLE_PHONEMES
)
# Ensure that TTS produces same phoneme string
text_cleaner = ["phoneme_cleaners"]
actual_sequence = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, lang)
actual_phoneme_str = sequence_to_phoneme(actual_sequence)
self.assertEqual(actual_phoneme_str, expected_phoneme_str)
def test_phoneme_to_sequence_with_blank_token(self): def test_phoneme_to_sequence_with_blank_token(self):
"""Verify example (text -> sequence -> phoneme string) pipeline with blank token""" """Verify en-us sentence phonemes with blank token"""
lang = "en-us" self._test_phoneme_to_sequence(add_blank=True)
def _test_phoneme_to_sequence(self, add_blank):
text_cleaner = ["phoneme_cleaners"] text_cleaner = ["phoneme_cleaners"]
sequence = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
text_hat = sequence_to_phoneme(sequence)
text_hat_with_params = sequence_to_phoneme(sequence)
gt = EXPECTED_PHONEMES.replace("|", "")
self.assertEqual(text_hat, text_hat_with_params)
self.assertEqual(text_hat, gt)
# Create with/without blank sequences # multiple punctuations
sequence_without_blank = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, lang, add_blank=False) text = "Be a voice, not an! echo?"
sequence_with_blank = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, lang, add_blank=True) sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
text_hat = sequence_to_phoneme(sequence)
text_hat_with_params = sequence_to_phoneme(sequence)
gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ ?"
print(text_hat)
print(len(sequence))
self.assertEqual(text_hat, text_hat_with_params)
self.assertEqual(text_hat, gt)
# With blank sequence should be bigger # not ending with punctuation
self.assertGreater(len(sequence_with_blank), len(sequence_without_blank)) text = "Be a voice, not an! echo"
sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
text_hat = sequence_to_phoneme(sequence)
text_hat_with_params = sequence_to_phoneme(sequence)
gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ"
print(text_hat)
print(len(sequence))
self.assertEqual(text_hat, text_hat_with_params)
self.assertEqual(text_hat, gt)
# But phoneme strings should still be identical # original
phoneme_str_without_blank = sequence_to_phoneme(sequence_without_blank, add_blank=False) text = "Be a voice, not an echo!"
phoneme_str_with_blank = sequence_to_phoneme(sequence_with_blank, add_blank=True) sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
text_hat = sequence_to_phoneme(sequence)
text_hat_with_params = sequence_to_phoneme(sequence)
gt = "biː ɐ vɔɪs , nɑːt ɐn ɛkoʊ !"
print(text_hat)
print(len(sequence))
self.assertEqual(text_hat, text_hat_with_params)
self.assertEqual(text_hat, gt)
self.assertEqual(phoneme_str_with_blank, phoneme_str_without_blank) # extra space after the sentence
text = "Be a voice, not an! echo. "
sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
text_hat = sequence_to_phoneme(sequence)
text_hat_with_params = sequence_to_phoneme(sequence)
gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ ."
print(text_hat)
print(len(sequence))
self.assertEqual(text_hat, text_hat_with_params)
self.assertEqual(text_hat, gt)
def test_messy_text(self): # extra space after the sentence
"""Verify text with extra punctuation/whitespace/etc. makes it through the pipeline""" text = "Be a voice, not an! echo. "
text = '"Be" a! voice, [NOT]? (an eCHo. ' sequence = phoneme_to_sequence(
lang = "en-us" text, text_cleaner, LANG, enable_eos_bos=True, add_blank=add_blank, use_espeak_phonemes=True
expected_phonemes = [
["b", "ˈi"],
["ə"],
["!"],
["v", "ˈɔɪ", "s"],
[","],
["n", "ˈɑ", "t"],
["?"],
["ə", "n"],
["ˈɛ", "k", ""],
["."],
]
expected_phoneme_str = " ".join(
"".join(clean_gruut_phonemes(word_phonemes)) for word_phonemes in expected_phonemes
) )
text_hat = sequence_to_phoneme(sequence)
text_hat_with_params = sequence_to_phoneme(sequence)
gt = "^biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ .~"
print(text_hat)
print(len(sequence))
self.assertEqual(text_hat, text_hat_with_params)
self.assertEqual(text_hat, gt)
# Ensure that TTS produces same phoneme string def test_text2phone(self):
text_cleaner = ["phoneme_cleaners"] text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"
actual_sequence = phoneme_to_sequence(text, text_cleaner, lang) ph = text2phone(EXAMPLE_TEXT, LANG)
actual_phoneme_str = sequence_to_phoneme(actual_sequence) self.assertEqual(ph, EXPECTED_PHONEMES)
self.assertEqual(actual_phoneme_str, expected_phoneme_str)
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------