mirror of https://github.com/coqui-ai/TTS.git
Use eSpeak IPA lexicons by default for phoneme models
This commit is contained in:
parent
618b509204
commit
4d8426fa0a
|
@ -97,6 +97,8 @@ class BaseTTSConfig(BaseTrainingConfig):
|
|||
Audio processor config object instance.
|
||||
use_phonemes (bool):
|
||||
enable / disable phoneme use.
|
||||
use_espeak_phonemes (bool):
|
||||
enable / disable eSpeak-compatible phonemes (only if use_phonemes = `True`).
|
||||
compute_input_seq_cache (bool):
|
||||
enable / disable precomputation of the phoneme sequences. At the expense of some delay at the beginning of
|
||||
the training, It allows faster data loader time and precise limitation with `max_seq_len` and
|
||||
|
@ -136,6 +138,7 @@ class BaseTTSConfig(BaseTrainingConfig):
|
|||
audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
|
||||
# phoneme settings
|
||||
use_phonemes: bool = False
|
||||
use_espeak_phonemes: bool = True
|
||||
phoneme_language: str = None
|
||||
compute_input_seq_cache: bool = False
|
||||
text_cleaner: str = MISSING
|
||||
|
|
|
@ -25,6 +25,7 @@ def text_to_seqvec(text, CONFIG):
|
|||
CONFIG.enable_eos_bos_chars,
|
||||
tp=CONFIG.characters,
|
||||
add_blank=CONFIG.add_blank,
|
||||
use_espeak_phonemes=CONFIG.use_espeak_phonemes
|
||||
),
|
||||
dtype=np.int32,
|
||||
)
|
||||
|
|
|
@ -31,46 +31,7 @@ PHONEME_PUNCTUATION_PATTERN = r"[" + _punctuations.replace(" ", "") + "]+"
|
|||
GRUUT_TRANS_TABLE = str.maketrans("g", "ɡ")
|
||||
|
||||
|
||||
def clean_gruut_phonemes(ph_list):
|
||||
"""Decompose, substitute, and clean gruut phonemes for TTS.
|
||||
|
||||
gruut phonemes may contain any IPA characters (e.g., "ẽ" for the nasalized
|
||||
"e"), and may be composed of multiple characters (e.g., "aɪ" in the English
|
||||
"r[i]ce").
|
||||
|
||||
TTS phonemes come from a fixed set of symbols, and do not include every
|
||||
possible variation of every vowel/consonant. Here, we decompose dipthongs,
|
||||
etc. into single characters and then filter out Unicode combining characters
|
||||
such as ties. This ensures that (most) phonemes will exist in the TTS symbol
|
||||
table.
|
||||
|
||||
Args:
|
||||
ph_list (list[str]): list of phonemes from gruut
|
||||
|
||||
Returns:
|
||||
clean_list (list[str]): decomposed/clean list of phonemes for TTS
|
||||
"""
|
||||
cleaned_phonemes = []
|
||||
|
||||
for phoneme_text in ph_list:
|
||||
phoneme_text = unicodedata.normalize("NFC", phoneme_text)
|
||||
if phoneme_text in phonemes:
|
||||
cleaned_phonemes.append(phoneme_text)
|
||||
continue
|
||||
|
||||
# Decompose into codepoints (ã -> ["a", "\u0303"])
|
||||
phoneme_text = unicodedata.normalize("NFD", phoneme_text)
|
||||
for codepoint in phoneme_text.translate(GRUUT_TRANS_TABLE):
|
||||
if unicodedata.combining(codepoint) > 0:
|
||||
# Skip combining characters like ties
|
||||
continue
|
||||
|
||||
cleaned_phonemes.append(codepoint)
|
||||
|
||||
return cleaned_phonemes
|
||||
|
||||
|
||||
def text2phone(text, language):
|
||||
def text2phone(text, language, use_espeak_phonemes=False):
|
||||
"""Convert graphemes to phonemes.
|
||||
Parameters:
|
||||
text (str): text to phonemize
|
||||
|
@ -93,21 +54,32 @@ def text2phone(text, language):
|
|||
|
||||
if gruut.is_language_supported(language):
|
||||
# Use gruut for phonemization
|
||||
phonemizer_args={
|
||||
"remove_stress": True,
|
||||
"ipa_minor_breaks": False, # don't replace commas/semi-colons with IPA |
|
||||
"ipa_major_breaks": False, # don't replace periods with IPA ‖
|
||||
}
|
||||
|
||||
if use_espeak_phonemes:
|
||||
# Use a lexicon/g2p model train on eSpeak IPA instead of gruut IPA.
|
||||
# This is intended for backwards compatibility with TTS<=v0.0.13
|
||||
# pre-trained models.
|
||||
phonemizer_args["model_prefix"] = "espeak"
|
||||
|
||||
ph_list = gruut.text_to_phonemes(
|
||||
text,
|
||||
lang=language,
|
||||
return_format="word_phonemes",
|
||||
phonemizer_args={
|
||||
"remove_accents": True, # remove accute/grave accents (Swedish)
|
||||
"ipa_minor_breaks": False, # don't replace commas/semi-colons with IPA |
|
||||
"ipa_major_breaks": False, # don't replace periods with IPA ‖
|
||||
},
|
||||
phonemizer_args=phonemizer_args,
|
||||
)
|
||||
|
||||
# Join and re-split to break apart dipthongs, suprasegmentals, etc.
|
||||
ph_words = ["|".join(clean_gruut_phonemes(word_phonemes)) for word_phonemes in ph_list]
|
||||
ph_words = ["|".join(word_phonemes) for word_phonemes in ph_list]
|
||||
ph = "| ".join(ph_words)
|
||||
|
||||
# Fix a few phonemes
|
||||
ph = ph.translate(GRUUT_TRANS_TABLE)
|
||||
|
||||
print(" > Phonemes: {}".format(ph))
|
||||
return ph
|
||||
|
||||
|
@ -132,7 +104,7 @@ def pad_with_eos_bos(phoneme_sequence, tp=None):
|
|||
return [_phonemes_to_id[_bos]] + list(phoneme_sequence) + [_phonemes_to_id[_eos]]
|
||||
|
||||
|
||||
def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None, add_blank=False):
|
||||
def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None, add_blank=False, use_espeak_phonemes=False):
|
||||
# pylint: disable=global-statement
|
||||
global _phonemes_to_id, _phonemes
|
||||
if tp:
|
||||
|
@ -141,7 +113,7 @@ def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=
|
|||
|
||||
sequence = []
|
||||
clean_text = _clean_text(text, cleaner_names)
|
||||
to_phonemes = text2phone(clean_text, language)
|
||||
to_phonemes = text2phone(clean_text, language, use_espeak_phonemes=use_espeak_phonemes)
|
||||
if to_phonemes is None:
|
||||
print("!! After phoneme conversion the result is None. -- {} ".format(clean_text))
|
||||
# iterate by skipping empty strings - NOTE: might be useful to keep it to have a better intonation.
|
||||
|
@ -152,6 +124,7 @@ def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=
|
|||
sequence = pad_with_eos_bos(sequence, tp=tp)
|
||||
if add_blank:
|
||||
sequence = intersperse(sequence, len(_phonemes)) # add a blank token (new), whose id number is len(_phonemes)
|
||||
|
||||
return sequence
|
||||
|
||||
|
||||
|
|
|
@ -23,4 +23,4 @@ coqpit
|
|||
mecab-python3==1.0.3
|
||||
unidic-lite==1.0.8
|
||||
# gruut+supported langs
|
||||
gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=1.1.0
|
||||
gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=1.2.0
|
||||
|
|
|
@ -1,54 +1,15 @@
|
|||
"""Tests for text to phoneme converstion"""
|
||||
import unittest
|
||||
|
||||
import gruut
|
||||
from gruut_ipa import IPA, Phonemes
|
||||
|
||||
from TTS.tts.utils.text import clean_gruut_phonemes, phoneme_to_sequence
|
||||
from TTS.tts.utils.text import phonemes as all_phonemes
|
||||
from TTS.tts.utils.text import sequence_to_phoneme
|
||||
from TTS.tts.utils.text import phoneme_to_sequence, sequence_to_phoneme, text2phone
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
LANG = "en-us"
|
||||
|
||||
EXAMPLE_TEXT = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"
|
||||
|
||||
# Raw phonemes from run of gruut with example text (en-us).
|
||||
# This includes IPA ties, etc.
|
||||
EXAMPLE_PHONEMES = [
|
||||
["ɹ", "ˈi", "s", "ə", "n", "t"],
|
||||
["ɹ", "i", "s", "ˈɚ", "t͡ʃ"],
|
||||
["ˈæ", "t"],
|
||||
["h", "ˈɑ", "ɹ", "v", "ɚ", "d"],
|
||||
["h", "ˈæ", "z"],
|
||||
["ʃ", "ˈoʊ", "n"],
|
||||
["m", "ˈɛ", "d", "ɪ", "t", "ˌeɪ", "t", "ɪ", "ŋ"],
|
||||
["f", "ɚ"],
|
||||
["ˈæ", "z"],
|
||||
["l", "ˈɪ", "t", "ə", "l"],
|
||||
["ˈæ", "z"],
|
||||
["ˈeɪ", "t"],
|
||||
["w", "ˈi", "k", "s"],
|
||||
["k", "ə", "n"],
|
||||
["ˈæ", "k", "t͡ʃ", "ə", "l", "i"],
|
||||
["ɪ", "ŋ", "k", "ɹ", "ˈi", "s"],
|
||||
[","],
|
||||
["ð", "ə"],
|
||||
["ɡ", "ɹ", "ˈeɪ"],
|
||||
["m", "ˈæ", "t", "ɚ"],
|
||||
["ˈɪ", "n"],
|
||||
["ð", "ə"],
|
||||
["p", "ˈɑ", "ɹ", "t", "s"],
|
||||
["ə", "v"],
|
||||
["ð", "ə"],
|
||||
["b", "ɹ", "ˈeɪ", "n"],
|
||||
["ɹ", "i", "s", "p", "ˈɑ", "n", "s", "ɪ", "b", "ə", "l"],
|
||||
["f", "ɚ"],
|
||||
["ɪ", "m", "ˈoʊ", "ʃ", "ə", "n", "ə", "l"],
|
||||
["ɹ", "ˌɛ", "ɡ", "j", "ə", "l", "ˈeɪ", "ʃ", "ə", "n"],
|
||||
["ˈæ", "n", "d"],
|
||||
["l", "ˈɚ", "n", "ɪ", "ŋ"],
|
||||
["!"],
|
||||
]
|
||||
EXPECTED_PHONEMES = "ɹ|iː|s|ə|n|t| ɹ|ᵻ|s|ɜː|tʃ| æ|t| h|ɑːɹ|v|ɚ|d| h|æ|z| ʃ|oʊ|n| m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| f|ɔːɹ| æ|z| l|ɪ|ɾ|əl| æ|z| eɪ|t| w|iː|k|s| k|æ|n| æ|k|tʃ|uː|əl|i| ɪ|ŋ|k|ɹ|iː|s| ,| ð|ə| ɡ|ɹ|eɪ| m|æ|ɾ|ɚ| ɪ|n| ð|ə| p|ɑːɹ|t|s| ʌ|v| ð|ə| b|ɹ|eɪ|n| ɹ|ᵻ|s|p|ɑː|n|s|ᵻ|b|əl| f|ɔːɹ| ɪ|m|oʊ|ʃ|ə|n|əl| ɹ|ɛ|ɡ|j|ʊ|l|eɪ|ʃ|ə|n| æ|n|d| l|ɜː|n|ɪ|ŋ| !"
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
@ -56,79 +17,84 @@ EXAMPLE_PHONEMES = [
|
|||
class TextProcessingTextCase(unittest.TestCase):
|
||||
"""Tests for text to phoneme conversion"""
|
||||
|
||||
def test_all_phonemes_in_tts(self):
|
||||
"""Ensure that all phonemes from gruut are present in TTS phonemes"""
|
||||
tts_phonemes = set(all_phonemes)
|
||||
|
||||
# Check stress characters
|
||||
for suprasegmental in [IPA.STRESS_PRIMARY, IPA.STRESS_SECONDARY]:
|
||||
self.assertIn(suprasegmental, tts_phonemes)
|
||||
|
||||
# Check that gruut's phonemes are a subset of TTS phonemes
|
||||
for lang in gruut.get_supported_languages():
|
||||
for phoneme in Phonemes.from_language(lang):
|
||||
for codepoint in clean_gruut_phonemes(phoneme.text):
|
||||
|
||||
self.assertIn(codepoint, tts_phonemes)
|
||||
|
||||
def test_phoneme_to_sequence(self):
|
||||
"""Verify example (text -> sequence -> phoneme string) pipeline"""
|
||||
lang = "en-us"
|
||||
expected_phoneme_str = " ".join(
|
||||
"".join(clean_gruut_phonemes(word_phonemes)) for word_phonemes in EXAMPLE_PHONEMES
|
||||
)
|
||||
|
||||
# Ensure that TTS produces same phoneme string
|
||||
text_cleaner = ["phoneme_cleaners"]
|
||||
actual_sequence = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, lang)
|
||||
actual_phoneme_str = sequence_to_phoneme(actual_sequence)
|
||||
|
||||
self.assertEqual(actual_phoneme_str, expected_phoneme_str)
|
||||
"""Verify en-us sentence phonemes without blank token"""
|
||||
self._test_phoneme_to_sequence(add_blank=False)
|
||||
|
||||
def test_phoneme_to_sequence_with_blank_token(self):
|
||||
"""Verify example (text -> sequence -> phoneme string) pipeline with blank token"""
|
||||
lang = "en-us"
|
||||
"""Verify en-us sentence phonemes with blank token"""
|
||||
self._test_phoneme_to_sequence(add_blank=True)
|
||||
|
||||
def _test_phoneme_to_sequence(self, add_blank):
|
||||
text_cleaner = ["phoneme_cleaners"]
|
||||
sequence = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
|
||||
text_hat = sequence_to_phoneme(sequence)
|
||||
text_hat_with_params = sequence_to_phoneme(sequence)
|
||||
gt = EXPECTED_PHONEMES.replace("|", "")
|
||||
self.assertEqual(text_hat, text_hat_with_params)
|
||||
self.assertEqual(text_hat, gt)
|
||||
|
||||
# Create with/without blank sequences
|
||||
sequence_without_blank = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, lang, add_blank=False)
|
||||
sequence_with_blank = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, lang, add_blank=True)
|
||||
# multiple punctuations
|
||||
text = "Be a voice, not an! echo?"
|
||||
sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
|
||||
text_hat = sequence_to_phoneme(sequence)
|
||||
text_hat_with_params = sequence_to_phoneme(sequence)
|
||||
gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ ?"
|
||||
print(text_hat)
|
||||
print(len(sequence))
|
||||
self.assertEqual(text_hat, text_hat_with_params)
|
||||
self.assertEqual(text_hat, gt)
|
||||
|
||||
# With blank sequence should be bigger
|
||||
self.assertGreater(len(sequence_with_blank), len(sequence_without_blank))
|
||||
# not ending with punctuation
|
||||
text = "Be a voice, not an! echo"
|
||||
sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
|
||||
text_hat = sequence_to_phoneme(sequence)
|
||||
text_hat_with_params = sequence_to_phoneme(sequence)
|
||||
gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ"
|
||||
print(text_hat)
|
||||
print(len(sequence))
|
||||
self.assertEqual(text_hat, text_hat_with_params)
|
||||
self.assertEqual(text_hat, gt)
|
||||
|
||||
# But phoneme strings should still be identical
|
||||
phoneme_str_without_blank = sequence_to_phoneme(sequence_without_blank, add_blank=False)
|
||||
phoneme_str_with_blank = sequence_to_phoneme(sequence_with_blank, add_blank=True)
|
||||
# original
|
||||
text = "Be a voice, not an echo!"
|
||||
sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
|
||||
text_hat = sequence_to_phoneme(sequence)
|
||||
text_hat_with_params = sequence_to_phoneme(sequence)
|
||||
gt = "biː ɐ vɔɪs , nɑːt ɐn ɛkoʊ !"
|
||||
print(text_hat)
|
||||
print(len(sequence))
|
||||
self.assertEqual(text_hat, text_hat_with_params)
|
||||
self.assertEqual(text_hat, gt)
|
||||
|
||||
self.assertEqual(phoneme_str_with_blank, phoneme_str_without_blank)
|
||||
# extra space after the sentence
|
||||
text = "Be a voice, not an! echo. "
|
||||
sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True)
|
||||
text_hat = sequence_to_phoneme(sequence)
|
||||
text_hat_with_params = sequence_to_phoneme(sequence)
|
||||
gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ ."
|
||||
print(text_hat)
|
||||
print(len(sequence))
|
||||
self.assertEqual(text_hat, text_hat_with_params)
|
||||
self.assertEqual(text_hat, gt)
|
||||
|
||||
def test_messy_text(self):
|
||||
"""Verify text with extra punctuation/whitespace/etc. makes it through the pipeline"""
|
||||
text = '"Be" a! voice, [NOT]? (an eCHo. '
|
||||
lang = "en-us"
|
||||
expected_phonemes = [
|
||||
["b", "ˈi"],
|
||||
["ə"],
|
||||
["!"],
|
||||
["v", "ˈɔɪ", "s"],
|
||||
[","],
|
||||
["n", "ˈɑ", "t"],
|
||||
["?"],
|
||||
["ə", "n"],
|
||||
["ˈɛ", "k", "oʊ"],
|
||||
["."],
|
||||
]
|
||||
expected_phoneme_str = " ".join(
|
||||
"".join(clean_gruut_phonemes(word_phonemes)) for word_phonemes in expected_phonemes
|
||||
# extra space after the sentence
|
||||
text = "Be a voice, not an! echo. "
|
||||
sequence = phoneme_to_sequence(
|
||||
text, text_cleaner, LANG, enable_eos_bos=True, add_blank=add_blank, use_espeak_phonemes=True
|
||||
)
|
||||
text_hat = sequence_to_phoneme(sequence)
|
||||
text_hat_with_params = sequence_to_phoneme(sequence)
|
||||
gt = "^biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ .~"
|
||||
print(text_hat)
|
||||
print(len(sequence))
|
||||
self.assertEqual(text_hat, text_hat_with_params)
|
||||
self.assertEqual(text_hat, gt)
|
||||
|
||||
# Ensure that TTS produces same phoneme string
|
||||
text_cleaner = ["phoneme_cleaners"]
|
||||
actual_sequence = phoneme_to_sequence(text, text_cleaner, lang)
|
||||
actual_phoneme_str = sequence_to_phoneme(actual_sequence)
|
||||
|
||||
self.assertEqual(actual_phoneme_str, expected_phoneme_str)
|
||||
def test_text2phone(self):
|
||||
text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!"
|
||||
ph = text2phone(EXAMPLE_TEXT, LANG)
|
||||
self.assertEqual(ph, EXPECTED_PHONEMES)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
|
Loading…
Reference in New Issue