From d245b5d48f01cdaf45479b7f924b8a74b6c58b97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Tue, 8 Jun 2021 09:21:01 +0200 Subject: [PATCH 01/11] bump up v0.0.15.1 --- TTS/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/VERSION b/TTS/VERSION index ceddfb28..58c4b6e9 100644 --- a/TTS/VERSION +++ b/TTS/VERSION @@ -1 +1 @@ -0.0.15 +0.0.15.1 From 67869e77f9ac4d51f9c972395ca7e0b2916ab14f Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Wed, 2 Jun 2021 10:35:59 -0400 Subject: [PATCH 02/11] Use gruut for phonemization --- TTS/tts/utils/text/__init__.py | 57 ++++++++++++++++++++++++++++++++++ TTS/utils/manage.py | 8 ++--- requirements.txt | 2 ++ 3 files changed, 63 insertions(+), 4 deletions(-) diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py index f9f44167..49e7a08a 100644 --- a/TTS/tts/utils/text/__init__.py +++ b/TTS/tts/utils/text/__init__.py @@ -2,6 +2,7 @@ import re +import gruut from packaging import version from TTS.tts.utils.text import cleaners @@ -25,6 +26,33 @@ _CURLY_RE = re.compile(r"(.*?)\{(.+?)\}(.*)") # Regular expression matching punctuations, ignoring empty space PHONEME_PUNCTUATION_PATTERN = r"[" + _punctuations.replace(" ", "") + "]+" +# language -> source phoneme -> dest phoneme +# Used to make gruut's phonemes fit better with eSpeak's. +GRUUT_PHONEME_MAP = { + "en-us": { + "i": "iː", + "ɑ": "ɑː", + "ɚ": "ɜːɹ", + }, + "de": { + "ʁ": "ɾ", + "g": "ɡ", + "ʔ": "", + }, + "nl": { + "a": "aː", + "e": "eː", + "ʏ": "ɵ", + "ʋ": "w", + "ɹ": "r", + "ɔː": "oː", + }, + "es": { + "ɾ": "r", + "g": "ɣ", + }, +} + def text2phone(text, language): """Convert graphemes to phonemes. @@ -39,10 +67,39 @@ def text2phone(text, language): # TO REVIEW : How to have a good implementation for this? if language == "zh-CN": ph = chinese_text_to_phonemes(text) + print(" > Phonemes: {}".format(ph)) return ph if language == "ja-jp": ph = japanese_text_to_phonemes(text) + print(" > Phonemes: {}".format(ph)) + return ph + + if gruut.is_language_supported(language): + # Use gruut for phonemization + ph_list = gruut.text_to_phonemes( + text, + lang=language, + return_format="word_phonemes", + phonemizer_args={ + "remove_stress": True, # remove primary/secondary stress + "ipa_minor_breaks": False, # don't replace commas/semi-colons with IPA | + "ipa_major_breaks": False, # don't replace periods with IPA ‖ + }, + ) + + ph_map = GRUUT_PHONEME_MAP.get(language) + if ph_map: + # Re-map phonemes to fit with eSpeak conventions + for word in ph_list: + for p_idx, p in enumerate(word): + word[p_idx] = ph_map.get(p, p) + + # Join and re-split to break apart dipthongs, suprasegmentals, etc. + ph_words = ["|".join(word_phonemes) for word_phonemes in ph_list] + ph = "| ".join(ph_words) + + print(" > Phonemes: {}".format(ph)) return ph raise ValueError(f" [!] Language {language} is not supported for phonemization.") diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index cf7df7de..f5165079 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -102,10 +102,10 @@ class ModelManager(object): output_model_path = os.path.join(output_path, "model_file.pth.tar") output_config_path = os.path.join(output_path, "config.json") # NOTE : band-aid for removing phoneme support - if "needs_phonemizer" in model_item and model_item["needs_phonemizer"]: - raise RuntimeError( - " [!] Use 🐸TTS <= v0.0.13 for this model. Current version does not support phoneme based models." - ) + # if "needs_phonemizer" in model_item and model_item["needs_phonemizer"]: + # raise RuntimeError( + # " [!] Use 🐸TTS <= v0.0.13 for this model. Current version does not support phoneme based models." + # ) if os.path.exists(output_path): print(f" > {model_name} is already downloaded.") else: diff --git a/requirements.txt b/requirements.txt index fde48978..a2fb4132 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,3 +22,5 @@ coqpit # japanese g2p deps mecab-python3==1.0.3 unidic-lite==1.0.8 +# gruut+supported langs +gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=1.0.0 From 47191f3eccd9ec42c3db16767e6a0be96e0aaa0d Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Wed, 9 Jun 2021 11:52:10 -0400 Subject: [PATCH 03/11] Add tests for gruut phonemization --- TTS/__init__.py | 1 - TTS/tts/utils/text/__init__.py | 66 ++++++++-------- tests/test_text_processing.py | 137 +++++++++++++++++++++++++++++++++ 3 files changed, 168 insertions(+), 36 deletions(-) create mode 100644 tests/test_text_processing.py diff --git a/TTS/__init__.py b/TTS/__init__.py index da35faf8..5162d4ec 100644 --- a/TTS/__init__.py +++ b/TTS/__init__.py @@ -1,6 +1,5 @@ import os - with open(os.path.join(os.path.dirname(__file__), "VERSION")) as f: version = f.read().strip() diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py index 49e7a08a..14319c44 100644 --- a/TTS/tts/utils/text/__init__.py +++ b/TTS/tts/utils/text/__init__.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import re +import unicodedata import gruut from packaging import version @@ -26,32 +27,34 @@ _CURLY_RE = re.compile(r"(.*?)\{(.+?)\}(.*)") # Regular expression matching punctuations, ignoring empty space PHONEME_PUNCTUATION_PATTERN = r"[" + _punctuations.replace(" ", "") + "]+" -# language -> source phoneme -> dest phoneme -# Used to make gruut's phonemes fit better with eSpeak's. -GRUUT_PHONEME_MAP = { - "en-us": { - "i": "iː", - "ɑ": "ɑː", - "ɚ": "ɜːɹ", - }, - "de": { - "ʁ": "ɾ", - "g": "ɡ", - "ʔ": "", - }, - "nl": { - "a": "aː", - "e": "eː", - "ʏ": "ɵ", - "ʋ": "w", - "ɹ": "r", - "ɔː": "oː", - }, - "es": { - "ɾ": "r", - "g": "ɣ", - }, -} +# Table for str.translate to fix gruut/TTS phoneme mismatch +GRUUT_TRANS_TABLE = str.maketrans("g", "ɡ") + + +def clean_gruut_phonemes(ph_list): + """Decompose, substitute, and clean gruut phonemes for TTS. + + Parameters: + ph_list (list[str]): list of phonemes from gruut + + Returns: + clean_list (list[str]): decomposed/clean list of phonemes for TTS + Dipthongs, etc. are decomposed into single characters + Unicode combining characters are removed (e.g., ties) + """ + cleaned_phonemes = [] + + for phoneme_text in ph_list: + # Decompose into codepoints (ã -> ["a", "\u0303"]) + phoneme_text = unicodedata.normalize("NFD", phoneme_text) + for codepoint in phoneme_text.translate(GRUUT_TRANS_TABLE): + if unicodedata.combining(codepoint) > 0: + # Skip combining characters like ties + continue + + cleaned_phonemes.append(codepoint) + + return cleaned_phonemes def text2phone(text, language): @@ -82,21 +85,14 @@ def text2phone(text, language): lang=language, return_format="word_phonemes", phonemizer_args={ - "remove_stress": True, # remove primary/secondary stress + "remove_accents": True, # remove accute/grave accents (Swedish) "ipa_minor_breaks": False, # don't replace commas/semi-colons with IPA | "ipa_major_breaks": False, # don't replace periods with IPA ‖ }, ) - ph_map = GRUUT_PHONEME_MAP.get(language) - if ph_map: - # Re-map phonemes to fit with eSpeak conventions - for word in ph_list: - for p_idx, p in enumerate(word): - word[p_idx] = ph_map.get(p, p) - # Join and re-split to break apart dipthongs, suprasegmentals, etc. - ph_words = ["|".join(word_phonemes) for word_phonemes in ph_list] + ph_words = ["|".join(clean_gruut_phonemes(word_phonemes)) for word_phonemes in ph_list] ph = "| ".join(ph_words) print(" > Phonemes: {}".format(ph)) diff --git a/tests/test_text_processing.py b/tests/test_text_processing.py new file mode 100644 index 00000000..f4938ca0 --- /dev/null +++ b/tests/test_text_processing.py @@ -0,0 +1,137 @@ +"""Tests for text to phoneme converstion""" +import unittest + +import gruut +from gruut_ipa import IPA, Phonemes + +from TTS.tts.utils.text import clean_gruut_phonemes, phoneme_to_sequence +from TTS.tts.utils.text import phonemes as all_phonemes +from TTS.tts.utils.text import sequence_to_phoneme + +# ----------------------------------------------------------------------------- + +EXAMPLE_TEXT = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!" + +# Raw phonemes from run of gruut with example text (en-us). +# This includes IPA ties, etc. +EXAMPLE_PHONEMES = [ + ["ɹ", "ˈi", "s", "ə", "n", "t"], + ["ɹ", "i", "s", "ˈɚ", "t͡ʃ"], + ["ˈæ", "t"], + ["h", "ˈɑ", "ɹ", "v", "ɚ", "d"], + ["h", "ˈæ", "z"], + ["ʃ", "ˈoʊ", "n"], + ["m", "ˈɛ", "d", "ɪ", "t", "ˌeɪ", "t", "ɪ", "ŋ"], + ["f", "ɚ"], + ["ˈæ", "z"], + ["l", "ˈɪ", "t", "ə", "l"], + ["ˈæ", "z"], + ["ˈeɪ", "t"], + ["w", "ˈi", "k", "s"], + ["k", "ə", "n"], + ["ˈæ", "k", "t͡ʃ", "ə", "l", "i"], + ["ɪ", "ŋ", "k", "ɹ", "ˈi", "s"], + [","], + ["ð", "ə"], + ["ɡ", "ɹ", "ˈeɪ"], + ["m", "ˈæ", "t", "ɚ"], + ["ˈɪ", "n"], + ["ð", "ə"], + ["p", "ˈɑ", "ɹ", "t", "s"], + ["ə", "v"], + ["ð", "ə"], + ["b", "ɹ", "ˈeɪ", "n"], + ["ɹ", "i", "s", "p", "ˈɑ", "n", "s", "ɪ", "b", "ə", "l"], + ["f", "ɚ"], + ["ɪ", "m", "ˈoʊ", "ʃ", "ə", "n", "ə", "l"], + ["ɹ", "ˌɛ", "ɡ", "j", "ə", "l", "ˈeɪ", "ʃ", "ə", "n"], + ["ˈæ", "n", "d"], + ["l", "ˈɚ", "n", "ɪ", "ŋ"], + ["!"], +] + +# ----------------------------------------------------------------------------- + + +class TextProcessingTextCase(unittest.TestCase): + """Tests for text to phoneme conversion""" + + def test_all_phonemes_in_tts(self): + """Ensure that all phonemes from gruut are present in TTS phonemes""" + tts_phonemes = set(all_phonemes) + + # Check stress characters + for suprasegmental in [IPA.STRESS_PRIMARY, IPA.STRESS_SECONDARY]: + self.assertIn(suprasegmental, tts_phonemes) + + # Check that gruut's phonemes are a subset of TTS phonemes + for lang in gruut.get_supported_languages(): + for phoneme in Phonemes.from_language(lang): + for codepoint in clean_gruut_phonemes(phoneme.text): + + self.assertIn(codepoint, tts_phonemes) + + def test_phoneme_to_sequence(self): + """Verify example (text -> sequence -> phoneme string) pipeline""" + lang = "en-us" + expected_phoneme_str = " ".join( + "".join(clean_gruut_phonemes(word_phonemes)) for word_phonemes in EXAMPLE_PHONEMES + ) + + # Ensure that TTS produces same phoneme string + text_cleaner = ["phoneme_cleaners"] + actual_sequence = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, lang) + actual_phoneme_str = sequence_to_phoneme(actual_sequence) + + self.assertEqual(actual_phoneme_str, expected_phoneme_str) + + def test_phoneme_to_sequence_with_blank_token(self): + """Verify example (text -> sequence -> phoneme string) pipeline with blank token""" + lang = "en-us" + text_cleaner = ["phoneme_cleaners"] + + # Create with/without blank sequences + sequence_without_blank = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, lang, add_blank=False) + sequence_with_blank = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, lang, add_blank=True) + + # With blank sequence should be bigger + self.assertGreater(len(sequence_with_blank), len(sequence_without_blank)) + + # But phoneme strings should still be identical + phoneme_str_without_blank = sequence_to_phoneme(sequence_without_blank, add_blank=False) + phoneme_str_with_blank = sequence_to_phoneme(sequence_with_blank, add_blank=True) + + self.assertEqual(phoneme_str_with_blank, phoneme_str_without_blank) + + def test_messy_text(self): + """Verify text with extra punctuation/whitespace/etc. makes it through the pipeline""" + text = '"Be" a! voice, [NOT]? (an eCHo. ' + lang = "en-us" + expected_phonemes = [ + ["b", "ˈi"], + ["ə"], + ["!"], + ["v", "ˈɔɪ", "s"], + [","], + ["n", "ˈɑ", "t"], + ["?"], + ["ə", "n"], + ["ˈɛ", "k", "oʊ"], + ["."], + ] + expected_phoneme_str = " ".join( + "".join(clean_gruut_phonemes(word_phonemes)) for word_phonemes in expected_phonemes + ) + + # Ensure that TTS produces same phoneme string + text_cleaner = ["phoneme_cleaners"] + actual_sequence = phoneme_to_sequence(text, text_cleaner, lang) + actual_phoneme_str = sequence_to_phoneme(actual_sequence) + + self.assertEqual(actual_phoneme_str, expected_phoneme_str) + + +# ----------------------------------------------------------------------------- + +if __name__ == "__main__": + unittest.main() From 07e8ff193a9b549da8960a42128b464cfcef904e Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Wed, 9 Jun 2021 13:44:04 -0400 Subject: [PATCH 04/11] Bump version of gruut to 1.1 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index a2fb4132..cb304693 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,4 +23,4 @@ coqpit mecab-python3==1.0.3 unidic-lite==1.0.8 # gruut+supported langs -gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=1.0.0 +gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=1.1.0 From da6f6a4a01763256341cc55e633f55c47f49ce2b Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Thu, 10 Jun 2021 10:08:01 -0400 Subject: [PATCH 05/11] Update docstring for clean_gruut_phonemes --- TTS/tts/utils/text/__init__.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py index 14319c44..3d2f5004 100644 --- a/TTS/tts/utils/text/__init__.py +++ b/TTS/tts/utils/text/__init__.py @@ -34,13 +34,21 @@ GRUUT_TRANS_TABLE = str.maketrans("g", "ɡ") def clean_gruut_phonemes(ph_list): """Decompose, substitute, and clean gruut phonemes for TTS. - Parameters: - ph_list (list[str]): list of phonemes from gruut + gruut phonemes may contain any IPA characters (e.g., "ẽ" for the nasalized + "e"), and may be composed of multiple characters (e.g., "aɪ" in the English + "r[i]ce"). + + TTS phonemes come from a fixed set of symbols, and do not include every + possible variation of every vowel/consonant. Here, we decompose dipthongs, + etc. into single characters and then filter out Unicode combining characters + such as ties. This ensures that (most) phonemes will exist in the TTS symbol + table. + + Args: + ph_list (list[str]): list of phonemes from gruut Returns: - clean_list (list[str]): decomposed/clean list of phonemes for TTS - Dipthongs, etc. are decomposed into single characters - Unicode combining characters are removed (e.g., ties) + clean_list (list[str]): decomposed/clean list of phonemes for TTS """ cleaned_phonemes = [] From 618b509204cf045f20dc972bbb3220ed160ab0ca Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Fri, 11 Jun 2021 10:43:52 -0400 Subject: [PATCH 06/11] =?UTF-8?q?Use=20combined=20characters=20available?= =?UTF-8?q?=20in=20TTS=20phonemes=20(like=20=C3=A7)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- TTS/tts/utils/text/__init__.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py index 3d2f5004..73bd829c 100644 --- a/TTS/tts/utils/text/__init__.py +++ b/TTS/tts/utils/text/__init__.py @@ -53,6 +53,11 @@ def clean_gruut_phonemes(ph_list): cleaned_phonemes = [] for phoneme_text in ph_list: + phoneme_text = unicodedata.normalize("NFC", phoneme_text) + if phoneme_text in phonemes: + cleaned_phonemes.append(phoneme_text) + continue + # Decompose into codepoints (ã -> ["a", "\u0303"]) phoneme_text = unicodedata.normalize("NFD", phoneme_text) for codepoint in phoneme_text.translate(GRUUT_TRANS_TABLE): From 4d8426fa0a26a5724f562941dfdfe2da1f0e7ee9 Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Tue, 15 Jun 2021 15:57:08 -0400 Subject: [PATCH 07/11] Use eSpeak IPA lexicons by default for phoneme models --- TTS/tts/configs/shared_configs.py | 3 + TTS/tts/utils/synthesis.py | 1 + TTS/tts/utils/text/__init__.py | 69 ++++-------- requirements.txt | 2 +- tests/test_text_processing.py | 176 ++++++++++++------------------ 5 files changed, 97 insertions(+), 154 deletions(-) diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py index 4690e76f..a501a880 100644 --- a/TTS/tts/configs/shared_configs.py +++ b/TTS/tts/configs/shared_configs.py @@ -97,6 +97,8 @@ class BaseTTSConfig(BaseTrainingConfig): Audio processor config object instance. use_phonemes (bool): enable / disable phoneme use. + use_espeak_phonemes (bool): + enable / disable eSpeak-compatible phonemes (only if use_phonemes = `True`). compute_input_seq_cache (bool): enable / disable precomputation of the phoneme sequences. At the expense of some delay at the beginning of the training, It allows faster data loader time and precise limitation with `max_seq_len` and @@ -136,6 +138,7 @@ class BaseTTSConfig(BaseTrainingConfig): audio: BaseAudioConfig = field(default_factory=BaseAudioConfig) # phoneme settings use_phonemes: bool = False + use_espeak_phonemes: bool = True phoneme_language: str = None compute_input_seq_cache: bool = False text_cleaner: str = MISSING diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 9f417a1d..da50f1ca 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -25,6 +25,7 @@ def text_to_seqvec(text, CONFIG): CONFIG.enable_eos_bos_chars, tp=CONFIG.characters, add_blank=CONFIG.add_blank, + use_espeak_phonemes=CONFIG.use_espeak_phonemes ), dtype=np.int32, ) diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py index 73bd829c..350e5934 100644 --- a/TTS/tts/utils/text/__init__.py +++ b/TTS/tts/utils/text/__init__.py @@ -31,46 +31,7 @@ PHONEME_PUNCTUATION_PATTERN = r"[" + _punctuations.replace(" ", "") + "]+" GRUUT_TRANS_TABLE = str.maketrans("g", "ɡ") -def clean_gruut_phonemes(ph_list): - """Decompose, substitute, and clean gruut phonemes for TTS. - - gruut phonemes may contain any IPA characters (e.g., "ẽ" for the nasalized - "e"), and may be composed of multiple characters (e.g., "aɪ" in the English - "r[i]ce"). - - TTS phonemes come from a fixed set of symbols, and do not include every - possible variation of every vowel/consonant. Here, we decompose dipthongs, - etc. into single characters and then filter out Unicode combining characters - such as ties. This ensures that (most) phonemes will exist in the TTS symbol - table. - - Args: - ph_list (list[str]): list of phonemes from gruut - - Returns: - clean_list (list[str]): decomposed/clean list of phonemes for TTS - """ - cleaned_phonemes = [] - - for phoneme_text in ph_list: - phoneme_text = unicodedata.normalize("NFC", phoneme_text) - if phoneme_text in phonemes: - cleaned_phonemes.append(phoneme_text) - continue - - # Decompose into codepoints (ã -> ["a", "\u0303"]) - phoneme_text = unicodedata.normalize("NFD", phoneme_text) - for codepoint in phoneme_text.translate(GRUUT_TRANS_TABLE): - if unicodedata.combining(codepoint) > 0: - # Skip combining characters like ties - continue - - cleaned_phonemes.append(codepoint) - - return cleaned_phonemes - - -def text2phone(text, language): +def text2phone(text, language, use_espeak_phonemes=False): """Convert graphemes to phonemes. Parameters: text (str): text to phonemize @@ -93,21 +54,32 @@ def text2phone(text, language): if gruut.is_language_supported(language): # Use gruut for phonemization + phonemizer_args={ + "remove_stress": True, + "ipa_minor_breaks": False, # don't replace commas/semi-colons with IPA | + "ipa_major_breaks": False, # don't replace periods with IPA ‖ + } + + if use_espeak_phonemes: + # Use a lexicon/g2p model train on eSpeak IPA instead of gruut IPA. + # This is intended for backwards compatibility with TTS<=v0.0.13 + # pre-trained models. + phonemizer_args["model_prefix"] = "espeak" + ph_list = gruut.text_to_phonemes( text, lang=language, return_format="word_phonemes", - phonemizer_args={ - "remove_accents": True, # remove accute/grave accents (Swedish) - "ipa_minor_breaks": False, # don't replace commas/semi-colons with IPA | - "ipa_major_breaks": False, # don't replace periods with IPA ‖ - }, + phonemizer_args=phonemizer_args, ) # Join and re-split to break apart dipthongs, suprasegmentals, etc. - ph_words = ["|".join(clean_gruut_phonemes(word_phonemes)) for word_phonemes in ph_list] + ph_words = ["|".join(word_phonemes) for word_phonemes in ph_list] ph = "| ".join(ph_words) + # Fix a few phonemes + ph = ph.translate(GRUUT_TRANS_TABLE) + print(" > Phonemes: {}".format(ph)) return ph @@ -132,7 +104,7 @@ def pad_with_eos_bos(phoneme_sequence, tp=None): return [_phonemes_to_id[_bos]] + list(phoneme_sequence) + [_phonemes_to_id[_eos]] -def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None, add_blank=False): +def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None, add_blank=False, use_espeak_phonemes=False): # pylint: disable=global-statement global _phonemes_to_id, _phonemes if tp: @@ -141,7 +113,7 @@ def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp= sequence = [] clean_text = _clean_text(text, cleaner_names) - to_phonemes = text2phone(clean_text, language) + to_phonemes = text2phone(clean_text, language, use_espeak_phonemes=use_espeak_phonemes) if to_phonemes is None: print("!! After phoneme conversion the result is None. -- {} ".format(clean_text)) # iterate by skipping empty strings - NOTE: might be useful to keep it to have a better intonation. @@ -152,6 +124,7 @@ def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp= sequence = pad_with_eos_bos(sequence, tp=tp) if add_blank: sequence = intersperse(sequence, len(_phonemes)) # add a blank token (new), whose id number is len(_phonemes) + return sequence diff --git a/requirements.txt b/requirements.txt index cb304693..046139d0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,4 +23,4 @@ coqpit mecab-python3==1.0.3 unidic-lite==1.0.8 # gruut+supported langs -gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=1.1.0 +gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=1.2.0 diff --git a/tests/test_text_processing.py b/tests/test_text_processing.py index f4938ca0..17ee755e 100644 --- a/tests/test_text_processing.py +++ b/tests/test_text_processing.py @@ -1,54 +1,15 @@ """Tests for text to phoneme converstion""" import unittest -import gruut -from gruut_ipa import IPA, Phonemes - -from TTS.tts.utils.text import clean_gruut_phonemes, phoneme_to_sequence -from TTS.tts.utils.text import phonemes as all_phonemes -from TTS.tts.utils.text import sequence_to_phoneme +from TTS.tts.utils.text import phoneme_to_sequence, sequence_to_phoneme, text2phone # ----------------------------------------------------------------------------- +LANG = "en-us" + EXAMPLE_TEXT = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!" -# Raw phonemes from run of gruut with example text (en-us). -# This includes IPA ties, etc. -EXAMPLE_PHONEMES = [ - ["ɹ", "ˈi", "s", "ə", "n", "t"], - ["ɹ", "i", "s", "ˈɚ", "t͡ʃ"], - ["ˈæ", "t"], - ["h", "ˈɑ", "ɹ", "v", "ɚ", "d"], - ["h", "ˈæ", "z"], - ["ʃ", "ˈoʊ", "n"], - ["m", "ˈɛ", "d", "ɪ", "t", "ˌeɪ", "t", "ɪ", "ŋ"], - ["f", "ɚ"], - ["ˈæ", "z"], - ["l", "ˈɪ", "t", "ə", "l"], - ["ˈæ", "z"], - ["ˈeɪ", "t"], - ["w", "ˈi", "k", "s"], - ["k", "ə", "n"], - ["ˈæ", "k", "t͡ʃ", "ə", "l", "i"], - ["ɪ", "ŋ", "k", "ɹ", "ˈi", "s"], - [","], - ["ð", "ə"], - ["ɡ", "ɹ", "ˈeɪ"], - ["m", "ˈæ", "t", "ɚ"], - ["ˈɪ", "n"], - ["ð", "ə"], - ["p", "ˈɑ", "ɹ", "t", "s"], - ["ə", "v"], - ["ð", "ə"], - ["b", "ɹ", "ˈeɪ", "n"], - ["ɹ", "i", "s", "p", "ˈɑ", "n", "s", "ɪ", "b", "ə", "l"], - ["f", "ɚ"], - ["ɪ", "m", "ˈoʊ", "ʃ", "ə", "n", "ə", "l"], - ["ɹ", "ˌɛ", "ɡ", "j", "ə", "l", "ˈeɪ", "ʃ", "ə", "n"], - ["ˈæ", "n", "d"], - ["l", "ˈɚ", "n", "ɪ", "ŋ"], - ["!"], -] +EXPECTED_PHONEMES = "ɹ|iː|s|ə|n|t| ɹ|ᵻ|s|ɜː|tʃ| æ|t| h|ɑːɹ|v|ɚ|d| h|æ|z| ʃ|oʊ|n| m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| f|ɔːɹ| æ|z| l|ɪ|ɾ|əl| æ|z| eɪ|t| w|iː|k|s| k|æ|n| æ|k|tʃ|uː|əl|i| ɪ|ŋ|k|ɹ|iː|s| ,| ð|ə| ɡ|ɹ|eɪ| m|æ|ɾ|ɚ| ɪ|n| ð|ə| p|ɑːɹ|t|s| ʌ|v| ð|ə| b|ɹ|eɪ|n| ɹ|ᵻ|s|p|ɑː|n|s|ᵻ|b|əl| f|ɔːɹ| ɪ|m|oʊ|ʃ|ə|n|əl| ɹ|ɛ|ɡ|j|ʊ|l|eɪ|ʃ|ə|n| æ|n|d| l|ɜː|n|ɪ|ŋ| !" # ----------------------------------------------------------------------------- @@ -56,79 +17,84 @@ EXAMPLE_PHONEMES = [ class TextProcessingTextCase(unittest.TestCase): """Tests for text to phoneme conversion""" - def test_all_phonemes_in_tts(self): - """Ensure that all phonemes from gruut are present in TTS phonemes""" - tts_phonemes = set(all_phonemes) - - # Check stress characters - for suprasegmental in [IPA.STRESS_PRIMARY, IPA.STRESS_SECONDARY]: - self.assertIn(suprasegmental, tts_phonemes) - - # Check that gruut's phonemes are a subset of TTS phonemes - for lang in gruut.get_supported_languages(): - for phoneme in Phonemes.from_language(lang): - for codepoint in clean_gruut_phonemes(phoneme.text): - - self.assertIn(codepoint, tts_phonemes) - def test_phoneme_to_sequence(self): - """Verify example (text -> sequence -> phoneme string) pipeline""" - lang = "en-us" - expected_phoneme_str = " ".join( - "".join(clean_gruut_phonemes(word_phonemes)) for word_phonemes in EXAMPLE_PHONEMES - ) - - # Ensure that TTS produces same phoneme string - text_cleaner = ["phoneme_cleaners"] - actual_sequence = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, lang) - actual_phoneme_str = sequence_to_phoneme(actual_sequence) - - self.assertEqual(actual_phoneme_str, expected_phoneme_str) + """Verify en-us sentence phonemes without blank token""" + self._test_phoneme_to_sequence(add_blank=False) def test_phoneme_to_sequence_with_blank_token(self): - """Verify example (text -> sequence -> phoneme string) pipeline with blank token""" - lang = "en-us" + """Verify en-us sentence phonemes with blank token""" + self._test_phoneme_to_sequence(add_blank=True) + + def _test_phoneme_to_sequence(self, add_blank): text_cleaner = ["phoneme_cleaners"] + sequence = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True) + text_hat = sequence_to_phoneme(sequence) + text_hat_with_params = sequence_to_phoneme(sequence) + gt = EXPECTED_PHONEMES.replace("|", "") + self.assertEqual(text_hat, text_hat_with_params) + self.assertEqual(text_hat, gt) - # Create with/without blank sequences - sequence_without_blank = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, lang, add_blank=False) - sequence_with_blank = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, lang, add_blank=True) + # multiple punctuations + text = "Be a voice, not an! echo?" + sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True) + text_hat = sequence_to_phoneme(sequence) + text_hat_with_params = sequence_to_phoneme(sequence) + gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ ?" + print(text_hat) + print(len(sequence)) + self.assertEqual(text_hat, text_hat_with_params) + self.assertEqual(text_hat, gt) - # With blank sequence should be bigger - self.assertGreater(len(sequence_with_blank), len(sequence_without_blank)) + # not ending with punctuation + text = "Be a voice, not an! echo" + sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True) + text_hat = sequence_to_phoneme(sequence) + text_hat_with_params = sequence_to_phoneme(sequence) + gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ" + print(text_hat) + print(len(sequence)) + self.assertEqual(text_hat, text_hat_with_params) + self.assertEqual(text_hat, gt) - # But phoneme strings should still be identical - phoneme_str_without_blank = sequence_to_phoneme(sequence_without_blank, add_blank=False) - phoneme_str_with_blank = sequence_to_phoneme(sequence_with_blank, add_blank=True) + # original + text = "Be a voice, not an echo!" + sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True) + text_hat = sequence_to_phoneme(sequence) + text_hat_with_params = sequence_to_phoneme(sequence) + gt = "biː ɐ vɔɪs , nɑːt ɐn ɛkoʊ !" + print(text_hat) + print(len(sequence)) + self.assertEqual(text_hat, text_hat_with_params) + self.assertEqual(text_hat, gt) - self.assertEqual(phoneme_str_with_blank, phoneme_str_without_blank) + # extra space after the sentence + text = "Be a voice, not an! echo. " + sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True) + text_hat = sequence_to_phoneme(sequence) + text_hat_with_params = sequence_to_phoneme(sequence) + gt = "biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ ." + print(text_hat) + print(len(sequence)) + self.assertEqual(text_hat, text_hat_with_params) + self.assertEqual(text_hat, gt) - def test_messy_text(self): - """Verify text with extra punctuation/whitespace/etc. makes it through the pipeline""" - text = '"Be" a! voice, [NOT]? (an eCHo. ' - lang = "en-us" - expected_phonemes = [ - ["b", "ˈi"], - ["ə"], - ["!"], - ["v", "ˈɔɪ", "s"], - [","], - ["n", "ˈɑ", "t"], - ["?"], - ["ə", "n"], - ["ˈɛ", "k", "oʊ"], - ["."], - ] - expected_phoneme_str = " ".join( - "".join(clean_gruut_phonemes(word_phonemes)) for word_phonemes in expected_phonemes + # extra space after the sentence + text = "Be a voice, not an! echo. " + sequence = phoneme_to_sequence( + text, text_cleaner, LANG, enable_eos_bos=True, add_blank=add_blank, use_espeak_phonemes=True ) + text_hat = sequence_to_phoneme(sequence) + text_hat_with_params = sequence_to_phoneme(sequence) + gt = "^biː ɐ vɔɪs , nɑːt ɐn ! ɛkoʊ .~" + print(text_hat) + print(len(sequence)) + self.assertEqual(text_hat, text_hat_with_params) + self.assertEqual(text_hat, gt) - # Ensure that TTS produces same phoneme string - text_cleaner = ["phoneme_cleaners"] - actual_sequence = phoneme_to_sequence(text, text_cleaner, lang) - actual_phoneme_str = sequence_to_phoneme(actual_sequence) - - self.assertEqual(actual_phoneme_str, expected_phoneme_str) + def test_text2phone(self): + text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!" + ph = text2phone(EXAMPLE_TEXT, LANG) + self.assertEqual(ph, EXPECTED_PHONEMES) # ----------------------------------------------------------------------------- From 3f172b84d850b2f1e7716dfb4428843037669b3b Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Wed, 16 Jun 2021 15:26:36 -0400 Subject: [PATCH 08/11] Fix linting issues --- TTS/bin/extract_tts_spectrograms.py | 1 + TTS/tts/utils/synthesis.py | 2 +- TTS/tts/utils/text/__init__.py | 6 ++++-- tests/test_text_processing.py | 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index ace7464a..fb3a8321 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -299,4 +299,5 @@ if __name__ == "__main__": args = parser.parse_args() c = load_config(args.config_path) + c.audio["do_trim_silence"] = False # IMPORTANT!!!!!!!!!!!!!!! disable to align mel main(args) diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index da50f1ca..0ddf7ebe 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -25,7 +25,7 @@ def text_to_seqvec(text, CONFIG): CONFIG.enable_eos_bos_chars, tp=CONFIG.characters, add_blank=CONFIG.add_blank, - use_espeak_phonemes=CONFIG.use_espeak_phonemes + use_espeak_phonemes=CONFIG.use_espeak_phonemes, ), dtype=np.int32, ) diff --git a/TTS/tts/utils/text/__init__.py b/TTS/tts/utils/text/__init__.py index 350e5934..787394b5 100644 --- a/TTS/tts/utils/text/__init__.py +++ b/TTS/tts/utils/text/__init__.py @@ -54,7 +54,7 @@ def text2phone(text, language, use_espeak_phonemes=False): if gruut.is_language_supported(language): # Use gruut for phonemization - phonemizer_args={ + phonemizer_args = { "remove_stress": True, "ipa_minor_breaks": False, # don't replace commas/semi-colons with IPA | "ipa_major_breaks": False, # don't replace periods with IPA ‖ @@ -104,7 +104,9 @@ def pad_with_eos_bos(phoneme_sequence, tp=None): return [_phonemes_to_id[_bos]] + list(phoneme_sequence) + [_phonemes_to_id[_eos]] -def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None, add_blank=False, use_espeak_phonemes=False): +def phoneme_to_sequence( + text, cleaner_names, language, enable_eos_bos=False, tp=None, add_blank=False, use_espeak_phonemes=False +): # pylint: disable=global-statement global _phonemes_to_id, _phonemes if tp: diff --git a/tests/test_text_processing.py b/tests/test_text_processing.py index 17ee755e..4a1ba64f 100644 --- a/tests/test_text_processing.py +++ b/tests/test_text_processing.py @@ -26,6 +26,7 @@ class TextProcessingTextCase(unittest.TestCase): self._test_phoneme_to_sequence(add_blank=True) def _test_phoneme_to_sequence(self, add_blank): + """Verify en-us sentence phonemes""" text_cleaner = ["phoneme_cleaners"] sequence = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True) text_hat = sequence_to_phoneme(sequence) @@ -92,7 +93,7 @@ class TextProcessingTextCase(unittest.TestCase): self.assertEqual(text_hat, gt) def test_text2phone(self): - text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!" + """Verify phones directly (with |)""" ph = text2phone(EXAMPLE_TEXT, LANG) self.assertEqual(ph, EXPECTED_PHONEMES) From a41f53fe725241f3f53f3474631c9f8b11d59012 Mon Sep 17 00:00:00 2001 From: Michael Hansen Date: Wed, 16 Jun 2021 18:10:51 -0400 Subject: [PATCH 09/11] Fix silly error in tests --- tests/test_text_processing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_text_processing.py b/tests/test_text_processing.py index 4a1ba64f..3c424a15 100644 --- a/tests/test_text_processing.py +++ b/tests/test_text_processing.py @@ -94,7 +94,7 @@ class TextProcessingTextCase(unittest.TestCase): def test_text2phone(self): """Verify phones directly (with |)""" - ph = text2phone(EXAMPLE_TEXT, LANG) + ph = text2phone(EXAMPLE_TEXT, LANG, use_espeak_phonemes=True) self.assertEqual(ph, EXPECTED_PHONEMES) From 987cf1178b3dd1894cfd4c0b32a56ae4d2af1620 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 25 Jun 2021 14:44:33 +0200 Subject: [PATCH 10/11] Bump up to v0.0.16 --- TTS/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/VERSION b/TTS/VERSION index 58c4b6e9..e3b86dd9 100644 --- a/TTS/VERSION +++ b/TTS/VERSION @@ -1 +1 @@ -0.0.15.1 +0.0.16 From 6c7bbcaef04a911c3f49c24eb50a55f691ea7194 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 25 Jun 2021 16:52:17 +0200 Subject: [PATCH 11/11] Use `en-us` for testing phoneme models --- tests/tts_tests/test_glow_tts_train.py | 3 ++- tests/tts_tests/test_speedy_speech_train.py | 2 +- tests/vocoder_tests/test_multiband_melgan_train.py | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/tts_tests/test_glow_tts_train.py b/tests/tts_tests/test_glow_tts_train.py index 2e675d13..e44f6365 100644 --- a/tests/tts_tests/test_glow_tts_train.py +++ b/tests/tts_tests/test_glow_tts_train.py @@ -16,7 +16,8 @@ config = GlowTTSConfig( num_val_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=True, - phoneme_language="zh-CN", + use_espeak_phonemes=True, + phoneme_language="en-us", phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", run_eval=True, test_delay_epochs=-1, diff --git a/tests/tts_tests/test_speedy_speech_train.py b/tests/tts_tests/test_speedy_speech_train.py index 3f508117..9dcf0ad8 100644 --- a/tests/tts_tests/test_speedy_speech_train.py +++ b/tests/tts_tests/test_speedy_speech_train.py @@ -16,7 +16,7 @@ config = SpeedySpeechConfig( num_val_loader_workers=0, text_cleaner="english_cleaners", use_phonemes=True, - phoneme_language="zh-CN", + phoneme_language="en-us", phoneme_cache_path="tests/data/ljspeech/phoneme_cache/", run_eval=True, test_delay_epochs=-1, diff --git a/tests/vocoder_tests/test_multiband_melgan_train.py b/tests/vocoder_tests/test_multiband_melgan_train.py index 081fb40e..ef362414 100644 --- a/tests/vocoder_tests/test_multiband_melgan_train.py +++ b/tests/vocoder_tests/test_multiband_melgan_train.py @@ -20,6 +20,7 @@ config = MultibandMelganConfig( eval_split_size=1, print_step=1, print_eval=True, + discriminator_model_params={"base_channels": 16, "max_channels": 128, "downsample_factors": [4, 4, 4]}, data_path="tests/data/ljspeech", output_path=output_path, )