# -*- coding: utf-8 -*- import re import unicodedata import gruut from packaging import version from TTS.tts.utils.text import cleaners from TTS.tts.utils.text.chinese_mandarin.phonemizer import chinese_text_to_phonemes from TTS.tts.utils.text.japanese.phonemizer import japanese_text_to_phonemes from TTS.tts.utils.text.symbols import _bos, _eos, _punctuations, make_symbols, phonemes, symbols # pylint: disable=unnecessary-comprehension # Mappings from symbol to numeric ID and vice versa: _symbol_to_id = {s: i for i, s in enumerate(symbols)} _id_to_symbol = {i: s for i, s in enumerate(symbols)} _phonemes_to_id = {s: i for i, s in enumerate(phonemes)} _id_to_phonemes = {i: s for i, s in enumerate(phonemes)} _symbols = symbols _phonemes = phonemes # Regular expression matching text enclosed in curly braces: _CURLY_RE = re.compile(r"(.*?)\{(.+?)\}(.*)") # Regular expression matching punctuations, ignoring empty space PHONEME_PUNCTUATION_PATTERN = r"[" + _punctuations.replace(" ", "") + "]+" # Table for str.translate to fix gruut/TTS phoneme mismatch GRUUT_TRANS_TABLE = str.maketrans("g", "ɡ") def clean_gruut_phonemes(ph_list): """Decompose, substitute, and clean gruut phonemes for TTS. gruut phonemes may contain any IPA characters (e.g., "ẽ" for the nasalized "e"), and may be composed of multiple characters (e.g., "aɪ" in the English "r[i]ce"). TTS phonemes come from a fixed set of symbols, and do not include every possible variation of every vowel/consonant. Here, we decompose dipthongs, etc. into single characters and then filter out Unicode combining characters such as ties. This ensures that (most) phonemes will exist in the TTS symbol table. Args: ph_list (list[str]): list of phonemes from gruut Returns: clean_list (list[str]): decomposed/clean list of phonemes for TTS """ cleaned_phonemes = [] for phoneme_text in ph_list: # Decompose into codepoints (ã -> ["a", "\u0303"]) phoneme_text = unicodedata.normalize("NFD", phoneme_text) for codepoint in phoneme_text.translate(GRUUT_TRANS_TABLE): if unicodedata.combining(codepoint) > 0: # Skip combining characters like ties continue cleaned_phonemes.append(codepoint) return cleaned_phonemes def text2phone(text, language): """Convert graphemes to phonemes. Parameters: text (str): text to phonemize language (str): language of the text Returns: ph (str): phonemes as a string seperated by "|" ph = "ɪ|g|ˈ|z|æ|m|p|ə|l" """ # TO REVIEW : How to have a good implementation for this? if language == "zh-CN": ph = chinese_text_to_phonemes(text) print(" > Phonemes: {}".format(ph)) return ph if language == "ja-jp": ph = japanese_text_to_phonemes(text) print(" > Phonemes: {}".format(ph)) return ph if gruut.is_language_supported(language): # Use gruut for phonemization ph_list = gruut.text_to_phonemes( text, lang=language, return_format="word_phonemes", phonemizer_args={ "remove_accents": True, # remove accute/grave accents (Swedish) "ipa_minor_breaks": False, # don't replace commas/semi-colons with IPA | "ipa_major_breaks": False, # don't replace periods with IPA ‖ }, ) # Join and re-split to break apart dipthongs, suprasegmentals, etc. ph_words = ["|".join(clean_gruut_phonemes(word_phonemes)) for word_phonemes in ph_list] ph = "| ".join(ph_words) print(" > Phonemes: {}".format(ph)) return ph raise ValueError(f" [!] Language {language} is not supported for phonemization.") def intersperse(sequence, token): result = [token] * (len(sequence) * 2 + 1) result[1::2] = sequence return result def pad_with_eos_bos(phoneme_sequence, tp=None): # pylint: disable=global-statement global _phonemes_to_id, _bos, _eos if tp: _bos = tp["bos"] _eos = tp["eos"] _, _phonemes = make_symbols(**tp) _phonemes_to_id = {s: i for i, s in enumerate(_phonemes)} return [_phonemes_to_id[_bos]] + list(phoneme_sequence) + [_phonemes_to_id[_eos]] def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False, tp=None, add_blank=False): # pylint: disable=global-statement global _phonemes_to_id, _phonemes if tp: _, _phonemes = make_symbols(**tp) _phonemes_to_id = {s: i for i, s in enumerate(_phonemes)} sequence = [] clean_text = _clean_text(text, cleaner_names) to_phonemes = text2phone(clean_text, language) if to_phonemes is None: print("!! After phoneme conversion the result is None. -- {} ".format(clean_text)) # iterate by skipping empty strings - NOTE: might be useful to keep it to have a better intonation. for phoneme in filter(None, to_phonemes.split("|")): sequence += _phoneme_to_sequence(phoneme) # Append EOS char if enable_eos_bos: sequence = pad_with_eos_bos(sequence, tp=tp) if add_blank: sequence = intersperse(sequence, len(_phonemes)) # add a blank token (new), whose id number is len(_phonemes) return sequence def sequence_to_phoneme(sequence, tp=None, add_blank=False): # pylint: disable=global-statement """Converts a sequence of IDs back to a string""" global _id_to_phonemes, _phonemes if add_blank: sequence = list(filter(lambda x: x != len(_phonemes), sequence)) result = "" if tp: _, _phonemes = make_symbols(**tp) _id_to_phonemes = {i: s for i, s in enumerate(_phonemes)} for symbol_id in sequence: if symbol_id in _id_to_phonemes: s = _id_to_phonemes[symbol_id] result += s return result.replace("}{", " ") def text_to_sequence(text, cleaner_names, tp=None, add_blank=False): """Converts a string of text to a sequence of IDs corresponding to the symbols in the text. The text can optionally have ARPAbet sequences enclosed in curly braces embedded in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." Args: text: string to convert to a sequence cleaner_names: names of the cleaner functions to run the text through tp: dictionary of character parameters to use a custom character set. Returns: List of integers corresponding to the symbols in the text """ # pylint: disable=global-statement global _symbol_to_id, _symbols if tp: _symbols, _ = make_symbols(**tp) _symbol_to_id = {s: i for i, s in enumerate(_symbols)} sequence = [] # Check for curly braces and treat their contents as ARPAbet: while text: m = _CURLY_RE.match(text) if not m: sequence += _symbols_to_sequence(_clean_text(text, cleaner_names)) break sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names)) sequence += _arpabet_to_sequence(m.group(2)) text = m.group(3) if add_blank: sequence = intersperse(sequence, len(_symbols)) # add a blank token (new), whose id number is len(_symbols) return sequence def sequence_to_text(sequence, tp=None, add_blank=False): """Converts a sequence of IDs back to a string""" # pylint: disable=global-statement global _id_to_symbol, _symbols if add_blank: sequence = list(filter(lambda x: x != len(_symbols), sequence)) if tp: _symbols, _ = make_symbols(**tp) _id_to_symbol = {i: s for i, s in enumerate(_symbols)} result = "" for symbol_id in sequence: if symbol_id in _id_to_symbol: s = _id_to_symbol[symbol_id] # Enclose ARPAbet back in curly braces: if len(s) > 1 and s[0] == "@": s = "{%s}" % s[1:] result += s return result.replace("}{", " ") def _clean_text(text, cleaner_names): for name in cleaner_names: cleaner = getattr(cleaners, name) if not cleaner: raise Exception("Unknown cleaner: %s" % name) text = cleaner(text) return text def _symbols_to_sequence(syms): return [_symbol_to_id[s] for s in syms if _should_keep_symbol(s)] def _phoneme_to_sequence(phons): return [_phonemes_to_id[s] for s in list(phons) if _should_keep_phoneme(s)] def _arpabet_to_sequence(text): return _symbols_to_sequence(["@" + s for s in text.split()]) def _should_keep_symbol(s): return s in _symbol_to_id and s not in ["~", "^", "_"] def _should_keep_phoneme(p): return p in _phonemes_to_id and p not in ["~", "^", "_"]