diff --git a/utils/text/__init__.py b/utils/text/__init__.py index 77486cd0..d97eabf2 100644 --- a/utils/text/__init__.py +++ b/utils/text/__init__.py @@ -1,9 +1,9 @@ # -*- coding: utf-8 -*- import re +import epitran from utils.text import cleaners from utils.text.symbols import symbols, phonemes, _punctuations -from utils.text.cmudict import text2phone # Mappings from symbol to numeric ID and vice versa: _symbol_to_id = {s: i for i, s in enumerate(symbols)} @@ -15,7 +15,21 @@ _id_to_phonemes = {i: s for i, s in enumerate(phonemes)} # Regular expression matching text enclosed in curly braces: _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') +# phoneme converter +epi = epitran.Epitran('eng-Latn') + +def text2phone(text): + ''' + Convert graphemes to phonemes. + ''' + try: + ph = epi.trans_list(text, normpunc=True) + except: + ph = None + return ph + + def phoneme_to_sequence(text, cleaner_names): ''' TODO: This ignores punctuations diff --git a/utils/text/cmudict.py b/utils/text/cmudict.py index fe5a311b..1202bf3d 100644 --- a/utils/text/cmudict.py +++ b/utils/text/cmudict.py @@ -1,8 +1,6 @@ # -*- coding: utf-8 -*- import re -import epitran -epi = epitran.Epitran('eng-Latn') # valid_symbols = [ # 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', @@ -16,67 +14,6 @@ epi = epitran.Epitran('eng-Latn') # ] -_phonemes = { - '$', - '&', - 'a', - 'b', - 'd', - 'd͡ʒ', - 'e', - 'f', - 'h', - 'i', - 'j', - 'k', - 'l', - 'm', - 'n', - 'o', - 'p', - 's', - 't', - 't͡ʃ', - 'u', - 'v', - 'w', - 'z', - '£', - 'à', - 'â', - 'æ', - 'è', - 'é', - 'ê', - 'ð', - 'ü', - 'ŋ', - 'ɑ', - 'ɔ', - 'ə', - 'ɛ', - 'ɡ', - 'ɪ', - 'ɹ', - 'ɹ̩', - 'ʃ', - 'ʊ', - 'ʌ', - 'ʒ', - 'θ' - } - -_phonemes = set(_phonemes) - - -def text2phone(text): - try: - ph = epi.trans_list(text, normpunc=True) - except: - ph = None - return ph - - class CMUDict: '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict''' diff --git a/utils/text/symbols.py b/utils/text/symbols.py index e63f1572..4c6eeb38 100644 --- a/utils/text/symbols.py +++ b/utils/text/symbols.py @@ -11,13 +11,63 @@ _pad = '_' _eos = '~' _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? ' _punctuations = '!\'(),-.:;? ' +_phonemes = [ + '$', + '&', + 'a', + 'b', + 'd', + 'd͡ʒ', + 'e', + 'f', + 'h', + 'i', + 'j', + 'k', + 'l', + 'm', + 'n', + 'o', + 'p', + 's', + 't', + 't͡ʃ', + 'u', + 'v', + 'w', + 'z', + '£', + 'à', + 'â', + 'æ', + 'è', + 'é', + 'ê', + 'ð', + 'ü', + 'ŋ', + 'ɑ', + 'ɔ', + 'ə', + 'ɛ', + 'ɡ', + 'ɪ', + 'ɹ', + 'ɹ̩', + 'ʃ', + 'ʊ', + 'ʌ', + 'ʒ', + 'θ' +] +_phonemes = sorted(list(set(_phonemes))) # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): -_arpabet = ['@' + s for s in cmudict._phonemes] +_arpabet = ['@' + s for s in _phonemes] # Export all symbols: symbols = [_pad, _eos] + list(_characters) + _arpabet -phonemes = [_pad, _eos] + list(cmudict._phonemes) + list(_punctuations) +phonemes = [_pad, _eos] + list(_phonemes) + list(_punctuations) if __name__ == '__main__': print(" > TTS symbols ")