Move phoneme compuataion to __init__ and put char list to symbols.py

2019-01-04 16:18:49 +01:00 · 2019-01-04 16:18:49 +01:00 · df49e93684
parent da2f064bc5
commit df49e93684
3 changed files with 67 additions and 66 deletions
--- a/utils/text/init.py
+++ b/utils/text/init.py
@ -1,9 +1,9 @@
 # -*- coding: utf-8 -*-
 import re
 import epitran
 from utils.text import cleaners
 from utils.text.symbols import symbols, phonemes, _punctuations
 from utils.text.cmudict import text2phone
 # Mappings from symbol to numeric ID and vice versa:
 _symbol_to_id = {s: i for i, s in enumerate(symbols)}
@ -15,6 +15,20 @@ _id_to_phonemes = {i: s for i, s in enumerate(phonemes)}
 # Regular expression matching text enclosed in curly braces:
 _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
 # phoneme converter
 epi = epitran.Epitran('eng-Latn')
 def text2phone(text):
    '''
    Convert graphemes to phonemes.
    '''
    try:
        ph = epi.trans_list(text, normpunc=True)
    except:
        ph = None
    return ph
 def phoneme_to_sequence(text, cleaner_names):
    '''
--- a/utils/text/cmudict.py
+++ b/utils/text/cmudict.py
@ -1,8 +1,6 @@
 # -*- coding: utf-8 -*-
 import re
 import epitran
 epi = epitran.Epitran('eng-Latn')
 # valid_symbols = [
 #     'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1',
@ -16,67 +14,6 @@ epi = epitran.Epitran('eng-Latn')
 # ]
 _phonemes = {
    '$',
    '&',
    'a',
    'b',
    'd',
    'd͡ʒ',
    'e',
    'f',
    'h',
    'i',
    'j',
    'k',
    'l',
    'm',
    'n',
    'o',
    'p',
    's',
    't',
    't͡ʃ',
    'u',
    'v',
    'w',
    'z',
    '£',
    'à',
    'â',
    'æ',
    'è',
    'é',
    'ê',
    'ð',
    'ü',
    'ŋ',
    'ɑ',
    'ɔ',
    'ə',
    'ɛ',
    'ɡ',
    'ɪ',
    'ɹ',
    'ɹ̩',
    'ʃ',
    'ʊ',
    'ʌ',
    'ʒ',
    'θ'
    }
 _phonemes = set(_phonemes)
 def text2phone(text):
    try:
        ph = epi.trans_list(text, normpunc=True)
    except:
        ph = None
    return ph
 class CMUDict:
    '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''
--- a/utils/text/symbols.py
+++ b/utils/text/symbols.py
@ -11,13 +11,63 @@ _pad = '_'
 _eos = '~'
 _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? '
 _punctuations = '!\'(),-.:;? '
 _phonemes = [
    '$',
    '&',
    'a',
    'b',
    'd',
    'd͡ʒ',
    'e',
    'f',
    'h',
    'i',
    'j',
    'k',
    'l',
    'm',
    'n',
    'o',
    'p',
    's',
    't',
    't͡ʃ',
    'u',
    'v',
    'w',
    'z',
    '£',
    'à',
    'â',
    'æ',
    'è',
    'é',
    'ê',
    'ð',
    'ü',
    'ŋ',
    'ɑ',
    'ɔ',
    'ə',
    'ɛ',
    'ɡ',
    'ɪ',
    'ɹ',
    'ɹ̩',
    'ʃ',
    'ʊ',
    'ʌ',
    'ʒ',
    'θ'
 ]
 _phonemes = sorted(list(set(_phonemes)))
 # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
-_arpabet = ['@' + s for s in cmudict._phonemes]
+_arpabet = ['@' + s for s in _phonemes]
 # Export all symbols:
 symbols = [_pad, _eos] + list(_characters) + _arpabet
-phonemes = [_pad, _eos] + list(cmudict._phonemes) + list(_punctuations)
+phonemes = [_pad, _eos] + list(_phonemes) + list(_punctuations)
 if __name__ == '__main__':
    print(" > TTS symbols ")