Move phoneme compuataion to __init__ and put char list to symbols.py

This commit is contained in:
Eren Golge 2019-01-04 16:18:49 +01:00
parent da2f064bc5
commit df49e93684
3 changed files with 67 additions and 66 deletions

View File

@ -1,9 +1,9 @@
# -*- coding: utf-8 -*-
import re
import epitran
from utils.text import cleaners
from utils.text.symbols import symbols, phonemes, _punctuations
from utils.text.cmudict import text2phone
# Mappings from symbol to numeric ID and vice versa:
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
@ -15,7 +15,21 @@ _id_to_phonemes = {i: s for i, s in enumerate(phonemes)}
# Regular expression matching text enclosed in curly braces:
_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
# phoneme converter
epi = epitran.Epitran('eng-Latn')
def text2phone(text):
'''
Convert graphemes to phonemes.
'''
try:
ph = epi.trans_list(text, normpunc=True)
except:
ph = None
return ph
def phoneme_to_sequence(text, cleaner_names):
'''
TODO: This ignores punctuations

View File

@ -1,8 +1,6 @@
# -*- coding: utf-8 -*-
import re
import epitran
epi = epitran.Epitran('eng-Latn')
# valid_symbols = [
# 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1',
@ -16,67 +14,6 @@ epi = epitran.Epitran('eng-Latn')
# ]
_phonemes = {
'$',
'&',
'a',
'b',
'd',
'd͡ʒ',
'e',
'f',
'h',
'i',
'j',
'k',
'l',
'm',
'n',
'o',
'p',
's',
't',
't͡ʃ',
'u',
'v',
'w',
'z',
'£',
'à',
'â',
'æ',
'è',
'é',
'ê',
'ð',
'ü',
'ŋ',
'ɑ',
'ɔ',
'ə',
'ɛ',
'ɡ',
'ɪ',
'ɹ',
'ɹ̩',
'ʃ',
'ʊ',
'ʌ',
'ʒ',
'θ'
}
_phonemes = set(_phonemes)
def text2phone(text):
try:
ph = epi.trans_list(text, normpunc=True)
except:
ph = None
return ph
class CMUDict:
'''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''

View File

@ -11,13 +11,63 @@ _pad = '_'
_eos = '~'
_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? '
_punctuations = '!\'(),-.:;? '
_phonemes = [
'$',
'&',
'a',
'b',
'd',
'd͡ʒ',
'e',
'f',
'h',
'i',
'j',
'k',
'l',
'm',
'n',
'o',
'p',
's',
't',
't͡ʃ',
'u',
'v',
'w',
'z',
'£',
'à',
'â',
'æ',
'è',
'é',
'ê',
'ð',
'ü',
'ŋ',
'ɑ',
'ɔ',
'ə',
'ɛ',
'ɡ',
'ɪ',
'ɹ',
'ɹ̩',
'ʃ',
'ʊ',
'ʌ',
'ʒ',
'θ'
]
_phonemes = sorted(list(set(_phonemes)))
# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
_arpabet = ['@' + s for s in cmudict._phonemes]
_arpabet = ['@' + s for s in _phonemes]
# Export all symbols:
symbols = [_pad, _eos] + list(_characters) + _arpabet
phonemes = [_pad, _eos] + list(cmudict._phonemes) + list(_punctuations)
phonemes = [_pad, _eos] + list(_phonemes) + list(_punctuations)
if __name__ == '__main__':
print(" > TTS symbols ")