Move phoneme compuataion to __init__ and put char list to symbols.py

2019-01-04 16:18:49 +01:00 · 2019-01-04 16:18:49 +01:00 · df49e93684
parent da2f064bc5
commit df49e93684
3 changed files with 67 additions and 66 deletions
--- a/utils/text/init.py
+++ b/utils/text/init.py
@ -1,9 +1,9 @@
 # -*- coding: utf-8 -*-

 import re
+import epitran
 from utils.text import cleaners
 from utils.text.symbols import symbols, phonemes, _punctuations
-from utils.text.cmudict import text2phone

 # Mappings from symbol to numeric ID and vice versa:
 _symbol_to_id = {s: i for i, s in enumerate(symbols)}
@ -15,7 +15,21 @@ _id_to_phonemes = {i: s for i, s in enumerate(phonemes)}
 # Regular expression matching text enclosed in curly braces:
 _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')

+# phoneme converter
+epi = epitran.Epitran('eng-Latn')

+
+def text2phone(text):
+    '''
+    Convert graphemes to phonemes.
+    '''
+    try:
+        ph = epi.trans_list(text, normpunc=True)
+    except:
+        ph = None
+    return ph
+
+   
 def phoneme_to_sequence(text, cleaner_names):
    '''
    TODO: This ignores punctuations
--- a/utils/text/cmudict.py
+++ b/utils/text/cmudict.py
@ -1,8 +1,6 @@
 # -*- coding: utf-8 -*-

 import re
-import epitran
-epi = epitran.Epitran('eng-Latn')

 # valid_symbols = [
 #     'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1',
@ -16,67 +14,6 @@ epi = epitran.Epitran('eng-Latn')
 # ]


-_phonemes = {
-    '$',
-    '&',
-    'a',
-    'b',
-    'd',
-    'd͡ʒ',
-    'e',
-    'f',
-    'h',
-    'i',
-    'j',
-    'k',
-    'l',
-    'm',
-    'n',
-    'o',
-    'p',
-    's',
-    't',
-    't͡ʃ',
-    'u',
-    'v',
-    'w',
-    'z',
-    '£',
-    'à',
-    'â',
-    'æ',
-    'è',
-    'é',
-    'ê',
-    'ð',
-    'ü',
-    'ŋ',
-    'ɑ',
-    'ɔ',
-    'ə',
-    'ɛ',
-    'ɡ',
-    'ɪ',
-    'ɹ',
-    'ɹ̩',
-    'ʃ',
-    'ʊ',
-    'ʌ',
-    'ʒ',
-    'θ'
-    }
-
-_phonemes = set(_phonemes)
-
-
-def text2phone(text):
-    try:
-        ph = epi.trans_list(text, normpunc=True)
-    except:
-        ph = None
-    return ph
-
-
 class CMUDict:
    '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''

--- a/utils/text/symbols.py
+++ b/utils/text/symbols.py
@ -11,13 +11,63 @@ _pad = '_'
 _eos = '~'
 _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? '
 _punctuations = '!\'(),-.:;? '
+_phonemes = [
+    '$',
+    '&',
+    'a',
+    'b',
+    'd',
+    'd͡ʒ',
+    'e',
+    'f',
+    'h',
+    'i',
+    'j',
+    'k',
+    'l',
+    'm',
+    'n',
+    'o',
+    'p',
+    's',
+    't',
+    't͡ʃ',
+    'u',
+    'v',
+    'w',
+    'z',
+    '£',
+    'à',
+    'â',
+    'æ',
+    'è',
+    'é',
+    'ê',
+    'ð',
+    'ü',
+    'ŋ',
+    'ɑ',
+    'ɔ',
+    'ə',
+    'ɛ',
+    'ɡ',
+    'ɪ',
+    'ɹ',
+    'ɹ̩',
+    'ʃ',
+    'ʊ',
+    'ʌ',
+    'ʒ',
+    'θ'
+]
+_phonemes = sorted(list(set(_phonemes)))

 # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
-_arpabet = ['@' + s for s in cmudict._phonemes]
+_arpabet = ['@' + s for s in _phonemes]

 # Export all symbols:
 symbols = [_pad, _eos] + list(_characters) + _arpabet
-phonemes = [_pad, _eos] + list(cmudict._phonemes) + list(_punctuations)
+phonemes = [_pad, _eos] + list(_phonemes) + list(_punctuations)

 if __name__ == '__main__':
    print(" > TTS symbols ")