phonemizer updates for utils.text

2019-01-08 17:08:50 +01:00 · 2019-01-08 17:08:50 +01:00 · e1cb7c1501
parent e7a119a427
commit e1cb7c1501
3 changed files with 77 additions and 69 deletions
--- a/utils/text/init.py
+++ b/utils/text/init.py
@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 import re
-import epitran
+import phonemizer
 from utils.text import cleaners
 from utils.text.symbols import symbols, phonemes, _punctuations
@ -15,18 +15,24 @@ _id_to_phonemes = {i: s for i, s in enumerate(phonemes)}
 # Regular expression matching text enclosed in curly braces:
 _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
-# phoneme converter
+# Regular expression matchinf punctuations, ignoring empty space
-epi = epitran.Epitran('eng-Latn')
+pat = r'['+_punctuations[:-1]+']+'
 def text2phone(text):
    '''
    Convert graphemes to phonemes.
    '''
-    try:
+    seperator = phonemizer.separator.Separator(' ', '', '|')
-        ph = epi.trans_list(text, normpunc=True)
+    #try:
-    except:
+    punctuations = re.findall(pat, text)
-        ph = None
+    ph = phonemizer.phonemize(text, separator=seperator, strip=False, njobs=1, backend='espeak', language='en-us')
    # Replace \n with matching punctuations.
    for punct in punctuations[:-1]:
        ph = ph.replace(' \n', punct+'| ', 1)
    ph = ph[:-1] + punctuations[-1]
    #except:
    #    ph = None
    return ph
@ -36,18 +42,13 @@ def phoneme_to_sequence(text, cleaner_names):
    '''
    sequence = []
    clean_text = _clean_text(text, cleaner_names)
-    for word in clean_text.split():
+    phonemes = text2phone(clean_text)
-        phonemes_text = text2phone(word)
+    print(phonemes.replace('|', ''))
    if phonemes is None:
        print("!! After phoneme conversion the result is None. -- {} ".format(clean_text))
    for phoneme in phonemes.split('|'):
        # print(word, ' -- ', phonemes_text)
-        if phonemes_text == None:
+        sequence += _phoneme_to_sequence(phoneme)
            print("!! After phoneme conversion the result is None. -- {} ".format(word))
            continue
        sequence += _phoneme_to_sequence(phonemes_text)
        if word[0] in _punctuations:
            sequence.append(_phonemes_to_id[word[0]])
        elif word[-1] in _punctuations:
            sequence.append(_phonemes_to_id[word[-1]])
        sequence.append(_phonemes_to_id[' '])
    # Aeepnd EOS char
    sequence.append(_phonemes_to_id['~'])
    return sequence
@ -122,7 +123,7 @@ def _symbols_to_sequence(symbols):
 def _phoneme_to_sequence(phonemes):
-    return [_phonemes_to_id[s] for s in phonemes if _should_keep_phoneme(s)]
+    return [_phonemes_to_id[s] for s in list(phonemes) if _should_keep_phoneme(s)]
 def _arpabet_to_sequence(text):
--- a/utils/text/cleaners.py
+++ b/utils/text/cleaners.py
@ -86,3 +86,12 @@ def english_cleaners(text):
    text = expand_abbreviations(text)
    text = collapse_whitespace(text)
    return text
 def phoneme_cleaners(text):
    '''Pipeline for phonemes mode, including number and abbreviation expansion.'''
    text = convert_to_ascii(text)
    text = expand_numbers(text)
    text = expand_abbreviations(text)
    text = collapse_whitespace(text)
    return text
--- a/utils/text/symbols.py
+++ b/utils/text/symbols.py
@ -11,55 +11,53 @@ _pad = '_'
 _eos = '~'
 _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? '
 _punctuations = '!\'(),-.:;? '
-_phonemes = [
+_phonemes = ['l',
-    '$',
+             'ɹ',
-    '&',
+             'ɜ',
-    'a',
+             'ɚ',
    'b',
    'd',
    'd͡ʒ',
    'e',
    'f',
    'h',
    'i',
    'j',
             'k',
    'l',
    'm',
    'n',
    'o',
    'p',
    's',
    't',
    't͡ʃ',
             'u',
-    'v',
+             'ʔ',
    'w',
    'z',
    '£',
    'à',
    'â',
    'æ',
    'è',
    'é',
    'ê',
             'ð',
-    'ü',
+             'ɐ',
-    'ŋ',
+             'ɾ',
             'ɑ',
             'ɔ',
-    'ə',
+             'b',
             'ɛ',
-    'ɡ',
+             't',
-    'ɪ',
+             'v',
-    'ɹ',
+             'n',
-    'ɹ̩',
+             'm',
    'ʃ',
             'ʊ',
             'ŋ',
             's',
             'ʌ',
             'o',
             'ʃ',
             'i',
             'p',
             'æ',
             'e',
             'a',
             'ʒ',
-    'θ'
+             ' ',
-]
+             'h',
             'ɪ',
             'ɡ',
             'f',
             'r',
             'w',
             'ɫ',
             'd',
             'x',
             'ː',
             'ᵻ',
             'ə',
             'j',
             'θ',
             'z']
 _phonemes = sorted(list(set(_phonemes)))
 # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):