phonemizer updates for utils.text

This commit is contained in:
Eren Golge 2019-01-08 17:08:50 +01:00
parent e7a119a427
commit e1cb7c1501
3 changed files with 77 additions and 69 deletions

View File

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import re import re
import epitran import phonemizer
from utils.text import cleaners from utils.text import cleaners
from utils.text.symbols import symbols, phonemes, _punctuations from utils.text.symbols import symbols, phonemes, _punctuations
@ -15,39 +15,40 @@ _id_to_phonemes = {i: s for i, s in enumerate(phonemes)}
# Regular expression matching text enclosed in curly braces: # Regular expression matching text enclosed in curly braces:
_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
# phoneme converter # Regular expression matchinf punctuations, ignoring empty space
epi = epitran.Epitran('eng-Latn') pat = r'['+_punctuations[:-1]+']+'
def text2phone(text): def text2phone(text):
''' '''
Convert graphemes to phonemes. Convert graphemes to phonemes.
''' '''
try: seperator = phonemizer.separator.Separator(' ', '', '|')
ph = epi.trans_list(text, normpunc=True) #try:
except: punctuations = re.findall(pat, text)
ph = None ph = phonemizer.phonemize(text, separator=seperator, strip=False, njobs=1, backend='espeak', language='en-us')
# Replace \n with matching punctuations.
for punct in punctuations[:-1]:
ph = ph.replace(' \n', punct+'| ', 1)
ph = ph[:-1] + punctuations[-1]
#except:
# ph = None
return ph return ph
def phoneme_to_sequence(text, cleaner_names): def phoneme_to_sequence(text, cleaner_names):
''' '''
TODO: This ignores punctuations TODO: This ignores punctuations
''' '''
sequence = [] sequence = []
clean_text = _clean_text(text, cleaner_names) clean_text = _clean_text(text, cleaner_names)
for word in clean_text.split(): phonemes = text2phone(clean_text)
phonemes_text = text2phone(word) print(phonemes.replace('|', ''))
if phonemes is None:
print("!! After phoneme conversion the result is None. -- {} ".format(clean_text))
for phoneme in phonemes.split('|'):
# print(word, ' -- ', phonemes_text) # print(word, ' -- ', phonemes_text)
if phonemes_text == None: sequence += _phoneme_to_sequence(phoneme)
print("!! After phoneme conversion the result is None. -- {} ".format(word))
continue
sequence += _phoneme_to_sequence(phonemes_text)
if word[0] in _punctuations:
sequence.append(_phonemes_to_id[word[0]])
elif word[-1] in _punctuations:
sequence.append(_phonemes_to_id[word[-1]])
sequence.append(_phonemes_to_id[' '])
# Aeepnd EOS char # Aeepnd EOS char
sequence.append(_phonemes_to_id['~']) sequence.append(_phonemes_to_id['~'])
return sequence return sequence
@ -122,7 +123,7 @@ def _symbols_to_sequence(symbols):
def _phoneme_to_sequence(phonemes): def _phoneme_to_sequence(phonemes):
return [_phonemes_to_id[s] for s in phonemes if _should_keep_phoneme(s)] return [_phonemes_to_id[s] for s in list(phonemes) if _should_keep_phoneme(s)]
def _arpabet_to_sequence(text): def _arpabet_to_sequence(text):

View File

@ -86,3 +86,12 @@ def english_cleaners(text):
text = expand_abbreviations(text) text = expand_abbreviations(text)
text = collapse_whitespace(text) text = collapse_whitespace(text)
return text return text
def phoneme_cleaners(text):
'''Pipeline for phonemes mode, including number and abbreviation expansion.'''
text = convert_to_ascii(text)
text = expand_numbers(text)
text = expand_abbreviations(text)
text = collapse_whitespace(text)
return text

View File

@ -11,55 +11,53 @@ _pad = '_'
_eos = '~' _eos = '~'
_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? ' _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? '
_punctuations = '!\'(),-.:;? ' _punctuations = '!\'(),-.:;? '
_phonemes = [ _phonemes = ['l',
'$', 'ɹ',
'&', 'ɜ',
'a', 'ɚ',
'b', 'k',
'd', 'u',
'd͡ʒ', 'ʔ',
'e', 'ð',
'f', 'ɐ',
'h', 'ɾ',
'i', 'ɑ',
'j', 'ɔ',
'k', 'b',
'l', 'ɛ',
'm', 't',
'n', 'v',
'o', 'n',
'p', 'm',
's', 'ʊ',
't', 'ŋ',
't͡ʃ', 's',
'u', 'ʌ',
'v', 'o',
'w', 'ʃ',
'z', 'i',
'£', 'p',
'à', 'æ',
'â', 'e',
'æ', 'a',
'è', 'ʒ',
'é', ' ',
'ê', 'h',
'ð', 'ɪ',
'ü', 'ɡ',
'ŋ', 'f',
'ɑ', 'r',
'ɔ', 'w',
'ə', 'ɫ',
'ɛ', 'd',
'ɡ', 'x',
'ɪ', 'ː',
'ɹ', '',
'ɹ̩', 'ə',
'ʃ', 'j',
'ʊ', 'θ',
'ʌ', 'z']
'ʒ',
'θ'
]
_phonemes = sorted(list(set(_phonemes))) _phonemes = sorted(list(set(_phonemes)))
# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):