From e1cb7c150166e2d911c65f1210a5023089f696c1 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Tue, 8 Jan 2019 17:08:50 +0100 Subject: [PATCH] phonemizer updates for utils.text --- utils/text/__init__.py | 41 +++++++++--------- utils/text/cleaners.py | 9 ++++ utils/text/symbols.py | 96 +++++++++++++++++++++--------------------- 3 files changed, 77 insertions(+), 69 deletions(-) diff --git a/utils/text/__init__.py b/utils/text/__init__.py index d97eabf2..3ba93b25 100644 --- a/utils/text/__init__.py +++ b/utils/text/__init__.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- import re -import epitran +import phonemizer from utils.text import cleaners from utils.text.symbols import symbols, phonemes, _punctuations @@ -15,39 +15,40 @@ _id_to_phonemes = {i: s for i, s in enumerate(phonemes)} # Regular expression matching text enclosed in curly braces: _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') -# phoneme converter -epi = epitran.Epitran('eng-Latn') +# Regular expression matchinf punctuations, ignoring empty space +pat = r'['+_punctuations[:-1]+']+' def text2phone(text): ''' Convert graphemes to phonemes. ''' - try: - ph = epi.trans_list(text, normpunc=True) - except: - ph = None + seperator = phonemizer.separator.Separator(' ', '', '|') + #try: + punctuations = re.findall(pat, text) + ph = phonemizer.phonemize(text, separator=seperator, strip=False, njobs=1, backend='espeak', language='en-us') + # Replace \n with matching punctuations. + for punct in punctuations[:-1]: + ph = ph.replace(' \n', punct+'| ', 1) + ph = ph[:-1] + punctuations[-1] + #except: + # ph = None return ph - + def phoneme_to_sequence(text, cleaner_names): ''' TODO: This ignores punctuations ''' sequence = [] clean_text = _clean_text(text, cleaner_names) - for word in clean_text.split(): - phonemes_text = text2phone(word) + phonemes = text2phone(clean_text) + print(phonemes.replace('|', '')) + if phonemes is None: + print("!! After phoneme conversion the result is None. -- {} ".format(clean_text)) + for phoneme in phonemes.split('|'): # print(word, ' -- ', phonemes_text) - if phonemes_text == None: - print("!! After phoneme conversion the result is None. -- {} ".format(word)) - continue - sequence += _phoneme_to_sequence(phonemes_text) - if word[0] in _punctuations: - sequence.append(_phonemes_to_id[word[0]]) - elif word[-1] in _punctuations: - sequence.append(_phonemes_to_id[word[-1]]) - sequence.append(_phonemes_to_id[' ']) + sequence += _phoneme_to_sequence(phoneme) # Aeepnd EOS char sequence.append(_phonemes_to_id['~']) return sequence @@ -122,7 +123,7 @@ def _symbols_to_sequence(symbols): def _phoneme_to_sequence(phonemes): - return [_phonemes_to_id[s] for s in phonemes if _should_keep_phoneme(s)] + return [_phonemes_to_id[s] for s in list(phonemes) if _should_keep_phoneme(s)] def _arpabet_to_sequence(text): diff --git a/utils/text/cleaners.py b/utils/text/cleaners.py index a33f91b5..bb07b2ff 100644 --- a/utils/text/cleaners.py +++ b/utils/text/cleaners.py @@ -86,3 +86,12 @@ def english_cleaners(text): text = expand_abbreviations(text) text = collapse_whitespace(text) return text + + +def phoneme_cleaners(text): + '''Pipeline for phonemes mode, including number and abbreviation expansion.''' + text = convert_to_ascii(text) + text = expand_numbers(text) + text = expand_abbreviations(text) + text = collapse_whitespace(text) + return text diff --git a/utils/text/symbols.py b/utils/text/symbols.py index 4c6eeb38..6788d68f 100644 --- a/utils/text/symbols.py +++ b/utils/text/symbols.py @@ -11,55 +11,53 @@ _pad = '_' _eos = '~' _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? ' _punctuations = '!\'(),-.:;? ' -_phonemes = [ - '$', - '&', - 'a', - 'b', - 'd', - 'd͡ʒ', - 'e', - 'f', - 'h', - 'i', - 'j', - 'k', - 'l', - 'm', - 'n', - 'o', - 'p', - 's', - 't', - 't͡ʃ', - 'u', - 'v', - 'w', - 'z', - '£', - 'à', - 'â', - 'æ', - 'è', - 'é', - 'ê', - 'ð', - 'ü', - 'ŋ', - 'ɑ', - 'ɔ', - 'ə', - 'ɛ', - 'ɡ', - 'ɪ', - 'ɹ', - 'ɹ̩', - 'ʃ', - 'ʊ', - 'ʌ', - 'ʒ', - 'θ' -] +_phonemes = ['l', + 'ɹ', + 'ɜ', + 'ɚ', + 'k', + 'u', + 'ʔ', + 'ð', + 'ɐ', + 'ɾ', + 'ɑ', + 'ɔ', + 'b', + 'ɛ', + 't', + 'v', + 'n', + 'm', + 'ʊ', + 'ŋ', + 's', + 'ʌ', + 'o', + 'ʃ', + 'i', + 'p', + 'æ', + 'e', + 'a', + 'ʒ', + ' ', + 'h', + 'ɪ', + 'ɡ', + 'f', + 'r', + 'w', + 'ɫ', + 'd', + 'x', + 'ː', + 'ᵻ', + 'ə', + 'j', + 'θ', + 'z'] + _phonemes = sorted(list(set(_phonemes))) # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):