mirror of https://github.com/coqui-ai/TTS.git
phonemizer updates for utils.text
This commit is contained in:
parent
e7a119a427
commit
e1cb7c1501
|
@ -1,7 +1,7 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import re
|
||||
import epitran
|
||||
import phonemizer
|
||||
from utils.text import cleaners
|
||||
from utils.text.symbols import symbols, phonemes, _punctuations
|
||||
|
||||
|
@ -15,39 +15,40 @@ _id_to_phonemes = {i: s for i, s in enumerate(phonemes)}
|
|||
# Regular expression matching text enclosed in curly braces:
|
||||
_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
|
||||
|
||||
# phoneme converter
|
||||
epi = epitran.Epitran('eng-Latn')
|
||||
# Regular expression matchinf punctuations, ignoring empty space
|
||||
pat = r'['+_punctuations[:-1]+']+'
|
||||
|
||||
|
||||
def text2phone(text):
|
||||
'''
|
||||
Convert graphemes to phonemes.
|
||||
'''
|
||||
try:
|
||||
ph = epi.trans_list(text, normpunc=True)
|
||||
except:
|
||||
ph = None
|
||||
seperator = phonemizer.separator.Separator(' ', '', '|')
|
||||
#try:
|
||||
punctuations = re.findall(pat, text)
|
||||
ph = phonemizer.phonemize(text, separator=seperator, strip=False, njobs=1, backend='espeak', language='en-us')
|
||||
# Replace \n with matching punctuations.
|
||||
for punct in punctuations[:-1]:
|
||||
ph = ph.replace(' \n', punct+'| ', 1)
|
||||
ph = ph[:-1] + punctuations[-1]
|
||||
#except:
|
||||
# ph = None
|
||||
return ph
|
||||
|
||||
|
||||
|
||||
def phoneme_to_sequence(text, cleaner_names):
|
||||
'''
|
||||
TODO: This ignores punctuations
|
||||
'''
|
||||
sequence = []
|
||||
clean_text = _clean_text(text, cleaner_names)
|
||||
for word in clean_text.split():
|
||||
phonemes_text = text2phone(word)
|
||||
phonemes = text2phone(clean_text)
|
||||
print(phonemes.replace('|', ''))
|
||||
if phonemes is None:
|
||||
print("!! After phoneme conversion the result is None. -- {} ".format(clean_text))
|
||||
for phoneme in phonemes.split('|'):
|
||||
# print(word, ' -- ', phonemes_text)
|
||||
if phonemes_text == None:
|
||||
print("!! After phoneme conversion the result is None. -- {} ".format(word))
|
||||
continue
|
||||
sequence += _phoneme_to_sequence(phonemes_text)
|
||||
if word[0] in _punctuations:
|
||||
sequence.append(_phonemes_to_id[word[0]])
|
||||
elif word[-1] in _punctuations:
|
||||
sequence.append(_phonemes_to_id[word[-1]])
|
||||
sequence.append(_phonemes_to_id[' '])
|
||||
sequence += _phoneme_to_sequence(phoneme)
|
||||
# Aeepnd EOS char
|
||||
sequence.append(_phonemes_to_id['~'])
|
||||
return sequence
|
||||
|
@ -122,7 +123,7 @@ def _symbols_to_sequence(symbols):
|
|||
|
||||
|
||||
def _phoneme_to_sequence(phonemes):
|
||||
return [_phonemes_to_id[s] for s in phonemes if _should_keep_phoneme(s)]
|
||||
return [_phonemes_to_id[s] for s in list(phonemes) if _should_keep_phoneme(s)]
|
||||
|
||||
|
||||
def _arpabet_to_sequence(text):
|
||||
|
|
|
@ -86,3 +86,12 @@ def english_cleaners(text):
|
|||
text = expand_abbreviations(text)
|
||||
text = collapse_whitespace(text)
|
||||
return text
|
||||
|
||||
|
||||
def phoneme_cleaners(text):
|
||||
'''Pipeline for phonemes mode, including number and abbreviation expansion.'''
|
||||
text = convert_to_ascii(text)
|
||||
text = expand_numbers(text)
|
||||
text = expand_abbreviations(text)
|
||||
text = collapse_whitespace(text)
|
||||
return text
|
||||
|
|
|
@ -11,55 +11,53 @@ _pad = '_'
|
|||
_eos = '~'
|
||||
_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? '
|
||||
_punctuations = '!\'(),-.:;? '
|
||||
_phonemes = [
|
||||
'$',
|
||||
'&',
|
||||
'a',
|
||||
'b',
|
||||
'd',
|
||||
'd͡ʒ',
|
||||
'e',
|
||||
'f',
|
||||
'h',
|
||||
'i',
|
||||
'j',
|
||||
'k',
|
||||
'l',
|
||||
'm',
|
||||
'n',
|
||||
'o',
|
||||
'p',
|
||||
's',
|
||||
't',
|
||||
't͡ʃ',
|
||||
'u',
|
||||
'v',
|
||||
'w',
|
||||
'z',
|
||||
'£',
|
||||
'à',
|
||||
'â',
|
||||
'æ',
|
||||
'è',
|
||||
'é',
|
||||
'ê',
|
||||
'ð',
|
||||
'ü',
|
||||
'ŋ',
|
||||
'ɑ',
|
||||
'ɔ',
|
||||
'ə',
|
||||
'ɛ',
|
||||
'ɡ',
|
||||
'ɪ',
|
||||
'ɹ',
|
||||
'ɹ̩',
|
||||
'ʃ',
|
||||
'ʊ',
|
||||
'ʌ',
|
||||
'ʒ',
|
||||
'θ'
|
||||
]
|
||||
_phonemes = ['l',
|
||||
'ɹ',
|
||||
'ɜ',
|
||||
'ɚ',
|
||||
'k',
|
||||
'u',
|
||||
'ʔ',
|
||||
'ð',
|
||||
'ɐ',
|
||||
'ɾ',
|
||||
'ɑ',
|
||||
'ɔ',
|
||||
'b',
|
||||
'ɛ',
|
||||
't',
|
||||
'v',
|
||||
'n',
|
||||
'm',
|
||||
'ʊ',
|
||||
'ŋ',
|
||||
's',
|
||||
'ʌ',
|
||||
'o',
|
||||
'ʃ',
|
||||
'i',
|
||||
'p',
|
||||
'æ',
|
||||
'e',
|
||||
'a',
|
||||
'ʒ',
|
||||
' ',
|
||||
'h',
|
||||
'ɪ',
|
||||
'ɡ',
|
||||
'f',
|
||||
'r',
|
||||
'w',
|
||||
'ɫ',
|
||||
'd',
|
||||
'x',
|
||||
'ː',
|
||||
'ᵻ',
|
||||
'ə',
|
||||
'j',
|
||||
'θ',
|
||||
'z']
|
||||
|
||||
_phonemes = sorted(list(set(_phonemes)))
|
||||
|
||||
# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
|
||||
|
|
Loading…
Reference in New Issue