coqui-tts/utils/text/cmudict.py

142 lines
3.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
import re
import epitran
epi = epitran.Epitran('eng-Latn')
# valid_symbols = [
# 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1',
# 'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0',
# 'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0',
# 'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0',
# 'IH1', 'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG',
# 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH',
# 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W',
# 'Y', 'Z', 'ZH'
# ]
_phonemes = {
'$',
'&',
'a',
'b',
'd',
'd͡ʒ',
'e',
'f',
'h',
'i',
'j',
'k',
'l',
'm',
'n',
'o',
'p',
's',
't',
't͡ʃ',
'u',
'v',
'w',
'z',
'£',
'à',
'â',
'æ',
'è',
'é',
'ê',
'ð',
'ü',
'ŋ',
'ɑ',
'ɔ',
'ə',
'ɛ',
'ɡ',
'ɪ',
'ɹ',
'ɹ̩',
'ʃ',
'ʊ',
'ʌ',
'ʒ',
'θ'
}
_phonemes = set(_phonemes)
def text2phone(text):
try:
ph = epi.trans_list(text, normpunc=True)
except:
ph = None
return ph
class CMUDict:
'''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''
def __init__(self, file_or_path, keep_ambiguous=True):
if isinstance(file_or_path, str):
with open(file_or_path, encoding='latin-1') as f:
entries = _parse_cmudict(f)
else:
entries = _parse_cmudict(file_or_path)
if not keep_ambiguous:
entries = {
word: pron
for word, pron in entries.items() if len(pron) == 1
}
self._entries = entries
def __len__(self):
return len(self._entries)
def lookup(self, word):
'''Returns list of ARPAbet pronunciations of the given word.'''
return self._entries.get(word.upper())
def get_arpabet(self, word, cmudict, punctuation_symbols):
first_symbol, last_symbol = '', ''
if len(word) > 0 and word[0] in punctuation_symbols:
first_symbol = word[0]
word = word[1:]
if len(word) > 0 and word[-1] in punctuation_symbols:
last_symbol = word[-1]
word = word[:-1]
arpabet = cmudict.lookup(word)
if arpabet is not None:
return first_symbol + '{%s}' % arpabet[0] + last_symbol
else:
return first_symbol + word + last_symbol
_alt_re = re.compile(r'\([0-9]+\)')
def _parse_cmudict(file):
cmudict = {}
for line in file:
if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
parts = line.split(' ')
word = re.sub(_alt_re, '', parts[0])
pronunciation = _get_pronunciation(parts[1])
if pronunciation:
if word in cmudict:
cmudict[word].append(pronunciation)
else:
cmudict[word] = [pronunciation]
return cmudict
def _get_pronunciation(s):
parts = s.strip().split(' ')
for part in parts:
if part not in _valid_symbol_set:
return None
return ' '.join(parts)