mirror of https://github.com/coqui-ai/TTS.git
33 lines
1.0 KiB
Python
33 lines
1.0 KiB
Python
# coding: utf-8
|
|
# Code based on https://github.com/carpedm20/multi-speaker-tacotron-tensorflow/blob/master/text/korean.py
|
|
import re
|
|
|
|
from TTS.tts.utils.text.korean.ko_dictionary import english_dictionary, etc_dictionary
|
|
|
|
|
|
def normalize(text):
|
|
text = text.strip()
|
|
text = re.sub("[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]", "", text)
|
|
text = normalize_with_dictionary(text, etc_dictionary)
|
|
text = normalize_english(text)
|
|
text = text.lower()
|
|
return text
|
|
|
|
|
|
def normalize_with_dictionary(text, dic):
|
|
if any(key in text for key in dic.keys()):
|
|
pattern = re.compile("|".join(re.escape(key) for key in dic.keys()))
|
|
return pattern.sub(lambda x: dic[x.group()], text)
|
|
return text
|
|
|
|
|
|
def normalize_english(text):
|
|
def fn(m):
|
|
word = m.group()
|
|
if word in english_dictionary:
|
|
return english_dictionary.get(word)
|
|
return word
|
|
|
|
text = re.sub("([A-Za-z]+)", fn, text)
|
|
return text
|