fix Chinese lang from 'zh' to 'zh-cn'

This commit is contained in:
Bojie Li 2023-11-16 21:51:16 +00:00
parent 7e4375da2b
commit 5f0d7c52a8
1 changed files with 7 additions and 6 deletions

View File

@ -20,7 +20,7 @@ from spacy.lang.es import Spanish
def get_spacy_lang(lang): def get_spacy_lang(lang):
if lang == "zh": if lang in ["zh", "zh-cn"]:
return Chinese() return Chinese()
elif lang == "ja": elif lang == "ja":
return Japanese() return Japanese()
@ -170,7 +170,7 @@ _abbreviations = {
# There are not many common abbreviations in Arabic as in English. # There are not many common abbreviations in Arabic as in English.
] ]
], ],
"zh": [ "zh-cn": [
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
for x in [ for x in [
# Chinese doesn't typically use abbreviations in the same way as Latin-based scripts. # Chinese doesn't typically use abbreviations in the same way as Latin-based scripts.
@ -335,7 +335,7 @@ _symbols_multilingual = {
("°", " درجة "), ("°", " درجة "),
] ]
], ],
"zh": [ "zh-cn": [
# Chinese # Chinese
(re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
for x in [ for x in [
@ -519,7 +519,7 @@ def _expand_number(m, lang="en"):
def expand_numbers_multilingual(text, lang="en"): def expand_numbers_multilingual(text, lang="en"):
if lang == "zh": if lang in ["zh", "zh-cn"]:
text = zh_num2words()(text) text = zh_num2words()(text)
else: else:
if lang in ["en", "ru"]: if lang in ["en", "ru"]:
@ -602,6 +602,7 @@ class VoiceBpeTokenizer:
"pt": 203, "pt": 203,
"pl": 224, "pl": 224,
"zh": 82, "zh": 82,
"zh-cn": 82,
"ar": 166, "ar": 166,
"cs": 186, "cs": 186,
"ru": 182, "ru": 182,
@ -627,9 +628,9 @@ class VoiceBpeTokenizer:
) )
def preprocess_text(self, txt, lang): def preprocess_text(self, txt, lang):
if lang in {"ar", "cs", "de", "en", "es", "fr", "hu", "it", "nl", "pl", "pt", "ru", "tr", "zh", "ko"}: if lang in {"ar", "cs", "de", "en", "es", "fr", "hu", "it", "nl", "pl", "pt", "ru", "tr", "zh", "zh-cn", "ko"}:
txt = multilingual_cleaners(txt, lang) txt = multilingual_cleaners(txt, lang)
if lang == "zh": if lang == "zh" or lang == "zh-cn":
txt = chinese_transliterate(txt) txt = chinese_transliterate(txt)
if lang == "ko": if lang == "ko":
txt = korean_transliterate(txt) txt = korean_transliterate(txt)