From 5f0d7c52a8dbf0e5ff06282625bc948b58ced903 Mon Sep 17 00:00:00 2001 From: Bojie Li Date: Thu, 16 Nov 2023 21:51:16 +0000 Subject: [PATCH] fix Chinese lang from 'zh' to 'zh-cn' --- TTS/tts/layers/xtts/tokenizer.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/TTS/tts/layers/xtts/tokenizer.py b/TTS/tts/layers/xtts/tokenizer.py index 1ef655a3..424c8bb3 100644 --- a/TTS/tts/layers/xtts/tokenizer.py +++ b/TTS/tts/layers/xtts/tokenizer.py @@ -20,7 +20,7 @@ from spacy.lang.es import Spanish def get_spacy_lang(lang): - if lang == "zh": + if lang in ["zh", "zh-cn"]: return Chinese() elif lang == "ja": return Japanese() @@ -170,7 +170,7 @@ _abbreviations = { # There are not many common abbreviations in Arabic as in English. ] ], - "zh": [ + "zh-cn": [ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) for x in [ # Chinese doesn't typically use abbreviations in the same way as Latin-based scripts. @@ -335,7 +335,7 @@ _symbols_multilingual = { ("°", " درجة "), ] ], - "zh": [ + "zh-cn": [ # Chinese (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) for x in [ @@ -519,7 +519,7 @@ def _expand_number(m, lang="en"): def expand_numbers_multilingual(text, lang="en"): - if lang == "zh": + if lang in ["zh", "zh-cn"]: text = zh_num2words()(text) else: if lang in ["en", "ru"]: @@ -602,6 +602,7 @@ class VoiceBpeTokenizer: "pt": 203, "pl": 224, "zh": 82, + "zh-cn": 82, "ar": 166, "cs": 186, "ru": 182, @@ -627,9 +628,9 @@ class VoiceBpeTokenizer: ) def preprocess_text(self, txt, lang): - if lang in {"ar", "cs", "de", "en", "es", "fr", "hu", "it", "nl", "pl", "pt", "ru", "tr", "zh", "ko"}: + if lang in {"ar", "cs", "de", "en", "es", "fr", "hu", "it", "nl", "pl", "pt", "ru", "tr", "zh", "zh-cn", "ko"}: txt = multilingual_cleaners(txt, lang) - if lang == "zh": + if lang == "zh" or lang == "zh-cn": txt = chinese_transliterate(txt) if lang == "ko": txt = korean_transliterate(txt)