2nd version of the tokenizer fix

This commit is contained in:
WeberJulian 2023-10-06 17:39:04 +02:00
parent 1ec341857e
commit 2fdf51ebd2
1 changed files with 5 additions and 3 deletions

View File

@ -223,9 +223,11 @@ class VoiceBpeTokenizer:
results = kks.convert(txt) results = kks.convert(txt)
txt = " ".join([result["kana"] for result in results]) txt = " ".join([result["kana"] for result in results])
txt = basic_cleaners(txt) txt = basic_cleaners(txt)
# elif lang == "en": elif lang == "en":
# txt = english_cleaners(txt) if txt[:4] == "[en]":
# English cleaner remove the language tag [en] txt = txt[4:]
txt = english_cleaners(txt)
txt = "[en]" + txt
elif lang == "ar": elif lang == "ar":
txt = arabic_cleaners(txt) txt = arabic_cleaners(txt)
elif lang == "zh-cn": elif lang == "zh-cn":