feat(xtts): support hindi in tokenizer (#64)

Added proper tokenizer support for Hindi Language which would prevent crash while fine tuning Hindi language.

Co-authored-by: Akshat Bhardwaj <157223825+akshatrocky@users.noreply.github.com>
This commit is contained in:
Enno Hermann 2024-09-12 20:29:21 +01:00 committed by GitHub
parent 233dfb54ae
commit 1920328822
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 25 additions and 4 deletions

View File

@ -233,6 +233,12 @@ _abbreviations = {
# Korean doesn't typically use abbreviations in the same way as Latin-based scripts.
]
],
"hi": [
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
for x in [
# Hindi doesn't typically use abbreviations in the same way as Latin-based scripts.
]
],
}
@ -429,6 +435,18 @@ _symbols_multilingual = {
("°", ""),
]
],
"hi": [
(re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
for x in [
("&", " और "),
("@", " ऐट दी रेट "),
("%", " प्रतिशत "),
("#", " हैश "),
("$", " डॉलर "),
("£", " पाउंड "),
("°", " डिग्री "),
]
],
}
@ -454,6 +472,7 @@ _ordinal_re = {
"tr": re.compile(r"([0-9]+)(\.|inci|nci|uncu|üncü|\.)"),
"hu": re.compile(r"([0-9]+)(\.|adik|edik|odik|edik|ödik|ödike|ik)"),
"ko": re.compile(r"([0-9]+)(번째|번|차|째)"),
"hi": re.compile(r"([0-9]+)(st|nd|rd|th)"), # To check
}
_number_re = re.compile(r"[0-9]+")
_currency_re = {
@ -505,6 +524,7 @@ def _expand_currency(m, lang="en", currency="USD"):
"tr": ", ",
"hu": ", ",
"ko": ", ",
"hi": ", ",
}
if amount.is_integer():
@ -644,7 +664,7 @@ class VoiceBpeTokenizer:
)
def preprocess_text(self, txt, lang):
if lang in {"ar", "cs", "de", "en", "es", "fr", "hu", "it", "nl", "pl", "pt", "ru", "tr", "zh", "ko"}:
if lang in {"ar", "cs", "de", "en", "es", "fr", "hi", "hu", "it", "nl", "pl", "pt", "ru", "tr", "zh", "ko"}:
txt = multilingual_cleaners(txt, lang)
if lang == "zh":
txt = chinese_transliterate(txt)
@ -652,9 +672,6 @@ class VoiceBpeTokenizer:
txt = korean_transliterate(txt)
elif lang == "ja":
txt = japanese_cleaners(txt, self.katsu)
elif lang == "hi":
# @manmay will implement this
txt = basic_cleaners(txt)
else:
raise NotImplementedError(f"Language '{lang}' is not supported.")
return txt
@ -777,6 +794,9 @@ def test_expand_numbers_multilingual():
("12.5 초 안에.", "십이 점 다섯 초 안에.", "ko"),
("50 명의 병사가 있었다.", "오십 명의 병사가 있었다.", "ko"),
("이것은 1 번째 테스트입니다", "이것은 첫 번째 테스트입니다", "ko"),
# Hindi
("12.5 सेकंड में।", "साढ़े बारह सेकंड में।", "hi"),
("50 सैनिक थे।", "पचास सैनिक थे।", "hi"),
]
for a, b, lang in test_cases:
out = expand_numbers_multilingual(a, lang=lang)
@ -846,6 +866,7 @@ def test_symbols_multilingual():
("Pilim %14 dolu.", "Pilim yüzde 14 dolu.", "tr"),
("Az akkumulátorom töltöttsége 14%", "Az akkumulátorom töltöttsége 14 százalék", "hu"),
("배터리 잔량이 14%입니다.", "배터리 잔량이 14 퍼센트입니다.", "ko"),
("मेरे पास 14% बैटरी है।", "मेरे पास चौदह प्रतिशत बैटरी है।", "hi"),
]
for a, b, lang in test_cases: