feat(xtts): support hindi in tokenizer (#64)

Added proper tokenizer support for Hindi Language which would prevent crash while fine tuning Hindi language.

Co-authored-by: Akshat Bhardwaj <157223825+akshatrocky@users.noreply.github.com>
This commit is contained in:
Enno Hermann 2024-09-12 20:29:21 +01:00 committed by GitHub
parent 233dfb54ae
commit 1920328822
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 25 additions and 4 deletions

View File

@ -233,6 +233,12 @@ _abbreviations = {
# Korean doesn't typically use abbreviations in the same way as Latin-based scripts. # Korean doesn't typically use abbreviations in the same way as Latin-based scripts.
] ]
], ],
"hi": [
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
for x in [
# Hindi doesn't typically use abbreviations in the same way as Latin-based scripts.
]
],
} }
@ -429,6 +435,18 @@ _symbols_multilingual = {
("°", ""), ("°", ""),
] ]
], ],
"hi": [
(re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
for x in [
("&", " और "),
("@", " ऐट दी रेट "),
("%", " प्रतिशत "),
("#", " हैश "),
("$", " डॉलर "),
("£", " पाउंड "),
("°", " डिग्री "),
]
],
} }
@ -454,6 +472,7 @@ _ordinal_re = {
"tr": re.compile(r"([0-9]+)(\.|inci|nci|uncu|üncü|\.)"), "tr": re.compile(r"([0-9]+)(\.|inci|nci|uncu|üncü|\.)"),
"hu": re.compile(r"([0-9]+)(\.|adik|edik|odik|edik|ödik|ödike|ik)"), "hu": re.compile(r"([0-9]+)(\.|adik|edik|odik|edik|ödik|ödike|ik)"),
"ko": re.compile(r"([0-9]+)(번째|번|차|째)"), "ko": re.compile(r"([0-9]+)(번째|번|차|째)"),
"hi": re.compile(r"([0-9]+)(st|nd|rd|th)"), # To check
} }
_number_re = re.compile(r"[0-9]+") _number_re = re.compile(r"[0-9]+")
_currency_re = { _currency_re = {
@ -505,6 +524,7 @@ def _expand_currency(m, lang="en", currency="USD"):
"tr": ", ", "tr": ", ",
"hu": ", ", "hu": ", ",
"ko": ", ", "ko": ", ",
"hi": ", ",
} }
if amount.is_integer(): if amount.is_integer():
@ -644,7 +664,7 @@ class VoiceBpeTokenizer:
) )
def preprocess_text(self, txt, lang): def preprocess_text(self, txt, lang):
if lang in {"ar", "cs", "de", "en", "es", "fr", "hu", "it", "nl", "pl", "pt", "ru", "tr", "zh", "ko"}: if lang in {"ar", "cs", "de", "en", "es", "fr", "hi", "hu", "it", "nl", "pl", "pt", "ru", "tr", "zh", "ko"}:
txt = multilingual_cleaners(txt, lang) txt = multilingual_cleaners(txt, lang)
if lang == "zh": if lang == "zh":
txt = chinese_transliterate(txt) txt = chinese_transliterate(txt)
@ -652,9 +672,6 @@ class VoiceBpeTokenizer:
txt = korean_transliterate(txt) txt = korean_transliterate(txt)
elif lang == "ja": elif lang == "ja":
txt = japanese_cleaners(txt, self.katsu) txt = japanese_cleaners(txt, self.katsu)
elif lang == "hi":
# @manmay will implement this
txt = basic_cleaners(txt)
else: else:
raise NotImplementedError(f"Language '{lang}' is not supported.") raise NotImplementedError(f"Language '{lang}' is not supported.")
return txt return txt
@ -777,6 +794,9 @@ def test_expand_numbers_multilingual():
("12.5 초 안에.", "십이 점 다섯 초 안에.", "ko"), ("12.5 초 안에.", "십이 점 다섯 초 안에.", "ko"),
("50 명의 병사가 있었다.", "오십 명의 병사가 있었다.", "ko"), ("50 명의 병사가 있었다.", "오십 명의 병사가 있었다.", "ko"),
("이것은 1 번째 테스트입니다", "이것은 첫 번째 테스트입니다", "ko"), ("이것은 1 번째 테스트입니다", "이것은 첫 번째 테스트입니다", "ko"),
# Hindi
("12.5 सेकंड में।", "साढ़े बारह सेकंड में।", "hi"),
("50 सैनिक थे।", "पचास सैनिक थे।", "hi"),
] ]
for a, b, lang in test_cases: for a, b, lang in test_cases:
out = expand_numbers_multilingual(a, lang=lang) out = expand_numbers_multilingual(a, lang=lang)
@ -846,6 +866,7 @@ def test_symbols_multilingual():
("Pilim %14 dolu.", "Pilim yüzde 14 dolu.", "tr"), ("Pilim %14 dolu.", "Pilim yüzde 14 dolu.", "tr"),
("Az akkumulátorom töltöttsége 14%", "Az akkumulátorom töltöttsége 14 százalék", "hu"), ("Az akkumulátorom töltöttsége 14%", "Az akkumulátorom töltöttsége 14 százalék", "hu"),
("배터리 잔량이 14%입니다.", "배터리 잔량이 14 퍼센트입니다.", "ko"), ("배터리 잔량이 14%입니다.", "배터리 잔량이 14 퍼센트입니다.", "ko"),
("मेरे पास 14% बैटरी है।", "मेरे पास चौदह प्रतिशत बैटरी है।", "hi"),
] ]
for a, b, lang in test_cases: for a, b, lang in test_cases: