From 0f81d748c633c144c14bc27eda43d80431fb3a37 Mon Sep 17 00:00:00 2001 From: manmay-nakhashi Date: Fri, 1 Dec 2023 12:50:14 +0530 Subject: [PATCH] add hindi cleaner --- TTS/tts/layers/xtts/tokenizer.py | 67 +++++++++++++++++++++++++++----- 1 file changed, 58 insertions(+), 9 deletions(-) diff --git a/TTS/tts/layers/xtts/tokenizer.py b/TTS/tts/layers/xtts/tokenizer.py index 1a3cc47a..641ddd80 100644 --- a/TTS/tts/layers/xtts/tokenizer.py +++ b/TTS/tts/layers/xtts/tokenizer.py @@ -14,7 +14,7 @@ from spacy.lang.es import Spanish from spacy.lang.ja import Japanese from spacy.lang.zh import Chinese from tokenizers import Tokenizer - +from num_to_words import num_to_word from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words @@ -130,6 +130,18 @@ _abbreviations = { ("jr", "junior"), ] ], + "hi": [ + (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) + for x in [ + ("डॉ", "डॉक्टर"), + ("सं", "संवत्"), + ("प", "पृष्ठ"), + ("उदा", "उदाहरण"), + ("वि", "विद्यालय"), + ("कु", "कुमारी"), + ("संवि", "संविधान") + ] + ], "pt": [ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) for x in [ @@ -425,6 +437,19 @@ _symbols_multilingual = { ("°", " 도 "), ] ], + "hi": [ + (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1]) + for x in [ + ("&", " और "), + ("@", " एट "), + ("%", " प्रतिशत "), + ("#", " हैश "), + ("$", " डॉलर "), + ("£", " पाउंड "), + ("₹", " रुपए"), + ("°", " डिग्री "), + ] + ], } @@ -450,12 +475,16 @@ _ordinal_re = { "tr": re.compile(r"([0-9]+)(\.|inci|nci|uncu|üncü|\.)"), "hu": re.compile(r"([0-9]+)(\.|adik|edik|odik|edik|ödik|ödike|ik)"), "ko": re.compile(r"([0-9]+)(번째|번|차|째)"), + "hi": re.compile(r"([0-9]+)(वें|वां|री|रा|रीं|ठा|ठी|ठे)"), } _number_re = re.compile(r"[0-9]+") _currency_re = { "USD": re.compile(r"((\$[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+\$))"), "GBP": re.compile(r"((£[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+£))"), "EUR": re.compile(r"(([0-9\.\,]*[0-9]+€)|((€[0-9\.\,]*[0-9]+)))"), + "INR": re.compile(r"((₹[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+₹))"), + "JPY": re.compile(r"((¥[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+¥))"), + "KRW": re.compile(r"((₩[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+₩))"), } _comma_number_re = re.compile(r"\b\d{1,3}(,\d{3})*(\.\d+)?\b") @@ -478,8 +507,14 @@ def _remove_dots(m): def _expand_decimal_point(m, lang="en"): - amount = m.group(1).replace(",", ".") - return num2words(float(amount), lang=lang if lang != "cs" else "cz") + whole, decimal = m.group(1).replace(",", ".").split(".") + if lang == "hi": + whole_word = num_to_word(int(whole), lang=lang) + decimal_word = num_to_word(int(decimal), lang=lang) + return f"{whole_word} दशमलव {decimal_word}" + else: + return num2words(float(m.group(1).replace(",", ".")), lang=lang if lang != "cs" else "cz") + def _expand_currency(m, lang="en", currency="USD"): @@ -501,6 +536,7 @@ def _expand_currency(m, lang="en", currency="USD"): "tr": ", ", "hu": ", ", "ko": ", ", + "hi": ", ", } if amount.is_integer(): @@ -512,11 +548,17 @@ def _expand_currency(m, lang="en", currency="USD"): def _expand_ordinal(m, lang="en"): - return num2words(int(m.group(1)), ordinal=True, lang=lang if lang != "cs" else "cz") + if lang == "hi": + return num_to_word(int(m.group(1)), lang=lang) + else: + return num2words(int(m.group(1)), ordinal=True, lang=lang if lang != "cs" else "cz") def _expand_number(m, lang="en"): - return num2words(int(m.group(0)), lang=lang if lang != "cs" else "cz") + if lang == "hi": + return num_to_word(int(m.group(0)), lang=lang) + else: + return num2words(int(m.group(0)), lang=lang if lang != "cs" else "cz") def expand_numbers_multilingual(text, lang="en"): @@ -531,10 +573,12 @@ def expand_numbers_multilingual(text, lang="en"): text = re.sub(_currency_re["GBP"], lambda m: _expand_currency(m, lang, "GBP"), text) text = re.sub(_currency_re["USD"], lambda m: _expand_currency(m, lang, "USD"), text) text = re.sub(_currency_re["EUR"], lambda m: _expand_currency(m, lang, "EUR"), text) + text = re.sub(_currency_re["INR"], lambda m: _expand_currency(m, lang, "INR"), text) except: pass if lang != "tr": text = re.sub(_decimal_number_re, lambda m: _expand_decimal_point(m, lang), text) + print(text) text = re.sub(_ordinal_re[lang], lambda m: _expand_ordinal(m, lang), text) text = re.sub(_number_re, lambda m: _expand_number(m, lang), text) return text @@ -628,7 +672,7 @@ class VoiceBpeTokenizer: ) def preprocess_text(self, txt, lang): - if lang in {"ar", "cs", "de", "en", "es", "fr", "hu", "it", "nl", "pl", "pt", "ru", "tr", "zh", "ko"}: + if lang in {"ar", "cs", "de", "en", "es", "fr", "hu", "it", "nl", "pl", "pt", "ru", "tr", "zh", "ko", "hi"}: txt = multilingual_cleaners(txt, lang) if lang == "zh": txt = chinese_transliterate(txt) @@ -636,9 +680,6 @@ class VoiceBpeTokenizer: txt = korean_transliterate(txt) elif lang == "ja": txt = japanese_cleaners(txt, self.katsu) - elif lang == "hi": - # @manmay will implement this - txt = basic_cleaners(txt) else: raise NotImplementedError(f"Language '{lang}' is not supported.") return txt @@ -761,6 +802,9 @@ def test_expand_numbers_multilingual(): ("12.5 초 안에.", "십이 점 다섯 초 안에.", "ko"), ("50 명의 병사가 있었다.", "오십 명의 병사가 있었다.", "ko"), ("이것은 1 번째 테스트입니다", "이것은 첫 번째 테스트입니다", "ko"), + ("मुझे 50 सैनिक मिले।", "मुझे पचास सैनिक मिले।", "hi"), + ("यह ₹20 का है।", "यह ₹बीस का है।", "hi"), + ("आज 25.5° तापमान है।", "आज पच्चीस दशमलव पाँच° तापमान है।", "hi"), ] for a, b, lang in test_cases: out = expand_numbers_multilingual(a, lang=lang) @@ -803,6 +847,7 @@ def test_abbreviations_multilingual(): ("Dr. Ayşe burada.", "doktor Ayşe burada.", "tr"), # Hungarian ("Dr. Szabó itt van.", "doktor Szabó itt van.", "hu"), + ("डॉ. शर्मा यहाँ हैं।", "डॉक्टर शर्मा यहाँ हैं।", "hi"), ] for a, b, lang in test_cases: @@ -830,6 +875,10 @@ def test_symbols_multilingual(): ("Pilim %14 dolu.", "Pilim yüzde 14 dolu.", "tr"), ("Az akkumulátorom töltöttsége 14%", "Az akkumulátorom töltöttsége 14 százalék", "hu"), ("배터리 잔량이 14%입니다.", "배터리 잔량이 14 퍼센트입니다.", "ko"), + ("मेरी बैटरी 14% बची है", "मेरी बैटरी 14 प्रतिशत बची है", "hi"), + ("क्या आप @ घर हैं?", "क्या आप एट घर हैं?", "hi"), + ("यह 20₹ का है।", "यह बीस रुपए का है।", "hi"), + ("आज तापमान 30° है", "आज तापमान 30 डिग्री है", "hi"), ] for a, b, lang in test_cases: