From cd83991067aedf9bb196ab43d96727753875c5f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 17 Apr 2023 12:54:00 +0200 Subject: [PATCH] Add BN phonemizer --- TTS/tts/utils/text/bangla/__init__.py | 0 TTS/tts/utils/text/bangla/phonemizer.py | 124 ++++++++++++++++++ TTS/tts/utils/text/phonemizers/__init__.py | 4 + .../text/phonemizers/bangla_phonemizer.py | 62 +++++++++ 4 files changed, 190 insertions(+) create mode 100644 TTS/tts/utils/text/bangla/__init__.py create mode 100644 TTS/tts/utils/text/bangla/phonemizer.py create mode 100644 TTS/tts/utils/text/phonemizers/bangla_phonemizer.py diff --git a/TTS/tts/utils/text/bangla/__init__.py b/TTS/tts/utils/text/bangla/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/TTS/tts/utils/text/bangla/phonemizer.py b/TTS/tts/utils/text/bangla/phonemizer.py new file mode 100644 index 00000000..3b2391ac --- /dev/null +++ b/TTS/tts/utils/text/bangla/phonemizer.py @@ -0,0 +1,124 @@ +import re +from typing import List + +from bnnumerizer import numerize +from bnunicodenormalizer import Normalizer +import bangla + +from .pinyinToPhonemes import PINYIN_DICT + +# initialize +bnorm = Normalizer() + + +attribution_dict = { + "সাঃ": "সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম", + "আঃ": "আলাইহিস সালাম", + "রাঃ": "রাদিআল্লাহু আনহু", + "রহঃ": "রহমাতুল্লাহি আলাইহি", + "রহিঃ": "রহিমাহুল্লাহ", + "হাফিঃ": "হাফিযাহুল্লাহ", + "বায়ান": "বাইআন", + "দাঃবাঃ": "দামাত বারাকাতুহুম,দামাত বারাকাতুল্লাহ", + # "আয়াত" : "আইআত",#আইআত + # "ওয়া" : "ওআ", + # "ওয়াসাল্লাম" : "ওআসাল্লাম", + # "কেন" : "কেনো", + # "কোন" : "কোনো", + # "বল" : "বলো", + # "চল" : "চলো", + # "কর" : "করো", + # "রাখ" : "রাখো", + "’": "", + "‘": "", + # "য়" : "অ", + # "সম্প্রদায়" : "সম্প্রদাই", + # "রয়েছে" : "রইছে", + # "রয়েছ" : "রইছ", + "/": " বাই ", +} + + +def tag_text(text: str): + # remove multiple spaces + text = re.sub(" +", " ", text) + # create start and end + text = "start" + text + "end" + # tag text + parts = re.split("[\u0600-\u06FF]+", text) + # remove non chars + parts = [p for p in parts if p.strip()] + # unique parts + parts = set(parts) + # tag the text + for m in parts: + if len(m.strip()) > 1: + text = text.replace(m, f"{m}") + # clean-tags + text = text.replace("start", "") + text = text.replace("end", "") + return text + + +def normalize(sen): + global bnorm + _words = [bnorm(word)["normalized"] for word in sen.split()] + return " ".join([word for word in _words if word is not None]) + + +def expand_full_attribution(text): + for word in attribution_dict: + if word in text: + text = text.replace(word, normalize(attribution_dict[word])) + return text + + +def collapse_whitespace(text): + # Regular expression matching whitespace: + _whitespace_re = re.compile(r"\s+") + return re.sub(_whitespace_re, " ", text) + + +def bangla_text_to_phonemes(text: str, seperator: str = "|") -> str: + # english numbers to bangla conversion + res = re.search("[0-9]", text) + if res is not None: + text = bangla.convert_english_digit_to_bangla_digit(text) + + # replace ':' in between two bangla numbers with ' এর ' + pattern = r"[০, ১, ২, ৩, ৪, ৫, ৬, ৭, ৮, ৯]:[০, ১, ২, ৩, ৪, ৫, ৬, ৭, ৮, ৯]" + matches = re.findall(pattern, text) + for m in matches: + r = m.replace(":", " এর ") + text = text.replace(m, r) + + # numerize text + text = numerize(text) + + # tag sections + text = tag_text(text) + + # text blocks + # blocks = text.split("") + # blocks = [b for b in blocks if b.strip()] + + # create tuple of (lang,text) + if "" in text: + text = text.replace("", "").replace("", "") + # Split based on sentence ending Characters + bn_text = text.strip() + + sentenceEnders = re.compile("[।!?]") + sentences = sentenceEnders.split(str(bn_text)) + + data = "" + for i in range(len(sentences)): + res = re.sub("\n", "", sentences[i]) + res = normalize(res) + # expand attributes + res = expand_full_attribution(res) + + res = collapse_whitespace(res) + res += "।" + data += res + return data diff --git a/TTS/tts/utils/text/phonemizers/__init__.py b/TTS/tts/utils/text/phonemizers/__init__.py index a5a341e9..d443fb24 100644 --- a/TTS/tts/utils/text/phonemizers/__init__.py +++ b/TTS/tts/utils/text/phonemizers/__init__.py @@ -1,3 +1,4 @@ +from TTS.tts.utils.text.phonemizers.bangla_phonemizer import BN_Phonemizer from TTS.tts.utils.text.phonemizers.base import BasePhonemizer from TTS.tts.utils.text.phonemizers.espeak_wrapper import ESpeak from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut @@ -28,6 +29,7 @@ DEF_LANG_TO_PHONEMIZER["en"] = DEF_LANG_TO_PHONEMIZER["en-us"] DEF_LANG_TO_PHONEMIZER["ja-jp"] = JA_JP_Phonemizer.name() DEF_LANG_TO_PHONEMIZER["zh-cn"] = ZH_CN_Phonemizer.name() DEF_LANG_TO_PHONEMIZER["ko-kr"] = KO_KR_Phonemizer.name() +DEF_LANG_TO_PHONEMIZER["bn"] = BN_Phonemizer.name() def get_phonemizer_by_name(name: str, **kwargs) -> BasePhonemizer: @@ -50,6 +52,8 @@ def get_phonemizer_by_name(name: str, **kwargs) -> BasePhonemizer: return JA_JP_Phonemizer(**kwargs) if name == "ko_kr_phonemizer": return KO_KR_Phonemizer(**kwargs) + if name == "bn_phonemizer": + return BN_Phonemizer(**kwargs) raise ValueError(f"Phonemizer {name} not found") diff --git a/TTS/tts/utils/text/phonemizers/bangla_phonemizer.py b/TTS/tts/utils/text/phonemizers/bangla_phonemizer.py new file mode 100644 index 00000000..c844c029 --- /dev/null +++ b/TTS/tts/utils/text/phonemizers/bangla_phonemizer.py @@ -0,0 +1,62 @@ +from typing import Dict + +from TTS.tts.utils.text.bangla.phonemizer import bangla_text_to_phonemes +from TTS.tts.utils.text.phonemizers.base import BasePhonemizer + +_DEF_ZH_PUNCS = "、.,[]()?!〽~『』「」【】" + + +class BN_Phonemizer(BasePhonemizer): + """🐸TTS bn phonemizer using functions in `TTS.tts.utils.text.bangla.phonemizer` + + Args: + punctuations (str): + Set of characters to be treated as punctuation. Defaults to `_DEF_ZH_PUNCS`. + + keep_puncs (bool): + If True, keep the punctuations after phonemization. Defaults to False. + + Example :: + + "这是,样本中文。" -> `d|ʒ|ø|4| |ʂ|ʏ|4| |,| |i|ɑ|ŋ|4|b|œ|n|3| |d|ʒ|o|ŋ|1|w|œ|n|2| |。` + + TODO: someone with Bangla knowledge should check this implementation + """ + + language = "bn" + + def __init__(self, punctuations=_DEF_ZH_PUNCS, keep_puncs=False, **kwargs): # pylint: disable=unused-argument + super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs) + + @staticmethod + def name(): + return "bn_phonemizer" + + @staticmethod + def phonemize_bn(text: str, separator: str = "|") -> str: + ph = bangla_text_to_phonemes(text, separator) + return ph + + def _phonemize(self, text, separator): + return self.phonemize_bn(text, separator) + + @staticmethod + def supported_languages() -> Dict: + return {"bn": "Bangla"} + + def version(self) -> str: + return "0.0.1" + + def is_available(self) -> bool: + return True + + +if __name__ == "__main__": + text = "রাসূলুল্লাহ সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম শিক্ষা দিয়েছেন যে, কেউ যদি কোন খারাপ কিছুর সম্মুখীন হয়, তখনও যেন বলে,।" + e = BN_Phonemizer() + print(e.supported_languages()) + print(e.version()) + print(e.language) + print(e.name()) + print(e.is_available()) + print("`" + e.phonemize(text) + "`")