Add BN phonemizer

This commit is contained in:
Eren Gölge 2023-04-17 12:54:00 +02:00
parent 36be05290d
commit cd83991067
4 changed files with 190 additions and 0 deletions

View File

View File

@ -0,0 +1,124 @@
import re
from typing import List
from bnnumerizer import numerize
from bnunicodenormalizer import Normalizer
import bangla
from .pinyinToPhonemes import PINYIN_DICT
# initialize
bnorm = Normalizer()
attribution_dict = {
"সাঃ": "সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম",
"আঃ": "আলাইহিস সালাম",
"রাঃ": "রাদিআল্লাহু আনহু",
"রহঃ": "রহমাতুল্লাহি আলাইহি",
"রহিঃ": "রহিমাহুল্লাহ",
"হাফিঃ": "হাফিযাহুল্লাহ",
"বায়ান": "বাইআন",
"দাঃবাঃ": "দামাত বারাকাতুহুম,দামাত বারাকাতুল্লাহ",
# "আয়াত" : "আইআত",#আইআত
# "ওয়া" : "ওআ",
# "ওয়াসাল্লাম" : "ওআসাল্লাম",
# "কেন" : "কেনো",
# "কোন" : "কোনো",
# "বল" : "বলো",
# "চল" : "চলো",
# "কর" : "করো",
# "রাখ" : "রাখো",
"": "",
"": "",
# "য়" : "অ",
# "সম্প্রদায়" : "সম্প্রদাই",
# "রয়েছে" : "রইছে",
# "রয়েছ" : "রইছ",
"/": " বাই ",
}
def tag_text(text: str):
# remove multiple spaces
text = re.sub(" +", " ", text)
# create start and end
text = "start" + text + "end"
# tag text
parts = re.split("[\u0600-\u06FF]+", text)
# remove non chars
parts = [p for p in parts if p.strip()]
# unique parts
parts = set(parts)
# tag the text
for m in parts:
if len(m.strip()) > 1:
text = text.replace(m, f"{m}")
# clean-tags
text = text.replace("start", "")
text = text.replace("end", "")
return text
def normalize(sen):
global bnorm
_words = [bnorm(word)["normalized"] for word in sen.split()]
return " ".join([word for word in _words if word is not None])
def expand_full_attribution(text):
for word in attribution_dict:
if word in text:
text = text.replace(word, normalize(attribution_dict[word]))
return text
def collapse_whitespace(text):
# Regular expression matching whitespace:
_whitespace_re = re.compile(r"\s+")
return re.sub(_whitespace_re, " ", text)
def bangla_text_to_phonemes(text: str, seperator: str = "|") -> str:
# english numbers to bangla conversion
res = re.search("[0-9]", text)
if res is not None:
text = bangla.convert_english_digit_to_bangla_digit(text)
# replace ':' in between two bangla numbers with ' এর '
pattern = r"[, ১, ২, ৩, , ৫, ৬, , ৮, ৯]:[, ১, ২, ৩, , ৫, ৬, , ৮, ৯]"
matches = re.findall(pattern, text)
for m in matches:
r = m.replace(":", " এর ")
text = text.replace(m, r)
# numerize text
text = numerize(text)
# tag sections
text = tag_text(text)
# text blocks
# blocks = text.split("")
# blocks = [b for b in blocks if b.strip()]
# create tuple of (lang,text)
if "" in text:
text = text.replace("", "").replace("", "")
# Split based on sentence ending Characters
bn_text = text.strip()
sentenceEnders = re.compile("[।!?]")
sentences = sentenceEnders.split(str(bn_text))
data = ""
for i in range(len(sentences)):
res = re.sub("\n", "", sentences[i])
res = normalize(res)
# expand attributes
res = expand_full_attribution(res)
res = collapse_whitespace(res)
res += ""
data += res
return data

View File

@ -1,3 +1,4 @@
from TTS.tts.utils.text.phonemizers.bangla_phonemizer import BN_Phonemizer
from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
from TTS.tts.utils.text.phonemizers.espeak_wrapper import ESpeak
from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut
@ -28,6 +29,7 @@ DEF_LANG_TO_PHONEMIZER["en"] = DEF_LANG_TO_PHONEMIZER["en-us"]
DEF_LANG_TO_PHONEMIZER["ja-jp"] = JA_JP_Phonemizer.name()
DEF_LANG_TO_PHONEMIZER["zh-cn"] = ZH_CN_Phonemizer.name()
DEF_LANG_TO_PHONEMIZER["ko-kr"] = KO_KR_Phonemizer.name()
DEF_LANG_TO_PHONEMIZER["bn"] = BN_Phonemizer.name()
def get_phonemizer_by_name(name: str, **kwargs) -> BasePhonemizer:
@ -50,6 +52,8 @@ def get_phonemizer_by_name(name: str, **kwargs) -> BasePhonemizer:
return JA_JP_Phonemizer(**kwargs)
if name == "ko_kr_phonemizer":
return KO_KR_Phonemizer(**kwargs)
if name == "bn_phonemizer":
return BN_Phonemizer(**kwargs)
raise ValueError(f"Phonemizer {name} not found")

View File

@ -0,0 +1,62 @@
from typing import Dict
from TTS.tts.utils.text.bangla.phonemizer import bangla_text_to_phonemes
from TTS.tts.utils.text.phonemizers.base import BasePhonemizer
_DEF_ZH_PUNCS = "、.,[]()?!〽~『』「」【】"
class BN_Phonemizer(BasePhonemizer):
"""🐸TTS bn phonemizer using functions in `TTS.tts.utils.text.bangla.phonemizer`
Args:
punctuations (str):
Set of characters to be treated as punctuation. Defaults to `_DEF_ZH_PUNCS`.
keep_puncs (bool):
If True, keep the punctuations after phonemization. Defaults to False.
Example ::
"这是,样本中文。" -> `d|ʒ|ø|4| |ʂ|ʏ|4| || |i|ɑ|ŋ|4|b|œ|n|3| |d|ʒ|o|ŋ|1|w|œ|n|2| |`
TODO: someone with Bangla knowledge should check this implementation
"""
language = "bn"
def __init__(self, punctuations=_DEF_ZH_PUNCS, keep_puncs=False, **kwargs): # pylint: disable=unused-argument
super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs)
@staticmethod
def name():
return "bn_phonemizer"
@staticmethod
def phonemize_bn(text: str, separator: str = "|") -> str:
ph = bangla_text_to_phonemes(text, separator)
return ph
def _phonemize(self, text, separator):
return self.phonemize_bn(text, separator)
@staticmethod
def supported_languages() -> Dict:
return {"bn": "Bangla"}
def version(self) -> str:
return "0.0.1"
def is_available(self) -> bool:
return True
if __name__ == "__main__":
text = "রাসূলুল্লাহ সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম শিক্ষা দিয়েছেন যে, কেউ যদি কোন খারাপ কিছুর সম্মুখীন হয়, তখনও যেন বলে,।"
e = BN_Phonemizer()
print(e.supported_languages())
print(e.version())
print(e.language)
print(e.name())
print(e.is_available())
print("`" + e.phonemize(text) + "`")