Enhance text_normalization

This commit is contained in:
WeberJulian 2023-10-20 06:03:01 -03:00
parent e18d6ada2f
commit d07b37b19c
3 changed files with 1650 additions and 183 deletions

View File

@ -1,206 +1,469 @@
import json
import os
import re
import json
import inflect
import pandas as pd
import pypinyin
import torch
from num2words import num2words
from tokenizers import Tokenizer
from unidecode import unidecode
from TTS.tts.utils.text.cleaners import english_cleaners
import pypinyin
import cutlet
from num2words import num2words
from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words
_inflect = inflect.engine()
_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
_decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
_pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
_dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
_number_re = re.compile(r"[0-9]+")
def _remove_commas(m):
return m.group(1).replace(",", "")
def _expand_decimal_point(m):
return m.group(1).replace(".", " point ")
def _expand_dollars(m):
match = m.group(1)
parts = match.split(".")
if len(parts) > 2:
return match + " dollars" # Unexpected format
dollars = int(parts[0]) if parts[0] else 0
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
if dollars and cents:
dollar_unit = "dollar" if dollars == 1 else "dollars"
cent_unit = "cent" if cents == 1 else "cents"
return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
elif dollars:
dollar_unit = "dollar" if dollars == 1 else "dollars"
return "%s %s" % (dollars, dollar_unit)
elif cents:
cent_unit = "cent" if cents == 1 else "cents"
return "%s %s" % (cents, cent_unit)
else:
return "zero dollars"
def _expand_ordinal(m):
return _inflect.number_to_words(m.group(0))
def _expand_number(m):
num = int(m.group(0))
if num > 1000 and num < 3000:
if num == 2000:
return "two thousand"
elif num > 2000 and num < 2010:
return "two thousand " + _inflect.number_to_words(num % 100)
elif num % 100 == 0:
return _inflect.number_to_words(num // 100) + " hundred"
else:
return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ")
else:
return _inflect.number_to_words(num, andword="")
def normalize_numbers(text):
text = re.sub(_comma_number_re, _remove_commas, text)
text = re.sub(_pounds_re, r"\1 pounds", text)
text = re.sub(_dollars_re, _expand_dollars, text)
text = re.sub(_decimal_number_re, _expand_decimal_point, text)
text = re.sub(_ordinal_re, _expand_ordinal, text)
text = re.sub(_number_re, _expand_number, text)
return text
# Regular expression matching whitespace:
_whitespace_re = re.compile(r"\s+")
# List of (regular expression, replacement) pairs for abbreviations:
_abbreviations = [
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
for x in [
("mrs", "misess"),
("mr", "mister"),
("dr", "doctor"),
("st", "saint"),
("co", "company"),
("jr", "junior"),
("maj", "major"),
("gen", "general"),
("drs", "doctors"),
("rev", "reverend"),
("lt", "lieutenant"),
("hon", "honorable"),
("sgt", "sergeant"),
("capt", "captain"),
("esq", "esquire"),
("ltd", "limited"),
("col", "colonel"),
("ft", "fort"),
]
]
_abbreviations = {
"en": [
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
for x in [
("mrs", "misess"),
("mr", "mister"),
("dr", "doctor"),
("st", "saint"),
("co", "company"),
("jr", "junior"),
("maj", "major"),
("gen", "general"),
("drs", "doctors"),
("rev", "reverend"),
("lt", "lieutenant"),
("hon", "honorable"),
("sgt", "sergeant"),
("capt", "captain"),
("esq", "esquire"),
("ltd", "limited"),
("col", "colonel"),
("ft", "fort"),
]
],
"es": [
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
for x in [
("sra", "señora"),
("sr", "señor"),
("dr", "doctor"),
("dra", "doctora"),
("st", "santo"),
("co", "compañía"),
("jr", "junior"),
("ltd", "limitada"),
]
],
"fr": [
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
for x in [
("mme", "madame"),
("mr", "monsieur"),
("dr", "docteur"),
("st", "saint"),
("co", "compagnie"),
("jr", "junior"),
("ltd", "limitée"),
]
],
"de": [
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
for x in [
("fr", "frau"),
("dr", "doktor"),
("st", "sankt"),
("co", "firma"),
("jr", "junior"),
]
],
"pt": [
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
for x in [
("sra", "senhora"),
("sr", "senhor"),
("dr", "doutor"),
("dra", "doutora"),
("st", "santo"),
("co", "companhia"),
("jr", "júnior"),
("ltd", "limitada"),
]
],
"it": [
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
for x in [
#("sig.ra", "signora"),
("sig", "signore"),
("dr", "dottore"),
("st", "santo"),
("co", "compagnia"),
("jr", "junior"),
("ltd", "limitata"),
]
],
"pl": [
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
for x in [
("p", "pani"),
("m", "pan"),
("dr", "doktor"),
("sw", "święty"),
("jr", "junior"),
]
],
"ar": [
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
for x in [
# There are not many common abbreviations in Arabic as in English.
]
],
"zh-cn": [
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
for x in [
# Chinese doesn't typically use abbreviations in the same way as Latin-based scripts.
]
],
"cs": [
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
for x in [
("dr", "doktor"), # doctor
("ing", "inženýr"), # engineer
("p", "pan"), # Could also map to pani for woman but no easy way to do it
# Other abbreviations would be specialized and not as common.
]
],
"ru": [
(re.compile("\\b%s\\b" % x[0], re.IGNORECASE), x[1])
for x in [
("г-жа", "госпожа"), # Mrs.
("г", "господин"), # Mr.
("д-р", "доктор"), # doctor
# Other abbreviations are less common or specialized.
]
],
"nl": [
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
for x in [
("dhr", "de heer"), # Mr.
("mevr", "mevrouw"), # Mrs.
("dr", "dokter"), # doctor
("jhr", "jonkheer"), # young lord or nobleman
# Dutch uses more abbreviations, but these are the most common ones.
]
],
"tr": [
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
for x in [
("b", "bay"), # Mr.
("byk", "büyük"), # büyük
("dr", "doktor"), # doctor
# Add other Turkish abbreviations here if needed.
]
],
}
def expand_abbreviations(text):
for regex, replacement in _abbreviations:
def expand_abbreviations_multilingual(text, lang='en'):
for regex, replacement in _abbreviations[lang]:
text = re.sub(regex, replacement, text)
return text
_symbols_multilingual = {
'en': [
(re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
for x in [
("&", " and "),
("@", " at "),
("%", " percent "),
("#", " hash "),
("$", " dollar "),
("£", " pound "),
("°", " degree ")
]
],
'es': [
(re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
for x in [
("&", " y "),
("@", " arroba "),
("%", " por ciento "),
("#", " numeral "),
("$", " dolar "),
("£", " libra "),
("°", " grados ")
]
],
'fr': [
(re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
for x in [
("&", " et "),
("@", " arobase "),
("%", " pour cent "),
("#", " dièse "),
("$", " dollar "),
("£", " livre "),
("°", " degrés ")
]
],
'de': [
(re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
for x in [
("&", " und "),
("@", " at "),
("%", " prozent "),
("#", " raute "),
("$", " dollar "),
("£", " pfund "),
("°", " grad ")
]
],
'pt': [
(re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
for x in [
("&", " e "),
("@", " arroba "),
("%", " por cento "),
("#", " cardinal "),
("$", " dólar "),
("£", " libra "),
("°", " graus ")
]
],
'it': [
(re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
for x in [
("&", " e "),
("@", " chiocciola "),
("%", " per cento "),
("#", " cancelletto "),
("$", " dollaro "),
("£", " sterlina "),
("°", " gradi ")
]
],
'pl': [
(re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
for x in [
("&", " i "),
("@", " małpa "),
("%", " procent "),
("#", " krzyżyk "),
("$", " dolar "),
("£", " funt "),
("°", " stopnie ")
]
],
"ar": [
# Arabic
(re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
for x in [
("&", " و "),
("@", " على "),
("%", " في المئة "),
("#", " رقم "),
("$", " دولار "),
("£", " جنيه "),
("°", " درجة ")
]
],
"zh-cn": [
# Chinese
(re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
for x in [
("&", ""),
("@", ""),
("%", " 百分之 "),
("#", ""),
("$", " 美元 "),
("£", " 英镑 "),
("°", "")
]
],
"cs": [
# Czech
(re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
for x in [
("&", " a "),
("@", " na "),
("%", " procento "),
("#", " křížek "),
("$", " dolar "),
("£", " libra "),
("°", " stupně ")
]
],
"ru": [
# Russian
(re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
for x in [
("&", " и "),
("@", " собака "),
("%", " процентов "),
("#", " номер "),
("$", " доллар "),
("£", " фунт "),
("°", " градус ")
]
],
"nl": [
# Dutch
(re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
for x in [
("&", " en "),
("@", " bij "),
("%", " procent "),
("#", " hekje "),
("$", " dollar "),
("£", " pond "),
("°", " graden ")
]
],
"tr": [
(re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
for x in [
("&", " ve "),
("@", " at "),
("%", " yüzde "),
("#", " diyez "),
("$", " dolar "),
("£", " sterlin "),
("°", " derece ")
]
],
}
def expand_numbers(text):
return normalize_numbers(text)
def expand_symbols_multilingual(text, lang='en'):
for regex, replacement in _symbols_multilingual[lang]:
text = re.sub(regex, replacement, text)
text = text.replace(' ', ' ') # Ensure there are no double spaces
return text.strip()
_ordinal_re = {
"en": re.compile(r"([0-9]+)(st|nd|rd|th)"),
"es": re.compile(r"([0-9]+)(º|ª|er|o|a|os|as)"),
"fr": re.compile(r"([0-9]+)(º|ª|er|re|e|ème)"),
"de": re.compile(r"([0-9]+)(st|nd|rd|th|º|ª|\.(?=\s|$))"),
"pt": re.compile(r"([0-9]+)(º|ª|o|a|os|as)"),
"it": re.compile(r"([0-9]+)(º|°|ª|o|a|i|e)"),
"pl": re.compile(r"([0-9]+)(º|ª|st|nd|rd|th)"),
"ar": re.compile(r"([0-9]+)(ون|ين|ث|ر|ى)"),
"cs": re.compile(r"([0-9]+)\.(?=\s|$)"), # In Czech, a dot is often used after the number to indicate ordinals.
"ru": re.compile(r"([0-9]+)(-й|-я|-е|-ое|-ье|-го)"),
"nl": re.compile(r"([0-9]+)(de|ste|e)"),
"tr": re.compile(r"([0-9]+)(\.|inci|nci|uncu|üncü|\.)"),
}
_number_re = re.compile(r"[0-9]+")
_currency_re = {
'USD': re.compile(r"((\$[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+\$))"),
'GBP': re.compile(r"((£[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+£))"),
'EUR': re.compile(r"(([0-9\.\,]*[0-9]+€)|((€[0-9\.\,]*[0-9]+)))")
}
_comma_number_re = re.compile(r"\b\d{1,3}(,\d{3})*(\.\d+)?\b")
_dot_number_re = re.compile(r"\b\d{1,3}(.\d{3})*(\,\d+)?\b")
_decimal_number_re = re.compile(r"([0-9]+[.,][0-9]+)")
def _remove_commas(m):
text = m.group(0)
if "," in text:
text = text.replace(",", "")
return text
def _remove_dots(m):
text = m.group(0)
if "." in text:
text = text.replace(".", "")
return text
def _expand_decimal_point(m, lang='en'):
amount = m.group(1).replace(",", ".")
return num2words(float(amount), lang=lang if lang != "cs" else "cz")
def _expand_currency(m, lang='en', currency='USD'):
amount = float((re.sub(r'[^\d.]', '', m.group(0).replace(",", "."))))
full_amount = num2words(amount, to='currency', currency=currency, lang=lang if lang != "cs" else "cz")
and_equivalents = {
"en": ", ",
"es": " con ",
"fr": " et ",
"de": " und ",
"pt": " e ",
"it": " e ",
"pl": ", ",
"cs": ", ",
"ru": ", ",
"nl": ", ",
"ar": ", ",
"tr": ", ",
}
if amount.is_integer():
last_and = full_amount.rfind(and_equivalents[lang])
if last_and != -1:
full_amount = full_amount[:last_and]
return full_amount
def _expand_ordinal(m, lang='en'):
return num2words(int(m.group(1)), ordinal=True, lang=lang if lang != "cs" else "cz")
def _expand_number(m, lang='en'):
return num2words(int(m.group(0)), lang=lang if lang != "cs" else "cz")
def expand_numbers_multilingual(text, lang='en'):
if lang == "zh-cn":
text = zh_num2words()(text)
else:
if lang in ["en", "ru"]:
text = re.sub(_comma_number_re, _remove_commas, text)
else:
text = re.sub(_dot_number_re, _remove_dots, text)
try:
text = re.sub(_currency_re['GBP'], lambda m: _expand_currency(m, lang, 'GBP'), text)
text = re.sub(_currency_re['USD'], lambda m: _expand_currency(m, lang, 'USD'), text)
text = re.sub(_currency_re['EUR'], lambda m: _expand_currency(m, lang, 'EUR'), text)
except:
pass
if lang != "tr":
text = re.sub(_decimal_number_re, lambda m: _expand_decimal_point(m, lang), text)
text = re.sub(_ordinal_re[lang], lambda m: _expand_ordinal(m, lang), text)
text = re.sub(_number_re, lambda m: _expand_number(m, lang), text)
return text
def lowercase(text):
return text.lower()
def collapse_whitespace(text):
return re.sub(_whitespace_re, " ", text)
def convert_to_ascii(text):
return unidecode(text)
def multilingual_cleaners(text, lang):
text = text.replace('"', '')
if lang=="tr":
text = text.replace("İ", "i")
text = text.replace("Ö", "ö")
text = text.replace("Ü", "ü")
text = lowercase(text)
text = expand_numbers_multilingual(text, lang)
text = expand_abbreviations_multilingual(text, lang)
text = expand_symbols_multilingual(text, lang=lang)
text = collapse_whitespace(text)
return text
def basic_cleaners(text):
"""Basic pipeline that lowercases and collapses whitespace without transliteration."""
text = lowercase(text)
text = collapse_whitespace(text)
text = text.replace('"', "")
return text
def chinese_transliterate(text):
return "".join([p[0] for p in pypinyin.pinyin(text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True)])
def expand_numbers_multilang(text, lang):
# TODO: Handle text more carefully. Currently, it just converts numbers without any context.
# Find all numbers in the input string
numbers = re.findall(r"\d+", text)
# Transliterate the numbers to text
for num in numbers:
transliterated_num = "".join(num2words(num, lang=lang))
text = text.replace(num, transliterated_num, 1)
return text
def transliteration_cleaners(text):
"""Pipeline for non-English text that transliterates to ASCII."""
text = convert_to_ascii(text)
def japanese_cleaners(text, katsu):
text = katsu.romaji(text)
text = lowercase(text)
text = collapse_whitespace(text)
return text
def multilingual_cleaners(text, lang):
text = lowercase(text)
text = expand_numbers_multilang(text, lang)
text = collapse_whitespace(text)
text = text.replace('"', "")
if lang == "tr":
text = text.replace("İ", "i")
text = text.replace("Ö", "ö")
text = text.replace("Ü", "ü")
return text
def remove_extraneous_punctuation(word):
replacement_punctuation = {"{": "(", "}": ")", "[": "(", "]": ")", "`": "'", "": "-", "": "-", "`": "'", "ʼ": "'"}
replace = re.compile(
"|".join([re.escape(k) for k in sorted(replacement_punctuation, key=len, reverse=True)]), flags=re.DOTALL
)
word = replace.sub(lambda x: replacement_punctuation[x.group(0)], word)
# TODO: some of these are spoken ('@', '%', '+', etc). Integrate them into the cleaners.
extraneous = re.compile(r"^[@#%_=\$\^&\*\+\\]$")
word = extraneous.sub("", word)
return word
def arabic_cleaners(text):
text = lowercase(text)
text = collapse_whitespace(text)
return text
def chinese_cleaners(text):
text = lowercase(text)
text = "".join(
[p[0] for p in pypinyin.pinyin(text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True)]
)
return text
class VoiceBpeTokenizer:
def __init__(self, vocab_file=None, preprocess=None):
self.tokenizer = None
self.katsu = None
if vocab_file is not None:
with open(vocab_file, "r", encoding="utf-8") as f:
@ -216,24 +479,17 @@ class VoiceBpeTokenizer:
self.tokenizer = Tokenizer.from_file(vocab_file)
def preprocess_text(self, txt, lang):
if lang == "ja":
import pykakasi
kks = pykakasi.kakasi()
results = kks.convert(txt)
txt = " ".join([result["kana"] for result in results])
txt = basic_cleaners(txt)
elif lang == "en":
if txt[:4] == "[en]":
txt = txt[4:]
txt = english_cleaners(txt)
txt = "[en]" + txt
elif lang == "ar":
txt = arabic_cleaners(txt)
elif lang == "zh-cn":
txt = chinese_cleaners(txt)
else:
if lang in ["en", "es", "fr", "de", "pt", "it", "pl", "ar", "cs", "ru", "nl", "tr", "zh-cn"]:
txt = multilingual_cleaners(txt, lang)
if lang == "zh-cn":
txt = chinese_transliterate(txt)
elif lang == "ja":
if self.katsu is None:
import cutlet
self.katsu = cutlet.Cutlet()
txt = japanese_cleaners(txt, self.katsu)
else:
raise NotImplementedError()
return txt
def encode(self, txt, lang):
@ -250,3 +506,6 @@ class VoiceBpeTokenizer:
txt = txt.replace("[STOP]", "")
txt = txt.replace("[UNK]", "")
return txt
def __len__(self):
return self.tokenizer.get_vocab_size()

File diff suppressed because it is too large Load Diff

View File

@ -2,3 +2,4 @@
# japanese g2p deps
mecab-python3==1.0.6
unidic-lite==1.0.8
cutlet