diff --git a/TTS/tts/utils/text/chinese_mandarin/phonemizer.py b/TTS/tts/utils/text/chinese_mandarin/phonemizer.py index 29cac160..727c881e 100644 --- a/TTS/tts/utils/text/chinese_mandarin/phonemizer.py +++ b/TTS/tts/utils/text/chinese_mandarin/phonemizer.py @@ -19,7 +19,7 @@ def _chinese_pinyin_to_phoneme(pinyin: str) -> str: return phoneme + tone -def chinese_text_to_phonemes(text: str) -> str: +def chinese_text_to_phonemes(text: str, seperator: str = "|") -> str: tokenized_text = jieba.cut(text, HMM=False) tokenized_text = " ".join(tokenized_text) pinyined_text: List[str] = _chinese_character_to_pinyin(tokenized_text) @@ -34,4 +34,4 @@ def chinese_text_to_phonemes(text: str) -> str: else: # is ponctuation or other results += list(token) - return "|".join(results) + return seperator.join(results) diff --git a/TTS/tts/utils/text/french/__init__.py b/TTS/tts/utils/text/french/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/TTS/tts/utils/text/abbreviations.py b/TTS/tts/utils/text/french/abbreviations.py similarity index 66% rename from TTS/tts/utils/text/abbreviations.py rename to TTS/tts/utils/text/french/abbreviations.py index 7e44b90c..f580dfed 100644 --- a/TTS/tts/utils/text/abbreviations.py +++ b/TTS/tts/utils/text/french/abbreviations.py @@ -1,30 +1,5 @@ import re -# List of (regular expression, replacement) pairs for abbreviations in english: -abbreviations_en = [ - (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) - for x in [ - ("mrs", "misess"), - ("mr", "mister"), - ("dr", "doctor"), - ("st", "saint"), - ("co", "company"), - ("jr", "junior"), - ("maj", "major"), - ("gen", "general"), - ("drs", "doctors"), - ("rev", "reverend"), - ("lt", "lieutenant"), - ("hon", "honorable"), - ("sgt", "sergeant"), - ("capt", "captain"), - ("esq", "esquire"), - ("ltd", "limited"), - ("col", "colonel"), - ("ft", "fort"), - ] -] - # List of (regular expression, replacement) pairs for abbreviations in french: abbreviations_fr = [ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) diff --git a/TTS/tts/utils/text/number_norm.py b/TTS/tts/utils/text/number_norm.py deleted file mode 100644 index e8377ede..00000000 --- a/TTS/tts/utils/text/number_norm.py +++ /dev/null @@ -1,97 +0,0 @@ -""" from https://github.com/keithito/tacotron """ - -import re -from typing import Dict - -import inflect - -_inflect = inflect.engine() -_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])") -_decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)") -_currency_re = re.compile(r"(£|\$|¥)([0-9\,\.]*[0-9]+)") -_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)") -_number_re = re.compile(r"-?[0-9]+") - - -def _remove_commas(m): - return m.group(1).replace(",", "") - - -def _expand_decimal_point(m): - return m.group(1).replace(".", " point ") - - -def __expand_currency(value: str, inflection: Dict[float, str]) -> str: - parts = value.replace(",", "").split(".") - if len(parts) > 2: - return f"{value} {inflection[2]}" # Unexpected format - text = [] - integer = int(parts[0]) if parts[0] else 0 - if integer > 0: - integer_unit = inflection.get(integer, inflection[2]) - text.append(f"{integer} {integer_unit}") - fraction = int(parts[1]) if len(parts) > 1 and parts[1] else 0 - if fraction > 0: - fraction_unit = inflection.get(fraction / 100, inflection[0.02]) - text.append(f"{fraction} {fraction_unit}") - if len(text) == 0: - return f"zero {inflection[2]}" - return " ".join(text) - - -def _expand_currency(m: "re.Match") -> str: - currencies = { - "$": { - 0.01: "cent", - 0.02: "cents", - 1: "dollar", - 2: "dollars", - }, - "€": { - 0.01: "cent", - 0.02: "cents", - 1: "euro", - 2: "euros", - }, - "£": { - 0.01: "penny", - 0.02: "pence", - 1: "pound sterling", - 2: "pounds sterling", - }, - "¥": { - # TODO rin - 0.02: "sen", - 2: "yen", - }, - } - unit = m.group(1) - currency = currencies[unit] - value = m.group(2) - return __expand_currency(value, currency) - - -def _expand_ordinal(m): - return _inflect.number_to_words(m.group(0)) - - -def _expand_number(m): - num = int(m.group(0)) - if 1000 < num < 3000: - if num == 2000: - return "two thousand" - if 2000 < num < 2010: - return "two thousand " + _inflect.number_to_words(num % 100) - if num % 100 == 0: - return _inflect.number_to_words(num // 100) + " hundred" - return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ") - return _inflect.number_to_words(num, andword="") - - -def normalize_numbers(text): - text = re.sub(_comma_number_re, _remove_commas, text) - text = re.sub(_currency_re, _expand_currency, text) - text = re.sub(_decimal_number_re, _expand_decimal_point, text) - text = re.sub(_ordinal_re, _expand_ordinal, text) - text = re.sub(_number_re, _expand_number, text) - return text diff --git a/TTS/tts/utils/text/time.py b/TTS/tts/utils/text/time.py deleted file mode 100644 index c8ac09e7..00000000 --- a/TTS/tts/utils/text/time.py +++ /dev/null @@ -1,47 +0,0 @@ -import re - -import inflect - -_inflect = inflect.engine() - -_time_re = re.compile( - r"""\b - ((0?[0-9])|(1[0-1])|(1[2-9])|(2[0-3])) # hours - : - ([0-5][0-9]) # minutes - \s*(a\\.m\\.|am|pm|p\\.m\\.|a\\.m|p\\.m)? # am/pm - \b""", - re.IGNORECASE | re.X, -) - - -def _expand_num(n: int) -> str: - return _inflect.number_to_words(n) - - -def _expand_time_english(match: "re.Match") -> str: - hour = int(match.group(1)) - past_noon = hour >= 12 - time = [] - if hour > 12: - hour -= 12 - elif hour == 0: - hour = 12 - past_noon = True - time.append(_expand_num(hour)) - - minute = int(match.group(6)) - if minute > 0: - if minute < 10: - time.append("oh") - time.append(_expand_num(minute)) - am_pm = match.group(7) - if am_pm is None: - time.append("p m" if past_noon else "a m") - else: - time.extend(list(am_pm.replace(".", ""))) - return " ".join(time) - - -def expand_time_english(text: str) -> str: - return re.sub(_time_re, _expand_time_english, text)