coqui-tts/TTS/tts/utils/text/chinese_mandarin/numbers.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Licensed under WTFPL or the Unlicense or CC0.
# This uses Python 3, but it's easy to port to Python 2 by changing
# strings to u'xx'.

import itertools
import re


def _num2chinese(num: str, big=False, simp=True, o=False, twoalt=False) -> str:
    """Convert numerical arabic numbers (0->9) to chinese hanzi numbers (〇 -> 九)

    Args:
        num (str): arabic number to convert
        big (bool, optional): use financial characters. Defaults to False.
        simp (bool, optional): use simplified characters instead of tradictional characters. Defaults to True.
        o (bool, optional): use 〇 for 'zero'. Defaults to False.
        twoalt (bool, optional): use 两/兩 for 'two' when appropriate. Defaults to False.

    Raises:
        ValueError: if number is more than 1e48
        ValueError: if 'e' exposent in number

    Returns:
        str: converted number as hanzi characters
    """

    # check num first
    nd = str(num)
    if abs(float(nd)) >= 1e48:
        raise ValueError("number out of range")
    if "e" in nd:
        raise ValueError("scientific notation is not supported")
    c_symbol = "正负点" if simp else "正負點"
    if o:  # formal
        twoalt = False
    if big:
        c_basic = "零壹贰叁肆伍陆柒捌玖" if simp else "零壹貳參肆伍陸柒捌玖"
        c_unit1 = "拾佰仟"
        c_twoalt = "贰" if simp else "貳"
    else:
        c_basic = "〇一二三四五六七八九" if o else "零一二三四五六七八九"
        c_unit1 = "十百千"
        if twoalt:
            c_twoalt = "两" if simp else "兩"
        else:
            c_twoalt = "二"
    c_unit2 = "万亿兆京垓秭穰沟涧正载" if simp else "萬億兆京垓秭穰溝澗正載"
    revuniq = lambda l: "".join(k for k, g in itertools.groupby(reversed(l)))
    nd = str(num)
    result = []
    if nd[0] == "+":
        result.append(c_symbol[0])
    elif nd[0] == "-":
        result.append(c_symbol[1])
    if "." in nd:
        integer, remainder = nd.lstrip("+-").split(".")
    else:
        integer, remainder = nd.lstrip("+-"), None
    if int(integer):
        splitted = [integer[max(i - 4, 0) : i] for i in range(len(integer), 0, -4)]
        intresult = []
        for nu, unit in enumerate(splitted):
            # special cases
            if int(unit) == 0:  # 0000
                intresult.append(c_basic[0])
                continue
            if nu > 0 and int(unit) == 2:  # 0002
                intresult.append(c_twoalt + c_unit2[nu - 1])
                continue
            ulist = []
            unit = unit.zfill(4)
            for nc, ch in enumerate(reversed(unit)):
                if ch == "0":
                    if ulist:  # ???0
                        ulist.append(c_basic[0])
                elif nc == 0:
                    ulist.append(c_basic[int(ch)])
                elif nc == 1 and ch == "1" and unit[1] == "0":
                    # special case for tens
                    # edit the 'elif' if you don't like
                    # 十四, 三千零十四, 三千三百一十四
                    ulist.append(c_unit1[0])
                elif nc > 1 and ch == "2":
                    ulist.append(c_twoalt + c_unit1[nc - 1])
                else:
                    ulist.append(c_basic[int(ch)] + c_unit1[nc - 1])
            ustr = revuniq(ulist)
            if nu == 0:
                intresult.append(ustr)
            else:
                intresult.append(ustr + c_unit2[nu - 1])
        result.append(revuniq(intresult).strip(c_basic[0]))
    else:
        result.append(c_basic[0])
    if remainder:
        result.append(c_symbol[2])
        result.append("".join(c_basic[int(ch)] for ch in remainder))
    return "".join(result)


def _number_replace(match) -> str:
    """function to apply in a match, transform all numbers in a match by chinese characters

    Args:
        match (re.Match): numbers regex matches

    Returns:
        str: replaced characters for the numbers
    """
    match_str: str = match.group()
    return _num2chinese(match_str)


def replace_numbers_to_characters_in_text(text: str) -> str:
    """Replace all arabic numbers in a text by their equivalent in chinese characters (simplified)

    Args:
        text (str): input text to transform

    Returns:
        str: output text
    """
    text = re.sub(r"[0-9]+", _number_replace, text)
    return text