From da30c3c9b36b90e8c7c77ea853872b2723a7d1f8 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Wed, 21 Nov 2018 17:06:10 +0100 Subject: [PATCH] change numbers.py to number_norm.py --- utils/text/number_norm.py | 129 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 129 insertions(+) create mode 100644 utils/text/number_norm.py diff --git a/utils/text/number_norm.py b/utils/text/number_norm.py new file mode 100644 index 00000000..9cc6f4df --- /dev/null +++ b/utils/text/number_norm.py @@ -0,0 +1,129 @@ +import re + +_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') +_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') +_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') +_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') +_ordinal_re = re.compile(r'([0-9]+)(st|nd|rd|th)') +_number_re = re.compile(r'[0-9]+') + +_units = [ + '', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', + 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', + 'seventeen', 'eighteen', 'nineteen' +] + +_tens = [ + '', + 'ten', + 'twenty', + 'thirty', + 'forty', + 'fifty', + 'sixty', + 'seventy', + 'eighty', + 'ninety', +] + +_digit_groups = [ + '', + 'thousand', + 'million', + 'billion', + 'trillion', + 'quadrillion', +] + +_ordinal_suffixes = [ + ('one', 'first'), + ('two', 'second'), + ('three', 'third'), + ('five', 'fifth'), + ('eight', 'eighth'), + ('nine', 'ninth'), + ('twelve', 'twelfth'), + ('ty', 'tieth'), +] + + +def _remove_commas(m): + return m.group(1).replace(',', '') + + +def _expand_decimal_point(m): + return m.group(1).replace('.', ' point ') + + +def _expand_dollars(m): + match = m.group(1) + parts = match.split('.') + if len(parts) > 2: + return match + ' dollars' # Unexpected format + dollars = int(parts[0]) if parts[0] else 0 + cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 + if dollars and cents: + dollar_unit = 'dollar' if dollars == 1 else 'dollars' + cent_unit = 'cent' if cents == 1 else 'cents' + return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) + elif dollars: + dollar_unit = 'dollar' if dollars == 1 else 'dollars' + return '%s %s' % (dollars, dollar_unit) + elif cents: + cent_unit = 'cent' if cents == 1 else 'cents' + return '%s %s' % (cents, cent_unit) + else: + return 'zero dollars' + + +def _standard_number_to_words(n, digit_group): + parts = [] + if n >= 1000: + # Format next higher digit group. + parts.append(_standard_number_to_words(n // 1000, digit_group + 1)) + n = n % 1000 + + if n >= 100: + parts.append('%s hundred' % _units[n // 100]) + if n % 100 >= len(_units): + parts.append(_tens[(n % 100) // 10]) + parts.append(_units[(n % 100) % 10]) + else: + parts.append(_units[n % 100]) + if n > 0: + parts.append(_digit_groups[digit_group]) + return ' '.join([x for x in parts if x]) + + +def _number_to_words(n): + # Handle special cases first, then go to the standard case: + if n >= 1000000000000000000: + return str(n) # Too large, just return the digits + elif n == 0: + return 'zero' + elif n % 100 == 0 and n % 1000 != 0 and n < 3000: + return _standard_number_to_words(n // 100, 0) + ' hundred' + else: + return _standard_number_to_words(n, 0) + + +def _expand_number(m): + return _number_to_words(int(m.group(0))) + + +def _expand_ordinal(m): + num = _number_to_words(int(m.group(1))) + for suffix, replacement in _ordinal_suffixes: + if num.endswith(suffix): + return num[:-len(suffix)] + replacement + return num + 'th' + + +def normalize_numbers(text): + text = re.sub(_comma_number_re, _remove_commas, text) + text = re.sub(_pounds_re, r'\1 pounds', text) + text = re.sub(_dollars_re, _expand_dollars, text) + text = re.sub(_decimal_number_re, _expand_decimal_point, text) + text = re.sub(_ordinal_re, _expand_ordinal, text) + text = re.sub(_number_re, _expand_number, text) + return text