new number text preprocessor

2018-04-06 05:03:26 -07:00 · 2018-04-06 05:03:26 -07:00 · e6bf09f3c6
parent a3f02b674a
commit e6bf09f3c6
2 changed files with 94 additions and 23 deletions
--- a/utils/text/cleaners.py
+++ b/utils/text/cleaners.py
@ -1,6 +1,3 @@
 # -*- coding: utf-8 -*-
 '''
 Cleaners are transformations that run over the input text at both training and eval time.
--- a/utils/text/numbers.py
+++ b/utils/text/numbers.py
@ -1,17 +1,67 @@
 # -*- coding: utf-8 -*-
 import inflect
 import re
 _inflect = inflect.engine()
 _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
 _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
 _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
 _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
-_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
+_ordinal_re = re.compile(r'([0-9]+)(st|nd|rd|th)')
 _number_re = re.compile(r'[0-9]+')
 _units = [
  '',
  'one',
  'two',
  'three',
  'four',
  'five',
  'six',
  'seven',
  'eight',
  'nine',
  'ten',
  'eleven',
  'twelve',
  'thirteen',
  'fourteen',
  'fifteen',
  'sixteen',
  'seventeen',
  'eighteen',
  'nineteen'
 ]
 _tens = [
  '',
  'ten',
  'twenty',
  'thirty',
  'forty',
  'fifty',
  'sixty',
  'seventy',
  'eighty',
  'ninety',
 ]
 _digit_groups = [
  '',
  'thousand',
  'million',
  'billion',
  'trillion',
  'quadrillion',
 ]
 _ordinal_suffixes = [
  ('one', 'first'),
  ('two', 'second'),
  ('three', 'third'),
  ('five', 'fifth'),
  ('eight', 'eighth'),
  ('nine', 'ninth'),
  ('twelve', 'twelfth'),
  ('ty', 'tieth'),
 ]
 def _remove_commas(m):
    return m.group(1).replace(',', '')
@ -42,23 +92,47 @@ def _expand_dollars(m):
        return 'zero dollars'
-def _expand_ordinal(m):
+def _standard_number_to_words(n, digit_group):
-    return _inflect.number_to_words(m.group(0))
+    parts = []
    if n >= 1000:
        # Format next higher digit group.
        parts.append(_standard_number_to_words(n // 1000, digit_group + 1))
        n = n % 1000
    if n >= 100:
        parts.append('%s hundred' % _units[n // 100])
    if n % 100 >= len(_units):
        parts.append(_tens[(n % 100) // 10])
        parts.append(_units[(n % 100) % 10])
    else:
        parts.append(_units[n % 100])
    if n > 0:
        parts.append(_digit_groups[digit_group])
    return ' '.join([x for x in parts if x])
 def _number_to_words(n):
    # Handle special cases first, then go to the standard case:
    if n >= 1000000000000000000:
        return str(n)   # Too large, just return the digits
    elif n == 0:
        return 'zero'
    elif n % 100 == 0 and n % 1000 != 0 and n < 3000:
        return _standard_number_to_words(n // 100, 0) + ' hundred'
    else:
        return _standard_number_to_words(n, 0)
 def _expand_number(m):
-    num = int(m.group(0))
+    return _number_to_words(int(m.group(0)))
-    if num > 1000 and num < 3000:
+
-        if num == 2000:
+
-            return 'two thousand'
+def _expand_ordinal(m):
-        elif num > 2000 and num < 2010:
+    num = _number_to_words(int(m.group(1)))
-            return 'two thousand ' + _inflect.number_to_words(num % 100)
+    for suffix, replacement in _ordinal_suffixes:
-        elif num % 100 == 0:
+        if num.endswith(suffix):
-            return _inflect.number_to_words(num // 100) + ' hundred'
+            return num[:-len(suffix)] + replacement
-        else:
+    return num + 'th'
            return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
    else:
        return _inflect.number_to_words(num, andword='')
 def normalize_numbers(text):