newer number normalization

2020-07-08 10:26:20 +02:00 · 2020-07-08 10:26:20 +02:00 · 7c671cae5a
parent 512188469b
commit 7c671cae5a
1 changed files with 40 additions and 41 deletions
--- a/utils/text/number_norm.py
+++ b/utils/text/number_norm.py
@ -1,10 +1,8 @@
 # -*- coding: utf-8 -*-
 """ from https://github.com/keithito/tacotron """
 import inflect
 import re
 _inflect = inflect.engine()
 _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
 _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
@ -15,58 +13,59 @@ _number_re = re.compile(r'[0-9]+')
 def _remove_commas(m):
-  return m.group(1).replace(',', '')
+    return m.group(1).replace(',', '')
 def _expand_decimal_point(m):
-  return m.group(1).replace('.', ' point ')
+    return m.group(1).replace('.', ' point ')
 def _expand_dollars(m):
-  match = m.group(1)
+    match = m.group(1)
-  parts = match.split('.')
+    parts = match.split('.')
-  if len(parts) > 2:
+    if len(parts) > 2:
-    return match + ' dollars'  # Unexpected format
+        return match + ' dollars'  # Unexpected format
-  dollars = int(parts[0]) if parts[0] else 0
+    dollars = int(parts[0]) if parts[0] else 0
-  cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
+    cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
-  if dollars and cents:
+    if dollars and cents:
-    dollar_unit = 'dollar' if dollars == 1 else 'dollars'
+        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
-    cent_unit = 'cent' if cents == 1 else 'cents'
+        cent_unit = 'cent' if cents == 1 else 'cents'
-    return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
+        return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
-  elif dollars:
+    elif dollars:
-    dollar_unit = 'dollar' if dollars == 1 else 'dollars'
+        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
-    return '%s %s' % (dollars, dollar_unit)
+        return '%s %s' % (dollars, dollar_unit)
-  elif cents:
+    elif cents:
-    cent_unit = 'cent' if cents == 1 else 'cents'
+        cent_unit = 'cent' if cents == 1 else 'cents'
-    return '%s %s' % (cents, cent_unit)
+        return '%s %s' % (cents, cent_unit)
-  else:
+    else:
-    return 'zero dollars'
+        return 'zero dollars'
 def _expand_ordinal(m):
-  return _inflect.number_to_words(m.group(0))
+    return _inflect.number_to_words(m.group(0))
 def _expand_number(m):
-  num = int(m.group(0))
+    num = int(m.group(0))
-  if num > 1000 and num < 3000:
+    if 1000 < num < 3000:
-    if num == 2000:
+        if num == 2000:
-      return 'two thousand'
+            return 'two thousand'
-    elif num > 2000 and num < 2010:
+        if 2000 < num < 2010:
-      return 'two thousand ' + _inflect.number_to_words(num % 100)
+            return 'two thousand ' + _inflect.number_to_words(num % 100)
-    elif num % 100 == 0:
+        if num % 100 == 0:
-      return _inflect.number_to_words(num // 100) + ' hundred'
+            return _inflect.number_to_words(num // 100) + ' hundred'
-    else:
+        return _inflect.number_to_words(num,
-      return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
+                                        andword='',
-  else:
+                                        zero='oh',
                                        group=2).replace(', ', ' ')
    return _inflect.number_to_words(num, andword='')
 def normalize_numbers(text):
-  text = re.sub(_comma_number_re, _remove_commas, text)
+    text = re.sub(_comma_number_re, _remove_commas, text)
-  text = re.sub(_pounds_re, r'\1 pounds', text)
+    text = re.sub(_pounds_re, r'\1 pounds', text)
-  text = re.sub(_dollars_re, _expand_dollars, text)
+    text = re.sub(_dollars_re, _expand_dollars, text)
-  text = re.sub(_decimal_number_re, _expand_decimal_point, text)
+    text = re.sub(_decimal_number_re, _expand_decimal_point, text)
-  text = re.sub(_ordinal_re, _expand_ordinal, text)
+    text = re.sub(_ordinal_re, _expand_ordinal, text)
-  text = re.sub(_number_re, _expand_number, text)
+    text = re.sub(_number_re, _expand_number, text)
-  return text
+    return text