expand more currencies

This commit is contained in:
Jörg Thalheim 2020-12-05 09:45:57 +01:00
parent 58687f9c34
commit 76138687d3
No known key found for this signature in database
GPG Key ID: 003F2096411B5F92
2 changed files with 51 additions and 22 deletions

View File

@ -122,8 +122,8 @@ def portuguese_cleaners(text):
def phoneme_cleaners(text): def phoneme_cleaners(text):
'''Pipeline for phonemes mode, including number and abbreviation expansion.''' '''Pipeline for phonemes mode, including number and abbreviation expansion.'''
text = convert_to_ascii(text)
text = expand_numbers(text) text = expand_numbers(text)
text = convert_to_ascii(text)
text = expand_abbreviations(text) text = expand_abbreviations(text)
text = replace_symbols(text) text = replace_symbols(text)
text = remove_aux_symbols(text) text = remove_aux_symbols(text)

View File

@ -2,12 +2,12 @@
import inflect import inflect
import re import re
from typing import Dict
_inflect = inflect.engine() _inflect = inflect.engine()
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') _currency_re = re.compile(r'(£|\$|¥)([0-9\,\.]*[0-9]+)')
_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
_number_re = re.compile(r'[0-9]+') _number_re = re.compile(r'[0-9]+')
@ -20,24 +20,54 @@ def _expand_decimal_point(m):
return m.group(1).replace('.', ' point ') return m.group(1).replace('.', ' point ')
def _expand_dollars(m): def __expand_currency(value: str, inflection: Dict[float, str]) -> str:
match = m.group(1) parts = value.replace(",", "").split('.')
parts = match.split('.')
if len(parts) > 2: if len(parts) > 2:
return match + ' dollars' # Unexpected format return f"{value} {inflection[2]}" # Unexpected format
dollars = int(parts[0]) if parts[0] else 0 text = []
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 integer = int(parts[0]) if parts[0] else 0
if dollars and cents: if integer > 0:
dollar_unit = 'dollar' if dollars == 1 else 'dollars' integer_unit = inflection.get(integer, inflection[2])
cent_unit = 'cent' if cents == 1 else 'cents' text.append(f"{integer} {integer_unit}")
return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) fraction = int(parts[1]) if len(parts) > 1 and parts[1] else 0
if dollars: if fraction > 0:
dollar_unit = 'dollar' if dollars == 1 else 'dollars' fraction_unit = inflection.get(fraction/100, inflection[0.02])
return '%s %s' % (dollars, dollar_unit) text.append(f"{fraction} {fraction_unit}")
if cents: if len(text) == 0:
cent_unit = 'cent' if cents == 1 else 'cents' return f"zero {inflection[2]}"
return '%s %s' % (cents, cent_unit) return " ".join(text)
return 'zero dollars'
def _expand_currency(m: "re.Match") -> str:
currencies = {
"$": {
0.01: "cent",
0.02: "cents",
1: "dollar",
2: "dollars",
},
"": {
0.01: "cent",
0.02: "cents",
1: "euro",
2: "euros",
},
"£": {
0.01: "penny",
0.02: "pence",
1: "pound sterling",
2: "pounds sterling",
},
"¥": {
# TODO rin
0.02: "sen",
2: "yen",
}
}
unit = m.group(1)
currency = currencies[unit]
value = m.group(2)
return __expand_currency(value, currency)
def _expand_ordinal(m): def _expand_ordinal(m):
@ -62,8 +92,7 @@ def _expand_number(m):
def normalize_numbers(text): def normalize_numbers(text):
text = re.sub(_comma_number_re, _remove_commas, text) text = re.sub(_comma_number_re, _remove_commas, text)
text = re.sub(_pounds_re, r'\1 pounds', text) text = re.sub(_currency_re, _expand_currency, text)
text = re.sub(_dollars_re, _expand_dollars, text)
text = re.sub(_decimal_number_re, _expand_decimal_point, text) text = re.sub(_decimal_number_re, _expand_decimal_point, text)
text = re.sub(_ordinal_re, _expand_ordinal, text) text = re.sub(_ordinal_re, _expand_ordinal, text)
text = re.sub(_number_re, _expand_number, text) text = re.sub(_number_re, _expand_number, text)