mirror of https://github.com/coqui-ai/TTS.git
Update model entry
This commit is contained in:
parent
9b5c295fcf
commit
b47afc5275
|
@ -3,17 +3,17 @@
|
||||||
"multilingual": {
|
"multilingual": {
|
||||||
"multi-dataset": {
|
"multi-dataset": {
|
||||||
"xtts_v2": {
|
"xtts_v2": {
|
||||||
"description": "",
|
"description": "XTTS-v2 by Coqui with 16 languages.",
|
||||||
"hf_url": [
|
"hf_url": [
|
||||||
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/v1.0.0/model.pth",
|
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth",
|
||||||
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/v1.0.0/config.json",
|
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json",
|
||||||
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/v1.0.0/vocab.json",
|
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json",
|
||||||
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.1/hash.md5"
|
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/hash.md5"
|
||||||
],
|
],
|
||||||
"default_vocoder": null,
|
"default_vocoder": null,
|
||||||
"commit": "e9a1953e",
|
"commit": "480a6cdf7",
|
||||||
"license": "CPML",
|
"license": "CPML",
|
||||||
"contact": "",
|
"contact": "info@coqui.ai",
|
||||||
"tos_required": true
|
"tos_required": true
|
||||||
},
|
},
|
||||||
"xtts_v1": {
|
"xtts_v1": {
|
||||||
|
|
|
@ -8,6 +8,9 @@ from num2words import num2words
|
||||||
from tokenizers import Tokenizer
|
from tokenizers import Tokenizer
|
||||||
|
|
||||||
from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words
|
from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words
|
||||||
|
from hangul_romanize import Transliter
|
||||||
|
from hangul_romanize.rule import academic
|
||||||
|
|
||||||
|
|
||||||
_whitespace_re = re.compile(r"\s+")
|
_whitespace_re = re.compile(r"\s+")
|
||||||
|
|
||||||
|
@ -112,7 +115,7 @@ _abbreviations = {
|
||||||
# There are not many common abbreviations in Arabic as in English.
|
# There are not many common abbreviations in Arabic as in English.
|
||||||
]
|
]
|
||||||
],
|
],
|
||||||
"zh-cn": [
|
"zh": [
|
||||||
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
|
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
|
||||||
for x in [
|
for x in [
|
||||||
# Chinese doesn't typically use abbreviations in the same way as Latin-based scripts.
|
# Chinese doesn't typically use abbreviations in the same way as Latin-based scripts.
|
||||||
|
@ -155,6 +158,22 @@ _abbreviations = {
|
||||||
# Add other Turkish abbreviations here if needed.
|
# Add other Turkish abbreviations here if needed.
|
||||||
]
|
]
|
||||||
],
|
],
|
||||||
|
"hu": [
|
||||||
|
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
|
||||||
|
for x in [
|
||||||
|
("dr", "doktor"), # doctor
|
||||||
|
("b", "bácsi"), # Mr.
|
||||||
|
("nőv", "nővér"), # nurse
|
||||||
|
# Add other Hungarian abbreviations here if needed.
|
||||||
|
]
|
||||||
|
],
|
||||||
|
"ko": [
|
||||||
|
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
|
||||||
|
for x in [
|
||||||
|
# Korean doesn't typically use abbreviations in the same way as Latin-based scripts.
|
||||||
|
|
||||||
|
]
|
||||||
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -262,7 +281,7 @@ _symbols_multilingual = {
|
||||||
("°", " درجة "),
|
("°", " درجة "),
|
||||||
]
|
]
|
||||||
],
|
],
|
||||||
"zh-cn": [
|
"zh": [
|
||||||
# Chinese
|
# Chinese
|
||||||
(re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
|
(re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
|
||||||
for x in [
|
for x in [
|
||||||
|
@ -326,6 +345,31 @@ _symbols_multilingual = {
|
||||||
("°", " derece "),
|
("°", " derece "),
|
||||||
]
|
]
|
||||||
],
|
],
|
||||||
|
"hu": [
|
||||||
|
(re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
|
||||||
|
for x in [
|
||||||
|
("&", " és "),
|
||||||
|
("@", " kukac "),
|
||||||
|
("%", " százalék "),
|
||||||
|
("#", " kettőskereszt "),
|
||||||
|
("$", " dollár "),
|
||||||
|
("£", " font "),
|
||||||
|
("°", " fok ")
|
||||||
|
]
|
||||||
|
],
|
||||||
|
"ko": [
|
||||||
|
# Korean
|
||||||
|
(re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
|
||||||
|
for x in [
|
||||||
|
("&", " 그리고 "),
|
||||||
|
("@", " 에 "),
|
||||||
|
("%", " 퍼센트 "),
|
||||||
|
("#", " 번호 "),
|
||||||
|
("$", " 달러 "),
|
||||||
|
("£", " 파운드 "),
|
||||||
|
("°", " 도 ")
|
||||||
|
]
|
||||||
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -349,6 +393,8 @@ _ordinal_re = {
|
||||||
"ru": re.compile(r"([0-9]+)(-й|-я|-е|-ое|-ье|-го)"),
|
"ru": re.compile(r"([0-9]+)(-й|-я|-е|-ое|-ье|-го)"),
|
||||||
"nl": re.compile(r"([0-9]+)(de|ste|e)"),
|
"nl": re.compile(r"([0-9]+)(de|ste|e)"),
|
||||||
"tr": re.compile(r"([0-9]+)(\.|inci|nci|uncu|üncü|\.)"),
|
"tr": re.compile(r"([0-9]+)(\.|inci|nci|uncu|üncü|\.)"),
|
||||||
|
"hu": re.compile(r"([0-9]+)(\.|adik|edik|odik|edik|ödik|ödike|ik)"),
|
||||||
|
"ko": re.compile(r"([0-9]+)(번째|번|차|째)"),
|
||||||
}
|
}
|
||||||
_number_re = re.compile(r"[0-9]+")
|
_number_re = re.compile(r"[0-9]+")
|
||||||
_currency_re = {
|
_currency_re = {
|
||||||
|
@ -398,6 +444,8 @@ def _expand_currency(m, lang="en", currency="USD"):
|
||||||
"nl": ", ",
|
"nl": ", ",
|
||||||
"ar": ", ",
|
"ar": ", ",
|
||||||
"tr": ", ",
|
"tr": ", ",
|
||||||
|
"hu": ", ",
|
||||||
|
"ko": ", ",
|
||||||
}
|
}
|
||||||
|
|
||||||
if amount.is_integer():
|
if amount.is_integer():
|
||||||
|
@ -415,9 +463,14 @@ def _expand_ordinal(m, lang="en"):
|
||||||
def _expand_number(m, lang="en"):
|
def _expand_number(m, lang="en"):
|
||||||
return num2words(int(m.group(0)), lang=lang if lang != "cs" else "cz")
|
return num2words(int(m.group(0)), lang=lang if lang != "cs" else "cz")
|
||||||
|
|
||||||
|
<<<<<<< HEAD
|
||||||
|
|
||||||
def expand_numbers_multilingual(text, lang="en"):
|
def expand_numbers_multilingual(text, lang="en"):
|
||||||
if lang == "zh-cn":
|
if lang == "zh-cn":
|
||||||
|
=======
|
||||||
|
def expand_numbers_multilingual(text, lang='en'):
|
||||||
|
if lang == "zh" or lang == "zh-cn":
|
||||||
|
>>>>>>> Update model entry
|
||||||
text = zh_num2words()(text)
|
text = zh_num2words()(text)
|
||||||
else:
|
else:
|
||||||
if lang in ["en", "ru"]:
|
if lang in ["en", "ru"]:
|
||||||
|
@ -472,30 +525,41 @@ def chinese_transliterate(text):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def japanese_cleaners(text, katsu):
|
def japanese_cleaners(text, katsu):
|
||||||
text = katsu.romaji(text)
|
text = katsu.romaji(text)
|
||||||
text = lowercase(text)
|
text = lowercase(text)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
<<<<<<< HEAD
|
||||||
|
|
||||||
class VoiceBpeTokenizer:
|
class VoiceBpeTokenizer:
|
||||||
def __init__(self, vocab_file=None, preprocess=None):
|
def __init__(self, vocab_file=None, preprocess=None):
|
||||||
self.tokenizer = None
|
self.tokenizer = None
|
||||||
self.katsu = None
|
self.katsu = None
|
||||||
|
=======
|
||||||
|
>>>>>>> Update model entry
|
||||||
|
|
||||||
if vocab_file is not None:
|
def korean_cleaners(text):
|
||||||
with open(vocab_file, "r", encoding="utf-8") as f:
|
r = Transliter(academic)
|
||||||
vocab = json.load(f)
|
return r.translit(text)
|
||||||
|
|
||||||
self.language = vocab["model"]["language"] if "language" in vocab["model"] else None
|
|
||||||
|
|
||||||
if preprocess is None:
|
def preprocess_text(txt, lang):
|
||||||
self.preprocess = "pre_tokenizer" in vocab and vocab["pre_tokenizer"]
|
if lang in ["en", "es", "fr", "de", "pt", "it", "pl", "zh", "ar", "cs", "ru", "nl", "tr", "hu"]:
|
||||||
else:
|
txt = multilingual_cleaners(txt, lang)
|
||||||
self.preprocess = preprocess
|
elif lang == "ja":
|
||||||
|
txt = japanese_cleaners(txt)
|
||||||
|
elif lang == "zh-cn" or lang == "zh":
|
||||||
|
txt = chinese_transliterate(txt)
|
||||||
|
elif lang == "ko":
|
||||||
|
txt = korean_cleaners(txt)
|
||||||
|
else:
|
||||||
|
raise NotImplementedError()
|
||||||
|
return txt
|
||||||
|
|
||||||
self.tokenizer = Tokenizer.from_file(vocab_file)
|
|
||||||
|
|
||||||
|
<<<<<<< HEAD
|
||||||
def preprocess_text(self, txt, lang):
|
def preprocess_text(self, txt, lang):
|
||||||
if lang in ["en", "es", "fr", "de", "pt", "it", "pl", "ar", "cs", "ru", "nl", "tr", "zh-cn"]:
|
if lang in ["en", "es", "fr", "de", "pt", "it", "pl", "ar", "cs", "ru", "nl", "tr", "zh-cn"]:
|
||||||
txt = multilingual_cleaners(txt, lang)
|
txt = multilingual_cleaners(txt, lang)
|
||||||
|
@ -510,10 +574,20 @@ class VoiceBpeTokenizer:
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
return txt
|
return txt
|
||||||
|
=======
|
||||||
|
DEFAULT_VOCAB_FILE = os.path.join(
|
||||||
|
os.path.dirname(os.path.realpath(__file__)), "../data/tokenizer.json"
|
||||||
|
)
|
||||||
|
|
||||||
|
class VoiceBpeTokenizer:
|
||||||
|
def __init__(self, vocab_file=None):
|
||||||
|
self.tokenizer = None
|
||||||
|
if vocab_file is not None:
|
||||||
|
self.tokenizer = Tokenizer.from_file(vocab_file)
|
||||||
|
>>>>>>> Update model entry
|
||||||
|
|
||||||
def encode(self, txt, lang):
|
def encode(self, txt, lang):
|
||||||
if self.preprocess:
|
txt = preprocess_text(txt, lang)
|
||||||
txt = self.preprocess_text(txt, lang)
|
|
||||||
txt = f"[{lang}]{txt}"
|
txt = f"[{lang}]{txt}"
|
||||||
txt = txt.replace(" ", "[SPACE]")
|
txt = txt.replace(" ", "[SPACE]")
|
||||||
return self.tokenizer.encode(txt).ids
|
return self.tokenizer.encode(txt).ids
|
||||||
|
@ -532,3 +606,171 @@ class VoiceBpeTokenizer:
|
||||||
|
|
||||||
def get_number_tokens(self):
|
def get_number_tokens(self):
|
||||||
return max(self.tokenizer.get_vocab().values()) + 1
|
return max(self.tokenizer.get_vocab().values()) + 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_expand_numbers_multilingual():
|
||||||
|
test_cases = [
|
||||||
|
# English
|
||||||
|
("In 12.5 seconds.", 'In twelve point five seconds.', 'en'),
|
||||||
|
("There were 50 soldiers.", 'There were fifty soldiers.', 'en'),
|
||||||
|
("This is a 1st test", 'This is a first test', 'en'),
|
||||||
|
("That will be $20 sir.", 'That will be twenty dollars sir.', 'en'),
|
||||||
|
("That will be 20€ sir.", 'That will be twenty euro sir.', 'en'),
|
||||||
|
("That will be 20.15€ sir.", 'That will be twenty euro, fifteen cents sir.', 'en'),
|
||||||
|
("That's 100,000.5.", 'That\'s one hundred thousand point five.', 'en'),
|
||||||
|
# French
|
||||||
|
("En 12,5 secondes.", 'En douze virgule cinq secondes.', 'fr'),
|
||||||
|
("Il y avait 50 soldats.", 'Il y avait cinquante soldats.', 'fr'),
|
||||||
|
("Ceci est un 1er test", 'Ceci est un premier test', 'fr'),
|
||||||
|
("Cela vous fera $20 monsieur.", 'Cela vous fera vingt dollars monsieur.', 'fr'),
|
||||||
|
("Cela vous fera 20€ monsieur.", 'Cela vous fera vingt euros monsieur.', 'fr'),
|
||||||
|
("Cela vous fera 20,15€ monsieur.", 'Cela vous fera vingt euros et quinze centimes monsieur.', 'fr'),
|
||||||
|
("Ce sera 100.000,5.", 'Ce sera cent mille virgule cinq.', 'fr'),
|
||||||
|
# German
|
||||||
|
("In 12,5 Sekunden.", 'In zwölf Komma fünf Sekunden.', 'de'),
|
||||||
|
("Es gab 50 Soldaten.", 'Es gab fünfzig Soldaten.', 'de'),
|
||||||
|
("Dies ist ein 1. Test", 'Dies ist ein erste Test', 'de'), # Issue with gender
|
||||||
|
("Das macht $20 Herr.", 'Das macht zwanzig Dollar Herr.', 'de'),
|
||||||
|
("Das macht 20€ Herr.", 'Das macht zwanzig Euro Herr.', 'de'),
|
||||||
|
("Das macht 20,15€ Herr.", 'Das macht zwanzig Euro und fünfzehn Cent Herr.', 'de'),
|
||||||
|
# Spanish
|
||||||
|
("En 12,5 segundos.", 'En doce punto cinco segundos.', 'es'),
|
||||||
|
("Había 50 soldados.", 'Había cincuenta soldados.', 'es'),
|
||||||
|
("Este es un 1er test", 'Este es un primero test', 'es'),
|
||||||
|
("Eso le costará $20 señor.", 'Eso le costará veinte dólares señor.', 'es'),
|
||||||
|
("Eso le costará 20€ señor.", 'Eso le costará veinte euros señor.', 'es'),
|
||||||
|
("Eso le costará 20,15€ señor.", 'Eso le costará veinte euros con quince céntimos señor.', 'es'),
|
||||||
|
# Italian
|
||||||
|
("In 12,5 secondi.", 'In dodici virgola cinque secondi.', 'it'),
|
||||||
|
("C'erano 50 soldati.", "C'erano cinquanta soldati.", 'it'),
|
||||||
|
("Questo è un 1° test", 'Questo è un primo test', 'it'),
|
||||||
|
("Ti costerà $20 signore.", 'Ti costerà venti dollari signore.', 'it'),
|
||||||
|
("Ti costerà 20€ signore.", 'Ti costerà venti euro signore.', 'it'),
|
||||||
|
("Ti costerà 20,15€ signore.", 'Ti costerà venti euro e quindici centesimi signore.', 'it'),
|
||||||
|
# Portuguese
|
||||||
|
("Em 12,5 segundos.", 'Em doze vírgula cinco segundos.', 'pt'),
|
||||||
|
("Havia 50 soldados.", 'Havia cinquenta soldados.', 'pt'),
|
||||||
|
("Este é um 1º teste", 'Este é um primeiro teste', 'pt'),
|
||||||
|
("Isso custará $20 senhor.", 'Isso custará vinte dólares senhor.', 'pt'),
|
||||||
|
("Isso custará 20€ senhor.", 'Isso custará vinte euros senhor.', 'pt'),
|
||||||
|
("Isso custará 20,15€ senhor.", 'Isso custará vinte euros e quinze cêntimos senhor.', 'pt'), # "cêntimos" should be "centavos" num2words issue
|
||||||
|
# Polish
|
||||||
|
("W 12,5 sekundy.", 'W dwanaście przecinek pięć sekundy.', 'pl'),
|
||||||
|
("Było 50 żołnierzy.", 'Było pięćdziesiąt żołnierzy.', 'pl'),
|
||||||
|
("To będzie kosztować 20€ panie.", 'To będzie kosztować dwadzieścia euro panie.', 'pl'),
|
||||||
|
("To będzie kosztować 20,15€ panie.", 'To będzie kosztować dwadzieścia euro, piętnaście centów panie.', 'pl'),
|
||||||
|
# Arabic
|
||||||
|
("في الـ 12,5 ثانية.", 'في الـ اثنا عشر , خمسون ثانية.', 'ar'),
|
||||||
|
("كان هناك 50 جنديًا.", 'كان هناك خمسون جنديًا.', 'ar'),
|
||||||
|
# ("ستكون النتيجة $20 يا سيد.", 'ستكون النتيجة عشرون دولار يا سيد.', 'ar'), # $ and € are mising from num2words
|
||||||
|
# ("ستكون النتيجة 20€ يا سيد.", 'ستكون النتيجة عشرون يورو يا سيد.', 'ar'),
|
||||||
|
# Czech
|
||||||
|
("Za 12,5 vteřiny.", 'Za dvanáct celá pět vteřiny.', 'cs'),
|
||||||
|
("Bylo tam 50 vojáků.", 'Bylo tam padesát vojáků.', 'cs'),
|
||||||
|
("To bude stát 20€ pane.", 'To bude stát dvacet euro pane.', 'cs'),
|
||||||
|
("To bude 20.15€ pane.", 'To bude dvacet euro, patnáct centů pane.', 'cs'),
|
||||||
|
# Russian
|
||||||
|
("Через 12.5 секунды.", 'Через двенадцать запятая пять секунды.', 'ru'),
|
||||||
|
("Там было 50 солдат.", 'Там было пятьдесят солдат.', 'ru'),
|
||||||
|
("Это будет 20.15€ сэр.", 'Это будет двадцать евро, пятнадцать центов сэр.', 'ru'),
|
||||||
|
("Это будет стоить 20€ господин.", 'Это будет стоить двадцать евро господин.', 'ru'),
|
||||||
|
# Dutch
|
||||||
|
("In 12,5 seconden.", 'In twaalf komma vijf seconden.', 'nl'),
|
||||||
|
("Er waren 50 soldaten.", 'Er waren vijftig soldaten.', 'nl'),
|
||||||
|
("Dat wordt dan $20 meneer.", 'Dat wordt dan twintig dollar meneer.', 'nl'),
|
||||||
|
("Dat wordt dan 20€ meneer.", 'Dat wordt dan twintig euro meneer.', 'nl'),
|
||||||
|
# Chinese (Simplified)
|
||||||
|
("在12.5秒内", '在十二点五秒内', 'zh'),
|
||||||
|
("有50名士兵", '有五十名士兵', 'zh'),
|
||||||
|
# ("那将是$20先生", '那将是二十美元先生', 'zh'), currency doesn't work
|
||||||
|
# ("那将是20€先生", '那将是二十欧元先生', 'zh'),
|
||||||
|
# Turkish
|
||||||
|
# ("12,5 saniye içinde.", 'On iki virgül beş saniye içinde.', 'tr'), # decimal doesn't work for TR
|
||||||
|
("50 asker vardı.", 'elli asker vardı.', 'tr'),
|
||||||
|
("Bu 1. test", 'Bu birinci test', 'tr'),
|
||||||
|
# ("Bu 100.000,5.", 'Bu yüz bin virgül beş.', 'tr'),
|
||||||
|
# Hungarian
|
||||||
|
("12,5 másodperc alatt.", 'tizenkettő egész öt tized másodperc alatt.', 'hu'),
|
||||||
|
("50 katona volt.", 'ötven katona volt.', 'hu'),
|
||||||
|
("Ez az 1. teszt", 'Ez az első teszt', 'hu'),
|
||||||
|
# Korean
|
||||||
|
("12.5 초 안에.", '십이 점 다섯 초 안에.', 'ko'),
|
||||||
|
("50 명의 병사가 있었다.", '오십 명의 병사가 있었다.', 'ko'),
|
||||||
|
("이것은 1 번째 테스트입니다", '이것은 첫 번째 테스트입니다', 'ko'),
|
||||||
|
]
|
||||||
|
for a, b, lang in test_cases:
|
||||||
|
out = expand_numbers_multilingual(a, lang=lang)
|
||||||
|
assert out == b, f"'{out}' vs '{b}'"
|
||||||
|
|
||||||
|
def test_abbreviations_multilingual():
|
||||||
|
test_cases = [
|
||||||
|
# English
|
||||||
|
("Hello Mr. Smith.", 'Hello mister Smith.', 'en'),
|
||||||
|
("Dr. Jones is here.", 'doctor Jones is here.', 'en'),
|
||||||
|
# Spanish
|
||||||
|
("Hola Sr. Garcia.", 'Hola señor Garcia.', 'es'),
|
||||||
|
("La Dra. Martinez es muy buena.", 'La doctora Martinez es muy buena.', 'es'),
|
||||||
|
# French
|
||||||
|
("Bonjour Mr. Dupond.", 'Bonjour monsieur Dupond.', 'fr'),
|
||||||
|
("Mme. Moreau est absente aujourd'hui.", 'madame Moreau est absente aujourd\'hui.', 'fr'),
|
||||||
|
# German
|
||||||
|
("Frau Dr. Müller ist sehr klug.", 'Frau doktor Müller ist sehr klug.', 'de'),
|
||||||
|
# Portuguese
|
||||||
|
("Olá Sr. Silva.", 'Olá senhor Silva.', 'pt'),
|
||||||
|
("Dra. Costa, você está disponível?", 'doutora Costa, você está disponível?', 'pt'),
|
||||||
|
# Italian
|
||||||
|
("Buongiorno, Sig. Rossi.", 'Buongiorno, signore Rossi.', 'it'),
|
||||||
|
#("Sig.ra Bianchi, posso aiutarti?", 'signora Bianchi, posso aiutarti?', 'it'), # Issue with matching that pattern
|
||||||
|
# Polish
|
||||||
|
("Dzień dobry, P. Kowalski.", 'Dzień dobry, pani Kowalski.', 'pl'),
|
||||||
|
("M. Nowak, czy mogę zadać pytanie?", 'pan Nowak, czy mogę zadać pytanie?', 'pl'),
|
||||||
|
# Czech
|
||||||
|
("P. Novák", "pan Novák", 'cs'),
|
||||||
|
("Dr. Vojtěch", "doktor Vojtěch", 'cs'),
|
||||||
|
# Dutch
|
||||||
|
("Dhr. Jansen", "de heer Jansen", 'nl'),
|
||||||
|
("Mevr. de Vries", "mevrouw de Vries", 'nl'),
|
||||||
|
# Russian
|
||||||
|
("Здравствуйте Г-н Иванов.", "Здравствуйте господин Иванов.", 'ru'),
|
||||||
|
("Д-р Смирнов здесь, чтобы увидеть вас.", "доктор Смирнов здесь, чтобы увидеть вас.", 'ru'),
|
||||||
|
# Turkish
|
||||||
|
("Merhaba B. Yılmaz.", "Merhaba bay Yılmaz.", 'tr'),
|
||||||
|
("Dr. Ayşe burada.", "doktor Ayşe burada.", 'tr'),
|
||||||
|
# Hungarian
|
||||||
|
("Dr. Szabó itt van.", "doktor Szabó itt van.", 'hu'),
|
||||||
|
]
|
||||||
|
|
||||||
|
for a, b, lang in test_cases:
|
||||||
|
out = expand_abbreviations_multilingual(a, lang=lang)
|
||||||
|
assert out == b, f"'{out}' vs '{b}'"
|
||||||
|
|
||||||
|
def test_symbols_multilingual():
|
||||||
|
test_cases = [
|
||||||
|
("I have 14% battery", "I have 14 percent battery", "en"),
|
||||||
|
("Te veo @ la fiesta", "Te veo arroba la fiesta", "es"),
|
||||||
|
("J'ai 14° de fièvre", "J'ai 14 degrés de fièvre", "fr"),
|
||||||
|
("Die Rechnung beträgt £ 20", "Die Rechnung beträgt pfund 20", "de"),
|
||||||
|
("O meu email é ana&joao@gmail.com", "O meu email é ana e joao arroba gmail.com", "pt"),
|
||||||
|
("linguaggio di programmazione C#", "linguaggio di programmazione C cancelletto", "it"),
|
||||||
|
("Moja temperatura to 36.6°", "Moja temperatura to 36.6 stopnie", "pl"),
|
||||||
|
("Mám 14% baterie", "Mám 14 procento baterie", "cs"),
|
||||||
|
("Těším se na tebe @ party", "Těším se na tebe na party", "cs"),
|
||||||
|
("У меня 14% заряда", "У меня 14 процентов заряда", "ru"),
|
||||||
|
("Я буду @ дома", "Я буду собака дома", "ru"),
|
||||||
|
("Ik heb 14% batterij", "Ik heb 14 procent batterij", "nl"),
|
||||||
|
("Ik zie je @ het feest", "Ik zie je bij het feest", "nl"),
|
||||||
|
("لدي 14% في البطارية", "لدي 14 في المئة في البطارية", "ar"),
|
||||||
|
("我的电量为 14%", "我的电量为 14 百分之", "zh"),
|
||||||
|
("Pilim %14 dolu.", "Pilim yüzde 14 dolu.", "tr"),
|
||||||
|
("Az akkumulátorom töltöttsége 14%", "Az akkumulátorom töltöttsége 14 százalék", "hu"),
|
||||||
|
("배터리 잔량이 14%입니다.", "배터리 잔량이 14 퍼센트입니다.", "ko")
|
||||||
|
]
|
||||||
|
|
||||||
|
for a, b, lang in test_cases:
|
||||||
|
out = expand_symbols_multilingual(a, lang=lang)
|
||||||
|
assert out == b, f"'{out}' vs '{b}'"
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test_expand_numbers_multilingual()
|
||||||
|
test_abbreviations_multilingual()
|
||||||
|
test_symbols_multilingual()
|
Loading…
Reference in New Issue