mirror of https://github.com/coqui-ai/TTS.git
Update model entry
This commit is contained in:
parent
9b5c295fcf
commit
b47afc5275
|
@ -3,17 +3,17 @@
|
|||
"multilingual": {
|
||||
"multi-dataset": {
|
||||
"xtts_v2": {
|
||||
"description": "",
|
||||
"description": "XTTS-v2 by Coqui with 16 languages.",
|
||||
"hf_url": [
|
||||
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/v1.0.0/model.pth",
|
||||
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/v1.0.0/config.json",
|
||||
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/v1.0.0/vocab.json",
|
||||
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.1/hash.md5"
|
||||
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth",
|
||||
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json",
|
||||
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json",
|
||||
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/hash.md5"
|
||||
],
|
||||
"default_vocoder": null,
|
||||
"commit": "e9a1953e",
|
||||
"commit": "480a6cdf7",
|
||||
"license": "CPML",
|
||||
"contact": "",
|
||||
"contact": "info@coqui.ai",
|
||||
"tos_required": true
|
||||
},
|
||||
"xtts_v1": {
|
||||
|
|
|
@ -8,6 +8,9 @@ from num2words import num2words
|
|||
from tokenizers import Tokenizer
|
||||
|
||||
from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words
|
||||
from hangul_romanize import Transliter
|
||||
from hangul_romanize.rule import academic
|
||||
|
||||
|
||||
_whitespace_re = re.compile(r"\s+")
|
||||
|
||||
|
@ -112,7 +115,7 @@ _abbreviations = {
|
|||
# There are not many common abbreviations in Arabic as in English.
|
||||
]
|
||||
],
|
||||
"zh-cn": [
|
||||
"zh": [
|
||||
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
|
||||
for x in [
|
||||
# Chinese doesn't typically use abbreviations in the same way as Latin-based scripts.
|
||||
|
@ -155,6 +158,22 @@ _abbreviations = {
|
|||
# Add other Turkish abbreviations here if needed.
|
||||
]
|
||||
],
|
||||
"hu": [
|
||||
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
|
||||
for x in [
|
||||
("dr", "doktor"), # doctor
|
||||
("b", "bácsi"), # Mr.
|
||||
("nőv", "nővér"), # nurse
|
||||
# Add other Hungarian abbreviations here if needed.
|
||||
]
|
||||
],
|
||||
"ko": [
|
||||
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
|
||||
for x in [
|
||||
# Korean doesn't typically use abbreviations in the same way as Latin-based scripts.
|
||||
|
||||
]
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
|
@ -262,7 +281,7 @@ _symbols_multilingual = {
|
|||
("°", " درجة "),
|
||||
]
|
||||
],
|
||||
"zh-cn": [
|
||||
"zh": [
|
||||
# Chinese
|
||||
(re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
|
||||
for x in [
|
||||
|
@ -326,6 +345,31 @@ _symbols_multilingual = {
|
|||
("°", " derece "),
|
||||
]
|
||||
],
|
||||
"hu": [
|
||||
(re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
|
||||
for x in [
|
||||
("&", " és "),
|
||||
("@", " kukac "),
|
||||
("%", " százalék "),
|
||||
("#", " kettőskereszt "),
|
||||
("$", " dollár "),
|
||||
("£", " font "),
|
||||
("°", " fok ")
|
||||
]
|
||||
],
|
||||
"ko": [
|
||||
# Korean
|
||||
(re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
|
||||
for x in [
|
||||
("&", " 그리고 "),
|
||||
("@", " 에 "),
|
||||
("%", " 퍼센트 "),
|
||||
("#", " 번호 "),
|
||||
("$", " 달러 "),
|
||||
("£", " 파운드 "),
|
||||
("°", " 도 ")
|
||||
]
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
|
@ -349,6 +393,8 @@ _ordinal_re = {
|
|||
"ru": re.compile(r"([0-9]+)(-й|-я|-е|-ое|-ье|-го)"),
|
||||
"nl": re.compile(r"([0-9]+)(de|ste|e)"),
|
||||
"tr": re.compile(r"([0-9]+)(\.|inci|nci|uncu|üncü|\.)"),
|
||||
"hu": re.compile(r"([0-9]+)(\.|adik|edik|odik|edik|ödik|ödike|ik)"),
|
||||
"ko": re.compile(r"([0-9]+)(번째|번|차|째)"),
|
||||
}
|
||||
_number_re = re.compile(r"[0-9]+")
|
||||
_currency_re = {
|
||||
|
@ -398,6 +444,8 @@ def _expand_currency(m, lang="en", currency="USD"):
|
|||
"nl": ", ",
|
||||
"ar": ", ",
|
||||
"tr": ", ",
|
||||
"hu": ", ",
|
||||
"ko": ", ",
|
||||
}
|
||||
|
||||
if amount.is_integer():
|
||||
|
@ -415,9 +463,14 @@ def _expand_ordinal(m, lang="en"):
|
|||
def _expand_number(m, lang="en"):
|
||||
return num2words(int(m.group(0)), lang=lang if lang != "cs" else "cz")
|
||||
|
||||
<<<<<<< HEAD
|
||||
|
||||
def expand_numbers_multilingual(text, lang="en"):
|
||||
if lang == "zh-cn":
|
||||
=======
|
||||
def expand_numbers_multilingual(text, lang='en'):
|
||||
if lang == "zh" or lang == "zh-cn":
|
||||
>>>>>>> Update model entry
|
||||
text = zh_num2words()(text)
|
||||
else:
|
||||
if lang in ["en", "ru"]:
|
||||
|
@ -472,30 +525,41 @@ def chinese_transliterate(text):
|
|||
)
|
||||
|
||||
|
||||
|
||||
def japanese_cleaners(text, katsu):
|
||||
text = katsu.romaji(text)
|
||||
text = lowercase(text)
|
||||
return text
|
||||
|
||||
<<<<<<< HEAD
|
||||
|
||||
class VoiceBpeTokenizer:
|
||||
def __init__(self, vocab_file=None, preprocess=None):
|
||||
self.tokenizer = None
|
||||
self.katsu = None
|
||||
=======
|
||||
>>>>>>> Update model entry
|
||||
|
||||
if vocab_file is not None:
|
||||
with open(vocab_file, "r", encoding="utf-8") as f:
|
||||
vocab = json.load(f)
|
||||
def korean_cleaners(text):
|
||||
r = Transliter(academic)
|
||||
return r.translit(text)
|
||||
|
||||
self.language = vocab["model"]["language"] if "language" in vocab["model"] else None
|
||||
|
||||
if preprocess is None:
|
||||
self.preprocess = "pre_tokenizer" in vocab and vocab["pre_tokenizer"]
|
||||
else:
|
||||
self.preprocess = preprocess
|
||||
def preprocess_text(txt, lang):
|
||||
if lang in ["en", "es", "fr", "de", "pt", "it", "pl", "zh", "ar", "cs", "ru", "nl", "tr", "hu"]:
|
||||
txt = multilingual_cleaners(txt, lang)
|
||||
elif lang == "ja":
|
||||
txt = japanese_cleaners(txt)
|
||||
elif lang == "zh-cn" or lang == "zh":
|
||||
txt = chinese_transliterate(txt)
|
||||
elif lang == "ko":
|
||||
txt = korean_cleaners(txt)
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
return txt
|
||||
|
||||
self.tokenizer = Tokenizer.from_file(vocab_file)
|
||||
|
||||
<<<<<<< HEAD
|
||||
def preprocess_text(self, txt, lang):
|
||||
if lang in ["en", "es", "fr", "de", "pt", "it", "pl", "ar", "cs", "ru", "nl", "tr", "zh-cn"]:
|
||||
txt = multilingual_cleaners(txt, lang)
|
||||
|
@ -510,10 +574,20 @@ class VoiceBpeTokenizer:
|
|||
else:
|
||||
raise NotImplementedError()
|
||||
return txt
|
||||
=======
|
||||
DEFAULT_VOCAB_FILE = os.path.join(
|
||||
os.path.dirname(os.path.realpath(__file__)), "../data/tokenizer.json"
|
||||
)
|
||||
|
||||
class VoiceBpeTokenizer:
|
||||
def __init__(self, vocab_file=None):
|
||||
self.tokenizer = None
|
||||
if vocab_file is not None:
|
||||
self.tokenizer = Tokenizer.from_file(vocab_file)
|
||||
>>>>>>> Update model entry
|
||||
|
||||
def encode(self, txt, lang):
|
||||
if self.preprocess:
|
||||
txt = self.preprocess_text(txt, lang)
|
||||
txt = preprocess_text(txt, lang)
|
||||
txt = f"[{lang}]{txt}"
|
||||
txt = txt.replace(" ", "[SPACE]")
|
||||
return self.tokenizer.encode(txt).ids
|
||||
|
@ -532,3 +606,171 @@ class VoiceBpeTokenizer:
|
|||
|
||||
def get_number_tokens(self):
|
||||
return max(self.tokenizer.get_vocab().values()) + 1
|
||||
|
||||
|
||||
def test_expand_numbers_multilingual():
|
||||
test_cases = [
|
||||
# English
|
||||
("In 12.5 seconds.", 'In twelve point five seconds.', 'en'),
|
||||
("There were 50 soldiers.", 'There were fifty soldiers.', 'en'),
|
||||
("This is a 1st test", 'This is a first test', 'en'),
|
||||
("That will be $20 sir.", 'That will be twenty dollars sir.', 'en'),
|
||||
("That will be 20€ sir.", 'That will be twenty euro sir.', 'en'),
|
||||
("That will be 20.15€ sir.", 'That will be twenty euro, fifteen cents sir.', 'en'),
|
||||
("That's 100,000.5.", 'That\'s one hundred thousand point five.', 'en'),
|
||||
# French
|
||||
("En 12,5 secondes.", 'En douze virgule cinq secondes.', 'fr'),
|
||||
("Il y avait 50 soldats.", 'Il y avait cinquante soldats.', 'fr'),
|
||||
("Ceci est un 1er test", 'Ceci est un premier test', 'fr'),
|
||||
("Cela vous fera $20 monsieur.", 'Cela vous fera vingt dollars monsieur.', 'fr'),
|
||||
("Cela vous fera 20€ monsieur.", 'Cela vous fera vingt euros monsieur.', 'fr'),
|
||||
("Cela vous fera 20,15€ monsieur.", 'Cela vous fera vingt euros et quinze centimes monsieur.', 'fr'),
|
||||
("Ce sera 100.000,5.", 'Ce sera cent mille virgule cinq.', 'fr'),
|
||||
# German
|
||||
("In 12,5 Sekunden.", 'In zwölf Komma fünf Sekunden.', 'de'),
|
||||
("Es gab 50 Soldaten.", 'Es gab fünfzig Soldaten.', 'de'),
|
||||
("Dies ist ein 1. Test", 'Dies ist ein erste Test', 'de'), # Issue with gender
|
||||
("Das macht $20 Herr.", 'Das macht zwanzig Dollar Herr.', 'de'),
|
||||
("Das macht 20€ Herr.", 'Das macht zwanzig Euro Herr.', 'de'),
|
||||
("Das macht 20,15€ Herr.", 'Das macht zwanzig Euro und fünfzehn Cent Herr.', 'de'),
|
||||
# Spanish
|
||||
("En 12,5 segundos.", 'En doce punto cinco segundos.', 'es'),
|
||||
("Había 50 soldados.", 'Había cincuenta soldados.', 'es'),
|
||||
("Este es un 1er test", 'Este es un primero test', 'es'),
|
||||
("Eso le costará $20 señor.", 'Eso le costará veinte dólares señor.', 'es'),
|
||||
("Eso le costará 20€ señor.", 'Eso le costará veinte euros señor.', 'es'),
|
||||
("Eso le costará 20,15€ señor.", 'Eso le costará veinte euros con quince céntimos señor.', 'es'),
|
||||
# Italian
|
||||
("In 12,5 secondi.", 'In dodici virgola cinque secondi.', 'it'),
|
||||
("C'erano 50 soldati.", "C'erano cinquanta soldati.", 'it'),
|
||||
("Questo è un 1° test", 'Questo è un primo test', 'it'),
|
||||
("Ti costerà $20 signore.", 'Ti costerà venti dollari signore.', 'it'),
|
||||
("Ti costerà 20€ signore.", 'Ti costerà venti euro signore.', 'it'),
|
||||
("Ti costerà 20,15€ signore.", 'Ti costerà venti euro e quindici centesimi signore.', 'it'),
|
||||
# Portuguese
|
||||
("Em 12,5 segundos.", 'Em doze vírgula cinco segundos.', 'pt'),
|
||||
("Havia 50 soldados.", 'Havia cinquenta soldados.', 'pt'),
|
||||
("Este é um 1º teste", 'Este é um primeiro teste', 'pt'),
|
||||
("Isso custará $20 senhor.", 'Isso custará vinte dólares senhor.', 'pt'),
|
||||
("Isso custará 20€ senhor.", 'Isso custará vinte euros senhor.', 'pt'),
|
||||
("Isso custará 20,15€ senhor.", 'Isso custará vinte euros e quinze cêntimos senhor.', 'pt'), # "cêntimos" should be "centavos" num2words issue
|
||||
# Polish
|
||||
("W 12,5 sekundy.", 'W dwanaście przecinek pięć sekundy.', 'pl'),
|
||||
("Było 50 żołnierzy.", 'Było pięćdziesiąt żołnierzy.', 'pl'),
|
||||
("To będzie kosztować 20€ panie.", 'To będzie kosztować dwadzieścia euro panie.', 'pl'),
|
||||
("To będzie kosztować 20,15€ panie.", 'To będzie kosztować dwadzieścia euro, piętnaście centów panie.', 'pl'),
|
||||
# Arabic
|
||||
("في الـ 12,5 ثانية.", 'في الـ اثنا عشر , خمسون ثانية.', 'ar'),
|
||||
("كان هناك 50 جنديًا.", 'كان هناك خمسون جنديًا.', 'ar'),
|
||||
# ("ستكون النتيجة $20 يا سيد.", 'ستكون النتيجة عشرون دولار يا سيد.', 'ar'), # $ and € are mising from num2words
|
||||
# ("ستكون النتيجة 20€ يا سيد.", 'ستكون النتيجة عشرون يورو يا سيد.', 'ar'),
|
||||
# Czech
|
||||
("Za 12,5 vteřiny.", 'Za dvanáct celá pět vteřiny.', 'cs'),
|
||||
("Bylo tam 50 vojáků.", 'Bylo tam padesát vojáků.', 'cs'),
|
||||
("To bude stát 20€ pane.", 'To bude stát dvacet euro pane.', 'cs'),
|
||||
("To bude 20.15€ pane.", 'To bude dvacet euro, patnáct centů pane.', 'cs'),
|
||||
# Russian
|
||||
("Через 12.5 секунды.", 'Через двенадцать запятая пять секунды.', 'ru'),
|
||||
("Там было 50 солдат.", 'Там было пятьдесят солдат.', 'ru'),
|
||||
("Это будет 20.15€ сэр.", 'Это будет двадцать евро, пятнадцать центов сэр.', 'ru'),
|
||||
("Это будет стоить 20€ господин.", 'Это будет стоить двадцать евро господин.', 'ru'),
|
||||
# Dutch
|
||||
("In 12,5 seconden.", 'In twaalf komma vijf seconden.', 'nl'),
|
||||
("Er waren 50 soldaten.", 'Er waren vijftig soldaten.', 'nl'),
|
||||
("Dat wordt dan $20 meneer.", 'Dat wordt dan twintig dollar meneer.', 'nl'),
|
||||
("Dat wordt dan 20€ meneer.", 'Dat wordt dan twintig euro meneer.', 'nl'),
|
||||
# Chinese (Simplified)
|
||||
("在12.5秒内", '在十二点五秒内', 'zh'),
|
||||
("有50名士兵", '有五十名士兵', 'zh'),
|
||||
# ("那将是$20先生", '那将是二十美元先生', 'zh'), currency doesn't work
|
||||
# ("那将是20€先生", '那将是二十欧元先生', 'zh'),
|
||||
# Turkish
|
||||
# ("12,5 saniye içinde.", 'On iki virgül beş saniye içinde.', 'tr'), # decimal doesn't work for TR
|
||||
("50 asker vardı.", 'elli asker vardı.', 'tr'),
|
||||
("Bu 1. test", 'Bu birinci test', 'tr'),
|
||||
# ("Bu 100.000,5.", 'Bu yüz bin virgül beş.', 'tr'),
|
||||
# Hungarian
|
||||
("12,5 másodperc alatt.", 'tizenkettő egész öt tized másodperc alatt.', 'hu'),
|
||||
("50 katona volt.", 'ötven katona volt.', 'hu'),
|
||||
("Ez az 1. teszt", 'Ez az első teszt', 'hu'),
|
||||
# Korean
|
||||
("12.5 초 안에.", '십이 점 다섯 초 안에.', 'ko'),
|
||||
("50 명의 병사가 있었다.", '오십 명의 병사가 있었다.', 'ko'),
|
||||
("이것은 1 번째 테스트입니다", '이것은 첫 번째 테스트입니다', 'ko'),
|
||||
]
|
||||
for a, b, lang in test_cases:
|
||||
out = expand_numbers_multilingual(a, lang=lang)
|
||||
assert out == b, f"'{out}' vs '{b}'"
|
||||
|
||||
def test_abbreviations_multilingual():
|
||||
test_cases = [
|
||||
# English
|
||||
("Hello Mr. Smith.", 'Hello mister Smith.', 'en'),
|
||||
("Dr. Jones is here.", 'doctor Jones is here.', 'en'),
|
||||
# Spanish
|
||||
("Hola Sr. Garcia.", 'Hola señor Garcia.', 'es'),
|
||||
("La Dra. Martinez es muy buena.", 'La doctora Martinez es muy buena.', 'es'),
|
||||
# French
|
||||
("Bonjour Mr. Dupond.", 'Bonjour monsieur Dupond.', 'fr'),
|
||||
("Mme. Moreau est absente aujourd'hui.", 'madame Moreau est absente aujourd\'hui.', 'fr'),
|
||||
# German
|
||||
("Frau Dr. Müller ist sehr klug.", 'Frau doktor Müller ist sehr klug.', 'de'),
|
||||
# Portuguese
|
||||
("Olá Sr. Silva.", 'Olá senhor Silva.', 'pt'),
|
||||
("Dra. Costa, você está disponível?", 'doutora Costa, você está disponível?', 'pt'),
|
||||
# Italian
|
||||
("Buongiorno, Sig. Rossi.", 'Buongiorno, signore Rossi.', 'it'),
|
||||
#("Sig.ra Bianchi, posso aiutarti?", 'signora Bianchi, posso aiutarti?', 'it'), # Issue with matching that pattern
|
||||
# Polish
|
||||
("Dzień dobry, P. Kowalski.", 'Dzień dobry, pani Kowalski.', 'pl'),
|
||||
("M. Nowak, czy mogę zadać pytanie?", 'pan Nowak, czy mogę zadać pytanie?', 'pl'),
|
||||
# Czech
|
||||
("P. Novák", "pan Novák", 'cs'),
|
||||
("Dr. Vojtěch", "doktor Vojtěch", 'cs'),
|
||||
# Dutch
|
||||
("Dhr. Jansen", "de heer Jansen", 'nl'),
|
||||
("Mevr. de Vries", "mevrouw de Vries", 'nl'),
|
||||
# Russian
|
||||
("Здравствуйте Г-н Иванов.", "Здравствуйте господин Иванов.", 'ru'),
|
||||
("Д-р Смирнов здесь, чтобы увидеть вас.", "доктор Смирнов здесь, чтобы увидеть вас.", 'ru'),
|
||||
# Turkish
|
||||
("Merhaba B. Yılmaz.", "Merhaba bay Yılmaz.", 'tr'),
|
||||
("Dr. Ayşe burada.", "doktor Ayşe burada.", 'tr'),
|
||||
# Hungarian
|
||||
("Dr. Szabó itt van.", "doktor Szabó itt van.", 'hu'),
|
||||
]
|
||||
|
||||
for a, b, lang in test_cases:
|
||||
out = expand_abbreviations_multilingual(a, lang=lang)
|
||||
assert out == b, f"'{out}' vs '{b}'"
|
||||
|
||||
def test_symbols_multilingual():
|
||||
test_cases = [
|
||||
("I have 14% battery", "I have 14 percent battery", "en"),
|
||||
("Te veo @ la fiesta", "Te veo arroba la fiesta", "es"),
|
||||
("J'ai 14° de fièvre", "J'ai 14 degrés de fièvre", "fr"),
|
||||
("Die Rechnung beträgt £ 20", "Die Rechnung beträgt pfund 20", "de"),
|
||||
("O meu email é ana&joao@gmail.com", "O meu email é ana e joao arroba gmail.com", "pt"),
|
||||
("linguaggio di programmazione C#", "linguaggio di programmazione C cancelletto", "it"),
|
||||
("Moja temperatura to 36.6°", "Moja temperatura to 36.6 stopnie", "pl"),
|
||||
("Mám 14% baterie", "Mám 14 procento baterie", "cs"),
|
||||
("Těším se na tebe @ party", "Těším se na tebe na party", "cs"),
|
||||
("У меня 14% заряда", "У меня 14 процентов заряда", "ru"),
|
||||
("Я буду @ дома", "Я буду собака дома", "ru"),
|
||||
("Ik heb 14% batterij", "Ik heb 14 procent batterij", "nl"),
|
||||
("Ik zie je @ het feest", "Ik zie je bij het feest", "nl"),
|
||||
("لدي 14% في البطارية", "لدي 14 في المئة في البطارية", "ar"),
|
||||
("我的电量为 14%", "我的电量为 14 百分之", "zh"),
|
||||
("Pilim %14 dolu.", "Pilim yüzde 14 dolu.", "tr"),
|
||||
("Az akkumulátorom töltöttsége 14%", "Az akkumulátorom töltöttsége 14 százalék", "hu"),
|
||||
("배터리 잔량이 14%입니다.", "배터리 잔량이 14 퍼센트입니다.", "ko")
|
||||
]
|
||||
|
||||
for a, b, lang in test_cases:
|
||||
out = expand_symbols_multilingual(a, lang=lang)
|
||||
assert out == b, f"'{out}' vs '{b}'"
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_expand_numbers_multilingual()
|
||||
test_abbreviations_multilingual()
|
||||
test_symbols_multilingual()
|
Loading…
Reference in New Issue