feat(cleaners): add multilingual phoneme cleaner

This doesn't convert numbers into English words.
This commit is contained in:
Enno Hermann 2024-06-14 15:06:03 +02:00
parent 063e9e9de9
commit e5c208d254
2 changed files with 22 additions and 4 deletions

View File

@ -3,6 +3,7 @@
# TODO: pick the cleaner for languages dynamically
import re
from typing import Optional
from anyascii import anyascii
@ -44,8 +45,8 @@ def remove_aux_symbols(text):
return text
def replace_symbols(text, lang="en"):
"""Replace symbols based on the lenguage tag.
def replace_symbols(text, lang: Optional[str] = "en"):
"""Replace symbols based on the language tag.
Args:
text:
@ -122,7 +123,11 @@ def english_cleaners(text):
def phoneme_cleaners(text):
"""Pipeline for phonemes mode, including number and abbreviation expansion."""
"""Pipeline for phonemes mode, including number and abbreviation expansion.
NB: This cleaner converts numbers into English words, for other languages
use multilingual_phoneme_cleaners().
"""
text = en_normalize_numbers(text)
text = expand_abbreviations(text)
text = replace_symbols(text)
@ -131,6 +136,14 @@ def phoneme_cleaners(text):
return text
def multilingual_phoneme_cleaners(text):
"""Pipeline for phonemes mode, including number and abbreviation expansion."""
text = replace_symbols(text, lang=None)
text = remove_aux_symbols(text)
text = collapse_whitespace(text)
return text
def french_cleaners(text):
"""Pipeline for French text. There is no need to expand numbers, phonemizer already does that"""
text = expand_abbreviations(text, lang="fr")

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python3
from TTS.tts.utils.text.cleaners import english_cleaners, phoneme_cleaners
from TTS.tts.utils.text.cleaners import english_cleaners, multilingual_phoneme_cleaners, phoneme_cleaners
def test_time() -> None:
@ -19,3 +19,8 @@ def test_currency() -> None:
def test_expand_numbers() -> None:
assert phoneme_cleaners("-1") == "minus one"
assert phoneme_cleaners("1") == "one"
def test_multilingual_phoneme_cleaners() -> None:
assert multilingual_phoneme_cleaners("(Hello)") == "Hello"
assert multilingual_phoneme_cleaners("1:") == "1,"