add expand tokenzier to take word len of > 250

This commit is contained in:
owos 2024-03-04 14:00:56 +00:00
parent cc3ba97b49
commit 6148baa952
1 changed files with 3 additions and 3 deletions

View File

@ -32,7 +32,7 @@ def get_spacy_lang(lang):
return English() return English()
def split_sentence(text, lang, text_split_length=250): def split_sentence(text, lang, text_split_length=400):
"""Preprocess the input text""" """Preprocess the input text"""
text_splits = [] text_splits = []
if text_split_length is not None and len(text) >= text_split_length: if text_split_length is not None and len(text) >= text_split_length:
@ -595,7 +595,7 @@ class VoiceBpeTokenizer:
if vocab_file is not None: if vocab_file is not None:
self.tokenizer = Tokenizer.from_file(vocab_file) self.tokenizer = Tokenizer.from_file(vocab_file)
self.char_limits = { self.char_limits = {
"en": 250, "en": 400,
"de": 253, "de": 253,
"fr": 273, "fr": 273,
"es": 239, "es": 239,
@ -621,7 +621,7 @@ class VoiceBpeTokenizer:
def check_input_length(self, txt, lang): def check_input_length(self, txt, lang):
lang = lang.split("-")[0] # remove the region lang = lang.split("-")[0] # remove the region
limit = self.char_limits.get(lang, 250) limit = self.char_limits.get(lang, 400)
if len(txt) > limit: if len(txt) > limit:
print( print(
f"[!] Warning: The text length exceeds the character limit of {limit} for language '{lang}', this might cause truncated audio." f"[!] Warning: The text length exceeds the character limit of {limit} for language '{lang}', this might cause truncated audio."