mirror of https://github.com/coqui-ai/TTS.git
add expand tokenzier to take word len of > 250
This commit is contained in:
parent
cc3ba97b49
commit
6148baa952
|
@ -32,7 +32,7 @@ def get_spacy_lang(lang):
|
||||||
return English()
|
return English()
|
||||||
|
|
||||||
|
|
||||||
def split_sentence(text, lang, text_split_length=250):
|
def split_sentence(text, lang, text_split_length=400):
|
||||||
"""Preprocess the input text"""
|
"""Preprocess the input text"""
|
||||||
text_splits = []
|
text_splits = []
|
||||||
if text_split_length is not None and len(text) >= text_split_length:
|
if text_split_length is not None and len(text) >= text_split_length:
|
||||||
|
@ -595,7 +595,7 @@ class VoiceBpeTokenizer:
|
||||||
if vocab_file is not None:
|
if vocab_file is not None:
|
||||||
self.tokenizer = Tokenizer.from_file(vocab_file)
|
self.tokenizer = Tokenizer.from_file(vocab_file)
|
||||||
self.char_limits = {
|
self.char_limits = {
|
||||||
"en": 250,
|
"en": 400,
|
||||||
"de": 253,
|
"de": 253,
|
||||||
"fr": 273,
|
"fr": 273,
|
||||||
"es": 239,
|
"es": 239,
|
||||||
|
@ -621,7 +621,7 @@ class VoiceBpeTokenizer:
|
||||||
|
|
||||||
def check_input_length(self, txt, lang):
|
def check_input_length(self, txt, lang):
|
||||||
lang = lang.split("-")[0] # remove the region
|
lang = lang.split("-")[0] # remove the region
|
||||||
limit = self.char_limits.get(lang, 250)
|
limit = self.char_limits.get(lang, 400)
|
||||||
if len(txt) > limit:
|
if len(txt) > limit:
|
||||||
print(
|
print(
|
||||||
f"[!] Warning: The text length exceeds the character limit of {limit} for language '{lang}', this might cause truncated audio."
|
f"[!] Warning: The text length exceeds the character limit of {limit} for language '{lang}', this might cause truncated audio."
|
||||||
|
|
Loading…
Reference in New Issue