Move lang token add to tokenizer

This commit is contained in:
WeberJulian 2023-10-24 11:00:56 +02:00
parent 6fa46d197d
commit c1133724a1
2 changed files with 3 additions and 5 deletions

View File

@ -483,13 +483,10 @@ class VoiceBpeTokenizer:
if lang == "zh-cn":
txt = chinese_transliterate(txt)
elif lang == "ja":
assert txt[:4] == "[ja]", "Japanese speech should start with the [ja] token."
txt = txt[4:]
if self.katsu is None:
import cutlet
self.katsu = cutlet.Cutlet()
txt = japanese_cleaners(txt, self.katsu)
txt = "[ja]" + txt
else:
raise NotImplementedError()
return txt
@ -497,6 +494,7 @@ class VoiceBpeTokenizer:
def encode(self, txt, lang):
if self.preprocess:
txt = self.preprocess_text(txt, lang)
txt = f"[{lang}]{txt}"
txt = txt.replace(" ", "[SPACE]")
return self.tokenizer.encode(txt).ids

View File

@ -610,7 +610,7 @@ class Xtts(BaseTTS):
decoder="hifigan",
**hf_generate_kwargs,
):
text = f"[{language}]{text.strip().lower()}"
text = text.strip().lower()
text_tokens = torch.IntTensor(self.tokenizer.encode(text, lang=language)).unsqueeze(0).to(self.device)
assert (
@ -722,7 +722,7 @@ class Xtts(BaseTTS):
assert hasattr(
self, "hifigan_decoder"
), "`inference_stream` requires use_hifigan to be set to true in the config.model_args, diffusion is too slow to stream."
text = f"[{language}]{text.strip().lower()}"
text = text.strip().lower()
text_tokens = torch.IntTensor(self.tokenizer.encode(text, lang=language)).unsqueeze(0).to(self.device)
fake_inputs = self.gpt.compute_embeddings(