Move lang token add to tokenizer

2023-10-24 11:00:56 +02:00 · 2023-10-24 11:00:56 +02:00 · c1133724a1
parent 6fa46d197d
commit c1133724a1
2 changed files with 3 additions and 5 deletions
--- a/TTS/tts/layers/xtts/tokenizer.py
+++ b/TTS/tts/layers/xtts/tokenizer.py
@ -483,13 +483,10 @@ class VoiceBpeTokenizer:
            if lang == "zh-cn":
                txt = chinese_transliterate(txt)
        elif lang == "ja":
            assert txt[:4] == "[ja]", "Japanese speech should start with the [ja] token."
            txt = txt[4:]
            if self.katsu is None:
                import cutlet
                self.katsu = cutlet.Cutlet()
            txt = japanese_cleaners(txt, self.katsu)
            txt = "[ja]" + txt
        else:
            raise NotImplementedError()
        return txt
@ -497,6 +494,7 @@ class VoiceBpeTokenizer:
    def encode(self, txt, lang):
        if self.preprocess:
            txt = self.preprocess_text(txt, lang)
        txt = f"[{lang}]{txt}"
        txt = txt.replace(" ", "[SPACE]")
        return self.tokenizer.encode(txt).ids
--- a/TTS/tts/models/xtts.py
+++ b/TTS/tts/models/xtts.py
@ -610,7 +610,7 @@ class Xtts(BaseTTS):
        decoder="hifigan",
        **hf_generate_kwargs,
    ):
-        text = f"[{language}]{text.strip().lower()}"
+        text = text.strip().lower()
        text_tokens = torch.IntTensor(self.tokenizer.encode(text, lang=language)).unsqueeze(0).to(self.device)
        assert (
@ -722,7 +722,7 @@ class Xtts(BaseTTS):
        assert hasattr(
            self, "hifigan_decoder"
        ), "`inference_stream` requires use_hifigan to be set to true in the config.model_args, diffusion is too slow to stream."
-        text = f"[{language}]{text.strip().lower()}"
+        text = text.strip().lower()
        text_tokens = torch.IntTensor(self.tokenizer.encode(text, lang=language)).unsqueeze(0).to(self.device)
        fake_inputs = self.gpt.compute_embeddings(