Add inference with precomputed latents

2023-09-28 17:51:49 +02:00 · 2023-09-28 17:51:49 +02:00 · a45cf83b34
parent a0f657c764
commit a45cf83b34
1 changed files with 53 additions and 27 deletions
--- a/TTS/tts/models/xtts.py
+++ b/TTS/tts/models/xtts.py
@ -478,10 +478,10 @@ class Xtts(BaseTTS):
            "decoder_sampler": config.decoder_sampler,
        }
        settings.update(kwargs)  # allow overriding of preset settings with kwargs
-        return self.inference(text, ref_audio_path, language, **settings)
+        return self.full_inference(text, ref_audio_path, language, **settings)

    @torch.no_grad()
-    def inference(
+    def full_inference(
        self,
        text,
        ref_audio_path,
@ -557,6 +557,56 @@ class Xtts(BaseTTS):
            Generated audio clip(s) as a torch tensor. Shape 1,S if k=1 else, (k,1,S) where S is the sample length.
            Sample rate is 24kHz.
        """
+        (
+            gpt_cond_latent,
+            diffusion_conditioning,
+            speaker_embedding
+        ) = self.get_conditioning_latents(audio_path=ref_audio_path, gpt_cond_len=gpt_cond_len)
+        return self.inference(
+            text,
+            language,
+            gpt_cond_latent,
+            speaker_embedding,
+            diffusion_conditioning,
+            temperature=temperature,
+            length_penalty=length_penalty,
+            repetition_penalty=repetition_penalty,
+            top_k=top_k,
+            top_p=top_p,
+            do_sample=do_sample,
+            decoder_iterations=decoder_iterations,
+            cond_free=cond_free,
+            cond_free_k=cond_free_k,
+            diffusion_temperature=diffusion_temperature,
+            decoder_sampler=decoder_sampler,
+            use_hifigan=use_hifigan,
+            **hf_generate_kwargs,
+        )
+    
+    @torch.no_grad()
+    def inference(
+        self,
+        text,
+        language,
+        gpt_cond_latent,
+        speaker_embedding,
+        diffusion_conditioning,
+        # GPT inference
+        temperature=0.65,
+        length_penalty=1,
+        repetition_penalty=2.0,
+        top_k=50,
+        top_p=0.85,
+        do_sample=True,
+        # Decoder inference
+        decoder_iterations=100,
+        cond_free=True,
+        cond_free_k=2,
+        diffusion_temperature=1.0,
+        decoder_sampler="ddim",
+        use_hifigan=True,
+        **hf_generate_kwargs,
+    ):
        text = f"[{language}]{text.strip().lower()}"
        text_tokens = torch.IntTensor(self.tokenizer.encode(text, lang=language)).unsqueeze(0).to(self.device)

@ -564,12 +614,6 @@ class Xtts(BaseTTS):
            text_tokens.shape[-1] < self.args.gpt_max_text_tokens
        ), " ❗ XTTS can only generate text with a maximum of 400 tokens."

-        (
-            gpt_cond_latent,
-            diffusion_conditioning,
-            speaker_embedding
-        ) = self.get_conditioning_latents(audio_path=ref_audio_path, gpt_cond_len=gpt_cond_len)
-
        if not use_hifigan:
            diffuser = load_discrete_vocoder_diffuser(
                desired_diffusion_steps=decoder_iterations,
@ -636,18 +680,6 @@ class Xtts(BaseTTS):

        return {"wav": wav.cpu().numpy().squeeze()}

-    def inference_speaker_cond(self, ref_audio_path, gpt_cond_len=3):
-        (
-            gpt_cond_latent,
-            diffusion_conditioning,
-            speaker_embedding
-        ) = self.get_conditioning_latents(audio_path=ref_audio_path, gpt_cond_len=3)
-        return {
-            "gpt_cond_latent": gpt_cond_latent,
-            "speaker_embedding": speaker_embedding,
-            "diffusion_conditioning": diffusion_conditioning,
-        }
-
    def handle_chunks(self, wav_gen, wav_gen_prev, wav_overlap, overlap_len):
        """Handle chunk formatting in streaming mode"""
        wav_chunk = wav_gen[:-overlap_len]
@ -668,7 +700,6 @@ class Xtts(BaseTTS):
        language,
        gpt_cond_latent,
        speaker_embedding,
-        diffusion_conditioning,
        # Streaming
        stream_chunk_size=20,
        overlap_wav_len=1024,
@ -678,14 +709,8 @@ class Xtts(BaseTTS):
        repetition_penalty=2.0,
        top_k=50,
        top_p=0.85,
-        gpt_cond_len=4,
        do_sample=True,
        # Decoder inference
-        decoder_iterations=100,
-        cond_free=True,
-        cond_free_k=2,
-        diffusion_temperature=1.0,
-        decoder_sampler="ddim",
        **hf_generate_kwargs,
    ):
        text = f"[{language}]{text.strip().lower()}"
@ -707,6 +732,7 @@ class Xtts(BaseTTS):
            repetition_penalty=float(repetition_penalty),
            output_attentions=False,
            output_hidden_states=True,
+            **hf_generate_kwargs,
        )

        last_tokens = []