Use non-enhanced hifigan for test samples

2023-11-01 10:01:54 -03:00 · 2023-11-01 10:01:54 -03:00 · 1fb6c203ab
parent 077a849b3b
commit 1fb6c203ab
3 changed files with 7 additions and 2 deletions
--- a/.gitignore
+++ b/.gitignore
@ -169,3 +169,4 @@ wandb
 depot/*
 coqui_recipes/*
 local_scripts/*
 coqui_demos/*
--- a/TTS/tts/layers/xtts/trainer/gpt_trainer.py
+++ b/TTS/tts/layers/xtts/trainer/gpt_trainer.py
@ -214,7 +214,7 @@ class GPTTrainer(BaseTTS):
            print(" | > Synthesizing test sentences.")
            for idx, s_info in enumerate(self.config.test_sentences):
                wav = self.xtts.synthesize(
-                    s_info["text"], self.config, s_info["speaker_wav"], s_info["language"], gpt_cond_len=3
+                    s_info["text"], self.config, s_info["speaker_wav"], s_info["language"], gpt_cond_len=3, decoder="ne_hifigan"
                )["wav"]
                test_audios["{}-audio".format(idx)] = wav
--- a/TTS/tts/models/xtts.py
+++ b/TTS/tts/models/xtts.py
@ -596,6 +596,10 @@ class Xtts(BaseTTS):
        text = text.strip().lower()
        text_tokens = torch.IntTensor(self.tokenizer.encode(text, lang=language)).unsqueeze(0).to(self.device)
        # print(" > Input text: ", text)
        # print(" > Input text preprocessed: ",self.tokenizer.preprocess_text(text, language))
        # print(" > Input tokens: ", text_tokens)
        # print(" > Decoded text: ", self.tokenizer.decode(text_tokens[0].cpu().numpy()))
        assert (
            text_tokens.shape[-1] < self.args.gpt_max_text_tokens
        ), " ❗ XTTS can only generate text with a maximum of 400 tokens."
@ -671,7 +675,7 @@ class Xtts(BaseTTS):
                )
                wav = self.vocoder.inference(mel)
-        return {"wav": wav.cpu().numpy().squeeze()}
+        return {"wav": wav.cpu().numpy().squeeze(), "gpt_latents": gpt_latents, "speaker_embedding": speaker_embedding, "diffusion_conditioning": diffusion_conditioning}
    def handle_chunks(self, wav_gen, wav_gen_prev, wav_overlap, overlap_len):
        """Handle chunk formatting in streaming mode"""