Use non-enhanced hifigan for test samples

This commit is contained in:
Edresson Casanova 2023-11-01 10:01:54 -03:00 committed by Eren G??lge
parent 077a849b3b
commit 1fb6c203ab
3 changed files with 7 additions and 2 deletions

1
.gitignore vendored
View File

@ -169,3 +169,4 @@ wandb
depot/*
coqui_recipes/*
local_scripts/*
coqui_demos/*

View File

@ -214,7 +214,7 @@ class GPTTrainer(BaseTTS):
print(" | > Synthesizing test sentences.")
for idx, s_info in enumerate(self.config.test_sentences):
wav = self.xtts.synthesize(
s_info["text"], self.config, s_info["speaker_wav"], s_info["language"], gpt_cond_len=3
s_info["text"], self.config, s_info["speaker_wav"], s_info["language"], gpt_cond_len=3, decoder="ne_hifigan"
)["wav"]
test_audios["{}-audio".format(idx)] = wav

View File

@ -596,6 +596,10 @@ class Xtts(BaseTTS):
text = text.strip().lower()
text_tokens = torch.IntTensor(self.tokenizer.encode(text, lang=language)).unsqueeze(0).to(self.device)
# print(" > Input text: ", text)
# print(" > Input text preprocessed: ",self.tokenizer.preprocess_text(text, language))
# print(" > Input tokens: ", text_tokens)
# print(" > Decoded text: ", self.tokenizer.decode(text_tokens[0].cpu().numpy()))
assert (
text_tokens.shape[-1] < self.args.gpt_max_text_tokens
), " ❗ XTTS can only generate text with a maximum of 400 tokens."
@ -671,7 +675,7 @@ class Xtts(BaseTTS):
)
wav = self.vocoder.inference(mel)
return {"wav": wav.cpu().numpy().squeeze()}
return {"wav": wav.cpu().numpy().squeeze(), "gpt_latents": gpt_latents, "speaker_embedding": speaker_embedding, "diffusion_conditioning": diffusion_conditioning}
def handle_chunks(self, wav_gen, wav_gen_prev, wav_overlap, overlap_len):
"""Handle chunk formatting in streaming mode"""