mirror of https://github.com/coqui-ai/TTS.git
Use non-enhanced hifigan for test samples
This commit is contained in:
parent
077a849b3b
commit
1fb6c203ab
|
@ -169,3 +169,4 @@ wandb
|
||||||
depot/*
|
depot/*
|
||||||
coqui_recipes/*
|
coqui_recipes/*
|
||||||
local_scripts/*
|
local_scripts/*
|
||||||
|
coqui_demos/*
|
|
@ -214,7 +214,7 @@ class GPTTrainer(BaseTTS):
|
||||||
print(" | > Synthesizing test sentences.")
|
print(" | > Synthesizing test sentences.")
|
||||||
for idx, s_info in enumerate(self.config.test_sentences):
|
for idx, s_info in enumerate(self.config.test_sentences):
|
||||||
wav = self.xtts.synthesize(
|
wav = self.xtts.synthesize(
|
||||||
s_info["text"], self.config, s_info["speaker_wav"], s_info["language"], gpt_cond_len=3
|
s_info["text"], self.config, s_info["speaker_wav"], s_info["language"], gpt_cond_len=3, decoder="ne_hifigan"
|
||||||
)["wav"]
|
)["wav"]
|
||||||
test_audios["{}-audio".format(idx)] = wav
|
test_audios["{}-audio".format(idx)] = wav
|
||||||
|
|
||||||
|
|
|
@ -596,6 +596,10 @@ class Xtts(BaseTTS):
|
||||||
text = text.strip().lower()
|
text = text.strip().lower()
|
||||||
text_tokens = torch.IntTensor(self.tokenizer.encode(text, lang=language)).unsqueeze(0).to(self.device)
|
text_tokens = torch.IntTensor(self.tokenizer.encode(text, lang=language)).unsqueeze(0).to(self.device)
|
||||||
|
|
||||||
|
# print(" > Input text: ", text)
|
||||||
|
# print(" > Input text preprocessed: ",self.tokenizer.preprocess_text(text, language))
|
||||||
|
# print(" > Input tokens: ", text_tokens)
|
||||||
|
# print(" > Decoded text: ", self.tokenizer.decode(text_tokens[0].cpu().numpy()))
|
||||||
assert (
|
assert (
|
||||||
text_tokens.shape[-1] < self.args.gpt_max_text_tokens
|
text_tokens.shape[-1] < self.args.gpt_max_text_tokens
|
||||||
), " ❗ XTTS can only generate text with a maximum of 400 tokens."
|
), " ❗ XTTS can only generate text with a maximum of 400 tokens."
|
||||||
|
@ -671,7 +675,7 @@ class Xtts(BaseTTS):
|
||||||
)
|
)
|
||||||
wav = self.vocoder.inference(mel)
|
wav = self.vocoder.inference(mel)
|
||||||
|
|
||||||
return {"wav": wav.cpu().numpy().squeeze()}
|
return {"wav": wav.cpu().numpy().squeeze(), "gpt_latents": gpt_latents, "speaker_embedding": speaker_embedding, "diffusion_conditioning": diffusion_conditioning}
|
||||||
|
|
||||||
def handle_chunks(self, wav_gen, wav_gen_prev, wav_overlap, overlap_len):
|
def handle_chunks(self, wav_gen, wav_gen_prev, wav_overlap, overlap_len):
|
||||||
"""Handle chunk formatting in streaming mode"""
|
"""Handle chunk formatting in streaming mode"""
|
||||||
|
|
Loading…
Reference in New Issue