From ee20e309583d5c39a99b58c982127ea1f7256de9 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Mon, 5 Dec 2022 09:15:01 -0300 Subject: [PATCH] Fix VITS multi-speaker voice conversion inference --- TTS/tts/models/vits.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 9dfdc067..518809b3 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -1211,8 +1211,8 @@ class Vits(BaseTTS): assert self.num_speakers > 0, "num_speakers have to be larger than 0." # speaker embedding if self.args.use_speaker_embedding and not self.args.use_d_vector_file: - g_src = self.emb_g(speaker_cond_src).unsqueeze(-1) - g_tgt = self.emb_g(speaker_cond_tgt).unsqueeze(-1) + g_src = self.emb_g(torch.from_numpy((np.array(speaker_cond_src))).unsqueeze(0)).unsqueeze(-1) + g_tgt = self.emb_g(torch.from_numpy((np.array(speaker_cond_tgt))).unsqueeze(0)).unsqueeze(-1) elif not self.args.use_speaker_embedding and self.args.use_d_vector_file: g_src = F.normalize(speaker_cond_src).unsqueeze(-1) g_tgt = F.normalize(speaker_cond_tgt).unsqueeze(-1)