mirror of https://github.com/coqui-ai/TTS.git
Fix voice conversion inference (#1583)
* Add voice conversion zoo test * Fix style * Fix unit test
This commit is contained in:
parent
e282da5161
commit
ee99a6c1e2
|
@ -171,7 +171,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
|
|||
help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument("--gst_style", help="Wav path file for GST stylereference.", default=None)
|
||||
parser.add_argument("--gst_style", help="Wav path file for GST style reference.", default=None)
|
||||
parser.add_argument(
|
||||
"--list_speaker_idxs",
|
||||
help="List available speaker ids for the defined multi-speaker model.",
|
||||
|
|
|
@ -1127,7 +1127,7 @@ class Vits(BaseTTS):
|
|||
self.config.audio.hop_length,
|
||||
self.config.audio.win_length,
|
||||
center=False,
|
||||
).transpose(1, 2)
|
||||
)
|
||||
y_lengths = torch.tensor([y.size(-1)]).to(y.device)
|
||||
speaker_cond_src = reference_speaker_id if reference_speaker_id is not None else reference_d_vector
|
||||
speaker_cond_tgt = speaker_id if speaker_id is not None else d_vector
|
||||
|
@ -1157,7 +1157,7 @@ class Vits(BaseTTS):
|
|||
else:
|
||||
raise RuntimeError(" [!] Voice conversion is only supported on multi-speaker models.")
|
||||
|
||||
z, _, _, y_mask = self.posterior_encoder(y.transpose(1, 2), y_lengths, g=g_src)
|
||||
z, _, _, y_mask = self.posterior_encoder(y, y_lengths, g=g_src)
|
||||
z_p = self.flow(z, y_mask, g=g_src)
|
||||
z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
|
||||
o_hat = self.waveform_decoder(z_hat * y_mask, g=g_tgt)
|
||||
|
|
|
@ -315,7 +315,7 @@ class Synthesizer(object):
|
|||
# get the speaker embedding or speaker id for the reference wav file
|
||||
reference_speaker_embedding = None
|
||||
reference_speaker_id = None
|
||||
if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "speaker_ids"):
|
||||
if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "ids"):
|
||||
if reference_speaker_name and isinstance(reference_speaker_name, str):
|
||||
if self.tts_config.use_d_vector_file:
|
||||
# get the speaker embedding from the saved d_vectors.
|
||||
|
|
|
@ -122,7 +122,7 @@ class TestVits(unittest.TestCase):
|
|||
args = VitsArgs(num_speakers=num_speakers, use_speaker_embedding=True)
|
||||
model = Vits(args)
|
||||
|
||||
ref_inp = torch.randn(1, spec_len, 513)
|
||||
ref_inp = torch.randn(1, 513, spec_len)
|
||||
ref_inp_len = torch.randint(1, spec_effective_len, (1,))
|
||||
ref_spk_id = torch.randint(1, num_speakers, (1,))
|
||||
tgt_spk_id = torch.randint(1, num_speakers, (1,))
|
||||
|
|
|
@ -3,7 +3,7 @@ import glob
|
|||
import os
|
||||
import shutil
|
||||
|
||||
from tests import get_tests_output_path, run_cli
|
||||
from tests import get_tests_data_path, get_tests_output_path, run_cli
|
||||
from TTS.tts.utils.languages import LanguageManager
|
||||
from TTS.tts.utils.speakers import SpeakerManager
|
||||
from TTS.utils.generic_utils import get_user_data_dir
|
||||
|
@ -56,3 +56,16 @@ def test_run_all_models():
|
|||
folders = glob.glob(os.path.join(manager.output_prefix, "*"))
|
||||
assert len(folders) == len(model_names)
|
||||
shutil.rmtree(manager.output_prefix)
|
||||
|
||||
|
||||
def test_voice_conversion():
|
||||
print(" > Run voice conversion inference using YourTTS model.")
|
||||
model_name = "tts_models/multilingual/multi-dataset/your_tts"
|
||||
language_id = "en"
|
||||
speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")
|
||||
reference_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0032.wav")
|
||||
output_path = os.path.join(get_tests_output_path(), "output.wav")
|
||||
run_cli(
|
||||
f"tts --model_name {model_name}"
|
||||
f" --out_path {output_path} --speaker_wav {speaker_wav} --reference_wav {reference_wav} --language_idx {language_id} "
|
||||
)
|
||||
|
|
Loading…
Reference in New Issue