Fix tts_with_vc (#3275)

* Revert "fix for issue 3067"

This reverts commit 041b4b6723.

Fixes #3143. The original issue (#3067) was people trying to use
tts.tts_with_vc_to_file() with XTTS and was "fixed" in #3109. But XTTS has
integrated VC and you can just do tts.tts_to_file(..., speaker_wav="..."), there
is no point in passing it through FreeVC afterwards. So, reverting this commit
because it breaks tts.tts_with_vc_to_file() for any model that doesn't have
integrated VC, i.e. all models this method is meant for.

* fix: support multi-speaker models in tts_with_vc/tts_with_vc_to_file

* fix: only compute spk embeddings for models that support it

Fixes #1440. Passing a `speaker_wav` argument to regular Vits models failed
because they don't support voice cloning. Now that argument is simply ignored.
This commit is contained in:
Enno Hermann 2023-11-24 12:26:37 +01:00 committed by GitHub
parent 2af0220996
commit 8c5227ed84
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 20 additions and 5 deletions

View File

@ -440,7 +440,7 @@ class TTS(nn.Module):
save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate) save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
return file_path return file_path
def tts_with_vc(self, text: str, language: str = None, speaker_wav: str = None): def tts_with_vc(self, text: str, language: str = None, speaker_wav: str = None, speaker: str = None):
"""Convert text to speech with voice conversion. """Convert text to speech with voice conversion.
It combines tts with voice conversion to fake voice cloning. It combines tts with voice conversion to fake voice cloning.
@ -457,17 +457,25 @@ class TTS(nn.Module):
speaker_wav (str, optional): speaker_wav (str, optional):
Path to a reference wav file to use for voice cloning with supporting models like YourTTS. Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
Defaults to None. Defaults to None.
speaker (str, optional):
Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
`tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
""" """
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
# Lazy code... save it to a temp file to resample it while reading it for VC # Lazy code... save it to a temp file to resample it while reading it for VC
self.tts_to_file(text=text, speaker=None, language=language, file_path=fp.name, speaker_wav=speaker_wav) self.tts_to_file(text=text, speaker=speaker, language=language, file_path=fp.name)
if self.voice_converter is None: if self.voice_converter is None:
self.load_vc_model_by_name("voice_conversion_models/multilingual/vctk/freevc24") self.load_vc_model_by_name("voice_conversion_models/multilingual/vctk/freevc24")
wav = self.voice_converter.voice_conversion(source_wav=fp.name, target_wav=speaker_wav) wav = self.voice_converter.voice_conversion(source_wav=fp.name, target_wav=speaker_wav)
return wav return wav
def tts_with_vc_to_file( def tts_with_vc_to_file(
self, text: str, language: str = None, speaker_wav: str = None, file_path: str = "output.wav" self,
text: str,
language: str = None,
speaker_wav: str = None,
file_path: str = "output.wav",
speaker: str = None,
): ):
"""Convert text to speech with voice conversion and save to file. """Convert text to speech with voice conversion and save to file.
@ -484,6 +492,9 @@ class TTS(nn.Module):
Defaults to None. Defaults to None.
file_path (str, optional): file_path (str, optional):
Output file path. Defaults to "output.wav". Output file path. Defaults to "output.wav".
speaker (str, optional):
Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
`tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
""" """
wav = self.tts_with_vc(text=text, language=language, speaker_wav=speaker_wav) wav = self.tts_with_vc(text=text, language=language, speaker_wav=speaker_wav, speaker=speaker)
save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate) save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)

View File

@ -358,7 +358,11 @@ class Synthesizer(nn.Module):
) )
# compute a new d_vector from the given clip. # compute a new d_vector from the given clip.
if speaker_wav is not None and self.tts_model.speaker_manager is not None: if (
speaker_wav is not None
and self.tts_model.speaker_manager is not None
and self.tts_model.speaker_manager.encoder_ap is not None
):
speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(speaker_wav) speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(speaker_wav)
vocoder_device = "cpu" vocoder_device = "cpu"