mirror of https://github.com/coqui-ai/TTS.git
Add support for inference using an specific reference file instead of the averaged embeddings
This commit is contained in:
parent
856e185641
commit
251e1c289d
|
@ -252,17 +252,24 @@ class Synthesizer(object):
|
||||||
if speaker_name and isinstance(speaker_name, str):
|
if speaker_name and isinstance(speaker_name, str):
|
||||||
if self.tts_config.use_d_vector_file:
|
if self.tts_config.use_d_vector_file:
|
||||||
# get the average speaker embedding from the saved d_vectors.
|
# get the average speaker embedding from the saved d_vectors.
|
||||||
|
if speaker_name in self.tts_model.speaker_manager.ids:
|
||||||
speaker_embedding = self.tts_model.speaker_manager.get_mean_embedding(
|
speaker_embedding = self.tts_model.speaker_manager.get_mean_embedding(
|
||||||
speaker_name, num_samples=None, randomize=False
|
speaker_name, num_samples=None, randomize=False
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
speaker_embedding = self.tts_model.speaker_manager.embeddings[speaker_name]["embedding"]
|
||||||
|
|
||||||
speaker_embedding = np.array(speaker_embedding)[None, :] # [1 x embedding_dim]
|
speaker_embedding = np.array(speaker_embedding)[None, :] # [1 x embedding_dim]
|
||||||
|
|
||||||
if style_speaker_name is not None:
|
if style_speaker_name is not None:
|
||||||
|
if style_speaker_name in self.tts_model.speaker_manager.ids:
|
||||||
style_speaker_embedding = self.tts_model.speaker_manager.get_mean_embedding(
|
style_speaker_embedding = self.tts_model.speaker_manager.get_mean_embedding(
|
||||||
style_speaker_name, num_samples=None, randomize=False
|
style_speaker_name, num_samples=None, randomize=False
|
||||||
)
|
)
|
||||||
style_speaker_embedding = np.array(style_speaker_embedding)[None, :] # [1 x embedding_dim]
|
else:
|
||||||
|
style_speaker_embedding = self.tts_model.speaker_manager.embeddings[style_speaker_name]["embedding"]
|
||||||
|
|
||||||
|
style_speaker_embedding = np.array(style_speaker_embedding)[None, :] # [1 x embedding_dim]
|
||||||
else:
|
else:
|
||||||
# get speaker idx from the speaker name
|
# get speaker idx from the speaker name
|
||||||
speaker_id = self.tts_model.speaker_manager.ids[speaker_name]
|
speaker_id = self.tts_model.speaker_manager.ids[speaker_name]
|
||||||
|
@ -322,11 +329,16 @@ class Synthesizer(object):
|
||||||
getattr(self.tts_config, "model_args", None)
|
getattr(self.tts_config, "model_args", None)
|
||||||
and getattr(self.tts_config.model_args, "use_external_emotions_embeddings", False)
|
and getattr(self.tts_config.model_args, "use_external_emotions_embeddings", False)
|
||||||
):
|
):
|
||||||
|
if emotion_name in self.tts_model.emotion_manager.ids:
|
||||||
# get the average speaker embedding from the saved embeddings.
|
# get the average speaker embedding from the saved embeddings.
|
||||||
emotion_embedding = self.tts_model.emotion_manager.get_mean_embedding(
|
emotion_embedding = self.tts_model.emotion_manager.get_mean_embedding(
|
||||||
emotion_name, num_samples=None, randomize=False
|
emotion_name, num_samples=None, randomize=False
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
emotion_embedding = self.tts_model.emotion_manager.embeddings[emotion_name]["embedding"]
|
||||||
|
|
||||||
emotion_embedding = np.array(emotion_embedding)[None, :] # [1 x embedding_dim]
|
emotion_embedding = np.array(emotion_embedding)[None, :] # [1 x embedding_dim]
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# get speaker idx from the speaker name
|
# get speaker idx from the speaker name
|
||||||
emotion_id = self.tts_model.emotion_manager.ids[emotion_name]
|
emotion_id = self.tts_model.emotion_manager.ids[emotion_name]
|
||||||
|
|
|
@ -27,6 +27,7 @@ config = VitsConfig(
|
||||||
print_eval=True,
|
print_eval=True,
|
||||||
test_sentences=[
|
test_sentences=[
|
||||||
["Be a voice, not an echo.", "ljspeech-1", None, None, "ljspeech-1"],
|
["Be a voice, not an echo.", "ljspeech-1", None, None, "ljspeech-1"],
|
||||||
|
["Be a voice, not an echo.", "LJ001-0001.wav", None, None, "LJ001-0002.wav"],
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
# set audio config
|
# set audio config
|
||||||
|
@ -83,6 +84,13 @@ continue_emotion_path = os.path.join(continue_path, "emotions.json")
|
||||||
inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --emotion_idx {emotion_id} --speakers_file_path {continue_speakers_path} --emotions_file_path {continue_emotion_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
|
inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --emotion_idx {emotion_id} --speakers_file_path {continue_speakers_path} --emotions_file_path {continue_emotion_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
|
||||||
run_cli(inference_command)
|
run_cli(inference_command)
|
||||||
|
|
||||||
|
# inference with one reference embedding instead of average
|
||||||
|
speaker_id = "LJ001-0001.wav"
|
||||||
|
emotion_id = "LJ001-0002.wav"
|
||||||
|
inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --emotion_idx {emotion_id} --speakers_file_path {continue_speakers_path} --emotions_file_path {continue_emotion_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
|
||||||
|
run_cli(inference_command)
|
||||||
|
|
||||||
|
|
||||||
# restore the model and continue training for one more epoch
|
# restore the model and continue training for one more epoch
|
||||||
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
|
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
|
||||||
run_cli(command_train)
|
run_cli(command_train)
|
||||||
|
|
Loading…
Reference in New Issue