mirror of https://github.com/coqui-ai/TTS.git
Add prosody encoder params on config
This commit is contained in:
parent
95409be0bc
commit
66e3f5388e
|
@ -179,10 +179,13 @@ If you don't specify any models, then it uses LJSpeech based English model.
|
|||
default=None,
|
||||
)
|
||||
parser.add_argument("--gst_style", help="Wav path file for GST style reference.", default=None)
|
||||
<<<<<<< HEAD
|
||||
parser.add_argument(
|
||||
"--capacitron_style_wav", type=str, help="Wav path file for Capacitron prosody reference.", default=None
|
||||
)
|
||||
parser.add_argument("--capacitron_style_text", type=str, help="Transcription of the reference.", default=None)
|
||||
=======
|
||||
>>>>>>> 3a524b05... Add prosody encoder params on config
|
||||
parser.add_argument(
|
||||
"--list_speaker_idxs",
|
||||
help="List available speaker ids for the defined multi-speaker model.",
|
||||
|
@ -321,6 +324,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
|
|||
args.speaker_idx,
|
||||
args.language_idx,
|
||||
args.speaker_wav,
|
||||
style_wav=args.gst_style,
|
||||
reference_wav=args.reference_wav,
|
||||
style_wav=args.capacitron_style_wav,
|
||||
style_text=args.capacitron_style_text,
|
||||
|
|
|
@ -117,7 +117,7 @@ def load_tts_samples(
|
|||
if eval_split:
|
||||
if meta_file_val:
|
||||
meta_data_eval = formatter(root_path, meta_file_val, ignored_speakers=ignored_speakers)
|
||||
meta_data_eval = [{**item, **{"language": language}} for item in meta_data_eval]
|
||||
meta_data_eval = [{**item, **{"language": language, "speech_style": speech_style}} for item in meta_data_eval]
|
||||
else:
|
||||
meta_data_eval, meta_data_train = split_dataset(meta_data_train, eval_split_max_size, eval_split_size)
|
||||
meta_data_eval_all += meta_data_eval
|
||||
|
|
|
@ -223,6 +223,7 @@ class Tacotron(BaseTacotron):
|
|||
encoder_outputs = self.encoder(inputs)
|
||||
if self.gst and self.use_gst:
|
||||
# B x gst_dim
|
||||
<<<<<<< HEAD
|
||||
encoder_outputs = self.compute_gst(encoder_outputs, aux_input["style_mel"], aux_input["d_vectors"])
|
||||
if self.capacitron_vae and self.use_capacitron_vae:
|
||||
if aux_input["style_text"] is not None:
|
||||
|
@ -246,6 +247,9 @@ class Tacotron(BaseTacotron):
|
|||
if self.capacitron_vae.capacitron_use_speaker_embedding
|
||||
else None,
|
||||
)
|
||||
=======
|
||||
encoder_outputs = self.compute_gst(encoder_outputs, aux_input["style_feature"], aux_input["d_vectors"])
|
||||
>>>>>>> 3a524b05... Add prosody encoder params on config
|
||||
if self.num_speakers > 1:
|
||||
if not self.use_d_vector_file:
|
||||
# B x 1 x speaker_embed_dim
|
||||
|
|
|
@ -250,7 +250,7 @@ class Tacotron2(BaseTacotron):
|
|||
|
||||
if self.gst and self.use_gst:
|
||||
# B x gst_dim
|
||||
encoder_outputs = self.compute_gst(encoder_outputs, aux_input["style_mel"], aux_input["d_vectors"])
|
||||
encoder_outputs = self.compute_gst(encoder_outputs, aux_input["style_feature"], aux_input["d_vectors"])
|
||||
|
||||
if self.capacitron_vae and self.use_capacitron_vae:
|
||||
if aux_input["style_text"] is not None:
|
||||
|
|
|
@ -546,6 +546,8 @@ class VitsArgs(Coqpit):
|
|||
# prosody encoder
|
||||
use_prosody_encoder: bool = False
|
||||
prosody_embedding_dim: int = 0
|
||||
prosody_encoder_num_heads: int = 1
|
||||
prosody_encoder_num_tokens: int = 5
|
||||
|
||||
detach_dp_input: bool = True
|
||||
use_language_embedding: bool = False
|
||||
|
@ -685,8 +687,8 @@ class Vits(BaseTTS):
|
|||
if self.args.use_prosody_encoder:
|
||||
self.prosody_encoder = GST(
|
||||
num_mel=self.args.hidden_channels,
|
||||
num_heads=1,
|
||||
num_style_tokens=5,
|
||||
num_heads=self.args.prosody_encoder_num_heads,
|
||||
num_style_tokens=self.args.prosody_encoder_num_tokens,
|
||||
gst_embedding_dim=self.args.prosody_embedding_dim,
|
||||
)
|
||||
self.speaker_reversal_classifier = ReversalClassifier(
|
||||
|
@ -916,7 +918,7 @@ class Vits(BaseTTS):
|
|||
@staticmethod
|
||||
def _set_cond_input(aux_input: Dict):
|
||||
"""Set the speaker conditioning input based on the multi-speaker mode."""
|
||||
sid, g, lid, eid, eg = None, None, None, None, None
|
||||
sid, g, lid, eid, eg, pf = None, None, None, None, None, None
|
||||
if "speaker_ids" in aux_input and aux_input["speaker_ids"] is not None:
|
||||
sid = aux_input["speaker_ids"]
|
||||
if sid.ndim == 0:
|
||||
|
@ -941,7 +943,11 @@ class Vits(BaseTTS):
|
|||
if eg.ndim == 2:
|
||||
eg = eg.unsqueeze_(0)
|
||||
|
||||
return sid, g, lid, eid, eg
|
||||
if "style_feature" in aux_input and aux_input["style_feature"] is not None:
|
||||
pf = aux_input["style_feature"]
|
||||
if pf.ndim == 2:
|
||||
pf = pf.unsqueeze_(0)
|
||||
return sid, g, lid, eid, eg, pf
|
||||
|
||||
def _set_speaker_input(self, aux_input: Dict):
|
||||
d_vectors = aux_input.get("d_vectors", None)
|
||||
|
@ -1061,7 +1067,7 @@ class Vits(BaseTTS):
|
|||
- syn_cons_emb: :math:`[B, 1, speaker_encoder.proj_dim]`
|
||||
"""
|
||||
outputs = {}
|
||||
sid, g, lid, eid, eg = self._set_cond_input(aux_input)
|
||||
sid, g, lid, eid, eg, _ = self._set_cond_input(aux_input)
|
||||
# speaker embedding
|
||||
if self.args.use_speaker_embedding and sid is not None:
|
||||
g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
|
||||
|
@ -1091,14 +1097,14 @@ class Vits(BaseTTS):
|
|||
if self.args.use_prosody_encoder:
|
||||
pros_emb = self.prosody_encoder(z).transpose(1, 2)
|
||||
_, l_pros_speaker = self.speaker_reversal_classifier(pros_emb.transpose(1, 2), sid, x_mask=None)
|
||||
|
||||
# print("Encoder input", x.shape)
|
||||
x, m_p, logs_p, x_mask = self.text_encoder(x, x_lengths, lang_emb=lang_emb, emo_emb=eg, pros_emb=pros_emb)
|
||||
|
||||
# print("X shape:", x.shape, "m_p shape:", m_p.shape, "x_mask:", x_mask.shape, "x_lengths:", x_lengths.shape)
|
||||
# flow layers
|
||||
z_p = self.flow(z, y_mask, g=g)
|
||||
|
||||
# print("Y mask:", y_mask.shape)
|
||||
# duration predictor
|
||||
g_dp = g
|
||||
g_dp = g if self.args.condition_dp_on_speaker else None
|
||||
if eg is not None and (self.args.use_emotion_embedding or self.args.use_external_emotions_embeddings) and self.args.emotion_just_encoder:
|
||||
if g_dp is None:
|
||||
g_dp = eg
|
||||
|
@ -1190,6 +1196,7 @@ class Vits(BaseTTS):
|
|||
"language_ids": None,
|
||||
"emotion_embeddings": None,
|
||||
"emotion_ids": None,
|
||||
"style_feature": None,
|
||||
},
|
||||
): # pylint: disable=dangerous-default-value
|
||||
"""
|
||||
|
@ -1210,7 +1217,7 @@ class Vits(BaseTTS):
|
|||
- m_p: :math:`[B, C, T_dec]`
|
||||
- logs_p: :math:`[B, C, T_dec]`
|
||||
"""
|
||||
sid, g, lid, eid, eg = self._set_cond_input(aux_input)
|
||||
sid, g, lid, eid, eg, pf = self._set_cond_input(aux_input)
|
||||
x_lengths = self._set_x_lengths(x, aux_input)
|
||||
|
||||
# speaker embedding
|
||||
|
@ -1233,29 +1240,42 @@ class Vits(BaseTTS):
|
|||
if self.args.use_language_embedding and lid is not None:
|
||||
lang_emb = self.emb_l(lid).unsqueeze(-1)
|
||||
|
||||
x, m_p, logs_p, x_mask = self.text_encoder(x, x_lengths, lang_emb=lang_emb, emo_emb=eg)
|
||||
# prosody embedding
|
||||
pros_emb = None
|
||||
if self.args.use_prosody_encoder:
|
||||
# extract posterior encoder feature
|
||||
pf_lengths = torch.tensor([pf.size(-1)]).to(pf.device)
|
||||
z_pro, _, _, _ = self.posterior_encoder(pf, pf_lengths, g=g)
|
||||
pros_emb = self.prosody_encoder(z_pro).transpose(1, 2)
|
||||
|
||||
x, m_p, logs_p, x_mask = self.text_encoder(x, x_lengths, lang_emb=lang_emb, emo_emb=eg, pros_emb=pros_emb)
|
||||
|
||||
# duration predictor
|
||||
g_dp = g if self.args.condition_dp_on_speaker else None
|
||||
if eg is not None and (self.args.use_emotion_embedding or self.args.use_external_emotions_embeddings) and self.args.emotion_just_encoder:
|
||||
if g is None:
|
||||
if g_dp is None:
|
||||
g_dp = eg
|
||||
else:
|
||||
g_dp = torch.cat([g, eg], dim=1) # [b, h1+h2, 1]
|
||||
else:
|
||||
g_dp = g
|
||||
g_dp = torch.cat([g_dp, eg], dim=1) # [b, h1+h2, 1]
|
||||
|
||||
if self.args.use_prosody_encoder:
|
||||
if g_dp is None:
|
||||
g_dp = pros_emb
|
||||
else:
|
||||
g_dp = torch.cat([g_dp, pros_emb], dim=1) # [b, h1+h2, 1]
|
||||
|
||||
if self.args.use_sdp:
|
||||
logw = self.duration_predictor(
|
||||
x,
|
||||
x_mask,
|
||||
g=g_dp if self.args.condition_dp_on_speaker else None,
|
||||
g=g_dp,
|
||||
reverse=True,
|
||||
noise_scale=self.inference_noise_scale_dp,
|
||||
lang_emb=lang_emb,
|
||||
)
|
||||
else:
|
||||
logw = self.duration_predictor(
|
||||
x, x_mask, g=g_dp if self.args.condition_dp_on_speaker else None, lang_emb=lang_emb
|
||||
x, x_mask, g=g_dp, lang_emb=lang_emb
|
||||
)
|
||||
|
||||
w = torch.exp(logw) * x_mask * self.length_scale
|
||||
|
@ -1277,6 +1297,7 @@ class Vits(BaseTTS):
|
|||
|
||||
o = self.waveform_decoder((z * y_mask)[:, :, : self.max_inference_len], g=g)
|
||||
|
||||
<<<<<<< HEAD
|
||||
outputs = {
|
||||
"model_outputs": o,
|
||||
"alignments": attn.squeeze(1),
|
||||
|
@ -1287,8 +1308,24 @@ class Vits(BaseTTS):
|
|||
"logs_p": logs_p,
|
||||
"y_mask": y_mask,
|
||||
}
|
||||
=======
|
||||
outputs = {"model_outputs": o, "alignments": attn.squeeze(1), "z": z, "z_p": z_p, "m_p": m_p, "logs_p": logs_p, "durations": w_ceil}
|
||||
>>>>>>> 3a524b05... Add prosody encoder params on config
|
||||
return outputs
|
||||
|
||||
def compute_style_feature(self, style_wav_path):
|
||||
style_wav, sr = torchaudio.load(style_wav_path)
|
||||
if sr != self.config.audio.sample_rate:
|
||||
raise RuntimeError(" [!] Style reference need to have sampling rate equal to {self.config.audio.sample_rate} !!")
|
||||
y = wav_to_spec(
|
||||
style_wav,
|
||||
self.config.audio.fft_size,
|
||||
self.config.audio.hop_length,
|
||||
self.config.audio.win_length,
|
||||
center=False,
|
||||
)
|
||||
return y
|
||||
|
||||
@torch.no_grad()
|
||||
def inference_voice_conversion(
|
||||
self, reference_wav, speaker_id=None, d_vector=None, reference_speaker_id=None, reference_d_vector=None
|
||||
|
|
|
@ -14,18 +14,18 @@ def numpy_to_torch(np_array, dtype, cuda=False):
|
|||
return tensor
|
||||
|
||||
|
||||
def compute_style_mel(style_wav, ap, cuda=False):
|
||||
style_mel = torch.FloatTensor(ap.melspectrogram(ap.load_wav(style_wav, sr=ap.sample_rate))).unsqueeze(0)
|
||||
def compute_style_feature(style_wav, ap, cuda=False):
|
||||
style_feature = torch.FloatTensor(ap.melspectrogram(ap.load_wav(style_wav, sr=ap.sample_rate))).unsqueeze(0)
|
||||
if cuda:
|
||||
return style_mel.cuda()
|
||||
return style_mel
|
||||
return style_feature.cuda()
|
||||
return style_feature
|
||||
|
||||
|
||||
def run_model_torch(
|
||||
model: nn.Module,
|
||||
inputs: torch.Tensor,
|
||||
speaker_id: int = None,
|
||||
style_mel: torch.Tensor = None,
|
||||
style_feature: torch.Tensor = None,
|
||||
style_text: str = None,
|
||||
d_vector: torch.Tensor = None,
|
||||
language_id: torch.Tensor = None,
|
||||
|
@ -38,7 +38,7 @@ def run_model_torch(
|
|||
model (nn.Module): The model to run inference.
|
||||
inputs (torch.Tensor): Input tensor with character ids.
|
||||
speaker_id (int, optional): Input speaker ids for multi-speaker models. Defaults to None.
|
||||
style_mel (torch.Tensor, optional): Spectrograms used for voice styling . Defaults to None.
|
||||
style_feature (torch.Tensor, optional): Spectrograms used for voice styling . Defaults to None.
|
||||
d_vector (torch.Tensor, optional): d-vector for multi-speaker models . Defaults to None.
|
||||
|
||||
Returns:
|
||||
|
@ -55,7 +55,7 @@ def run_model_torch(
|
|||
"x_lengths": input_lengths,
|
||||
"speaker_ids": speaker_id,
|
||||
"d_vectors": d_vector,
|
||||
"style_mel": style_mel,
|
||||
"style_feature": style_feature,
|
||||
"style_text": style_text,
|
||||
"language_ids": language_id,
|
||||
"emotion_ids": emotion_id,
|
||||
|
@ -170,16 +170,19 @@ def synthesis(
|
|||
"""
|
||||
# GST or Capacitron processing
|
||||
# TODO: need to handle the case of setting both gst and capacitron to true somewhere
|
||||
style_mel = None
|
||||
style_feature = None
|
||||
if CONFIG.has("gst") and CONFIG.gst and style_wav is not None:
|
||||
if isinstance(style_wav, dict):
|
||||
style_mel = style_wav
|
||||
style_feature = style_wav
|
||||
else:
|
||||
style_mel = compute_style_mel(style_wav, model.ap, cuda=use_cuda)
|
||||
style_feature = compute_style_feature(style_wav, model.ap, cuda=use_cuda)
|
||||
|
||||
if CONFIG.has("capacitron_vae") and CONFIG.use_capacitron_vae and style_wav is not None:
|
||||
style_mel = compute_style_mel(style_wav, model.ap, cuda=use_cuda)
|
||||
style_mel = style_mel.transpose(1, 2) # [1, time, depth]
|
||||
style_feature = compute_style_feature(style_wav, model.ap, cuda=use_cuda)
|
||||
style_feature = style_feature.transpose(1, 2) # [1, time, depth]
|
||||
|
||||
if hasattr(model, 'compute_style_feature'):
|
||||
style_feature = model.compute_style_feature(style_wav)
|
||||
|
||||
# convert text to sequence of token IDs
|
||||
text_inputs = np.asarray(
|
||||
|
@ -202,9 +205,9 @@ def synthesis(
|
|||
if emotion_embedding is not None:
|
||||
emotion_embedding = embedding_to_torch(emotion_embedding, cuda=use_cuda)
|
||||
|
||||
if not isinstance(style_mel, dict):
|
||||
if not isinstance(style_feature, dict):
|
||||
# GST or Capacitron style mel
|
||||
style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda)
|
||||
style_feature = numpy_to_torch(style_feature, torch.float, cuda=use_cuda)
|
||||
if style_text is not None:
|
||||
style_text = np.asarray(
|
||||
model.tokenizer.text_to_ids(style_text, language=language_id),
|
||||
|
@ -220,7 +223,7 @@ def synthesis(
|
|||
model,
|
||||
text_inputs,
|
||||
speaker_id,
|
||||
style_mel,
|
||||
style_feature,
|
||||
style_text,
|
||||
d_vector=d_vector,
|
||||
language_id=language_id,
|
||||
|
|
|
@ -47,7 +47,7 @@ config.model_args.use_emotion_embedding = False
|
|||
config.model_args.emotion_embedding_dim = 256
|
||||
config.model_args.emotion_just_encoder = True
|
||||
config.model_args.external_emotions_embs_file = "tests/data/ljspeech/speakers.json"
|
||||
|
||||
config.use_style_weighted_sampler = True
|
||||
# consistency loss
|
||||
# config.model_args.use_emotion_encoder_as_loss = True
|
||||
# config.model_args.encoder_model_path = "/raid/edresson/dev/Checkpoints/Coqui-Realesead/tts_models--multilingual--multi-dataset--your_tts/model_se.pth.tar"
|
||||
|
@ -64,6 +64,13 @@ command_train = (
|
|||
"--coqpit.datasets.0.meta_file_val metadata.csv "
|
||||
"--coqpit.datasets.0.path tests/data/ljspeech "
|
||||
"--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
|
||||
"--coqpit.datasets.0.speech_style style1 "
|
||||
"--coqpit.datasets.1.name ljspeech_test "
|
||||
"--coqpit.datasets.1.meta_file_train metadata.csv "
|
||||
"--coqpit.datasets.1.meta_file_val metadata.csv "
|
||||
"--coqpit.datasets.1.path tests/data/ljspeech "
|
||||
"--coqpit.datasets.1.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
|
||||
"--coqpit.datasets.1.speech_style style2 "
|
||||
"--coqpit.test_delay_epochs 0"
|
||||
)
|
||||
run_cli(command_train)
|
||||
|
|
|
@ -26,7 +26,7 @@ config = VitsConfig(
|
|||
print_step=1,
|
||||
print_eval=True,
|
||||
test_sentences=[
|
||||
["Be a voice, not an echo.", "ljspeech-1", None, None, "ljspeech-1"],
|
||||
["Be a voice, not an echo.", "ljspeech-1", "tests/data/ljspeech/wavs/LJ001-0001.wav", None, None],
|
||||
],
|
||||
)
|
||||
# set audio config
|
||||
|
@ -38,11 +38,11 @@ config.model_args.use_speaker_embedding = True
|
|||
config.model_args.use_d_vector_file = False
|
||||
config.model_args.d_vector_file = "tests/data/ljspeech/speakers.json"
|
||||
config.model_args.speaker_embedding_channels = 128
|
||||
config.model_args.d_vector_dim = 256
|
||||
config.model_args.d_vector_dim = 128
|
||||
|
||||
# prosody embedding
|
||||
config.model_args.use_prosody_encoder = True
|
||||
config.model_args.prosody_embedding_dim = 256
|
||||
config.model_args.prosody_embedding_dim = 64
|
||||
|
||||
config.save_json(config_path)
|
||||
|
||||
|
@ -67,12 +67,11 @@ continue_config_path = os.path.join(continue_path, "config.json")
|
|||
continue_restore_path, _ = get_last_checkpoint(continue_path)
|
||||
out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
|
||||
speaker_id = "ljspeech-1"
|
||||
emotion_id = "ljspeech-3"
|
||||
style_wav_path = "tests/data/ljspeech/wavs/LJ001-0001.wav"
|
||||
continue_speakers_path = os.path.join(continue_path, "speakers.json")
|
||||
continue_emotion_path = os.path.join(continue_path, "speakers.json")
|
||||
|
||||
|
||||
inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --emotion_idx {emotion_id} --speakers_file_path {continue_speakers_path} --emotions_file_path {continue_emotion_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
|
||||
inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path} --gst_style {style_wav_path}"
|
||||
run_cli(inference_command)
|
||||
|
||||
# restore the model and continue training for one more epoch
|
||||
|
|
Loading…
Reference in New Issue