From 569decba64022784faeb37a0260ab73e17d4de66 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Fri, 17 Jun 2022 17:39:17 -0300 Subject: [PATCH] Add Pitch Predictor conditioned on enc --- TTS/bin/compute_vits_alignments.py | 3 + TTS/tts/layers/vits/networks.py | 9 +- TTS/tts/models/vits.py | 109 ++++++++++++------ .../mas_alignments/alignments/LJ001-0001.npy | Bin 0 -> 475460 bytes .../mas_alignments/alignments/LJ001-0002.npy | Bin 0 -> 19036 bytes .../mas_alignments/alignments/LJ001-0003.npy | Bin 0 -> 479360 bytes .../mas_alignments/alignments/LJ001-0004.npy | Bin 0 -> 138032 bytes .../mas_alignments/alignments/LJ001-0005.npy | Bin 0 -> 365880 bytes .../mas_alignments/alignments/LJ001-0006.npy | Bin 0 -> 139004 bytes .../mas_alignments/alignments/LJ001-0007.npy | Bin 0 -> 332248 bytes .../mas_alignments/alignments/LJ001-0008.npy | Bin 0 -> 12368 bytes ..._vits_speaker_emb_with_pitch_predictor.py} | 15 ++- 12 files changed, 93 insertions(+), 43 deletions(-) create mode 100644 tests/data/ljspeech/mas_alignments/alignments/LJ001-0001.npy create mode 100644 tests/data/ljspeech/mas_alignments/alignments/LJ001-0002.npy create mode 100644 tests/data/ljspeech/mas_alignments/alignments/LJ001-0003.npy create mode 100644 tests/data/ljspeech/mas_alignments/alignments/LJ001-0004.npy create mode 100644 tests/data/ljspeech/mas_alignments/alignments/LJ001-0005.npy create mode 100644 tests/data/ljspeech/mas_alignments/alignments/LJ001-0006.npy create mode 100644 tests/data/ljspeech/mas_alignments/alignments/LJ001-0007.npy create mode 100644 tests/data/ljspeech/mas_alignments/alignments/LJ001-0008.npy rename tests/tts_tests/{test_vits_speaker_emb_with_prosody_encoder_with_pitch_predictor.py => test_vits_speaker_emb_with_pitch_predictor.py} (89%) diff --git a/TTS/bin/compute_vits_alignments.py b/TTS/bin/compute_vits_alignments.py index df4e0b65..1d70d87b 100644 --- a/TTS/bin/compute_vits_alignments.py +++ b/TTS/bin/compute_vits_alignments.py @@ -69,6 +69,9 @@ def extract_aligments( for idx in range(tokens.shape[0]): wav_file_path = item_idx[idx] alignment = alignments[idx] + spec_length = spec_lens[idx] + token_length = token_lenghts[idx] + alignment = alignment[:token_length, :spec_length] # set paths align_file_name = os.path.splitext(os.path.basename(wav_file_path))[0] + ".npy" os.makedirs(os.path.join(output_path, "alignments"), exist_ok=True) diff --git a/TTS/tts/layers/vits/networks.py b/TTS/tts/layers/vits/networks.py index cf3cd14b..dbcb7313 100644 --- a/TTS/tts/layers/vits/networks.py +++ b/TTS/tts/layers/vits/networks.py @@ -40,6 +40,7 @@ class TextEncoder(nn.Module): language_emb_dim: int = None, emotion_emb_dim: int = None, prosody_emb_dim: int = None, + pitch_dim: int = None, ): """Text Encoder for VITS model. @@ -70,6 +71,9 @@ class TextEncoder(nn.Module): if prosody_emb_dim: hidden_channels += prosody_emb_dim + if pitch_dim: + hidden_channels += pitch_dim + self.encoder = RelativePositionTransformer( in_channels=hidden_channels, out_channels=hidden_channels, @@ -85,7 +89,7 @@ class TextEncoder(nn.Module): self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) - def forward(self, x, x_lengths, lang_emb=None, emo_emb=None, pros_emb=None): + def forward(self, x, x_lengths, lang_emb=None, emo_emb=None, pros_emb=None, pitch_emb=None): """ Shapes: - x: :math:`[B, T]` @@ -105,6 +109,9 @@ class TextEncoder(nn.Module): if pros_emb is not None: x = torch.cat((x, pros_emb.transpose(2, 1).expand(x.size(0), x.size(1), -1)), dim=-1) + if pitch_emb is not None: + x = torch.cat((x, pitch_emb.transpose(2, 1)), dim=-1) + x = torch.transpose(x, 1, -1) # [b, h, t] x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) # [b, 1, t] diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index c54850c6..eb2e9976 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -265,7 +265,9 @@ class VitsDataset(TTSDataset): self.pad_id = self.tokenizer.characters.pad_id self.model_args = model_args self.compute_pitch = compute_pitch - + self.use_precomputed_alignments = model_args.use_precomputed_alignments + self.alignments_cache_path = model_args.alignments_cache_path + if self.compute_pitch: self.f0_dataset = VITSF0Dataset(config, samples=self.samples, ap=self.ap, cache_path=self.f0_cache_path, precompute_num_workers=self.precompute_num_workers @@ -289,6 +291,11 @@ class VitsDataset(TTSDataset): if self.compute_pitch: f0 = self.get_f0(idx)["f0"] + alignments = None + if self.use_precomputed_alignments: + align_file = os.path.join(self.alignments_cache_path, os.path.splitext(wav_filename)[0] + ".npy") + alignments = self.get_attn_mask(align_file) + # after phonemization the text length may change # this is a shameful 🤭 hack to prevent longer phonemes # TODO: find a better fix @@ -305,6 +312,8 @@ class VitsDataset(TTSDataset): "speaker_name": item["speaker_name"], "language_name": item["language"], "pitch": f0, + "alignments": alignments, + } @property @@ -365,6 +374,18 @@ class VitsDataset(TTSDataset): pitch = torch.FloatTensor(pitch)[:, None, :].contiguous() # B x 1 xT else: pitch = None + + padded_alignments = None + if self.use_precomputed_alignments: + alignments = batch["alignments"] + max_len_1 = max((x.shape[0] for x in alignments)) + max_len_2 = max((x.shape[1] for x in alignments)) + padded_alignments = [] + for x in alignments: + padded_alignment = np.pad(x, ((0, max_len_1 - x.shape[0]), (0, max_len_2 - x.shape[1])), mode="constant", constant_values=0) + padded_alignments.append(padded_alignment) + + padded_alignments = torch.FloatTensor(np.stack(padded_alignments)).unsqueeze(1) return { "tokens": token_padded, @@ -378,6 +399,7 @@ class VitsDataset(TTSDataset): "language_names": batch["language_name"], "audio_files": batch["wav_file"], "raw_text": batch["raw_text"], + "alignments": padded_alignments, } @@ -385,7 +407,6 @@ class VitsDataset(TTSDataset): # MODEL DEFINITION ############################## - @dataclass class VitsArgs(Coqpit): """VITS model arguments. @@ -664,6 +685,9 @@ class VitsArgs(Coqpit): pitch_predictor_dropout_p: float = 0.1 pitch_embedding_kernel_size: int = 3 detach_pp_input: bool = False + use_precomputed_alignments: bool = False + alignments_cache_path: str = "" + pitch_embedding_dim: int = 0 detach_dp_input: bool = True use_language_embedding: bool = False @@ -751,6 +775,7 @@ class Vits(BaseTTS): language_emb_dim=self.embedded_language_dim, emotion_emb_dim=self.args.emotion_embedding_dim if not self.args.use_noise_scale_predictor else 0, prosody_emb_dim=self.args.prosody_embedding_dim if not self.args.use_noise_scale_predictor else 0, + pitch_dim=self.args.pitch_embedding_dim if self.args.use_pitch and self.args.use_pitch_on_enc_input else 0, ) self.posterior_encoder = PosteriorEncoder( @@ -791,6 +816,9 @@ class Vits(BaseTTS): if self.args.use_prosody_encoder and not self.args.use_noise_scale_predictor: dp_extra_inp_dim += self.args.prosody_embedding_dim + if self.args.use_pitch and self.args.use_pitch_on_enc_input: + dp_extra_inp_dim += self.args.pitch_embedding_dim + if self.args.use_sdp: self.duration_predictor = StochasticDurationPredictor( self.args.hidden_channels + dp_extra_inp_dim, @@ -814,13 +842,13 @@ class Vits(BaseTTS): if self.args.use_pitch: if self.args.use_pitch_on_enc_input: self.pitch_predictor_vocab_emb = nn.Embedding(self.args.num_chars, self.args.hidden_channels) - else: - self.pitch_emb = nn.Conv1d( - 1, - self.args.hidden_channels, - kernel_size=self.args.pitch_predictor_kernel_size, - padding=int((self.args.pitch_predictor_kernel_size - 1) / 2), - ) + + self.pitch_emb = nn.Conv1d( + 1, + self.args.hidden_channels if not self.args.use_pitch_on_enc_input else self.args.pitch_embedding_dim, + kernel_size=self.args.pitch_predictor_kernel_size, + padding=int((self.args.pitch_predictor_kernel_size - 1) / 2), + ) self.pitch_predictor = DurationPredictor( self.args.hidden_channels, self.args.pitch_predictor_hidden_channels, @@ -1241,17 +1269,16 @@ class Vits(BaseTTS): ) pitch_loss = None - gt_avg_pitch = None + pred_avg_pitch_emb = None + gt_avg_pitch_emb = None if pitch is not None: gt_avg_pitch = average_over_durations(pitch, dr.squeeze()).detach() pitch_loss = torch.sum(torch.sum((gt_avg_pitch - pred_avg_pitch) ** 2, [1, 2]) / torch.sum(x_mask)) - if not self.args.use_pitch_on_enc_input: - gt_agv_pitch = self.pitch_emb(gt_avg_pitch) + gt_avg_pitch_emb = self.pitch_emb(gt_avg_pitch) else: - if not self.args.use_pitch_on_enc_input: - pred_avg_pitch = self.pitch_emb(pred_avg_pitch) + pred_avg_pitch_emb = self.pitch_emb(pred_avg_pitch) - return pitch_loss, gt_agv_pitch, pred_avg_pitch + return pitch_loss, gt_avg_pitch_emb, pred_avg_pitch_emb def forward_mas(self, outputs, z_p, m_p, logs_p, x, x_mask, y_mask, g, lang_emb): # find the alignment path @@ -1313,6 +1340,7 @@ class Vits(BaseTTS): y_lengths: torch.tensor, waveform: torch.tensor, pitch: torch.tensor, + alignments: torch.tensor, aux_input={ "d_vectors": None, "speaker_ids": None, @@ -1389,6 +1417,21 @@ class Vits(BaseTTS): if self.args.use_speaker_embedding or self.args.use_d_vector_file: g = F.normalize(self.speaker_embedding_squeezer(g.squeeze(-1))).unsqueeze(-1) + # duration predictor + g_dp = g if self.args.condition_dp_on_speaker else None + if eg is not None and (self.args.use_emotion_embedding or self.args.use_external_emotions_embeddings): + if g_dp is None: + g_dp = eg + else: + g_dp = torch.cat([g_dp, eg], dim=1) # [b, h1+h2, 1] + + pitch_loss = None + gt_avg_pitch_emb = None + if self.args.use_pitch and self.args.use_pitch_on_enc_input: + if alignments is None: + raise RuntimeError(" [!] For condition the pitch on the Text Encoder you need to provide external alignments !") + pitch_loss, gt_avg_pitch_emb, _ = self.forward_pitch_predictor(x, x_lengths, pitch, alignments.sum(3), g_dp) + # posterior encoder z, m_q, logs_q, y_mask = self.posterior_encoder(y, y_lengths, g=g) @@ -1418,13 +1461,14 @@ class Vits(BaseTTS): _, l_pros_speaker = self.speaker_reversal_classifier(pros_emb.transpose(1, 2), sid, x_mask=None) if self.args.use_prosody_enc_emo_classifier: _, l_pros_emotion = self.pros_enc_emotion_classifier(pros_emb.transpose(1, 2), eid, x_mask=None) - x_input = x + x, m_p, logs_p, x_mask = self.text_encoder( x, x_lengths, lang_emb=lang_emb, emo_emb=eg if not self.args.use_noise_scale_predictor else None, pros_emb=pros_emb if not self.args.use_noise_scale_predictor else None, + pitch_emb=gt_avg_pitch_emb if self.args.use_pitch and self.args.use_pitch_on_enc_input else None, ) # reversal speaker loss to force the encoder to be speaker identity free @@ -1437,14 +1481,6 @@ class Vits(BaseTTS): if self.args.use_text_enc_emo_classifier: _, l_text_emotion = self.emo_text_enc_classifier(m_p.transpose(1, 2), eid, x_mask=x_mask) - # duration predictor - g_dp = g if self.args.condition_dp_on_speaker else None - if eg is not None and (self.args.use_emotion_embedding or self.args.use_external_emotions_embeddings): - if g_dp is None: - g_dp = eg - else: - g_dp = torch.cat([g_dp, eg], dim=1) # [b, h1+h2, 1] - if self.args.use_prosody_encoder: if g_dp is None: g_dp = pros_emb @@ -1453,7 +1489,6 @@ class Vits(BaseTTS): outputs, attn = self.forward_mas(outputs, z_p, m_p, logs_p, x, x_mask, y_mask, g=g_dp, lang_emb=lang_emb) - pitch_loss = None if self.args.use_pitch and not self.args.use_pitch_on_enc_input: pitch_loss, gt_avg_pitch_emb, _ = self.forward_pitch_predictor(m_p, x_lengths, pitch, attn.sum(3), g_dp) m_p = m_p + gt_avg_pitch_emb @@ -1631,14 +1666,6 @@ class Vits(BaseTTS): pros_emb = pros_emb.transpose(1, 2) - x, m_p, logs_p, x_mask = self.text_encoder( - x, - x_lengths, - lang_emb=lang_emb, - emo_emb=eg if not self.args.use_noise_scale_predictor else None, - pros_emb=pros_emb if not self.args.use_noise_scale_predictor else None, - ) - # duration predictor g_dp = g if self.args.condition_dp_on_speaker else None if eg is not None and (self.args.use_emotion_embedding or self.args.use_external_emotions_embeddings): @@ -1647,6 +1674,19 @@ class Vits(BaseTTS): else: g_dp = torch.cat([g_dp, eg], dim=1) # [b, h1+h2, 1] + pred_avg_pitch_emb = None + if self.args.use_pitch and self.args.use_pitch_on_enc_input: + _, _, pred_avg_pitch_emb = self.forward_pitch_predictor(x, x_lengths, g_pp=g_dp) + + x, m_p, logs_p, x_mask = self.text_encoder( + x, + x_lengths, + lang_emb=lang_emb, + emo_emb=eg if not self.args.use_noise_scale_predictor else None, + pros_emb=pros_emb if not self.args.use_noise_scale_predictor else None, + pitch_emb=pred_avg_pitch_emb if self.args.use_pitch and self.args.use_pitch_on_enc_input else None, + ) + if self.args.use_prosody_encoder: if g_dp is None: g_dp = pros_emb @@ -1673,7 +1713,6 @@ class Vits(BaseTTS): attn_mask = x_mask * y_mask.transpose(1, 2) # [B, 1, T_enc] * [B, T_dec, 1] attn = generate_path(w_ceil.squeeze(1), attn_mask.squeeze(1).transpose(1, 2)) - pred_avg_pitch_emb = None if self.args.use_pitch and not self.args.use_pitch_on_enc_input: _, _, pred_avg_pitch_emb = self.forward_pitch_predictor(m_p, x_lengths, g_pp=g_dp) m_p = m_p + pred_avg_pitch_emb @@ -1819,6 +1858,7 @@ class Vits(BaseTTS): emotion_ids = batch["emotion_ids"] waveform = batch["waveform"] pitch = batch["pitch"] + alignments = batch["alignments"] # generator pass outputs = self.forward( @@ -1828,6 +1868,7 @@ class Vits(BaseTTS): spec_lens, waveform, pitch, + alignments, aux_input={ "d_vectors": d_vectors, "speaker_ids": speaker_ids, diff --git a/tests/data/ljspeech/mas_alignments/alignments/LJ001-0001.npy b/tests/data/ljspeech/mas_alignments/alignments/LJ001-0001.npy new file mode 100644 index 0000000000000000000000000000000000000000..c2b961bb626f39bc02f4cc9eda9d4319d9a0da55 GIT binary patch literal 475460 zcmeI(v1(L70EOXe>r>qBDr^&yLPSVsr`T9n$znEGhzYx5BSPN67j~yuSi;&Y2!a0# zOp$cwJ5zX=>!0@@-+lP>Z2CFFEgQ~14N zFDC&41PBlyK!5-N0&M~BQSJVvCqRGz0RjXF5Fk(zuzyNz%SwO%0RjXF5FkLHEnxq& z`o0RjXF5FkK+KuN&nPK!5-N0t5&UAW##qe`<}(OMn0Y0t5&UAV8oiVE=Txmz)3r0t5&UAV7dXO~C%C zH7+j!0t5&UAV7csfv$l4)9qez0t5&UAV7cs0RlAv`={2pyaWgkAV7cs0RjZN0%QN2 zI$aUT2@oJafB*pk1PBl)2^{uNDYL8u2oNAZfB*pk1PF8md>7a4UUC8i2oNAZfB*pk zH39pl*0{U`2oNAZfB*pk1iAwDPq%x?2@oJafB*pk1PIgw?4MfW@)96GfB*pk1PBo5 z3fMp0?jN+r8uj2oNAZfB*pk1Zo1_fNG7)OMn0Y0t5&U zAV8ok;0>tVzw`tM5FkK+009C7N&@yzscl&a5FkK+009C72($(4pLYM!6Cgl<009C7 z2oNX<*gvJVWhFp>009C72oNC97O;QX{Yy`P009C72oNAZpd?`bl-ib+009C72oNAZ zfIwTo{%Q9wJplp)2oNAZfB=D#fc;ZyTUG)D2oNAZfB*pkZ2|kI-M{n%2oNAZfB*pk z1WE#9{~VuEc3BA!AV7cs0RjXF5a<;2+r8uj2oNAZfB*pk1Zo2IPpxrz2@oJafB*pk1PF8m?4NG;k`o|6fB*pk z1PBnQ3D`fi#^ohIfB*pk1PBly&=s(My4_1ofB*pk1PBlyK%gdI|I`|nmjD3*1PBly zK!8A3VCo0RjXF5FkK+KuKWipZ|GE zma-BcK!5-N0t5&UAkY=?w$$xjasmVh5FkK+009Cu0sE)cxV!`i5FkK+009C7x&ro3 zw|mJ65FkK+009C72-F1ZpIYPc5+Fc;009C72oUHB*gxIwB_}|D009C72oNAp6Bzr) zVc-t91MYx3;0`>^J5c*~iM#{|5FkK+009C72s8!8CiO#r009C72oNAZfB=Dlfc;Zw zSw;c`2oNAZfB*pk1ctz=ZxYE6AV7cs0RjXF5FkLHAaK}2g{(3XAV7cs0RjXF5FkLn zH*xL&0RjXF5FkK+009CO0sE)Yw44M85FkK+009C72>1rh9Uwq}009C72oNAZpd{c8 zsMNNs1PBlyK!5-N0tDIu_D{Qi=?M@ZK!5-N0t5(@1ni$u+p-cMK!5-N0t5&UXbbp1 zK<)meCqRGz0RjXF5Fk(!@CH<;2+r8uj2oNAZfB*pk1Zo2I zPpxrz2@oJafB*pk1PF8m?4NG;k`o|6fB*pk1PBnQ3D`fi#^ohIfB*pk1PBly&=s(M zy4_1ofB*pk1PBlyK%gdI|I`|nmjD3*1PBlyK!8A3!2aoWFF64M1PBlyK!5;&nt=UN zYg}Fe1PBlyK!5-N0$l<7r`x^c1PBlyK!5-N0t9LT_D`*Gc?l38K!5-N0t5(j1;+ld zfx7)mPJjRb0t5&UAV8odVE@z_mzMwm0t5&UAV7dXSHS-1b}u;r0t5&UAV7csftrB* zQ)^sa0t5&UAV7cs0RmkC`={Hz|JUC1PBlyK!5-N z0!0D)r`Wp81PBlyK!5-N0tA`@_D{2SsRoOA{K!5-N0t5&U xXbOz|b9|Z#QWGFRfB*pk1PBlyP!l-bLEeRG?aNDm009C72oNAZpeyij{TGO-huQ!D literal 0 HcmV?d00001 diff --git a/tests/data/ljspeech/mas_alignments/alignments/LJ001-0002.npy b/tests/data/ljspeech/mas_alignments/alignments/LJ001-0002.npy new file mode 100644 index 0000000000000000000000000000000000000000..0a227d871c2306ce097ad8a989b1b52b430966fd GIT binary patch literal 19036 zcmeH}y$ZrG7)9giQ)G)m7k>~Gp_`lH;@~93wm68DR$N5rJNUvjLpNQt!-bqeDD6=0 zckUtSYqMK#_dRnr50_@uv2+XTmig30*5yTcE0fEiDAUZpUnS=%Yu;C<KjwXh6@5R1n^O&3W78QypR7bKte!;0Q;*-6Xfs&@DUG`2;ieq8)U&p z98e;Fk4kNj#rw$m%m6zephAFiL!}9FcmnO20SySK5MYl~X@VS{06yY@5&?WvYJ)8J zhyzLl@KLD^vfv{QC=tL%r8daoeZ)yTP$GbjN^Ot@A8|m5Kzn9)eOH>1EMXI1&kehO twdwIut&yBLX0LJ0C`YEzSp^MrB^4JD`W^hj<{$^8BvERi!tV)$ z|GmA_`@9D(FXx{huim_T{r2AC>*Cw$Y<>BDvwF5%JwHEQ9W7Vq7n?7e(@*a%HfQVc z_ZO!hFW1|@Uw%0Kyx#u%gGa~5N6V*A9zQ%6O-`o z0RjXF5FkK+KvBRBDz+{&0RjXF5FkK+0D-o^+(EbTY)eW{fB*pk1PBlyK!89=U~DKK z0t5&UAV7cs0RjXFR0M3GO4D)@AV7cs0RjXF5FjuFY#`@=009C72oNAZfB=D#fDKe? zTUG)D2oNAZfB*pk1jYv1^(8}q009C72oNAZfB=DtfV-he({d6ZK!5-N0t5&UATR`M zAm@Mp0RjXF5FkK+0D+Q#4OD7dRssYF5FkK+009C7cKtr+uK%VL0t5&UAV7cs0RjXF zR0P~hRhpKQ009C72oNAZfB=CZU;{Y^1PBlyK!5-N0t5(@1Z<#E+p-cMK!5-N0t5&U zAYcPI0|W>VAV7cs0RjXFlmu*`Qrof;AV7cs0RjXF5Fju%kR9Y45FkK+009C72oNAp z60m_vZOcl4009C72oNAZfPf9;3=kkdfB*pk1PBlyP!h0#N^Q$ZfB*pk1PBlyK!AV^ zsad`<4AV7cs0RjXFv<2q-aJThrYf4Xm009C72oNAZfIvyWol>c7SqTszK!5-N z0t5&Uuz{Qb0t5&UAV7cs0RjX{0ya>oZCMErAV7cs0RjXF5U_!q0RjXF5FkK+009C7 zN&;g8*+HeoWhFp>009C72oNAZzy@*#2oNAZfB*pk1PBl)3D`iTwq+$ifB*pk1PBly zK)?oa1_%%!K!5-N0t5&UC<%-W^gn(}9mq<6009C72oNAZfIwTo-Bi1O=?M@ZK!5-N z0t5)u1m?X|Ew{V`2oNAZfB*pk1PF8m+y!;Jmz)3r0t5&UAV7dXQNRW&wk|UP0t5&U zAV7csfv$iJ)a_n!0t5&UAV7cs0RlyV`I{hHsMx;D1PBlyK!5-N0tC7Ozi*-I|J+rP zoB#m=1PBlyK!5;&nt=PETI2E(AV7cs0RjXF5NHe7K<)meCqRGz0RjXF5Fk(!uz_lg z%S(U&0RjXF5FkLHEnow+`o0RjXF5FkK+Kuy2~sx>Yz0RjXF5FkK+0D-oE4b<*m zdIAIp5FkK+009Cu0UM~+xV!`i5FkK+009C7+5%$(`TjC!p9JX%5FkK+009C72oNX< z9NK{;2U!UaAV7cs0RjXF5NHc{SFqi`^aKbHAV7cs0RjYS0*CI0{5S7WIs?vtGjOvr zQ2T$0yaWgkAV7cs0RjXFbOpw4^dUfi009C72oNAZfIvmS2C6hICjkNk2oNAZfB*pk zL%;@d4hRq+K!5-N0t5&UC<*-OH&i7HSqTszK!5-N0t5&UXbZR_YWFWa0RjXF5FkK+ z0D+o-4ODAfUIGLN5FkK+009DR0UM~@zw`tM5FkK+009C7Y63P;t#Nq?5FkK+009C7 z2($%kpmzV#6Cgl<009C72oR_V*g&<$&Jplp)2oNAZfB=D-fDKe@TwVeM2oNAZfB*pk zZ2=po-M{n%2oNAZfB*pk1Zo0f1KriPW-TuP0t5&UAV7cs0RmkC_fOsKB_}|D009C7 z2oNAp6tIDct;cCqRGz0RjXF5Fk(!uz_lg%S(U&0RjXF5FkLH zEimtb{`T3{lAZto0t5&UAV7csfs%kbqf*oZCMErAV7cs0RjXF5U_!q0RjXF5FkK+009C7N&+@escl&a5FkK+009C72oSJ= zoB;v^2oNAZfB*pk1WE#91MT{j3}huhfB*pk1PBlyK%gz)Zm8YA^aKbHAV7cs0RjYS z0ya>sad`<4AV7cs0RjXFv;}OScK^~7AV7cs0RjXF5U2^*K()r@B|v}x0RjXF5FpSN zuz}kBOHY6R0RjXF5FkLHCSU{A8kd&<0RjXF5FkK+KwDsJpnv(c^`s|2fB*pk1PBly zK%gYx4yn|(tON)UAV7cs0RjXF*g(zz0RjXF5FkK+009Ce0UM~)wyXpQ5FkK+009C7 z2-rZ*009C72oNAZfB*pkB>@|#)V8by2oNAZfB*pk1PItb&Hw=d1PBlyK!5-N0wsZ~ Flb?w&h?@Wa literal 0 HcmV?d00001 diff --git a/tests/data/ljspeech/mas_alignments/alignments/LJ001-0004.npy b/tests/data/ljspeech/mas_alignments/alignments/LJ001-0004.npy new file mode 100644 index 0000000000000000000000000000000000000000..6becf176fdd74f4ddc0715cd8c1ac158f7badedc GIT binary patch literal 138032 zcmeI*v1*f100rRR)u+f7g-jAqMCj(GxHvdTuqh5=OCl~J^c{R*bBDlB)FGiR=X`{4&kEZX_kLC4t|7o{;xmdosSualKaw0RjXL z0%Pm!oufd2009E&0=6!_VfhFUAV9#@IRgX;5Fn5)VC&KwmX81d0t9TGGeCd<0Rrg) zwl2M4`3MjoKwxa${7HlW0RjXFBn!CXCATXZ0RjXF41xLmZvL7M0RjXF5J(o7Z=Tox z{(XSW|Fuc}8Iz3w0RjXFjNcK!5-N?`t{(1PBlykS<{B(i@hK z009C7Y@IVefB*pk=>oPcy^1PItVXMg|!0tC_pY+ZW8@)00FfPk%Y1_%%!KpeeNrJn zfB*pk$pY?q$?eKUfB*pkL*Vp|_Yj|xB|v}x0RqVa?up6m%0_?y0Rlt7);R|R2oNBU zE@11@8iI_H1@0RjZl1#De^1PItVXMg|!0tC_p z#@3xa(=*FQfB*pk-2(1;-Tg~XfB=C8f#V&oA*+l82oNC9E8uR|+r8uj2oPuyuyrj> z%SnI$fnI^*`$>}wo&W&?4Fc!>ZSRurZ*Y{6009C7dIj7Yd%KsM009Cm0=BNDX*mfHAkZst$$Osu z&7c3r@7$#KG)Yc?009CG0`AETEz3xN0D*1+Ti4yc^aKbHXb`Y<4K2$^fB=DR0bAGI Ozw`tM5NHs%zxoZBk!6Jd literal 0 HcmV?d00001 diff --git a/tests/data/ljspeech/mas_alignments/alignments/LJ001-0005.npy b/tests/data/ljspeech/mas_alignments/alignments/LJ001-0005.npy new file mode 100644 index 0000000000000000000000000000000000000000..99d27e860d0af023215c282eb69e669898c5e4fe GIT binary patch literal 365880 zcmeI&!HQf}6ouh(>QmILBJ?B}BoQGqXNnUC8A)S0WFSVm#fb=c2VdAe$cExlfwYDY z{`(aar>f}5`t~NQ9PZ!W|M=YxFFrf`arpD}^5*X6+tb&_(>Jf4oUV?iS8r~AzrFtT zr#H7RZ`SYMUjK4;bN~C@>+9cc?!P~J{N>}T<5$m~esOjD>-e8upC1lypC7yk5FkK+ z009C72qXn`-Q-w>B0zuu0RjXF5V#juU3cC<2oNAZfB*pk1PBxb-0v2rDiZ+$1PBly zK!Cs!n7iGFoa6`)AV7cs0RjXF6b0M^7pE!{0RjXF5FkK+z!K1P?EwJ-1PBlyK!8A1 zK-aBKRxSbr2oNAZfB*qq*A5UMK!5-N0t5(D1$5o&WaT12fB*pk1PBn&b?pEF0t5&U zAV7dXRY2FRPF5}g1PBlyK!5-NUDpl}AV7cs0RjXFR0VY1>SX02K!5-N0t5&USY7u+ zyh#!uK!5-N0t5&UC2Y@0t5&UAV7csfhC~p+5-Xv2oNAZfB=E2fUaAetXu>L z5FkK+009E?ye|m?1PBlyK!5-N0!0CLyTz%>M1TMR0t5&UAg}~tWNAjfB*pk1PBlyK%gj~>lUXf69EDQ2oNAZfWQ)%XB}t|AV7cs0RjXF5GV?` z+bvF2CISQq5FkK+0D&bice~CHV_~ZZ5FkK+009C72owd}kr$^b69EDQ2oNAZfWQ)X z=-oIq0t5&UAV7cs0Rlwnw52oNAZfB*pk1XkD8ckKZI0t5&UAV7dX zRY2FRPF5}g1PBlyK!5;&x!cuw?E(P;1PBlyK!8A1KyCtz009C72oNAZpeCT})+Q!M2oNAZfWQdox+CEvK!5-N0t5&U zs0rx0wTa3@fB*pk1PBlyFao;nNH_@)AV7cs0RjYS0=jN(qVfYmm5TrY0t5&UAV6Ss-FLm|5FkK+009C7 z2oNX=O#dxrlZgNU0t5&UAV7eCzH0{v5FkK+009C7$^xtJdJ!N%fB*pk1PBl)3h27U zsmerv009C72oNB!1m-!0IjImJK!5-N0t5&UC<;v9{l90?MkWFT2oNAZfB*pkt0#LA zAV7cs0RjXF5GV@hy2Yu=M1TMR0t5&UAg}~?J2afB*pk1PIgxJhxk)uzUmv5FkK+009Eic}Wl;K!5-N0t5&UC<=I< zp*U5U2oNAZfB*pk1eSoVYYzwzAV7cs0RjZ70=jN>vT_k1K!5-N0t5)?x^{p70RjXF z5FkLHDzLh)zFVEHTm%RZAV7cs0RpIO%70^u1PBlyK!5-N0tAWz?xc%Tm5BfW0t5&UAV6RV=(_fR009C72oNAZ zpems2RwpYL0RjXF5FkK+fUau?2oNAZfB*pk1gZkMZgsM95gsBW#7XbnU2oNAZfWYdy^QJ+7009C72oNAZpeUg47N;r`0RjXF5FkK+ zzzBGrVI-Uc2oNAZfB*pkH34_KwTa3@fB*pk1PBlya4w+lo@@9_fB*pk1PBlyP!sSh zLv5n+5FkK+009C72#mno?an#BI|&dVK!5-N0t5(@1g7u0>n%-H76Jqa5FkK+0D*IX z>AdH^`AmQS0RjXF5FkLHBH(VfGD$fI5FkK+009C7wgS5DR>XG#1PBlyK!5;&ioiU_ zpz~HHDhB}q1PBlyK!CtjK-b-h_)dTT0RjXF5Fk(y&~+=5l!E{P0t5&UAV6R%aQ-aA zeBSuyDS iCqRGz0RjXF5U2>~x|K=FL4W`O0t5&UAg~p9`}`m2J|w3A literal 0 HcmV?d00001 diff --git a/tests/data/ljspeech/mas_alignments/alignments/LJ001-0006.npy b/tests/data/ljspeech/mas_alignments/alignments/LJ001-0006.npy new file mode 100644 index 0000000000000000000000000000000000000000..16da9744b486fe3b49a5adda3375d1ad73ac87e3 GIT binary patch literal 139004 zcmeI*KWkHA7zOaS`YEzSA&XG(4?;IL#l^u%f=zJ{TM}^*q2IwTY%g*lE^V+_&&&A* zLSjqdd4A{0>3g;G=bLx0-@bpa__p}II@|6)?p80BtC!~|tMzhqezE(y+kE+Ou{+yN zAAhy^yx(4beE(^4xxN1Lqo+^S%ado%AFr1`mVftnxL91BE<6YjAV7csflz^|rw^Z@ zV;YM90RjXF)C=hA`bz)+0RjXF3?OjW*Z1*$01_h-AV7csfl7ht`v4CD1PBlyKp<0K zKCfp^XDR{&2oNAZz?s@LK!5-N0t9je^mJ}ECL=(A009E?y>ET(S|C7x009EI0&`#I zPG~X$1PBly&@JFh-JQSf2@oJaAX`9BXXj%&0t5&U=oZk^-TB*|009C7vIYDbb#^|c zBS3%v0RnSZ-}Xw1009C72oT5=aDO~A7gG@+K!5;&NkC7#4hRq+K!8B5z-{-$eg4ni zE6(HpdgT61G8q8^1PBo57I5#UJAc~~AV7dXwt$|_&c}2F2oNC9Eug2n^S3<#0t5(T z3mpF6fb2<4M}PnU0t9*moU41Yw>bd<1PJ5{9Gd7r`kqHS9AV8p3Ku`B(Z*u|!2oT5@ z(9`)DnUDYh0t9*m^mK3bHYY%U0D*jgsi*(dGk<#%5+Fc;0D(>c=k(6pZB2jx0Rm$P z%xCm5rZpu20t5&U=o6T``li=D6&n*EK!5;&F$CNjA0sDI5+Fc;K&QY>=j>xY*C}Uf z0t5&UAdoNMJf5GC2?-D&K%iGZPxoeTa{>ei5XcwM)A<>hkN^P!1bPMZbZ_=HCqRGz zfqVfyou82j2@oJapjSXo_hxT%0t5&U$QRJl`5Bpz009C7dIj`!Z}v7PK!5;&d;vY3 zpOFa(5FkLHS77RCUEQ0%%?S`7KpW3LIMN`5a<=q)4kc-oB#m=1o8#ud*6qz z`BR&a009C72y_ZKUw7tiYXSra5Ew&1Pmht4DG3lDK%i5={qN4)ZB2jx0Rm$Q9QO5H z-jB7lX$cS@K!8Axz+L*?*Z%(5_WN+?xgu>!fB*pk1jZ7Wz7OyqK!5-N0t7Mz^mJw} zrXoOq009D%fSz_85FkK+0D)WqJ)N74$p{c2K!AXrb`209K!5;&Tme0un~li`5FkK+ zz|_<8lLi3-1PBlykSX9?otcZN2oNAZfWRc6r(FjG2oNAZAXh+7=VoIv0t5&UAfTsR P0|W>VAV45j;Og`jW!#9G literal 0 HcmV?d00001 diff --git a/tests/data/ljspeech/mas_alignments/alignments/LJ001-0007.npy b/tests/data/ljspeech/mas_alignments/alignments/LJ001-0007.npy new file mode 100644 index 0000000000000000000000000000000000000000..89899435cb8b494e4fbe62d7149adf69d2089d64 GIT binary patch literal 332248 zcmeI&v1*h-6b8_>^(nSng)|9@AVNAj#m2%)f=RFtBO9?1A@AS|n<)l@WQDmii1?j_ zpC6apk$dhSU&i_Q`t7SX?;dQvZNBf$FRnlAch9%G7ne_VN88=y)&A@L?DPAp{rN@v z{^i-H>x@9w0z~009C72oNAp6Og5AZmI|nAV7cs0RjXF$kO%z0RjXF5FkK+0D+o-EM0R` zMSuVS0t5&UAV5Htwg(6hAV7cs0RjXF)C6Sdnwu&D1PBlyK!5-N0xe75_mCk#fB*pk z1PBlyKwwkK!5-N0t5&U zARtTI0|W>VAV7cs0RjYS0}6#@hZ5FkK+009C7#s%)?>hWq71PBlyK!5-N0t5)i z)b;=Y0t5&UAV7csfjI$psptGu6Cgl<009C72;>UL(z&zOIROF$2oNAZfWVx9EIsF^ zng9U;1PBlyKp5M&Iu48K!5-N0tDs+T9%fl=L}U7AV7cs0RjXF=O@-pqAOfB*pk1PBlyutq?ZUgKyL0t5&U zAV7csfjohhr4K!M8agIGfB*pk1PBlyFfZV|dfw0q1PBlyK!5-N0+|A`bmrW3O@IIa z0t5&UATTd*=-!9#58sJlGiEctORYeF009C72oNAZAWy*md*sbr#{>uvAV7cs0Rn3T zWa%}IRv|!u009C72oT5`s0t5&UAV45bz}a-(%ymqF009C72oNB!PN3)M{$97Tl?V_ZK!5-N0t5); z2*}epv(_mA0t5&UAV7e?dV!XweFzXBK!5-N0t5&U7#HYg)BnSJyn7V{2oNAZfB*pk z1PC}^w+9FiAV7cs0RjXF)C6Sdnwu&D1PBlyK!5-N0`V7AV7cs0Rq_qvUK+Rbx(i*0RjXF5Fk(&kfrN(DhUuE zK!5-N0tB)JT9#hE zok{`(2oNAZfB=DP0a-eG{<$`Rkql0RjXF5FkLHE+9+S?NkyVK!5-N z0t5(T3&_&h^VdBA0t5&UAV7dXU7%&@>Qnz)RT3aTfB*pk1PBnw6>y%NJA0iIAV7cs z0RjXF%n8WSbAGA`5FkK+009C7as_1R+}Z1#009C72oNAZU`{}mp7T>pfB*pk1PBly zkSicd=gwZ|1PBlyK!5-N0&@bg^qik+0t5&UAV7csfm{JuI(POuCqRGz0RjXF5SSB? zrRV%q6Cgl<009C72;>UL(z&zOIROF$2oNAZfWVwU%hK}noS|w01PBlyK!5;&Tme}+ zclJ6bK!5-N0t5&Um=lnt=loO?AV7cs0RjXFK!5-N0t5(*3CPl8UVfSY0RjXF5FkK+Ko^juyM#9Z0t5&UAV7csfiVGDdd$mD z6Cgl<009C72oUH3vUHd5CP07y0RjXF5FjunAWM&V`Dp?K2oNAZfB*pkUEt>Q7iCE{ Ag#Z8m literal 0 HcmV?d00001 diff --git a/tests/data/ljspeech/mas_alignments/alignments/LJ001-0008.npy b/tests/data/ljspeech/mas_alignments/alignments/LJ001-0008.npy new file mode 100644 index 0000000000000000000000000000000000000000..b545ad08dadf18f10d8e853a73b821312ba30a46 GIT binary patch literal 12368 zcmeI#KMR6D7=ZC>?Ni)#1rGgPB5-RfS{j-{@}wa$&S;6i@6Z>{+rm+J-v%F{L5F+o z$Ag}q-C?^w4$RZMT<)uLa4YN9#lj`l6=isY>~<qh}PqXP|rI{gBrOpVC^ literal 0 HcmV?d00001 diff --git a/tests/tts_tests/test_vits_speaker_emb_with_prosody_encoder_with_pitch_predictor.py b/tests/tts_tests/test_vits_speaker_emb_with_pitch_predictor.py similarity index 89% rename from tests/tts_tests/test_vits_speaker_emb_with_prosody_encoder_with_pitch_predictor.py rename to tests/tts_tests/test_vits_speaker_emb_with_pitch_predictor.py index 30d9f0f6..f3fe2bd5 100644 --- a/tests/tts_tests/test_vits_speaker_emb_with_prosody_encoder_with_pitch_predictor.py +++ b/tests/tts_tests/test_vits_speaker_emb_with_pitch_predictor.py @@ -28,7 +28,7 @@ config = VitsConfig( compute_pitch=True, f0_cache_path="tests/data/ljspeech/f0_cache/", test_sentences=[ - ["Be a voice, not an echo.", "ljspeech-1", "tests/data/ljspeech/wavs/LJ001-0001.wav", None, None], + ["Be a voice, not an echo.", "ljspeech-1", None, None, None], ], ) # set audio config @@ -42,17 +42,18 @@ config.model_args.d_vector_file = "tests/data/ljspeech/speakers.json" config.model_args.speaker_embedding_channels = 128 config.model_args.d_vector_dim = 128 -# prosody embedding -config.model_args.use_prosody_encoder = True -config.model_args.prosody_embedding_dim = 64 + +config.model_args.use_precomputed_alignments = True +config.model_args.alignments_cache_path = "tests/data/ljspeech/mas_alignments/alignments/" # pitch predictor config.model_args.use_pitch = True +config.model_args.use_pitch_on_enc_input = True +config.model_args.pitch_embedding_dim = 2 config.model_args.condition_dp_on_speaker = True config.save_json(config_path) - # train the model for one epoch command_train = ( f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --config_path {config_path} " @@ -74,11 +75,9 @@ continue_config_path = os.path.join(continue_path, "config.json") continue_restore_path, _ = get_last_checkpoint(continue_path) out_wav_path = os.path.join(get_tests_output_path(), "output.wav") speaker_id = "ljspeech-1" -style_wav_path = "tests/data/ljspeech/wavs/LJ001-0001.wav" continue_speakers_path = os.path.join(continue_path, "speakers.json") - -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path} --gst_style {style_wav_path}" +inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path} " run_cli(inference_command) # restore the model and continue training for one more epoch