diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index 64abc719..5137d48a 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -108,9 +108,8 @@ def format_data(data): mel_lengths = mel_lengths.cuda(non_blocking=True) if speaker_ids is not None: speaker_ids = speaker_ids.cuda(non_blocking=True) - if speaker_embeddings is not None: - speaker_embeddings = speaker_embeddings.cuda(non_blocking=True) - + if d_vectors is not None: + d_vectors = d_vectors.cuda(non_blocking=True) if attn_mask is not None: attn_mask = attn_mask.cuda(non_blocking=True) return ( @@ -119,7 +118,7 @@ def format_data(data): mel_input, mel_lengths, speaker_ids, - speaker_embeddings, + d_vectors, avg_text_length, avg_spec_length, attn_mask, @@ -137,23 +136,23 @@ def inference( mel_input, mel_lengths, speaker_ids=None, - speaker_embeddings=None, + d_vectors=None, ): if model_name == "glow_tts": speaker_c = None if speaker_ids is not None: speaker_c = speaker_ids - elif speaker_embeddings is not None: - speaker_c = speaker_embeddings + elif d_vectors is not None: + speaker_c = d_vectors outputs = model.inference_with_MAS( - text_input, text_lengths, mel_input, mel_lengths, cond_input={"x_vectors": speaker_c} + text_input, text_lengths, mel_input, mel_lengths, cond_input={"d_vectors": speaker_c} ) model_output = outputs["model_outputs"] model_output = model_output.transpose(1, 2).detach().cpu().numpy() elif "tacotron" in model_name: - cond_input = {"speaker_ids": speaker_ids, "x_vectors": speaker_embeddings} + cond_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors} outputs = model(text_input, text_lengths, mel_input, mel_lengths, cond_input) postnet_outputs = outputs["model_outputs"] # normalize tacotron output @@ -184,7 +183,7 @@ def extract_spectrograms( mel_input, mel_lengths, speaker_ids, - speaker_embeddings, + d_vectors, _, _, _, @@ -200,7 +199,7 @@ def extract_spectrograms( mel_input, mel_lengths, speaker_ids, - speaker_embeddings, + d_vectors, ) for idx in range(text_input.shape[0]): @@ -256,7 +255,7 @@ def main(args): # pylint: disable=redefined-outer-name speaker_manager = get_speaker_manager(c, args, meta_data_train) # setup model - model = setup_model(num_chars, speaker_manager.num_speakers, c, speaker_embedding_dim=speaker_manager.x_vector_dim) + model = setup_model(num_chars, speaker_manager.num_speakers, c, d_vector_dim=speaker_manager.d_vector_dim) # restore model checkpoint = torch.load(args.checkpoint_path, map_location="cpu") diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index a5066e3d..3cde5612 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -157,7 +157,7 @@ def main(): parser.add_argument( "--speaker_wav", nargs="+", - help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The x_vectors is computed as their average.", + help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.", default=None, ) parser.add_argument("--gst_style", help="Wav path file for GST stylereference.", default=None) diff --git a/TTS/trainer.py b/TTS/trainer.py index 8ec59f55..55560624 100644 --- a/TTS/trainer.py +++ b/TTS/trainer.py @@ -113,7 +113,7 @@ class TrainerTTS: len(self.model_characters), self.speaker_manager.num_speakers, self.config, - self.speaker_manager.x_vector_dim if self.speaker_manager.x_vectors else None, + self.speaker_manager.d_vector_dim if self.speaker_manager.d_vectors else None, ) # setup criterion @@ -156,8 +156,8 @@ class TrainerTTS: print("\n > Model has {} parameters".format(num_params)) @staticmethod - def get_model(num_chars: int, num_speakers: int, config: Coqpit, x_vector_dim: int) -> nn.Module: - model = setup_model(num_chars, num_speakers, config, x_vector_dim) + def get_model(num_chars: int, num_speakers: int, config: Coqpit, d_vector_dim: int) -> nn.Module: + model = setup_model(num_chars, num_speakers, config, d_vector_dim) return model @staticmethod @@ -196,11 +196,11 @@ class TrainerTTS: speakers_file = config.external_speaker_embedding_file if config.use_external_speaker_embedding_file: - speaker_manager.load_x_vectors_file(speakers_file) + speaker_manager.load_d_vectors_file(speakers_file) else: speaker_manager.load_ids_file(speakers_file) elif config.use_external_speaker_embedding_file and config.external_speaker_embedding_file: - speaker_manager.load_x_vectors_file(config.external_speaker_embedding_file) + speaker_manager.load_d_vectors_file(config.external_speaker_embedding_file) else: speaker_manager.parse_speakers_from_items(data_train) file_path = os.path.join(out_path, "speakers.json") @@ -387,8 +387,8 @@ class TrainerTTS: durations = to_cuda(durations) if attn_mask is not None else None if speaker_ids is not None: speaker_ids = to_cuda(speaker_ids) - if speaker_embeddings is not None: - speaker_embeddings = to_cuda(speaker_embeddings) + if d_vectors is not None: + d_vectors = to_cuda(d_vectors) return { "text_input": text_input, @@ -400,7 +400,7 @@ class TrainerTTS: "attn_mask": attn_mask, "durations": durations, "speaker_ids": speaker_ids, - "x_vectors": speaker_embeddings, + "d_vectors": d_vectors, "max_text_length": max_text_length, "max_spec_length": max_spec_length, "item_idx": item_idx, @@ -591,7 +591,7 @@ class TrainerTTS: self.use_cuda, self.ap, speaker_id=cond_inputs["speaker_id"], - x_vector=cond_inputs["x_vector"], + d_vector=cond_inputs["d_vector"], style_wav=cond_inputs["style_wav"], enable_eos_bos_chars=self.config.enable_eos_bos_chars, use_griffin_lim=True, @@ -612,9 +612,9 @@ class TrainerTTS: def _get_cond_inputs(self) -> Dict: # setup speaker_id speaker_id = 0 if self.config.use_speaker_embedding else None - # setup x_vector - x_vector = ( - self.speaker_manager.get_x_vectors_by_speaker(self.speaker_manager.speaker_ids[0]) + # setup d_vector + d_vector = ( + self.speaker_manager.get_d_vectors_by_speaker(self.speaker_manager.speaker_ids[0]) if self.config.use_external_speaker_embedding_file and self.config.use_speaker_embedding else None ) @@ -629,7 +629,7 @@ class TrainerTTS: print("WARNING: You don't provided a gst style wav, for this reason we use a zero tensor!") for i in range(self.config.gst["gst_num_style_tokens"]): style_wav[str(i)] = 0 - cond_inputs = {"speaker_id": speaker_id, "style_wav": style_wav, "x_vector": x_vector} + cond_inputs = {"speaker_id": speaker_id, "style_wav": style_wav, "d_vector": d_vector} return cond_inputs def fit(self) -> None: diff --git a/TTS/tts/layers/tacotron/gst_layers.py b/TTS/tts/layers/tacotron/gst_layers.py index e2784e5d..02154093 100644 --- a/TTS/tts/layers/tacotron/gst_layers.py +++ b/TTS/tts/layers/tacotron/gst_layers.py @@ -8,10 +8,10 @@ class GST(nn.Module): See https://arxiv.org/pdf/1803.09017""" - def __init__(self, num_mel, num_heads, num_style_tokens, gst_embedding_dim, speaker_embedding_dim=None): + def __init__(self, num_mel, num_heads, num_style_tokens, gst_embedding_dim, d_vector_dim=None): super().__init__() self.encoder = ReferenceEncoder(num_mel, gst_embedding_dim) - self.style_token_layer = StyleTokenLayer(num_heads, num_style_tokens, gst_embedding_dim, speaker_embedding_dim) + self.style_token_layer = StyleTokenLayer(num_heads, num_style_tokens, gst_embedding_dim, d_vector_dim) def forward(self, inputs, speaker_embedding=None): enc_out = self.encoder(inputs) @@ -83,13 +83,13 @@ class ReferenceEncoder(nn.Module): class StyleTokenLayer(nn.Module): """NN Module attending to style tokens based on prosody encodings.""" - def __init__(self, num_heads, num_style_tokens, embedding_dim, speaker_embedding_dim=None): + def __init__(self, num_heads, num_style_tokens, embedding_dim, d_vector_dim=None): super().__init__() self.query_dim = embedding_dim // 2 - if speaker_embedding_dim: - self.query_dim += speaker_embedding_dim + if d_vector_dim: + self.query_dim += d_vector_dim self.key_dim = embedding_dim // num_heads self.style_tokens = nn.Parameter(torch.FloatTensor(num_style_tokens, self.key_dim)) diff --git a/TTS/tts/layers/tacotron/tacotron.py b/TTS/tts/layers/tacotron/tacotron.py index dc38173f..2f94db88 100644 --- a/TTS/tts/layers/tacotron/tacotron.py +++ b/TTS/tts/layers/tacotron/tacotron.py @@ -266,7 +266,7 @@ class Decoder(nn.Module): location_attn (bool): if true, use location sensitive attention. attn_K (int): number of attention heads for GravesAttention. separate_stopnet (bool): if true, detach stopnet input to prevent gradient flow. - speaker_embedding_dim (int): size of speaker embedding vector, for multi-speaker training. + d_vector_dim (int): size of speaker embedding vector, for multi-speaker training. """ # Pylint gets confused by PyTorch conventions here diff --git a/TTS/tts/models/__init__.py b/TTS/tts/models/__init__.py index 153f8d43..026f5c85 100644 --- a/TTS/tts/models/__init__.py +++ b/TTS/tts/models/__init__.py @@ -1,7 +1,7 @@ from TTS.utils.generic_utils import find_module -def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None): +def setup_model(num_chars, num_speakers, c, d_vector_dim=None): print(" > Using model: {}".format(c.model)) MyModel = find_module("TTS.tts.models", c.model.lower()) if c.model.lower() in "tacotron": @@ -29,7 +29,7 @@ def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None): bidirectional_decoder=c.bidirectional_decoder, double_decoder_consistency=c.double_decoder_consistency, ddc_r=c.ddc_r, - speaker_embedding_dim=speaker_embedding_dim, + d_vector_dim=d_vector_dim, ) elif c.model.lower() == "tacotron2": model = MyModel( @@ -55,7 +55,7 @@ def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None): bidirectional_decoder=c.bidirectional_decoder, double_decoder_consistency=c.double_decoder_consistency, ddc_r=c.ddc_r, - speaker_embedding_dim=speaker_embedding_dim, + d_vector_dim=d_vector_dim, ) elif c.model.lower() == "glow_tts": model = MyModel( @@ -79,7 +79,7 @@ def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None): num_squeeze=2, sigmoid_scale=False, mean_only=True, - speaker_embedding_dim=speaker_embedding_dim, + d_vector_dim=d_vector_dim, ) elif c.model.lower() == "speedy_speech": model = MyModel( diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py index 3e8d4adc..20b0cdf7 100644 --- a/TTS/tts/models/align_tts.py +++ b/TTS/tts/models/align_tts.py @@ -212,7 +212,7 @@ class AlignTTS(nn.Module): return dr_mas, mu, log_sigma, logp def forward( - self, x, x_lengths, y, y_lengths, cond_input={"x_vectors": None}, phase=None + self, x, x_lengths, y, y_lengths, cond_input={"d_vectors": None}, phase=None ): # pylint: disable=unused-argument """ Shapes: @@ -223,7 +223,7 @@ class AlignTTS(nn.Module): g: [B, C] """ y = y.transpose(1, 2) - g = cond_input["x_vectors"] if "x_vectors" in cond_input else None + g = cond_input["d_vectors"] if "d_vectors" in cond_input else None o_de, o_dr_log, dr_mas_log, attn, mu, log_sigma, logp = None, None, None, None, None, None, None if phase == 0: # train encoder and MDN @@ -267,14 +267,14 @@ class AlignTTS(nn.Module): return outputs @torch.no_grad() - def inference(self, x, cond_input={"x_vectors": None}): # pylint: disable=unused-argument + def inference(self, x, cond_input={"d_vectors": None}): # pylint: disable=unused-argument """ Shapes: x: [B, T_max] x_lengths: [B] g: [B, C] """ - g = cond_input["x_vectors"] if "x_vectors" in cond_input else None + g = cond_input["d_vectors"] if "d_vectors" in cond_input else None x_lengths = torch.tensor(x.shape[1:2]).to(x.device) # pad input to prevent dropping the last word # x = torch.nn.functional.pad(x, pad=(0, 5), mode='constant', value=0) @@ -293,10 +293,10 @@ class AlignTTS(nn.Module): text_lengths = batch["text_lengths"] mel_input = batch["mel_input"] mel_lengths = batch["mel_lengths"] - x_vectors = batch["x_vectors"] + d_vectors = batch["d_vectors"] speaker_ids = batch["speaker_ids"] - cond_input = {"x_vectors": x_vectors, "speaker_ids": speaker_ids} + cond_input = {"d_vectors": d_vectors, "speaker_ids": speaker_ids} outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input, self.phase) loss_dict = criterion( outputs["logp"], diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index af52ba1c..9c928a67 100755 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -36,7 +36,7 @@ class GlowTTS(nn.Module): mean_only (bool): if True, encoder only computes mean value and uses constant variance for each time step. encoder_type (str): encoder module type. encoder_params (dict): encoder module parameters. - speaker_embedding_dim (int): channels of external speaker embedding vectors. + d_vector_dim (int): channels of external speaker embedding vectors. """ def __init__( @@ -62,7 +62,7 @@ class GlowTTS(nn.Module): mean_only=False, encoder_type="transformer", encoder_params=None, - speaker_embedding_dim=None, + d_vector_dim=None, ): super().__init__() @@ -88,15 +88,15 @@ class GlowTTS(nn.Module): # model constants. self.noise_scale = 0.33 # defines the noise variance applied to the random z vector at inference. self.length_scale = 1.0 # scaler for the duration predictor. The larger it is, the slower the speech. - self.speaker_embedding_dim = speaker_embedding_dim + self.d_vector_dim = d_vector_dim # if is a multispeaker and c_in_channels is 0, set to 256 if num_speakers > 1: - if self.c_in_channels == 0 and not self.speaker_embedding_dim: + if self.c_in_channels == 0 and not self.d_vector_dim: # TODO: make this adjustable self.c_in_channels = 256 - elif self.speaker_embedding_dim: - self.c_in_channels = self.speaker_embedding_dim + elif self.d_vector_dim: + self.c_in_channels = self.d_vector_dim self.encoder = Encoder( num_chars, @@ -125,7 +125,7 @@ class GlowTTS(nn.Module): c_in_channels=self.c_in_channels, ) - if num_speakers > 1 and not speaker_embedding_dim: + if num_speakers > 1 and not d_vector_dim: # speaker embedding layer self.emb_g = nn.Embedding(num_speakers, self.c_in_channels) nn.init.uniform_(self.emb_g.weight, -0.1, 0.1) @@ -144,7 +144,7 @@ class GlowTTS(nn.Module): return y_mean, y_log_scale, o_attn_dur def forward( - self, x, x_lengths, y, y_lengths=None, cond_input={"x_vectors": None} + self, x, x_lengths, y, y_lengths=None, cond_input={"d_vectors": None} ): # pylint: disable=dangerous-default-value """ Shapes: @@ -157,9 +157,9 @@ class GlowTTS(nn.Module): y = y.transpose(1, 2) y_max_length = y.size(2) # norm speaker embeddings - g = cond_input["x_vectors"] if cond_input is not None and "x_vectors" in cond_input else None + g = cond_input["d_vectors"] if cond_input is not None and "d_vectors" in cond_input else None if g is not None: - if self.speaker_embedding_dim: + if self.d_vector_dim: g = F.normalize(g).unsqueeze(-1) else: g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1] @@ -197,7 +197,7 @@ class GlowTTS(nn.Module): @torch.no_grad() def inference_with_MAS( - self, x, x_lengths, y=None, y_lengths=None, cond_input={"x_vectors": None} + self, x, x_lengths, y=None, y_lengths=None, cond_input={"d_vectors": None} ): # pylint: disable=dangerous-default-value """ It's similar to the teacher forcing in Tacotron. @@ -212,9 +212,9 @@ class GlowTTS(nn.Module): y = y.transpose(1, 2) y_max_length = y.size(2) # norm speaker embeddings - g = cond_input["x_vectors"] if cond_input is not None and "x_vectors" in cond_input else None + g = cond_input["d_vectors"] if cond_input is not None and "d_vectors" in cond_input else None if g is not None: - if self.external_speaker_embedding_dim: + if self.external_d_vector_dim: g = F.normalize(g).unsqueeze(-1) else: g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1] @@ -258,7 +258,7 @@ class GlowTTS(nn.Module): @torch.no_grad() def decoder_inference( - self, y, y_lengths=None, cond_input={"x_vectors": None} + self, y, y_lengths=None, cond_input={"d_vectors": None} ): # pylint: disable=dangerous-default-value """ Shapes: @@ -268,10 +268,10 @@ class GlowTTS(nn.Module): """ y = y.transpose(1, 2) y_max_length = y.size(2) - g = cond_input["x_vectors"] if cond_input is not None and "x_vectors" in cond_input else None + g = cond_input["d_vectors"] if cond_input is not None and "d_vectors" in cond_input else None # norm speaker embeddings if g is not None: - if self.external_speaker_embedding_dim: + if self.external_d_vector_dim: g = F.normalize(g).unsqueeze(-1) else: g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1] @@ -290,10 +290,10 @@ class GlowTTS(nn.Module): return outputs @torch.no_grad() - def inference(self, x, x_lengths, cond_input={"x_vectors": None}): # pylint: disable=dangerous-default-value - g = cond_input["x_vectors"] if cond_input is not None and "x_vectors" in cond_input else None + def inference(self, x, x_lengths, cond_input={"d_vectors": None}): # pylint: disable=dangerous-default-value + g = cond_input["d_vectors"] if cond_input is not None and "d_vectors" in cond_input else None if g is not None: - if self.speaker_embedding_dim: + if self.d_vector_dim: g = F.normalize(g).unsqueeze(-1) else: g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h] @@ -338,9 +338,9 @@ class GlowTTS(nn.Module): text_lengths = batch["text_lengths"] mel_input = batch["mel_input"] mel_lengths = batch["mel_lengths"] - x_vectors = batch["x_vectors"] + d_vectors = batch["d_vectors"] - outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input={"x_vectors": x_vectors}) + outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input={"d_vectors": d_vectors}) loss_dict = criterion( outputs["model_outputs"], diff --git a/TTS/tts/models/speedy_speech.py b/TTS/tts/models/speedy_speech.py index 455dbf38..53f7bbaa 100644 --- a/TTS/tts/models/speedy_speech.py +++ b/TTS/tts/models/speedy_speech.py @@ -157,7 +157,7 @@ class SpeedySpeech(nn.Module): return o_de, attn.transpose(1, 2) def forward( - self, x, x_lengths, y_lengths, dr, cond_input={"x_vectors": None, "speaker_ids": None} + self, x, x_lengths, y_lengths, dr, cond_input={"d_vectors": None, "speaker_ids": None} ): # pylint: disable=unused-argument """ TODO: speaker embedding for speaker_ids @@ -168,21 +168,21 @@ class SpeedySpeech(nn.Module): dr: [B, T_max] g: [B, C] """ - g = cond_input["x_vectors"] if "x_vectors" in cond_input else None + g = cond_input["d_vectors"] if "d_vectors" in cond_input else None o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask) o_de, attn = self._forward_decoder(o_en, o_en_dp, dr, x_mask, y_lengths, g=g) outputs = {"model_outputs": o_de.transpose(1, 2), "durations_log": o_dr_log.squeeze(1), "alignments": attn} return outputs - def inference(self, x, cond_input={"x_vectors": None, "speaker_ids": None}): # pylint: disable=unused-argument + def inference(self, x, cond_input={"d_vectors": None, "speaker_ids": None}): # pylint: disable=unused-argument """ Shapes: x: [B, T_max] x_lengths: [B] g: [B, C] """ - g = cond_input["x_vectors"] if "x_vectors" in cond_input else None + g = cond_input["d_vectors"] if "d_vectors" in cond_input else None x_lengths = torch.tensor(x.shape[1:2]).to(x.device) # input sequence should be greated than the max convolution size inference_padding = 5 @@ -204,11 +204,11 @@ class SpeedySpeech(nn.Module): text_lengths = batch["text_lengths"] mel_input = batch["mel_input"] mel_lengths = batch["mel_lengths"] - x_vectors = batch["x_vectors"] + d_vectors = batch["d_vectors"] speaker_ids = batch["speaker_ids"] durations = batch["durations"] - cond_input = {"x_vectors": x_vectors, "speaker_ids": speaker_ids} + cond_input = {"d_vectors": d_vectors, "speaker_ids": speaker_ids} outputs = self.forward(text_input, text_lengths, mel_lengths, durations, cond_input) # compute loss diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index 12c3e5f9..123b69a7 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -42,7 +42,7 @@ class Tacotron(TacotronAbstract): ddc_r (int, optional): reduction rate for the coarse decoder of double decoder consistency. Defaults to None. encoder_in_features (int, optional): input channels for the encoder. Defaults to 512. decoder_in_features (int, optional): input channels for the decoder. Defaults to 512. - speaker_embedding_dim (int, optional): external speaker conditioning vector channels. Defaults to None. + d_vector_dim (int, optional): external speaker conditioning vector channels. Defaults to None. use_gst (bool, optional): enable/disable Global style token module. gst (Coqpit, optional): Coqpit to initialize the GST module. If `None`, GST is disabled. Defaults to None. memory_size (int, optional): size of the history queue fed to the prenet. Model feeds the last ```memory_size``` @@ -75,7 +75,7 @@ class Tacotron(TacotronAbstract): ddc_r=None, encoder_in_features=256, decoder_in_features=256, - speaker_embedding_dim=None, + d_vector_dim=None, use_gst=False, gst=None, memory_size=5, @@ -104,7 +104,7 @@ class Tacotron(TacotronAbstract): ddc_r, encoder_in_features, decoder_in_features, - speaker_embedding_dim, + d_vector_dim, use_gst, gst, gradual_training, @@ -112,14 +112,14 @@ class Tacotron(TacotronAbstract): # speaker embedding layers if self.num_speakers > 1: - if not self.embeddings_per_sample: - speaker_embedding_dim = 256 - self.speaker_embedding = nn.Embedding(self.num_speakers, speaker_embedding_dim) + if not self.use_d_vectors: + d_vector_dim = 256 + self.speaker_embedding = nn.Embedding(self.num_speakers, d_vector_dim) self.speaker_embedding.weight.data.normal_(0, 0.3) # speaker and gst embeddings is concat in decoder input if self.num_speakers > 1: - self.decoder_in_features += speaker_embedding_dim # add speaker embedding dim + self.decoder_in_features += d_vector_dim # add speaker embedding dim # embedding layer self.embedding = nn.Embedding(num_chars, 256, padding_idx=0) @@ -154,7 +154,7 @@ class Tacotron(TacotronAbstract): if self.gst and self.use_gst: self.gst_layer = GST( num_mel=decoder_output_dim, - speaker_embedding_dim=speaker_embedding_dim, + d_vector_dim=d_vector_dim, num_heads=gst.gst_num_heads, num_style_tokens=gst.gst_num_style_tokens, gst_embedding_dim=gst.gst_embedding_dim, @@ -189,7 +189,7 @@ class Tacotron(TacotronAbstract): text_lengths: [B] mel_specs: [B, T_out, C] mel_lengths: [B] - cond_input: 'speaker_ids': [B, 1] and 'x_vectors':[B, C] + cond_input: 'speaker_ids': [B, 1] and 'd_vectors':[B, C] """ outputs = {"alignments_backward": None, "decoder_outputs_backward": None} inputs = self.embedding(text) @@ -201,16 +201,16 @@ class Tacotron(TacotronAbstract): # global style token if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, cond_input["x_vectors"]) + encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, cond_input["d_vectors"]) # speaker embedding if self.num_speakers > 1: - if not self.embeddings_per_sample: + if not self.use_d_vectors: # B x 1 x speaker_embed_dim - speaker_embeddings = self.speaker_embedding(cond_input["speaker_ids"])[:, None] + embedded_speakers = self.speaker_embedding(cond_input["speaker_ids"])[:, None] else: # B x 1 x speaker_embed_dim - speaker_embeddings = torch.unsqueeze(cond_input["x_vectors"], 1) - encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) + embedded_speakers = torch.unsqueeze(cond_input["d_vectors"], 1) + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers) # decoder_outputs: B x decoder_in_features x T_out # alignments: B x T_in x encoder_in_features # stop_tokens: B x T_in @@ -254,15 +254,15 @@ class Tacotron(TacotronAbstract): encoder_outputs = self.encoder(inputs) if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, cond_input["style_mel"], cond_input["x_vectors"]) + encoder_outputs = self.compute_gst(encoder_outputs, cond_input["style_mel"], cond_input["d_vectors"]) if self.num_speakers > 1: - if not self.embeddings_per_sample: + if not self.use_d_vectors: # B x 1 x speaker_embed_dim - speaker_embeddings = self.speaker_embedding(cond_input["speaker_ids"])[:, None] + embedded_speakers = self.speaker_embedding(cond_input["speaker_ids"])[:, None] else: # B x 1 x speaker_embed_dim - speaker_embeddings = torch.unsqueeze(cond_input["x_vectors"], 1) - encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) + embedded_speakers = torch.unsqueeze(cond_input["d_vectors"], 1) + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers) decoder_outputs, alignments, stop_tokens = self.decoder.inference(encoder_outputs) postnet_outputs = self.postnet(decoder_outputs) postnet_outputs = self.last_linear(postnet_outputs) @@ -289,7 +289,7 @@ class Tacotron(TacotronAbstract): linear_input = batch["linear_input"] stop_targets = batch["stop_targets"] speaker_ids = batch["speaker_ids"] - x_vectors = batch["x_vectors"] + d_vectors = batch["d_vectors"] # forward pass model outputs = self.forward( @@ -297,7 +297,7 @@ class Tacotron(TacotronAbstract): text_lengths, mel_input, mel_lengths, - cond_input={"speaker_ids": speaker_ids, "x_vectors": x_vectors}, + cond_input={"speaker_ids": speaker_ids, "d_vectors": d_vectors}, ) # set the [alignment] lengths wrt reduction factor for guided attention @@ -308,7 +308,7 @@ class Tacotron(TacotronAbstract): else: alignment_lengths = mel_lengths // self.decoder.r - cond_input = {"speaker_ids": speaker_ids, "x_vectors": x_vectors} + cond_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors} outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input) # compute loss diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index 68867ec8..4628c64e 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -42,7 +42,7 @@ class Tacotron2(TacotronAbstract): ddc_r (int, optional): reduction rate for the coarse decoder of double decoder consistency. Defaults to None. encoder_in_features (int, optional): input channels for the encoder. Defaults to 512. decoder_in_features (int, optional): input channels for the decoder. Defaults to 512. - speaker_embedding_dim (int, optional): external speaker conditioning vector channels. Defaults to None. + d_vector_dim (int, optional): external speaker conditioning vector channels. Defaults to None. use_gst (bool, optional): enable/disable Global style token module. gst (Coqpit, optional): Coqpit to initialize the GST module. If `None`, GST is disabled. Defaults to None. gradual_training (List): Gradual training schedule. If None or `[]`, no gradual training is used. @@ -73,7 +73,7 @@ class Tacotron2(TacotronAbstract): ddc_r=None, encoder_in_features=512, decoder_in_features=512, - speaker_embedding_dim=None, + d_vector_dim=None, use_gst=False, gst=None, gradual_training=None, @@ -101,7 +101,7 @@ class Tacotron2(TacotronAbstract): ddc_r, encoder_in_features, decoder_in_features, - speaker_embedding_dim, + d_vector_dim, use_gst, gst, gradual_training, @@ -109,14 +109,14 @@ class Tacotron2(TacotronAbstract): # speaker embedding layer if self.num_speakers > 1: - if not self.embeddings_per_sample: - speaker_embedding_dim = 512 - self.speaker_embedding = nn.Embedding(self.num_speakers, speaker_embedding_dim) + if not self.use_d_vectors: + d_vector_dim = 512 + self.speaker_embedding = nn.Embedding(self.num_speakers, d_vector_dim) self.speaker_embedding.weight.data.normal_(0, 0.3) # speaker and gst embeddings is concat in decoder input if self.num_speakers > 1: - self.decoder_in_features += speaker_embedding_dim # add speaker embedding dim + self.decoder_in_features += d_vector_dim # add speaker embedding dim # embedding layer self.embedding = nn.Embedding(num_chars, 512, padding_idx=0) @@ -142,13 +142,13 @@ class Tacotron2(TacotronAbstract): self.postnet = Postnet(self.postnet_output_dim) # setup prenet dropout - self.decoder.prenet.dropout_at_inference = prenet_dropout_at_inference + self.decoder.prenet.dropout_at_g = prenet_dropout_at_inference # global style token layers if self.gst and use_gst: self.gst_layer = GST( num_mel=decoder_output_dim, - speaker_embedding_dim=speaker_embedding_dim, + d_vector_dim=d_vector_dim, num_heads=gst.gst_num_heads, num_style_tokens=gst.gst_num_style_tokens, gst_embedding_dim=gst.gst_embedding_dim, @@ -189,7 +189,7 @@ class Tacotron2(TacotronAbstract): text_lengths: [B] mel_specs: [B, T_out, C] mel_lengths: [B] - cond_input: 'speaker_ids': [B, 1] and 'x_vectors':[B, C] + cond_input: 'speaker_ids': [B, 1] and 'd_vectors':[B, C] """ cond_input = self._format_cond_input(cond_input) outputs = {"alignments_backward": None, "decoder_outputs_backward": None} @@ -202,15 +202,15 @@ class Tacotron2(TacotronAbstract): encoder_outputs = self.encoder(embedded_inputs, text_lengths) if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, cond_input["x_vectors"]) + encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, cond_input["d_vectors"]) if self.num_speakers > 1: - if not self.embeddings_per_sample: + if not self.use_d_vectors: # B x 1 x speaker_embed_dim - speaker_embeddings = self.speaker_embedding(cond_input["speaker_ids"])[:, None] + embedded_speakers = self.speaker_embedding(cond_input["speaker_ids"])[:, None] else: # B x 1 x speaker_embed_dim - speaker_embeddings = torch.unsqueeze(cond_input["x_vectors"], 1) - encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) + embedded_speakers = torch.unsqueeze(cond_input["d_vectors"], 1) + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers) encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs) @@ -255,15 +255,15 @@ class Tacotron2(TacotronAbstract): if self.gst and self.use_gst: # B x gst_dim - encoder_outputs = self.compute_gst(encoder_outputs, cond_input["style_mel"], cond_input["x_vectors"]) + encoder_outputs = self.compute_gst(encoder_outputs, cond_input["style_mel"], cond_input["d_vectors"]) if self.num_speakers > 1: if not self.embeddings_per_sample: x_vector = self.speaker_embedding(cond_input['speaker_ids'])[:, None] x_vector = torch.unsqueeze(x_vector, 0).transpose(1, 2) else: - x_vector = cond_input["x_vectors"] + embedded_speakers = cond_input["d_vectors"] - encoder_outputs = self._concat_speaker_embedding(encoder_outputs, x_vector) + encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers) decoder_outputs, alignments, stop_tokens = self.decoder.inference(encoder_outputs) postnet_outputs = self.postnet(decoder_outputs) @@ -291,7 +291,7 @@ class Tacotron2(TacotronAbstract): linear_input = batch["linear_input"] stop_targets = batch["stop_targets"] speaker_ids = batch["speaker_ids"] - x_vectors = batch["x_vectors"] + d_vectors = batch["d_vectors"] # forward pass model outputs = self.forward( @@ -299,7 +299,7 @@ class Tacotron2(TacotronAbstract): text_lengths, mel_input, mel_lengths, - cond_input={"speaker_ids": speaker_ids, "x_vectors": x_vectors}, + cond_input={"speaker_ids": speaker_ids, "d_vectors": d_vectors}, ) # set the [alignment] lengths wrt reduction factor for guided attention @@ -310,7 +310,7 @@ class Tacotron2(TacotronAbstract): else: alignment_lengths = mel_lengths // self.decoder.r - cond_input = {"speaker_ids": speaker_ids, "x_vectors": x_vectors} + cond_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors} outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input) # compute loss diff --git a/TTS/tts/models/tacotron_abstract.py b/TTS/tts/models/tacotron_abstract.py index fe43d81f..e480e2e0 100644 --- a/TTS/tts/models/tacotron_abstract.py +++ b/TTS/tts/models/tacotron_abstract.py @@ -35,7 +35,7 @@ class TacotronAbstract(ABC, nn.Module): ddc_r=None, encoder_in_features=512, decoder_in_features=512, - speaker_embedding_dim=None, + d_vector_dim=None, use_gst=False, gst=None, gradual_training=None, @@ -66,7 +66,7 @@ class TacotronAbstract(ABC, nn.Module): self.separate_stopnet = separate_stopnet self.encoder_in_features = encoder_in_features self.decoder_in_features = decoder_in_features - self.speaker_embedding_dim = speaker_embedding_dim + self.d_vector_dim = d_vector_dim self.gradual_training = gradual_training # layers @@ -76,12 +76,12 @@ class TacotronAbstract(ABC, nn.Module): self.postnet = None # multispeaker - if self.speaker_embedding_dim is None: - # if speaker_embedding_dim is None we need use the nn.Embedding, with default speaker_embedding_dim - self.embeddings_per_sample = False + if self.d_vector_dim is None: + # if d_vector_dim is None we need use the nn.Embedding, with default d_vector_dim + self.use_d_vectors = False else: - # if speaker_embedding_dim is not None we need use speaker embedding per sample - self.embeddings_per_sample = True + # if d_vector_dim is not None we need use speaker embedding per sample + self.use_d_vectors = True # global style token if self.gst and use_gst: @@ -89,8 +89,8 @@ class TacotronAbstract(ABC, nn.Module): self.gst_layer = None # model states - self.speaker_embeddings = None - self.speaker_embeddings_projected = None + self.embedded_speakers = None + self.embedded_speakers_projected = None # additional layers self.decoder_backward = None @@ -98,15 +98,15 @@ class TacotronAbstract(ABC, nn.Module): @staticmethod def _format_cond_input(cond_input: Dict) -> Dict: - return format_cond_input({"x_vectors": None, "speaker_ids": None}, cond_input) + return format_cond_input({"d_vectors": None, "speaker_ids": None}, cond_input) ############################# # INIT FUNCTIONS ############################# def _init_states(self): - self.speaker_embeddings = None - self.speaker_embeddings_projected = None + self.embedded_speakers = None + self.embedded_speakers_projected = None def _init_backward_decoder(self): self.decoder_backward = copy.deepcopy(self.decoder) @@ -188,9 +188,9 @@ class TacotronAbstract(ABC, nn.Module): if hasattr(self, "speaker_embedding") and speaker_ids is None: raise RuntimeError(" [!] Model has speaker embedding layer but speaker_id is not provided") if hasattr(self, "speaker_embedding") and speaker_ids is not None: - self.speaker_embeddings = self.speaker_embedding(speaker_ids).unsqueeze(1) + self.embedded_speakers = self.speaker_embedding(speaker_ids).unsqueeze(1) if hasattr(self, "speaker_project_mel") and speaker_ids is not None: - self.speaker_embeddings_projected = self.speaker_project_mel(self.speaker_embeddings).squeeze(1) + self.embedded_speakers_projected = self.speaker_project_mel(self.embedded_speakers).squeeze(1) def compute_gst(self, inputs, style_input, speaker_embedding=None): """Compute global style token""" @@ -213,15 +213,15 @@ class TacotronAbstract(ABC, nn.Module): return inputs @staticmethod - def _add_speaker_embedding(outputs, speaker_embeddings): - speaker_embeddings_ = speaker_embeddings.expand(outputs.size(0), outputs.size(1), -1) - outputs = outputs + speaker_embeddings_ + def _add_speaker_embedding(outputs, embedded_speakers): + embedded_speakers_ = embedded_speakers.expand(outputs.size(0), outputs.size(1), -1) + outputs = outputs + embedded_speakers_ return outputs @staticmethod - def _concat_speaker_embedding(outputs, speaker_embeddings): - speaker_embeddings_ = speaker_embeddings.expand(outputs.size(0), outputs.size(1), -1) - outputs = torch.cat([outputs, speaker_embeddings_], dim=-1) + def _concat_speaker_embedding(outputs, embedded_speakers): + embedded_speakers_ = embedded_speakers.expand(outputs.size(0), outputs.size(1), -1) + outputs = torch.cat([outputs, embedded_speakers_], dim=-1) return outputs ############################# diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index cebf0dca..546d483d 100755 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -52,8 +52,8 @@ def get_speaker_manager(c, args, meta_data_train): raise RuntimeError( "You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file" ) - speaker_manager.load_x_vectors_file(c.external_speaker_embedding_file) - speaker_manager.set_x_vectors_from_file(speakers_file) + speaker_manager.load_d_vectors_file(c.external_speaker_embedding_file) + speaker_manager.set_d_vectors_from_file(speakers_file) elif not c.use_external_speaker_embedding_file: # restor speaker manager with speaker ID file. speakers_file = os.path.dirname(args.restore_path) speaker_ids_from_data = speaker_manager.speaker_ids @@ -63,7 +63,7 @@ def get_speaker_manager(c, args, meta_data_train): ), " [!] You cannot introduce new speakers to a pre-trained model." elif c.use_external_speaker_embedding_file and c.external_speaker_embedding_file: # new speaker manager with external speaker embeddings. - speaker_manager.set_x_vectors_from_file(c.external_speaker_embedding_file) + speaker_manager.set_d_vectors_from_file(c.external_speaker_embedding_file) elif ( c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file ): # new speaker manager with speaker IDs file. @@ -88,7 +88,7 @@ class SpeakerManager: { 'clip_name.wav':{ 'name': 'speakerA', - 'embedding'[] + 'embedding'[] }, ... } @@ -103,10 +103,10 @@ class SpeakerManager: >>> # load a sample audio and compute embedding >>> waveform = ap.load_wav(sample_wav_path) >>> mel = ap.melspectrogram(waveform) - >>> x_vector = manager.compute_x_vector(mel.T) + >>> d_vector = manager.compute_d_vector(mel.T) Args: - x_vectors_file_path (str, optional): Path to the metafile including x vectors. Defaults to "". + d_vectors_file_path (str, optional): Path to the metafile including x vectors. Defaults to "". speaker_id_file_path (str, optional): Path to the metafile that maps speaker names to ids used by TTS models. Defaults to "". encoder_model_path (str, optional): Path to the speaker encoder model file. Defaults to "". @@ -116,15 +116,15 @@ class SpeakerManager: def __init__( self, data_items: List[List[Any]] = None, - x_vectors_file_path: str = "", + d_vectors_file_path: str = "", speaker_id_file_path: str = "", encoder_model_path: str = "", encoder_config_path: str = "", ): self.data_items = [] - self.x_vectors = {} - self.speaker_ids = [] + self.d_vectors = {} + self.speaker_ids = {} self.clip_ids = [] self.speaker_encoder = None self.speaker_encoder_ap = None @@ -132,8 +132,8 @@ class SpeakerManager: if data_items: self.speaker_ids, _ = self.parse_speakers_from_data(self.data_items) - if x_vectors_file_path: - self.set_x_vectors_from_file(x_vectors_file_path) + if d_vectors_file_path: + self.set_d_vectors_from_file(d_vectors_file_path) if speaker_id_file_path: self.set_speaker_ids_from_file(speaker_id_file_path) @@ -156,10 +156,10 @@ class SpeakerManager: return len(self.speaker_ids) @property - def x_vector_dim(self): - """Dimensionality of x_vectors. If x_vectors are not loaded, returns zero.""" - if self.x_vectors: - return len(self.x_vectors[list(self.x_vectors.keys())[0]]["embedding"]) + def d_vector_dim(self): + """Dimensionality of d_vectors. If d_vectors are not loaded, returns zero.""" + if self.d_vectors: + return len(self.d_vectors[list(self.d_vectors.keys())[0]]["embedding"]) return 0 @staticmethod @@ -201,73 +201,73 @@ class SpeakerManager: """ self._save_json(file_path, self.speaker_ids) - def save_x_vectors_to_file(self, file_path: str) -> None: - """Save x_vectors to a json file. + def save_d_vectors_to_file(self, file_path: str) -> None: + """Save d_vectors to a json file. Args: file_path (str): Path to the output file. """ - self._save_json(file_path, self.x_vectors) + self._save_json(file_path, self.d_vectors) - def set_x_vectors_from_file(self, file_path: str) -> None: - """Load x_vectors from a json file. + def set_d_vectors_from_file(self, file_path: str) -> None: + """Load d_vectors from a json file. Args: file_path (str): Path to the target json file. """ - self.x_vectors = self._load_json(file_path) - self.speaker_ids = list(set(sorted(x["name"] for x in self.x_vectors.values()))) - self.clip_ids = list(set(sorted(clip_name for clip_name in self.x_vectors.keys()))) + self.d_vectors = self._load_json(file_path) + self.speaker_ids = list(set(sorted(x["name"] for x in self.d_vectors.values()))) + self.clip_ids = list(set(sorted(clip_name for clip_name in self.d_vectors.keys()))) - def get_x_vector_by_clip(self, clip_idx: str) -> List: - """Get x_vector by clip ID. + def get_d_vector_by_clip(self, clip_idx: str) -> List: + """Get d_vector by clip ID. Args: clip_idx (str): Target clip ID. Returns: - List: x_vector as a list. + List: d_vector as a list. """ - return self.x_vectors[clip_idx]["embedding"] + return self.d_vectors[clip_idx]["embedding"] - def get_x_vectors_by_speaker(self, speaker_idx: str) -> List[List]: - """Get all x_vectors of a speaker. + def get_d_vectors_by_speaker(self, speaker_idx: str) -> List[List]: + """Get all d_vectors of a speaker. Args: speaker_idx (str): Target speaker ID. Returns: - List[List]: all the x_vectors of the given speaker. + List[List]: all the d_vectors of the given speaker. """ - return [x["embedding"] for x in self.x_vectors.values() if x["name"] == speaker_idx] + return [x["embedding"] for x in self.d_vectors.values() if x["name"] == speaker_idx] - def get_mean_x_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False) -> np.ndarray: - """Get mean x_vector of a speaker ID. + def get_mean_d_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False) -> np.ndarray: + """Get mean d_vector of a speaker ID. Args: speaker_idx (str): Target speaker ID. num_samples (int, optional): Number of samples to be averaged. Defaults to None. - randomize (bool, optional): Pick random `num_samples`of x_vectors. Defaults to False. + randomize (bool, optional): Pick random `num_samples`of d_vectors. Defaults to False. Returns: - np.ndarray: Mean x_vector. + np.ndarray: Mean d_vector. """ - x_vectors = self.get_x_vectors_by_speaker(speaker_idx) + d_vectors = self.get_d_vectors_by_speaker(speaker_idx) if num_samples is None: - x_vectors = np.stack(x_vectors).mean(0) + d_vectors = np.stack(d_vectors).mean(0) else: - assert len(x_vectors) >= num_samples, f" [!] speaker {speaker_idx} has number of samples < {num_samples}" + assert len(d_vectors) >= num_samples, f" [!] speaker {speaker_idx} has number of samples < {num_samples}" if randomize: - x_vectors = np.stack(random.choices(x_vectors, k=num_samples)).mean(0) + d_vectors = np.stack(random.choices(d_vectors, k=num_samples)).mean(0) else: - x_vectors = np.stack(x_vectors[:num_samples]).mean(0) - return x_vectors + d_vectors = np.stack(d_vectors[:num_samples]).mean(0) + return d_vectors def get_speakers(self) -> List: return self.speaker_ids def get_clips(self) -> List: - return sorted(self.x_vectors.keys()) + return sorted(self.d_vectors.keys()) def init_speaker_encoder(self, model_path: str, config_path: str) -> None: """Initialize a speaker encoder model. @@ -284,14 +284,14 @@ class SpeakerManager: self.speaker_encoder_ap.do_sound_norm = True self.speaker_encoder_ap.do_trim_silence = True - def compute_x_vector_from_clip(self, wav_file: Union[str, list]) -> list: - """Compute a x_vector from a given audio file. + def compute_d_vector_from_clip(self, wav_file: Union[str, list]) -> list: + """Compute a d_vector from a given audio file. Args: wav_file (Union[str, list]): Target file path. Returns: - list: Computed x_vector. + list: Computed d_vector. """ def _compute(wav_file: str): @@ -299,30 +299,30 @@ class SpeakerManager: spec = self.speaker_encoder_ap.melspectrogram(waveform) spec = torch.from_numpy(spec.T) spec = spec.unsqueeze(0) - x_vector = self.speaker_encoder.compute_embedding(spec) - return x_vector + d_vector = self.speaker_encoder.compute_embedding(spec) + return d_vector if isinstance(wav_file, list): - # compute the mean x_vector - x_vectors = None + # compute the mean d_vector + d_vectors = None for wf in wav_file: - x_vector = _compute(wf) - if x_vectors is None: - x_vectors = x_vector + d_vector = _compute(wf) + if d_vectors is None: + d_vectors = d_vector else: - x_vectors += x_vector - return (x_vectors / len(wav_file))[0].tolist() - x_vector = _compute(wav_file) - return x_vector[0].tolist() + d_vectors += d_vector + return (d_vectors / len(wav_file))[0].tolist() + d_vector = _compute(wav_file) + return d_vector[0].tolist() - def compute_x_vector(self, feats: Union[torch.Tensor, np.ndarray]) -> List: - """Compute x_vector from features. + def compute_d_vector(self, feats: Union[torch.Tensor, np.ndarray]) -> List: + """Compute d_vector from features. Args: feats (Union[torch.Tensor, np.ndarray]): Input features. Returns: - List: computed x_vector. + List: computed d_vector. """ if isinstance(feats, np.ndarray): feats = torch.from_numpy(feats) diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 35b7d818..0cb8df38 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -64,9 +64,9 @@ def compute_style_mel(style_wav, ap, cuda=False): return style_mel -def run_model_torch(model, inputs, speaker_id=None, style_mel=None, x_vector=None): +def run_model_torch(model, inputs, speaker_id=None, style_mel=None, d_vector=None): outputs = model.inference( - inputs, cond_input={"speaker_ids": speaker_id, "x_vector": x_vector, "style_mel": style_mel} + inputs, cond_input={"speaker_ids": speaker_id, "d_vector": d_vector, "style_mel": style_mel} ) return outputs @@ -139,13 +139,13 @@ def speaker_id_to_torch(speaker_id, cuda=False): return speaker_id -def embedding_to_torch(x_vector, cuda=False): - if x_vector is not None: - x_vector = np.asarray(x_vector) - x_vector = torch.from_numpy(x_vector).unsqueeze(0).type(torch.FloatTensor) +def embedding_to_torch(d_vector, cuda=False): + if d_vector is not None: + d_vector = np.asarray(d_vector) + d_vector = torch.from_numpy(d_vector).unsqueeze(0).type(torch.FloatTensor) if cuda: - return x_vector.cuda() - return x_vector + return d_vector.cuda() + return d_vector # TODO: perform GL with pytorch for batching @@ -177,7 +177,7 @@ def synthesis( enable_eos_bos_chars=False, # pylint: disable=unused-argument use_griffin_lim=False, do_trim_silence=False, - x_vector=None, + d_vector=None, backend="torch", ): """Synthesize voice for the given text. @@ -209,8 +209,8 @@ def synthesis( if speaker_id is not None: speaker_id = speaker_id_to_torch(speaker_id, cuda=use_cuda) - if x_vector is not None: - x_vector = embedding_to_torch(x_vector, cuda=use_cuda) + if d_vector is not None: + d_vector = embedding_to_torch(d_vector, cuda=use_cuda) if not isinstance(style_mel, dict): style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda) @@ -227,7 +227,7 @@ def synthesis( text_inputs = tf.expand_dims(text_inputs, 0) # synthesize voice if backend == "torch": - outputs = run_model_torch(model, text_inputs, speaker_id, style_mel, x_vector=x_vector) + outputs = run_model_torch(model, text_inputs, speaker_id, style_mel, d_vector=d_vector) model_outputs = outputs["model_outputs"] model_outputs = model_outputs[0].data.cpu().numpy() alignments = outputs["alignments"] diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index a31436d4..8f510f20 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -63,7 +63,7 @@ class Synthesizer(object): self.speaker_manager = None self.num_speakers = 0 self.tts_speakers = {} - self.speaker_embedding_dim = 0 + self.d_vector_dim = 0 self.seg = self._get_segmenter("en") self.use_cuda = use_cuda @@ -98,9 +98,9 @@ class Synthesizer(object): self.speaker_manager = SpeakerManager( encoder_model_path=self.encoder_checkpoint, encoder_config_path=self.encoder_config ) - self.speaker_manager.load_x_vectors_file(self.tts_config.get("external_speaker_embedding_file", speaker_file)) + self.speaker_manager.load_d_vectors_file(self.tts_config.get("external_speaker_embedding_file", speaker_file)) self.num_speakers = self.speaker_manager.num_speakers - self.speaker_embedding_dim = self.speaker_manager.x_vector_dim + self.d_vector_dim = self.speaker_manager.d_vector_dim def _load_tts(self, tts_checkpoint: str, tts_config_path: str, use_cuda: bool) -> None: """Load the TTS model. @@ -135,7 +135,7 @@ class Synthesizer(object): self.input_size, num_speakers=self.num_speakers, c=self.tts_config, - speaker_embedding_dim=self.speaker_embedding_dim, + d_vector_dim=self.d_vector_dim, ) self.tts_model.load_checkpoint(self.tts_config, tts_checkpoint, eval=True) if use_cuda: @@ -197,9 +197,9 @@ class Synthesizer(object): print(sens) if self.tts_speakers_file: - # get the speaker embedding from the saved x_vectors. + # get the speaker embedding from the saved d_vectors. if speaker_idx and isinstance(speaker_idx, str): - speaker_embedding = self.speaker_manager.get_x_vectors_by_speaker(speaker_idx)[0] + speaker_embedding = self.speaker_manager.get_d_vectors_by_speaker(speaker_idx)[0] elif not speaker_idx and not speaker_wav: raise ValueError( " [!] Look like you use a multi-speaker model. " @@ -214,9 +214,9 @@ class Synthesizer(object): "Define path for speaker.json if it is a multi-speaker model or remove defined speaker idx. " ) - # compute a new x_vector from the given clip. + # compute a new d_vector from the given clip. if speaker_wav is not None: - speaker_embedding = self.speaker_manager.compute_x_vector_from_clip(speaker_wav) + speaker_embedding = self.speaker_manager.compute_d_vector_from_clip(speaker_wav) use_gl = self.vocoder_model is None @@ -232,7 +232,7 @@ class Synthesizer(object): style_wav=style_wav, enable_eos_bos_chars=self.tts_config.enable_eos_bos_chars, use_griffin_lim=use_gl, - x_vector=speaker_embedding, + d_vector=speaker_embedding, ) waveform = outputs["wav"] mel_postnet_spec = outputs["model_outputs"] diff --git a/tests/test_extract_tts_spectrograms.py b/tests/test_extract_tts_spectrograms.py index ddc7e4da..d16167ed 100644 --- a/tests/test_extract_tts_spectrograms.py +++ b/tests/test_extract_tts_spectrograms.py @@ -22,7 +22,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase): c = load_config(config_path) # create model num_chars = len(phonemes if c.use_phonemes else symbols) - model = setup_model(num_chars, 1, c, speaker_embedding_dim=None) + model = setup_model(num_chars, 1, c, d_vector_dim=None) # save model torch.save({"model": model.state_dict()}, checkpoint_path) # run test @@ -41,7 +41,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase): c = load_config(config_path) # create model num_chars = len(phonemes if c.use_phonemes else symbols) - model = setup_model(num_chars, 1, c, speaker_embedding_dim=None) + model = setup_model(num_chars, 1, c, d_vector_dim=None) # save model torch.save({"model": model.state_dict()}, checkpoint_path) # run test @@ -60,7 +60,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase): c = load_config(config_path) # create model num_chars = len(phonemes if c.use_phonemes else symbols) - model = setup_model(num_chars, 1, c, speaker_embedding_dim=None) + model = setup_model(num_chars, 1, c, d_vector_dim=None) # save model torch.save({"model": model.state_dict()}, checkpoint_path) # run test diff --git a/tests/test_speaker_manager.py b/tests/test_speaker_manager.py index f80e56fc..a695fe61 100644 --- a/tests/test_speaker_manager.py +++ b/tests/test_speaker_manager.py @@ -15,11 +15,11 @@ encoder_config_path = os.path.join(get_tests_input_path(), "test_speaker_encoder encoder_model_path = os.path.join(get_tests_input_path(), "checkpoint_0.pth.tar") sample_wav_path = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0001.wav") sample_wav_path2 = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0002.wav") -x_vectors_file_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.json") +d_vectors_file_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.json") class SpeakerManagerTest(unittest.TestCase): - """Test SpeakerManager for loading embedding files and computing x_vectors from waveforms""" + """Test SpeakerManager for loading embedding files and computing d_vectors from waveforms""" @staticmethod def test_speaker_embedding(): @@ -38,38 +38,38 @@ class SpeakerManagerTest(unittest.TestCase): # load a sample audio and compute embedding waveform = ap.load_wav(sample_wav_path) mel = ap.melspectrogram(waveform) - x_vector = manager.compute_x_vector(mel.T) - assert x_vector.shape[1] == 256 + d_vector = manager.compute_d_vector(mel.T) + assert d_vector.shape[1] == 256 - # compute x_vector directly from an input file - x_vector = manager.compute_x_vector_from_clip(sample_wav_path) - x_vector2 = manager.compute_x_vector_from_clip(sample_wav_path) - x_vector = torch.FloatTensor(x_vector) - x_vector2 = torch.FloatTensor(x_vector2) - assert x_vector.shape[0] == 256 - assert (x_vector - x_vector2).sum() == 0.0 + # compute d_vector directly from an input file + d_vector = manager.compute_d_vector_from_clip(sample_wav_path) + d_vector2 = manager.compute_d_vector_from_clip(sample_wav_path) + d_vector = torch.FloatTensor(d_vector) + d_vector2 = torch.FloatTensor(d_vector2) + assert d_vector.shape[0] == 256 + assert (d_vector - d_vector2).sum() == 0.0 - # compute x_vector from a list of wav files. - x_vector3 = manager.compute_x_vector_from_clip([sample_wav_path, sample_wav_path2]) - x_vector3 = torch.FloatTensor(x_vector3) - assert x_vector3.shape[0] == 256 - assert (x_vector - x_vector3).sum() != 0.0 + # compute d_vector from a list of wav files. + d_vector3 = manager.compute_d_vector_from_clip([sample_wav_path, sample_wav_path2]) + d_vector3 = torch.FloatTensor(d_vector3) + assert d_vector3.shape[0] == 256 + assert (d_vector - d_vector3).sum() != 0.0 # remove dummy model os.remove(encoder_model_path) @staticmethod def test_speakers_file_processing(): - manager = SpeakerManager(x_vectors_file_path=x_vectors_file_path) + manager = SpeakerManager(d_vectors_file_path=d_vectors_file_path) print(manager.num_speakers) - print(manager.x_vector_dim) + print(manager.d_vector_dim) print(manager.clip_ids) - x_vector = manager.get_x_vector_by_clip(manager.clip_ids[0]) - assert len(x_vector) == 256 - x_vectors = manager.get_x_vectors_by_speaker(manager.speaker_ids[0]) - assert len(x_vectors[0]) == 256 - x_vector1 = manager.get_mean_x_vector(manager.speaker_ids[0], num_samples=2, randomize=True) - assert len(x_vector1) == 256 - x_vector2 = manager.get_mean_x_vector(manager.speaker_ids[0], num_samples=2, randomize=False) - assert len(x_vector2) == 256 - assert np.sum(np.array(x_vector1) - np.array(x_vector2)) != 0 + d_vector = manager.get_d_vector_by_clip(manager.clip_ids[0]) + assert len(d_vector) == 256 + d_vectors = manager.get_d_vectors_by_speaker(manager.speaker_ids[0]) + assert len(d_vectors[0]) == 256 + d_vector1 = manager.get_mean_d_vector(manager.speaker_ids[0], num_samples=2, randomize=True) + assert len(d_vector1) == 256 + d_vector2 = manager.get_mean_d_vector(manager.speaker_ids[0], num_samples=2, randomize=False) + assert len(d_vector2) == 256 + assert np.sum(np.array(d_vector1) - np.array(d_vector2)) != 0 diff --git a/tests/tts_tests/test_speedy_speech_layers.py b/tests/tts_tests/test_speedy_speech_layers.py index 66339a82..7c4f0adf 100644 --- a/tests/tts_tests/test_speedy_speech_layers.py +++ b/tests/tts_tests/test_speedy_speech_layers.py @@ -57,7 +57,7 @@ def test_speedy_speech(): # with speaker embedding model = SpeedySpeech(num_chars, out_channels=80, hidden_channels=128, num_speakers=10, c_in_channels=256).to(device) model.forward( - x_dummy, x_lengths, y_lengths, durations, cond_input={"x_vectors": torch.randint(0, 10, (B,)).to(device)} + x_dummy, x_lengths, y_lengths, durations, cond_input={"d_vectors": torch.randint(0, 10, (B,)).to(device)} ) o_de = outputs["model_outputs"] attn = outputs["alignments"] @@ -71,7 +71,7 @@ def test_speedy_speech(): model = SpeedySpeech( num_chars, out_channels=80, hidden_channels=128, num_speakers=10, external_c=True, c_in_channels=256 ).to(device) - model.forward(x_dummy, x_lengths, y_lengths, durations, cond_input={"x_vectors": torch.rand((B, 256)).to(device)}) + model.forward(x_dummy, x_lengths, y_lengths, durations, cond_input={"d_vectors": torch.rand((B, 256)).to(device)}) o_de = outputs["model_outputs"] attn = outputs["alignments"] o_dr = outputs["durations_log"] diff --git a/tests/tts_tests/test_tacotron2_model.py b/tests/tts_tests/test_tacotron2_model.py index 0933ec70..b77f7cc5 100644 --- a/tests/tts_tests/test_tacotron2_model.py +++ b/tests/tts_tests/test_tacotron2_model.py @@ -95,7 +95,7 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): criterion = MSELossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, speaker_embedding_dim=55).to(device) + model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, d_vector_dim=55).to(device) model.train() model_ref = copy.deepcopy(model) count = 0 @@ -105,7 +105,7 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(5): outputs = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"x_vectors": speaker_ids} + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"d_vectors": speaker_ids} ) assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0 assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0 @@ -259,7 +259,7 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() criterion = MSELossMasked(seq_len_norm=False).to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, speaker_embedding_dim=55, use_gst=True, gst=c.gst).to( + model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, d_vector_dim=55, use_gst=True, gst=c.gst).to( device ) model.train() @@ -271,7 +271,7 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=c.lr) for i in range(5): outputs = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"x_vectors": speaker_embeddings} + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"d_vectors": speaker_embeddings} ) assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0 assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0 diff --git a/tests/tts_tests/test_tacotron_model.py b/tests/tts_tests/test_tacotron_model.py index 86de5d16..31682d7a 100644 --- a/tests/tts_tests/test_tacotron_model.py +++ b/tests/tts_tests/test_tacotron_model.py @@ -116,7 +116,7 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): decoder_output_dim=c.audio["num_mels"], r=c.r, memory_size=c.memory_size, - speaker_embedding_dim=55, + d_vector_dim=55, ).to( device ) # FIXME: missing num_speakers parameter to Tacotron ctor @@ -130,7 +130,7 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(5): outputs = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"x_vectors": speaker_embeddings} + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"d_vectors": speaker_embeddings} ) optimizer.zero_grad() loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) @@ -305,7 +305,7 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): gst=c.gst, r=c.r, memory_size=c.memory_size, - speaker_embedding_dim=55, + d_vector_dim=55, ).to( device ) # FIXME: missing num_speakers parameter to Tacotron ctor @@ -319,7 +319,7 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): optimizer = optim.Adam(model.parameters(), lr=c.lr) for _ in range(5): outputs = model.forward( - input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"x_vectors": speaker_embeddings} + input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"d_vectors": speaker_embeddings} ) optimizer.zero_grad() loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths)