rename external speaker embedding arguments as `d_vectors`

This commit is contained in:
Eren Gölge 2021-06-03 11:42:40 +02:00
parent e7b7268c43
commit f00ef90ce6
20 changed files with 251 additions and 252 deletions

View File

@ -108,9 +108,8 @@ def format_data(data):
mel_lengths = mel_lengths.cuda(non_blocking=True) mel_lengths = mel_lengths.cuda(non_blocking=True)
if speaker_ids is not None: if speaker_ids is not None:
speaker_ids = speaker_ids.cuda(non_blocking=True) speaker_ids = speaker_ids.cuda(non_blocking=True)
if speaker_embeddings is not None: if d_vectors is not None:
speaker_embeddings = speaker_embeddings.cuda(non_blocking=True) d_vectors = d_vectors.cuda(non_blocking=True)
if attn_mask is not None: if attn_mask is not None:
attn_mask = attn_mask.cuda(non_blocking=True) attn_mask = attn_mask.cuda(non_blocking=True)
return ( return (
@ -119,7 +118,7 @@ def format_data(data):
mel_input, mel_input,
mel_lengths, mel_lengths,
speaker_ids, speaker_ids,
speaker_embeddings, d_vectors,
avg_text_length, avg_text_length,
avg_spec_length, avg_spec_length,
attn_mask, attn_mask,
@ -137,23 +136,23 @@ def inference(
mel_input, mel_input,
mel_lengths, mel_lengths,
speaker_ids=None, speaker_ids=None,
speaker_embeddings=None, d_vectors=None,
): ):
if model_name == "glow_tts": if model_name == "glow_tts":
speaker_c = None speaker_c = None
if speaker_ids is not None: if speaker_ids is not None:
speaker_c = speaker_ids speaker_c = speaker_ids
elif speaker_embeddings is not None: elif d_vectors is not None:
speaker_c = speaker_embeddings speaker_c = d_vectors
outputs = model.inference_with_MAS( outputs = model.inference_with_MAS(
text_input, text_lengths, mel_input, mel_lengths, cond_input={"x_vectors": speaker_c} text_input, text_lengths, mel_input, mel_lengths, cond_input={"d_vectors": speaker_c}
) )
model_output = outputs["model_outputs"] model_output = outputs["model_outputs"]
model_output = model_output.transpose(1, 2).detach().cpu().numpy() model_output = model_output.transpose(1, 2).detach().cpu().numpy()
elif "tacotron" in model_name: elif "tacotron" in model_name:
cond_input = {"speaker_ids": speaker_ids, "x_vectors": speaker_embeddings} cond_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
outputs = model(text_input, text_lengths, mel_input, mel_lengths, cond_input) outputs = model(text_input, text_lengths, mel_input, mel_lengths, cond_input)
postnet_outputs = outputs["model_outputs"] postnet_outputs = outputs["model_outputs"]
# normalize tacotron output # normalize tacotron output
@ -184,7 +183,7 @@ def extract_spectrograms(
mel_input, mel_input,
mel_lengths, mel_lengths,
speaker_ids, speaker_ids,
speaker_embeddings, d_vectors,
_, _,
_, _,
_, _,
@ -200,7 +199,7 @@ def extract_spectrograms(
mel_input, mel_input,
mel_lengths, mel_lengths,
speaker_ids, speaker_ids,
speaker_embeddings, d_vectors,
) )
for idx in range(text_input.shape[0]): for idx in range(text_input.shape[0]):
@ -256,7 +255,7 @@ def main(args): # pylint: disable=redefined-outer-name
speaker_manager = get_speaker_manager(c, args, meta_data_train) speaker_manager = get_speaker_manager(c, args, meta_data_train)
# setup model # setup model
model = setup_model(num_chars, speaker_manager.num_speakers, c, speaker_embedding_dim=speaker_manager.x_vector_dim) model = setup_model(num_chars, speaker_manager.num_speakers, c, d_vector_dim=speaker_manager.d_vector_dim)
# restore model # restore model
checkpoint = torch.load(args.checkpoint_path, map_location="cpu") checkpoint = torch.load(args.checkpoint_path, map_location="cpu")

View File

@ -157,7 +157,7 @@ def main():
parser.add_argument( parser.add_argument(
"--speaker_wav", "--speaker_wav",
nargs="+", nargs="+",
help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The x_vectors is computed as their average.", help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.",
default=None, default=None,
) )
parser.add_argument("--gst_style", help="Wav path file for GST stylereference.", default=None) parser.add_argument("--gst_style", help="Wav path file for GST stylereference.", default=None)

View File

@ -113,7 +113,7 @@ class TrainerTTS:
len(self.model_characters), len(self.model_characters),
self.speaker_manager.num_speakers, self.speaker_manager.num_speakers,
self.config, self.config,
self.speaker_manager.x_vector_dim if self.speaker_manager.x_vectors else None, self.speaker_manager.d_vector_dim if self.speaker_manager.d_vectors else None,
) )
# setup criterion # setup criterion
@ -156,8 +156,8 @@ class TrainerTTS:
print("\n > Model has {} parameters".format(num_params)) print("\n > Model has {} parameters".format(num_params))
@staticmethod @staticmethod
def get_model(num_chars: int, num_speakers: int, config: Coqpit, x_vector_dim: int) -> nn.Module: def get_model(num_chars: int, num_speakers: int, config: Coqpit, d_vector_dim: int) -> nn.Module:
model = setup_model(num_chars, num_speakers, config, x_vector_dim) model = setup_model(num_chars, num_speakers, config, d_vector_dim)
return model return model
@staticmethod @staticmethod
@ -196,11 +196,11 @@ class TrainerTTS:
speakers_file = config.external_speaker_embedding_file speakers_file = config.external_speaker_embedding_file
if config.use_external_speaker_embedding_file: if config.use_external_speaker_embedding_file:
speaker_manager.load_x_vectors_file(speakers_file) speaker_manager.load_d_vectors_file(speakers_file)
else: else:
speaker_manager.load_ids_file(speakers_file) speaker_manager.load_ids_file(speakers_file)
elif config.use_external_speaker_embedding_file and config.external_speaker_embedding_file: elif config.use_external_speaker_embedding_file and config.external_speaker_embedding_file:
speaker_manager.load_x_vectors_file(config.external_speaker_embedding_file) speaker_manager.load_d_vectors_file(config.external_speaker_embedding_file)
else: else:
speaker_manager.parse_speakers_from_items(data_train) speaker_manager.parse_speakers_from_items(data_train)
file_path = os.path.join(out_path, "speakers.json") file_path = os.path.join(out_path, "speakers.json")
@ -387,8 +387,8 @@ class TrainerTTS:
durations = to_cuda(durations) if attn_mask is not None else None durations = to_cuda(durations) if attn_mask is not None else None
if speaker_ids is not None: if speaker_ids is not None:
speaker_ids = to_cuda(speaker_ids) speaker_ids = to_cuda(speaker_ids)
if speaker_embeddings is not None: if d_vectors is not None:
speaker_embeddings = to_cuda(speaker_embeddings) d_vectors = to_cuda(d_vectors)
return { return {
"text_input": text_input, "text_input": text_input,
@ -400,7 +400,7 @@ class TrainerTTS:
"attn_mask": attn_mask, "attn_mask": attn_mask,
"durations": durations, "durations": durations,
"speaker_ids": speaker_ids, "speaker_ids": speaker_ids,
"x_vectors": speaker_embeddings, "d_vectors": d_vectors,
"max_text_length": max_text_length, "max_text_length": max_text_length,
"max_spec_length": max_spec_length, "max_spec_length": max_spec_length,
"item_idx": item_idx, "item_idx": item_idx,
@ -591,7 +591,7 @@ class TrainerTTS:
self.use_cuda, self.use_cuda,
self.ap, self.ap,
speaker_id=cond_inputs["speaker_id"], speaker_id=cond_inputs["speaker_id"],
x_vector=cond_inputs["x_vector"], d_vector=cond_inputs["d_vector"],
style_wav=cond_inputs["style_wav"], style_wav=cond_inputs["style_wav"],
enable_eos_bos_chars=self.config.enable_eos_bos_chars, enable_eos_bos_chars=self.config.enable_eos_bos_chars,
use_griffin_lim=True, use_griffin_lim=True,
@ -612,9 +612,9 @@ class TrainerTTS:
def _get_cond_inputs(self) -> Dict: def _get_cond_inputs(self) -> Dict:
# setup speaker_id # setup speaker_id
speaker_id = 0 if self.config.use_speaker_embedding else None speaker_id = 0 if self.config.use_speaker_embedding else None
# setup x_vector # setup d_vector
x_vector = ( d_vector = (
self.speaker_manager.get_x_vectors_by_speaker(self.speaker_manager.speaker_ids[0]) self.speaker_manager.get_d_vectors_by_speaker(self.speaker_manager.speaker_ids[0])
if self.config.use_external_speaker_embedding_file and self.config.use_speaker_embedding if self.config.use_external_speaker_embedding_file and self.config.use_speaker_embedding
else None else None
) )
@ -629,7 +629,7 @@ class TrainerTTS:
print("WARNING: You don't provided a gst style wav, for this reason we use a zero tensor!") print("WARNING: You don't provided a gst style wav, for this reason we use a zero tensor!")
for i in range(self.config.gst["gst_num_style_tokens"]): for i in range(self.config.gst["gst_num_style_tokens"]):
style_wav[str(i)] = 0 style_wav[str(i)] = 0
cond_inputs = {"speaker_id": speaker_id, "style_wav": style_wav, "x_vector": x_vector} cond_inputs = {"speaker_id": speaker_id, "style_wav": style_wav, "d_vector": d_vector}
return cond_inputs return cond_inputs
def fit(self) -> None: def fit(self) -> None:

View File

@ -8,10 +8,10 @@ class GST(nn.Module):
See https://arxiv.org/pdf/1803.09017""" See https://arxiv.org/pdf/1803.09017"""
def __init__(self, num_mel, num_heads, num_style_tokens, gst_embedding_dim, speaker_embedding_dim=None): def __init__(self, num_mel, num_heads, num_style_tokens, gst_embedding_dim, d_vector_dim=None):
super().__init__() super().__init__()
self.encoder = ReferenceEncoder(num_mel, gst_embedding_dim) self.encoder = ReferenceEncoder(num_mel, gst_embedding_dim)
self.style_token_layer = StyleTokenLayer(num_heads, num_style_tokens, gst_embedding_dim, speaker_embedding_dim) self.style_token_layer = StyleTokenLayer(num_heads, num_style_tokens, gst_embedding_dim, d_vector_dim)
def forward(self, inputs, speaker_embedding=None): def forward(self, inputs, speaker_embedding=None):
enc_out = self.encoder(inputs) enc_out = self.encoder(inputs)
@ -83,13 +83,13 @@ class ReferenceEncoder(nn.Module):
class StyleTokenLayer(nn.Module): class StyleTokenLayer(nn.Module):
"""NN Module attending to style tokens based on prosody encodings.""" """NN Module attending to style tokens based on prosody encodings."""
def __init__(self, num_heads, num_style_tokens, embedding_dim, speaker_embedding_dim=None): def __init__(self, num_heads, num_style_tokens, embedding_dim, d_vector_dim=None):
super().__init__() super().__init__()
self.query_dim = embedding_dim // 2 self.query_dim = embedding_dim // 2
if speaker_embedding_dim: if d_vector_dim:
self.query_dim += speaker_embedding_dim self.query_dim += d_vector_dim
self.key_dim = embedding_dim // num_heads self.key_dim = embedding_dim // num_heads
self.style_tokens = nn.Parameter(torch.FloatTensor(num_style_tokens, self.key_dim)) self.style_tokens = nn.Parameter(torch.FloatTensor(num_style_tokens, self.key_dim))

View File

@ -266,7 +266,7 @@ class Decoder(nn.Module):
location_attn (bool): if true, use location sensitive attention. location_attn (bool): if true, use location sensitive attention.
attn_K (int): number of attention heads for GravesAttention. attn_K (int): number of attention heads for GravesAttention.
separate_stopnet (bool): if true, detach stopnet input to prevent gradient flow. separate_stopnet (bool): if true, detach stopnet input to prevent gradient flow.
speaker_embedding_dim (int): size of speaker embedding vector, for multi-speaker training. d_vector_dim (int): size of speaker embedding vector, for multi-speaker training.
""" """
# Pylint gets confused by PyTorch conventions here # Pylint gets confused by PyTorch conventions here

View File

@ -1,7 +1,7 @@
from TTS.utils.generic_utils import find_module from TTS.utils.generic_utils import find_module
def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None): def setup_model(num_chars, num_speakers, c, d_vector_dim=None):
print(" > Using model: {}".format(c.model)) print(" > Using model: {}".format(c.model))
MyModel = find_module("TTS.tts.models", c.model.lower()) MyModel = find_module("TTS.tts.models", c.model.lower())
if c.model.lower() in "tacotron": if c.model.lower() in "tacotron":
@ -29,7 +29,7 @@ def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None):
bidirectional_decoder=c.bidirectional_decoder, bidirectional_decoder=c.bidirectional_decoder,
double_decoder_consistency=c.double_decoder_consistency, double_decoder_consistency=c.double_decoder_consistency,
ddc_r=c.ddc_r, ddc_r=c.ddc_r,
speaker_embedding_dim=speaker_embedding_dim, d_vector_dim=d_vector_dim,
) )
elif c.model.lower() == "tacotron2": elif c.model.lower() == "tacotron2":
model = MyModel( model = MyModel(
@ -55,7 +55,7 @@ def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None):
bidirectional_decoder=c.bidirectional_decoder, bidirectional_decoder=c.bidirectional_decoder,
double_decoder_consistency=c.double_decoder_consistency, double_decoder_consistency=c.double_decoder_consistency,
ddc_r=c.ddc_r, ddc_r=c.ddc_r,
speaker_embedding_dim=speaker_embedding_dim, d_vector_dim=d_vector_dim,
) )
elif c.model.lower() == "glow_tts": elif c.model.lower() == "glow_tts":
model = MyModel( model = MyModel(
@ -79,7 +79,7 @@ def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None):
num_squeeze=2, num_squeeze=2,
sigmoid_scale=False, sigmoid_scale=False,
mean_only=True, mean_only=True,
speaker_embedding_dim=speaker_embedding_dim, d_vector_dim=d_vector_dim,
) )
elif c.model.lower() == "speedy_speech": elif c.model.lower() == "speedy_speech":
model = MyModel( model = MyModel(

View File

@ -212,7 +212,7 @@ class AlignTTS(nn.Module):
return dr_mas, mu, log_sigma, logp return dr_mas, mu, log_sigma, logp
def forward( def forward(
self, x, x_lengths, y, y_lengths, cond_input={"x_vectors": None}, phase=None self, x, x_lengths, y, y_lengths, cond_input={"d_vectors": None}, phase=None
): # pylint: disable=unused-argument ): # pylint: disable=unused-argument
""" """
Shapes: Shapes:
@ -223,7 +223,7 @@ class AlignTTS(nn.Module):
g: [B, C] g: [B, C]
""" """
y = y.transpose(1, 2) y = y.transpose(1, 2)
g = cond_input["x_vectors"] if "x_vectors" in cond_input else None g = cond_input["d_vectors"] if "d_vectors" in cond_input else None
o_de, o_dr_log, dr_mas_log, attn, mu, log_sigma, logp = None, None, None, None, None, None, None o_de, o_dr_log, dr_mas_log, attn, mu, log_sigma, logp = None, None, None, None, None, None, None
if phase == 0: if phase == 0:
# train encoder and MDN # train encoder and MDN
@ -267,14 +267,14 @@ class AlignTTS(nn.Module):
return outputs return outputs
@torch.no_grad() @torch.no_grad()
def inference(self, x, cond_input={"x_vectors": None}): # pylint: disable=unused-argument def inference(self, x, cond_input={"d_vectors": None}): # pylint: disable=unused-argument
""" """
Shapes: Shapes:
x: [B, T_max] x: [B, T_max]
x_lengths: [B] x_lengths: [B]
g: [B, C] g: [B, C]
""" """
g = cond_input["x_vectors"] if "x_vectors" in cond_input else None g = cond_input["d_vectors"] if "d_vectors" in cond_input else None
x_lengths = torch.tensor(x.shape[1:2]).to(x.device) x_lengths = torch.tensor(x.shape[1:2]).to(x.device)
# pad input to prevent dropping the last word # pad input to prevent dropping the last word
# x = torch.nn.functional.pad(x, pad=(0, 5), mode='constant', value=0) # x = torch.nn.functional.pad(x, pad=(0, 5), mode='constant', value=0)
@ -293,10 +293,10 @@ class AlignTTS(nn.Module):
text_lengths = batch["text_lengths"] text_lengths = batch["text_lengths"]
mel_input = batch["mel_input"] mel_input = batch["mel_input"]
mel_lengths = batch["mel_lengths"] mel_lengths = batch["mel_lengths"]
x_vectors = batch["x_vectors"] d_vectors = batch["d_vectors"]
speaker_ids = batch["speaker_ids"] speaker_ids = batch["speaker_ids"]
cond_input = {"x_vectors": x_vectors, "speaker_ids": speaker_ids} cond_input = {"d_vectors": d_vectors, "speaker_ids": speaker_ids}
outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input, self.phase) outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input, self.phase)
loss_dict = criterion( loss_dict = criterion(
outputs["logp"], outputs["logp"],

View File

@ -36,7 +36,7 @@ class GlowTTS(nn.Module):
mean_only (bool): if True, encoder only computes mean value and uses constant variance for each time step. mean_only (bool): if True, encoder only computes mean value and uses constant variance for each time step.
encoder_type (str): encoder module type. encoder_type (str): encoder module type.
encoder_params (dict): encoder module parameters. encoder_params (dict): encoder module parameters.
speaker_embedding_dim (int): channels of external speaker embedding vectors. d_vector_dim (int): channels of external speaker embedding vectors.
""" """
def __init__( def __init__(
@ -62,7 +62,7 @@ class GlowTTS(nn.Module):
mean_only=False, mean_only=False,
encoder_type="transformer", encoder_type="transformer",
encoder_params=None, encoder_params=None,
speaker_embedding_dim=None, d_vector_dim=None,
): ):
super().__init__() super().__init__()
@ -88,15 +88,15 @@ class GlowTTS(nn.Module):
# model constants. # model constants.
self.noise_scale = 0.33 # defines the noise variance applied to the random z vector at inference. self.noise_scale = 0.33 # defines the noise variance applied to the random z vector at inference.
self.length_scale = 1.0 # scaler for the duration predictor. The larger it is, the slower the speech. self.length_scale = 1.0 # scaler for the duration predictor. The larger it is, the slower the speech.
self.speaker_embedding_dim = speaker_embedding_dim self.d_vector_dim = d_vector_dim
# if is a multispeaker and c_in_channels is 0, set to 256 # if is a multispeaker and c_in_channels is 0, set to 256
if num_speakers > 1: if num_speakers > 1:
if self.c_in_channels == 0 and not self.speaker_embedding_dim: if self.c_in_channels == 0 and not self.d_vector_dim:
# TODO: make this adjustable # TODO: make this adjustable
self.c_in_channels = 256 self.c_in_channels = 256
elif self.speaker_embedding_dim: elif self.d_vector_dim:
self.c_in_channels = self.speaker_embedding_dim self.c_in_channels = self.d_vector_dim
self.encoder = Encoder( self.encoder = Encoder(
num_chars, num_chars,
@ -125,7 +125,7 @@ class GlowTTS(nn.Module):
c_in_channels=self.c_in_channels, c_in_channels=self.c_in_channels,
) )
if num_speakers > 1 and not speaker_embedding_dim: if num_speakers > 1 and not d_vector_dim:
# speaker embedding layer # speaker embedding layer
self.emb_g = nn.Embedding(num_speakers, self.c_in_channels) self.emb_g = nn.Embedding(num_speakers, self.c_in_channels)
nn.init.uniform_(self.emb_g.weight, -0.1, 0.1) nn.init.uniform_(self.emb_g.weight, -0.1, 0.1)
@ -144,7 +144,7 @@ class GlowTTS(nn.Module):
return y_mean, y_log_scale, o_attn_dur return y_mean, y_log_scale, o_attn_dur
def forward( def forward(
self, x, x_lengths, y, y_lengths=None, cond_input={"x_vectors": None} self, x, x_lengths, y, y_lengths=None, cond_input={"d_vectors": None}
): # pylint: disable=dangerous-default-value ): # pylint: disable=dangerous-default-value
""" """
Shapes: Shapes:
@ -157,9 +157,9 @@ class GlowTTS(nn.Module):
y = y.transpose(1, 2) y = y.transpose(1, 2)
y_max_length = y.size(2) y_max_length = y.size(2)
# norm speaker embeddings # norm speaker embeddings
g = cond_input["x_vectors"] if cond_input is not None and "x_vectors" in cond_input else None g = cond_input["d_vectors"] if cond_input is not None and "d_vectors" in cond_input else None
if g is not None: if g is not None:
if self.speaker_embedding_dim: if self.d_vector_dim:
g = F.normalize(g).unsqueeze(-1) g = F.normalize(g).unsqueeze(-1)
else: else:
g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1] g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1]
@ -197,7 +197,7 @@ class GlowTTS(nn.Module):
@torch.no_grad() @torch.no_grad()
def inference_with_MAS( def inference_with_MAS(
self, x, x_lengths, y=None, y_lengths=None, cond_input={"x_vectors": None} self, x, x_lengths, y=None, y_lengths=None, cond_input={"d_vectors": None}
): # pylint: disable=dangerous-default-value ): # pylint: disable=dangerous-default-value
""" """
It's similar to the teacher forcing in Tacotron. It's similar to the teacher forcing in Tacotron.
@ -212,9 +212,9 @@ class GlowTTS(nn.Module):
y = y.transpose(1, 2) y = y.transpose(1, 2)
y_max_length = y.size(2) y_max_length = y.size(2)
# norm speaker embeddings # norm speaker embeddings
g = cond_input["x_vectors"] if cond_input is not None and "x_vectors" in cond_input else None g = cond_input["d_vectors"] if cond_input is not None and "d_vectors" in cond_input else None
if g is not None: if g is not None:
if self.external_speaker_embedding_dim: if self.external_d_vector_dim:
g = F.normalize(g).unsqueeze(-1) g = F.normalize(g).unsqueeze(-1)
else: else:
g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1] g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1]
@ -258,7 +258,7 @@ class GlowTTS(nn.Module):
@torch.no_grad() @torch.no_grad()
def decoder_inference( def decoder_inference(
self, y, y_lengths=None, cond_input={"x_vectors": None} self, y, y_lengths=None, cond_input={"d_vectors": None}
): # pylint: disable=dangerous-default-value ): # pylint: disable=dangerous-default-value
""" """
Shapes: Shapes:
@ -268,10 +268,10 @@ class GlowTTS(nn.Module):
""" """
y = y.transpose(1, 2) y = y.transpose(1, 2)
y_max_length = y.size(2) y_max_length = y.size(2)
g = cond_input["x_vectors"] if cond_input is not None and "x_vectors" in cond_input else None g = cond_input["d_vectors"] if cond_input is not None and "d_vectors" in cond_input else None
# norm speaker embeddings # norm speaker embeddings
if g is not None: if g is not None:
if self.external_speaker_embedding_dim: if self.external_d_vector_dim:
g = F.normalize(g).unsqueeze(-1) g = F.normalize(g).unsqueeze(-1)
else: else:
g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1] g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h, 1]
@ -290,10 +290,10 @@ class GlowTTS(nn.Module):
return outputs return outputs
@torch.no_grad() @torch.no_grad()
def inference(self, x, x_lengths, cond_input={"x_vectors": None}): # pylint: disable=dangerous-default-value def inference(self, x, x_lengths, cond_input={"d_vectors": None}): # pylint: disable=dangerous-default-value
g = cond_input["x_vectors"] if cond_input is not None and "x_vectors" in cond_input else None g = cond_input["d_vectors"] if cond_input is not None and "d_vectors" in cond_input else None
if g is not None: if g is not None:
if self.speaker_embedding_dim: if self.d_vector_dim:
g = F.normalize(g).unsqueeze(-1) g = F.normalize(g).unsqueeze(-1)
else: else:
g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h] g = F.normalize(self.emb_g(g)).unsqueeze(-1) # [b, h]
@ -338,9 +338,9 @@ class GlowTTS(nn.Module):
text_lengths = batch["text_lengths"] text_lengths = batch["text_lengths"]
mel_input = batch["mel_input"] mel_input = batch["mel_input"]
mel_lengths = batch["mel_lengths"] mel_lengths = batch["mel_lengths"]
x_vectors = batch["x_vectors"] d_vectors = batch["d_vectors"]
outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input={"x_vectors": x_vectors}) outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input={"d_vectors": d_vectors})
loss_dict = criterion( loss_dict = criterion(
outputs["model_outputs"], outputs["model_outputs"],

View File

@ -157,7 +157,7 @@ class SpeedySpeech(nn.Module):
return o_de, attn.transpose(1, 2) return o_de, attn.transpose(1, 2)
def forward( def forward(
self, x, x_lengths, y_lengths, dr, cond_input={"x_vectors": None, "speaker_ids": None} self, x, x_lengths, y_lengths, dr, cond_input={"d_vectors": None, "speaker_ids": None}
): # pylint: disable=unused-argument ): # pylint: disable=unused-argument
""" """
TODO: speaker embedding for speaker_ids TODO: speaker embedding for speaker_ids
@ -168,21 +168,21 @@ class SpeedySpeech(nn.Module):
dr: [B, T_max] dr: [B, T_max]
g: [B, C] g: [B, C]
""" """
g = cond_input["x_vectors"] if "x_vectors" in cond_input else None g = cond_input["d_vectors"] if "d_vectors" in cond_input else None
o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g) o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask) o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask)
o_de, attn = self._forward_decoder(o_en, o_en_dp, dr, x_mask, y_lengths, g=g) o_de, attn = self._forward_decoder(o_en, o_en_dp, dr, x_mask, y_lengths, g=g)
outputs = {"model_outputs": o_de.transpose(1, 2), "durations_log": o_dr_log.squeeze(1), "alignments": attn} outputs = {"model_outputs": o_de.transpose(1, 2), "durations_log": o_dr_log.squeeze(1), "alignments": attn}
return outputs return outputs
def inference(self, x, cond_input={"x_vectors": None, "speaker_ids": None}): # pylint: disable=unused-argument def inference(self, x, cond_input={"d_vectors": None, "speaker_ids": None}): # pylint: disable=unused-argument
""" """
Shapes: Shapes:
x: [B, T_max] x: [B, T_max]
x_lengths: [B] x_lengths: [B]
g: [B, C] g: [B, C]
""" """
g = cond_input["x_vectors"] if "x_vectors" in cond_input else None g = cond_input["d_vectors"] if "d_vectors" in cond_input else None
x_lengths = torch.tensor(x.shape[1:2]).to(x.device) x_lengths = torch.tensor(x.shape[1:2]).to(x.device)
# input sequence should be greated than the max convolution size # input sequence should be greated than the max convolution size
inference_padding = 5 inference_padding = 5
@ -204,11 +204,11 @@ class SpeedySpeech(nn.Module):
text_lengths = batch["text_lengths"] text_lengths = batch["text_lengths"]
mel_input = batch["mel_input"] mel_input = batch["mel_input"]
mel_lengths = batch["mel_lengths"] mel_lengths = batch["mel_lengths"]
x_vectors = batch["x_vectors"] d_vectors = batch["d_vectors"]
speaker_ids = batch["speaker_ids"] speaker_ids = batch["speaker_ids"]
durations = batch["durations"] durations = batch["durations"]
cond_input = {"x_vectors": x_vectors, "speaker_ids": speaker_ids} cond_input = {"d_vectors": d_vectors, "speaker_ids": speaker_ids}
outputs = self.forward(text_input, text_lengths, mel_lengths, durations, cond_input) outputs = self.forward(text_input, text_lengths, mel_lengths, durations, cond_input)
# compute loss # compute loss

View File

@ -42,7 +42,7 @@ class Tacotron(TacotronAbstract):
ddc_r (int, optional): reduction rate for the coarse decoder of double decoder consistency. Defaults to None. ddc_r (int, optional): reduction rate for the coarse decoder of double decoder consistency. Defaults to None.
encoder_in_features (int, optional): input channels for the encoder. Defaults to 512. encoder_in_features (int, optional): input channels for the encoder. Defaults to 512.
decoder_in_features (int, optional): input channels for the decoder. Defaults to 512. decoder_in_features (int, optional): input channels for the decoder. Defaults to 512.
speaker_embedding_dim (int, optional): external speaker conditioning vector channels. Defaults to None. d_vector_dim (int, optional): external speaker conditioning vector channels. Defaults to None.
use_gst (bool, optional): enable/disable Global style token module. use_gst (bool, optional): enable/disable Global style token module.
gst (Coqpit, optional): Coqpit to initialize the GST module. If `None`, GST is disabled. Defaults to None. gst (Coqpit, optional): Coqpit to initialize the GST module. If `None`, GST is disabled. Defaults to None.
memory_size (int, optional): size of the history queue fed to the prenet. Model feeds the last ```memory_size``` memory_size (int, optional): size of the history queue fed to the prenet. Model feeds the last ```memory_size```
@ -75,7 +75,7 @@ class Tacotron(TacotronAbstract):
ddc_r=None, ddc_r=None,
encoder_in_features=256, encoder_in_features=256,
decoder_in_features=256, decoder_in_features=256,
speaker_embedding_dim=None, d_vector_dim=None,
use_gst=False, use_gst=False,
gst=None, gst=None,
memory_size=5, memory_size=5,
@ -104,7 +104,7 @@ class Tacotron(TacotronAbstract):
ddc_r, ddc_r,
encoder_in_features, encoder_in_features,
decoder_in_features, decoder_in_features,
speaker_embedding_dim, d_vector_dim,
use_gst, use_gst,
gst, gst,
gradual_training, gradual_training,
@ -112,14 +112,14 @@ class Tacotron(TacotronAbstract):
# speaker embedding layers # speaker embedding layers
if self.num_speakers > 1: if self.num_speakers > 1:
if not self.embeddings_per_sample: if not self.use_d_vectors:
speaker_embedding_dim = 256 d_vector_dim = 256
self.speaker_embedding = nn.Embedding(self.num_speakers, speaker_embedding_dim) self.speaker_embedding = nn.Embedding(self.num_speakers, d_vector_dim)
self.speaker_embedding.weight.data.normal_(0, 0.3) self.speaker_embedding.weight.data.normal_(0, 0.3)
# speaker and gst embeddings is concat in decoder input # speaker and gst embeddings is concat in decoder input
if self.num_speakers > 1: if self.num_speakers > 1:
self.decoder_in_features += speaker_embedding_dim # add speaker embedding dim self.decoder_in_features += d_vector_dim # add speaker embedding dim
# embedding layer # embedding layer
self.embedding = nn.Embedding(num_chars, 256, padding_idx=0) self.embedding = nn.Embedding(num_chars, 256, padding_idx=0)
@ -154,7 +154,7 @@ class Tacotron(TacotronAbstract):
if self.gst and self.use_gst: if self.gst and self.use_gst:
self.gst_layer = GST( self.gst_layer = GST(
num_mel=decoder_output_dim, num_mel=decoder_output_dim,
speaker_embedding_dim=speaker_embedding_dim, d_vector_dim=d_vector_dim,
num_heads=gst.gst_num_heads, num_heads=gst.gst_num_heads,
num_style_tokens=gst.gst_num_style_tokens, num_style_tokens=gst.gst_num_style_tokens,
gst_embedding_dim=gst.gst_embedding_dim, gst_embedding_dim=gst.gst_embedding_dim,
@ -189,7 +189,7 @@ class Tacotron(TacotronAbstract):
text_lengths: [B] text_lengths: [B]
mel_specs: [B, T_out, C] mel_specs: [B, T_out, C]
mel_lengths: [B] mel_lengths: [B]
cond_input: 'speaker_ids': [B, 1] and 'x_vectors':[B, C] cond_input: 'speaker_ids': [B, 1] and 'd_vectors':[B, C]
""" """
outputs = {"alignments_backward": None, "decoder_outputs_backward": None} outputs = {"alignments_backward": None, "decoder_outputs_backward": None}
inputs = self.embedding(text) inputs = self.embedding(text)
@ -201,16 +201,16 @@ class Tacotron(TacotronAbstract):
# global style token # global style token
if self.gst and self.use_gst: if self.gst and self.use_gst:
# B x gst_dim # B x gst_dim
encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, cond_input["x_vectors"]) encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, cond_input["d_vectors"])
# speaker embedding # speaker embedding
if self.num_speakers > 1: if self.num_speakers > 1:
if not self.embeddings_per_sample: if not self.use_d_vectors:
# B x 1 x speaker_embed_dim # B x 1 x speaker_embed_dim
speaker_embeddings = self.speaker_embedding(cond_input["speaker_ids"])[:, None] embedded_speakers = self.speaker_embedding(cond_input["speaker_ids"])[:, None]
else: else:
# B x 1 x speaker_embed_dim # B x 1 x speaker_embed_dim
speaker_embeddings = torch.unsqueeze(cond_input["x_vectors"], 1) embedded_speakers = torch.unsqueeze(cond_input["d_vectors"], 1)
encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers)
# decoder_outputs: B x decoder_in_features x T_out # decoder_outputs: B x decoder_in_features x T_out
# alignments: B x T_in x encoder_in_features # alignments: B x T_in x encoder_in_features
# stop_tokens: B x T_in # stop_tokens: B x T_in
@ -254,15 +254,15 @@ class Tacotron(TacotronAbstract):
encoder_outputs = self.encoder(inputs) encoder_outputs = self.encoder(inputs)
if self.gst and self.use_gst: if self.gst and self.use_gst:
# B x gst_dim # B x gst_dim
encoder_outputs = self.compute_gst(encoder_outputs, cond_input["style_mel"], cond_input["x_vectors"]) encoder_outputs = self.compute_gst(encoder_outputs, cond_input["style_mel"], cond_input["d_vectors"])
if self.num_speakers > 1: if self.num_speakers > 1:
if not self.embeddings_per_sample: if not self.use_d_vectors:
# B x 1 x speaker_embed_dim # B x 1 x speaker_embed_dim
speaker_embeddings = self.speaker_embedding(cond_input["speaker_ids"])[:, None] embedded_speakers = self.speaker_embedding(cond_input["speaker_ids"])[:, None]
else: else:
# B x 1 x speaker_embed_dim # B x 1 x speaker_embed_dim
speaker_embeddings = torch.unsqueeze(cond_input["x_vectors"], 1) embedded_speakers = torch.unsqueeze(cond_input["d_vectors"], 1)
encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers)
decoder_outputs, alignments, stop_tokens = self.decoder.inference(encoder_outputs) decoder_outputs, alignments, stop_tokens = self.decoder.inference(encoder_outputs)
postnet_outputs = self.postnet(decoder_outputs) postnet_outputs = self.postnet(decoder_outputs)
postnet_outputs = self.last_linear(postnet_outputs) postnet_outputs = self.last_linear(postnet_outputs)
@ -289,7 +289,7 @@ class Tacotron(TacotronAbstract):
linear_input = batch["linear_input"] linear_input = batch["linear_input"]
stop_targets = batch["stop_targets"] stop_targets = batch["stop_targets"]
speaker_ids = batch["speaker_ids"] speaker_ids = batch["speaker_ids"]
x_vectors = batch["x_vectors"] d_vectors = batch["d_vectors"]
# forward pass model # forward pass model
outputs = self.forward( outputs = self.forward(
@ -297,7 +297,7 @@ class Tacotron(TacotronAbstract):
text_lengths, text_lengths,
mel_input, mel_input,
mel_lengths, mel_lengths,
cond_input={"speaker_ids": speaker_ids, "x_vectors": x_vectors}, cond_input={"speaker_ids": speaker_ids, "d_vectors": d_vectors},
) )
# set the [alignment] lengths wrt reduction factor for guided attention # set the [alignment] lengths wrt reduction factor for guided attention
@ -308,7 +308,7 @@ class Tacotron(TacotronAbstract):
else: else:
alignment_lengths = mel_lengths // self.decoder.r alignment_lengths = mel_lengths // self.decoder.r
cond_input = {"speaker_ids": speaker_ids, "x_vectors": x_vectors} cond_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input) outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input)
# compute loss # compute loss

View File

@ -42,7 +42,7 @@ class Tacotron2(TacotronAbstract):
ddc_r (int, optional): reduction rate for the coarse decoder of double decoder consistency. Defaults to None. ddc_r (int, optional): reduction rate for the coarse decoder of double decoder consistency. Defaults to None.
encoder_in_features (int, optional): input channels for the encoder. Defaults to 512. encoder_in_features (int, optional): input channels for the encoder. Defaults to 512.
decoder_in_features (int, optional): input channels for the decoder. Defaults to 512. decoder_in_features (int, optional): input channels for the decoder. Defaults to 512.
speaker_embedding_dim (int, optional): external speaker conditioning vector channels. Defaults to None. d_vector_dim (int, optional): external speaker conditioning vector channels. Defaults to None.
use_gst (bool, optional): enable/disable Global style token module. use_gst (bool, optional): enable/disable Global style token module.
gst (Coqpit, optional): Coqpit to initialize the GST module. If `None`, GST is disabled. Defaults to None. gst (Coqpit, optional): Coqpit to initialize the GST module. If `None`, GST is disabled. Defaults to None.
gradual_training (List): Gradual training schedule. If None or `[]`, no gradual training is used. gradual_training (List): Gradual training schedule. If None or `[]`, no gradual training is used.
@ -73,7 +73,7 @@ class Tacotron2(TacotronAbstract):
ddc_r=None, ddc_r=None,
encoder_in_features=512, encoder_in_features=512,
decoder_in_features=512, decoder_in_features=512,
speaker_embedding_dim=None, d_vector_dim=None,
use_gst=False, use_gst=False,
gst=None, gst=None,
gradual_training=None, gradual_training=None,
@ -101,7 +101,7 @@ class Tacotron2(TacotronAbstract):
ddc_r, ddc_r,
encoder_in_features, encoder_in_features,
decoder_in_features, decoder_in_features,
speaker_embedding_dim, d_vector_dim,
use_gst, use_gst,
gst, gst,
gradual_training, gradual_training,
@ -109,14 +109,14 @@ class Tacotron2(TacotronAbstract):
# speaker embedding layer # speaker embedding layer
if self.num_speakers > 1: if self.num_speakers > 1:
if not self.embeddings_per_sample: if not self.use_d_vectors:
speaker_embedding_dim = 512 d_vector_dim = 512
self.speaker_embedding = nn.Embedding(self.num_speakers, speaker_embedding_dim) self.speaker_embedding = nn.Embedding(self.num_speakers, d_vector_dim)
self.speaker_embedding.weight.data.normal_(0, 0.3) self.speaker_embedding.weight.data.normal_(0, 0.3)
# speaker and gst embeddings is concat in decoder input # speaker and gst embeddings is concat in decoder input
if self.num_speakers > 1: if self.num_speakers > 1:
self.decoder_in_features += speaker_embedding_dim # add speaker embedding dim self.decoder_in_features += d_vector_dim # add speaker embedding dim
# embedding layer # embedding layer
self.embedding = nn.Embedding(num_chars, 512, padding_idx=0) self.embedding = nn.Embedding(num_chars, 512, padding_idx=0)
@ -142,13 +142,13 @@ class Tacotron2(TacotronAbstract):
self.postnet = Postnet(self.postnet_output_dim) self.postnet = Postnet(self.postnet_output_dim)
# setup prenet dropout # setup prenet dropout
self.decoder.prenet.dropout_at_inference = prenet_dropout_at_inference self.decoder.prenet.dropout_at_g = prenet_dropout_at_inference
# global style token layers # global style token layers
if self.gst and use_gst: if self.gst and use_gst:
self.gst_layer = GST( self.gst_layer = GST(
num_mel=decoder_output_dim, num_mel=decoder_output_dim,
speaker_embedding_dim=speaker_embedding_dim, d_vector_dim=d_vector_dim,
num_heads=gst.gst_num_heads, num_heads=gst.gst_num_heads,
num_style_tokens=gst.gst_num_style_tokens, num_style_tokens=gst.gst_num_style_tokens,
gst_embedding_dim=gst.gst_embedding_dim, gst_embedding_dim=gst.gst_embedding_dim,
@ -189,7 +189,7 @@ class Tacotron2(TacotronAbstract):
text_lengths: [B] text_lengths: [B]
mel_specs: [B, T_out, C] mel_specs: [B, T_out, C]
mel_lengths: [B] mel_lengths: [B]
cond_input: 'speaker_ids': [B, 1] and 'x_vectors':[B, C] cond_input: 'speaker_ids': [B, 1] and 'd_vectors':[B, C]
""" """
cond_input = self._format_cond_input(cond_input) cond_input = self._format_cond_input(cond_input)
outputs = {"alignments_backward": None, "decoder_outputs_backward": None} outputs = {"alignments_backward": None, "decoder_outputs_backward": None}
@ -202,15 +202,15 @@ class Tacotron2(TacotronAbstract):
encoder_outputs = self.encoder(embedded_inputs, text_lengths) encoder_outputs = self.encoder(embedded_inputs, text_lengths)
if self.gst and self.use_gst: if self.gst and self.use_gst:
# B x gst_dim # B x gst_dim
encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, cond_input["x_vectors"]) encoder_outputs = self.compute_gst(encoder_outputs, mel_specs, cond_input["d_vectors"])
if self.num_speakers > 1: if self.num_speakers > 1:
if not self.embeddings_per_sample: if not self.use_d_vectors:
# B x 1 x speaker_embed_dim # B x 1 x speaker_embed_dim
speaker_embeddings = self.speaker_embedding(cond_input["speaker_ids"])[:, None] embedded_speakers = self.speaker_embedding(cond_input["speaker_ids"])[:, None]
else: else:
# B x 1 x speaker_embed_dim # B x 1 x speaker_embed_dim
speaker_embeddings = torch.unsqueeze(cond_input["x_vectors"], 1) embedded_speakers = torch.unsqueeze(cond_input["d_vectors"], 1)
encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings) encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers)
encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs) encoder_outputs = encoder_outputs * input_mask.unsqueeze(2).expand_as(encoder_outputs)
@ -255,15 +255,15 @@ class Tacotron2(TacotronAbstract):
if self.gst and self.use_gst: if self.gst and self.use_gst:
# B x gst_dim # B x gst_dim
encoder_outputs = self.compute_gst(encoder_outputs, cond_input["style_mel"], cond_input["x_vectors"]) encoder_outputs = self.compute_gst(encoder_outputs, cond_input["style_mel"], cond_input["d_vectors"])
if self.num_speakers > 1: if self.num_speakers > 1:
if not self.embeddings_per_sample: if not self.embeddings_per_sample:
x_vector = self.speaker_embedding(cond_input['speaker_ids'])[:, None] x_vector = self.speaker_embedding(cond_input['speaker_ids'])[:, None]
x_vector = torch.unsqueeze(x_vector, 0).transpose(1, 2) x_vector = torch.unsqueeze(x_vector, 0).transpose(1, 2)
else: else:
x_vector = cond_input["x_vectors"] embedded_speakers = cond_input["d_vectors"]
encoder_outputs = self._concat_speaker_embedding(encoder_outputs, x_vector) encoder_outputs = self._concat_speaker_embedding(encoder_outputs, embedded_speakers)
decoder_outputs, alignments, stop_tokens = self.decoder.inference(encoder_outputs) decoder_outputs, alignments, stop_tokens = self.decoder.inference(encoder_outputs)
postnet_outputs = self.postnet(decoder_outputs) postnet_outputs = self.postnet(decoder_outputs)
@ -291,7 +291,7 @@ class Tacotron2(TacotronAbstract):
linear_input = batch["linear_input"] linear_input = batch["linear_input"]
stop_targets = batch["stop_targets"] stop_targets = batch["stop_targets"]
speaker_ids = batch["speaker_ids"] speaker_ids = batch["speaker_ids"]
x_vectors = batch["x_vectors"] d_vectors = batch["d_vectors"]
# forward pass model # forward pass model
outputs = self.forward( outputs = self.forward(
@ -299,7 +299,7 @@ class Tacotron2(TacotronAbstract):
text_lengths, text_lengths,
mel_input, mel_input,
mel_lengths, mel_lengths,
cond_input={"speaker_ids": speaker_ids, "x_vectors": x_vectors}, cond_input={"speaker_ids": speaker_ids, "d_vectors": d_vectors},
) )
# set the [alignment] lengths wrt reduction factor for guided attention # set the [alignment] lengths wrt reduction factor for guided attention
@ -310,7 +310,7 @@ class Tacotron2(TacotronAbstract):
else: else:
alignment_lengths = mel_lengths // self.decoder.r alignment_lengths = mel_lengths // self.decoder.r
cond_input = {"speaker_ids": speaker_ids, "x_vectors": x_vectors} cond_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input) outputs = self.forward(text_input, text_lengths, mel_input, mel_lengths, cond_input)
# compute loss # compute loss

View File

@ -35,7 +35,7 @@ class TacotronAbstract(ABC, nn.Module):
ddc_r=None, ddc_r=None,
encoder_in_features=512, encoder_in_features=512,
decoder_in_features=512, decoder_in_features=512,
speaker_embedding_dim=None, d_vector_dim=None,
use_gst=False, use_gst=False,
gst=None, gst=None,
gradual_training=None, gradual_training=None,
@ -66,7 +66,7 @@ class TacotronAbstract(ABC, nn.Module):
self.separate_stopnet = separate_stopnet self.separate_stopnet = separate_stopnet
self.encoder_in_features = encoder_in_features self.encoder_in_features = encoder_in_features
self.decoder_in_features = decoder_in_features self.decoder_in_features = decoder_in_features
self.speaker_embedding_dim = speaker_embedding_dim self.d_vector_dim = d_vector_dim
self.gradual_training = gradual_training self.gradual_training = gradual_training
# layers # layers
@ -76,12 +76,12 @@ class TacotronAbstract(ABC, nn.Module):
self.postnet = None self.postnet = None
# multispeaker # multispeaker
if self.speaker_embedding_dim is None: if self.d_vector_dim is None:
# if speaker_embedding_dim is None we need use the nn.Embedding, with default speaker_embedding_dim # if d_vector_dim is None we need use the nn.Embedding, with default d_vector_dim
self.embeddings_per_sample = False self.use_d_vectors = False
else: else:
# if speaker_embedding_dim is not None we need use speaker embedding per sample # if d_vector_dim is not None we need use speaker embedding per sample
self.embeddings_per_sample = True self.use_d_vectors = True
# global style token # global style token
if self.gst and use_gst: if self.gst and use_gst:
@ -89,8 +89,8 @@ class TacotronAbstract(ABC, nn.Module):
self.gst_layer = None self.gst_layer = None
# model states # model states
self.speaker_embeddings = None self.embedded_speakers = None
self.speaker_embeddings_projected = None self.embedded_speakers_projected = None
# additional layers # additional layers
self.decoder_backward = None self.decoder_backward = None
@ -98,15 +98,15 @@ class TacotronAbstract(ABC, nn.Module):
@staticmethod @staticmethod
def _format_cond_input(cond_input: Dict) -> Dict: def _format_cond_input(cond_input: Dict) -> Dict:
return format_cond_input({"x_vectors": None, "speaker_ids": None}, cond_input) return format_cond_input({"d_vectors": None, "speaker_ids": None}, cond_input)
############################# #############################
# INIT FUNCTIONS # INIT FUNCTIONS
############################# #############################
def _init_states(self): def _init_states(self):
self.speaker_embeddings = None self.embedded_speakers = None
self.speaker_embeddings_projected = None self.embedded_speakers_projected = None
def _init_backward_decoder(self): def _init_backward_decoder(self):
self.decoder_backward = copy.deepcopy(self.decoder) self.decoder_backward = copy.deepcopy(self.decoder)
@ -188,9 +188,9 @@ class TacotronAbstract(ABC, nn.Module):
if hasattr(self, "speaker_embedding") and speaker_ids is None: if hasattr(self, "speaker_embedding") and speaker_ids is None:
raise RuntimeError(" [!] Model has speaker embedding layer but speaker_id is not provided") raise RuntimeError(" [!] Model has speaker embedding layer but speaker_id is not provided")
if hasattr(self, "speaker_embedding") and speaker_ids is not None: if hasattr(self, "speaker_embedding") and speaker_ids is not None:
self.speaker_embeddings = self.speaker_embedding(speaker_ids).unsqueeze(1) self.embedded_speakers = self.speaker_embedding(speaker_ids).unsqueeze(1)
if hasattr(self, "speaker_project_mel") and speaker_ids is not None: if hasattr(self, "speaker_project_mel") and speaker_ids is not None:
self.speaker_embeddings_projected = self.speaker_project_mel(self.speaker_embeddings).squeeze(1) self.embedded_speakers_projected = self.speaker_project_mel(self.embedded_speakers).squeeze(1)
def compute_gst(self, inputs, style_input, speaker_embedding=None): def compute_gst(self, inputs, style_input, speaker_embedding=None):
"""Compute global style token""" """Compute global style token"""
@ -213,15 +213,15 @@ class TacotronAbstract(ABC, nn.Module):
return inputs return inputs
@staticmethod @staticmethod
def _add_speaker_embedding(outputs, speaker_embeddings): def _add_speaker_embedding(outputs, embedded_speakers):
speaker_embeddings_ = speaker_embeddings.expand(outputs.size(0), outputs.size(1), -1) embedded_speakers_ = embedded_speakers.expand(outputs.size(0), outputs.size(1), -1)
outputs = outputs + speaker_embeddings_ outputs = outputs + embedded_speakers_
return outputs return outputs
@staticmethod @staticmethod
def _concat_speaker_embedding(outputs, speaker_embeddings): def _concat_speaker_embedding(outputs, embedded_speakers):
speaker_embeddings_ = speaker_embeddings.expand(outputs.size(0), outputs.size(1), -1) embedded_speakers_ = embedded_speakers.expand(outputs.size(0), outputs.size(1), -1)
outputs = torch.cat([outputs, speaker_embeddings_], dim=-1) outputs = torch.cat([outputs, embedded_speakers_], dim=-1)
return outputs return outputs
############################# #############################

View File

@ -52,8 +52,8 @@ def get_speaker_manager(c, args, meta_data_train):
raise RuntimeError( raise RuntimeError(
"You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file" "You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file"
) )
speaker_manager.load_x_vectors_file(c.external_speaker_embedding_file) speaker_manager.load_d_vectors_file(c.external_speaker_embedding_file)
speaker_manager.set_x_vectors_from_file(speakers_file) speaker_manager.set_d_vectors_from_file(speakers_file)
elif not c.use_external_speaker_embedding_file: # restor speaker manager with speaker ID file. elif not c.use_external_speaker_embedding_file: # restor speaker manager with speaker ID file.
speakers_file = os.path.dirname(args.restore_path) speakers_file = os.path.dirname(args.restore_path)
speaker_ids_from_data = speaker_manager.speaker_ids speaker_ids_from_data = speaker_manager.speaker_ids
@ -63,7 +63,7 @@ def get_speaker_manager(c, args, meta_data_train):
), " [!] You cannot introduce new speakers to a pre-trained model." ), " [!] You cannot introduce new speakers to a pre-trained model."
elif c.use_external_speaker_embedding_file and c.external_speaker_embedding_file: elif c.use_external_speaker_embedding_file and c.external_speaker_embedding_file:
# new speaker manager with external speaker embeddings. # new speaker manager with external speaker embeddings.
speaker_manager.set_x_vectors_from_file(c.external_speaker_embedding_file) speaker_manager.set_d_vectors_from_file(c.external_speaker_embedding_file)
elif ( elif (
c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file
): # new speaker manager with speaker IDs file. ): # new speaker manager with speaker IDs file.
@ -88,7 +88,7 @@ class SpeakerManager:
{ {
'clip_name.wav':{ 'clip_name.wav':{
'name': 'speakerA', 'name': 'speakerA',
'embedding'[<x_vector_values>] 'embedding'[<d_vector_values>]
}, },
... ...
} }
@ -103,10 +103,10 @@ class SpeakerManager:
>>> # load a sample audio and compute embedding >>> # load a sample audio and compute embedding
>>> waveform = ap.load_wav(sample_wav_path) >>> waveform = ap.load_wav(sample_wav_path)
>>> mel = ap.melspectrogram(waveform) >>> mel = ap.melspectrogram(waveform)
>>> x_vector = manager.compute_x_vector(mel.T) >>> d_vector = manager.compute_d_vector(mel.T)
Args: Args:
x_vectors_file_path (str, optional): Path to the metafile including x vectors. Defaults to "". d_vectors_file_path (str, optional): Path to the metafile including x vectors. Defaults to "".
speaker_id_file_path (str, optional): Path to the metafile that maps speaker names to ids used by speaker_id_file_path (str, optional): Path to the metafile that maps speaker names to ids used by
TTS models. Defaults to "". TTS models. Defaults to "".
encoder_model_path (str, optional): Path to the speaker encoder model file. Defaults to "". encoder_model_path (str, optional): Path to the speaker encoder model file. Defaults to "".
@ -116,15 +116,15 @@ class SpeakerManager:
def __init__( def __init__(
self, self,
data_items: List[List[Any]] = None, data_items: List[List[Any]] = None,
x_vectors_file_path: str = "", d_vectors_file_path: str = "",
speaker_id_file_path: str = "", speaker_id_file_path: str = "",
encoder_model_path: str = "", encoder_model_path: str = "",
encoder_config_path: str = "", encoder_config_path: str = "",
): ):
self.data_items = [] self.data_items = []
self.x_vectors = {} self.d_vectors = {}
self.speaker_ids = [] self.speaker_ids = {}
self.clip_ids = [] self.clip_ids = []
self.speaker_encoder = None self.speaker_encoder = None
self.speaker_encoder_ap = None self.speaker_encoder_ap = None
@ -132,8 +132,8 @@ class SpeakerManager:
if data_items: if data_items:
self.speaker_ids, _ = self.parse_speakers_from_data(self.data_items) self.speaker_ids, _ = self.parse_speakers_from_data(self.data_items)
if x_vectors_file_path: if d_vectors_file_path:
self.set_x_vectors_from_file(x_vectors_file_path) self.set_d_vectors_from_file(d_vectors_file_path)
if speaker_id_file_path: if speaker_id_file_path:
self.set_speaker_ids_from_file(speaker_id_file_path) self.set_speaker_ids_from_file(speaker_id_file_path)
@ -156,10 +156,10 @@ class SpeakerManager:
return len(self.speaker_ids) return len(self.speaker_ids)
@property @property
def x_vector_dim(self): def d_vector_dim(self):
"""Dimensionality of x_vectors. If x_vectors are not loaded, returns zero.""" """Dimensionality of d_vectors. If d_vectors are not loaded, returns zero."""
if self.x_vectors: if self.d_vectors:
return len(self.x_vectors[list(self.x_vectors.keys())[0]]["embedding"]) return len(self.d_vectors[list(self.d_vectors.keys())[0]]["embedding"])
return 0 return 0
@staticmethod @staticmethod
@ -201,73 +201,73 @@ class SpeakerManager:
""" """
self._save_json(file_path, self.speaker_ids) self._save_json(file_path, self.speaker_ids)
def save_x_vectors_to_file(self, file_path: str) -> None: def save_d_vectors_to_file(self, file_path: str) -> None:
"""Save x_vectors to a json file. """Save d_vectors to a json file.
Args: Args:
file_path (str): Path to the output file. file_path (str): Path to the output file.
""" """
self._save_json(file_path, self.x_vectors) self._save_json(file_path, self.d_vectors)
def set_x_vectors_from_file(self, file_path: str) -> None: def set_d_vectors_from_file(self, file_path: str) -> None:
"""Load x_vectors from a json file. """Load d_vectors from a json file.
Args: Args:
file_path (str): Path to the target json file. file_path (str): Path to the target json file.
""" """
self.x_vectors = self._load_json(file_path) self.d_vectors = self._load_json(file_path)
self.speaker_ids = list(set(sorted(x["name"] for x in self.x_vectors.values()))) self.speaker_ids = list(set(sorted(x["name"] for x in self.d_vectors.values())))
self.clip_ids = list(set(sorted(clip_name for clip_name in self.x_vectors.keys()))) self.clip_ids = list(set(sorted(clip_name for clip_name in self.d_vectors.keys())))
def get_x_vector_by_clip(self, clip_idx: str) -> List: def get_d_vector_by_clip(self, clip_idx: str) -> List:
"""Get x_vector by clip ID. """Get d_vector by clip ID.
Args: Args:
clip_idx (str): Target clip ID. clip_idx (str): Target clip ID.
Returns: Returns:
List: x_vector as a list. List: d_vector as a list.
""" """
return self.x_vectors[clip_idx]["embedding"] return self.d_vectors[clip_idx]["embedding"]
def get_x_vectors_by_speaker(self, speaker_idx: str) -> List[List]: def get_d_vectors_by_speaker(self, speaker_idx: str) -> List[List]:
"""Get all x_vectors of a speaker. """Get all d_vectors of a speaker.
Args: Args:
speaker_idx (str): Target speaker ID. speaker_idx (str): Target speaker ID.
Returns: Returns:
List[List]: all the x_vectors of the given speaker. List[List]: all the d_vectors of the given speaker.
""" """
return [x["embedding"] for x in self.x_vectors.values() if x["name"] == speaker_idx] return [x["embedding"] for x in self.d_vectors.values() if x["name"] == speaker_idx]
def get_mean_x_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False) -> np.ndarray: def get_mean_d_vector(self, speaker_idx: str, num_samples: int = None, randomize: bool = False) -> np.ndarray:
"""Get mean x_vector of a speaker ID. """Get mean d_vector of a speaker ID.
Args: Args:
speaker_idx (str): Target speaker ID. speaker_idx (str): Target speaker ID.
num_samples (int, optional): Number of samples to be averaged. Defaults to None. num_samples (int, optional): Number of samples to be averaged. Defaults to None.
randomize (bool, optional): Pick random `num_samples`of x_vectors. Defaults to False. randomize (bool, optional): Pick random `num_samples`of d_vectors. Defaults to False.
Returns: Returns:
np.ndarray: Mean x_vector. np.ndarray: Mean d_vector.
""" """
x_vectors = self.get_x_vectors_by_speaker(speaker_idx) d_vectors = self.get_d_vectors_by_speaker(speaker_idx)
if num_samples is None: if num_samples is None:
x_vectors = np.stack(x_vectors).mean(0) d_vectors = np.stack(d_vectors).mean(0)
else: else:
assert len(x_vectors) >= num_samples, f" [!] speaker {speaker_idx} has number of samples < {num_samples}" assert len(d_vectors) >= num_samples, f" [!] speaker {speaker_idx} has number of samples < {num_samples}"
if randomize: if randomize:
x_vectors = np.stack(random.choices(x_vectors, k=num_samples)).mean(0) d_vectors = np.stack(random.choices(d_vectors, k=num_samples)).mean(0)
else: else:
x_vectors = np.stack(x_vectors[:num_samples]).mean(0) d_vectors = np.stack(d_vectors[:num_samples]).mean(0)
return x_vectors return d_vectors
def get_speakers(self) -> List: def get_speakers(self) -> List:
return self.speaker_ids return self.speaker_ids
def get_clips(self) -> List: def get_clips(self) -> List:
return sorted(self.x_vectors.keys()) return sorted(self.d_vectors.keys())
def init_speaker_encoder(self, model_path: str, config_path: str) -> None: def init_speaker_encoder(self, model_path: str, config_path: str) -> None:
"""Initialize a speaker encoder model. """Initialize a speaker encoder model.
@ -284,14 +284,14 @@ class SpeakerManager:
self.speaker_encoder_ap.do_sound_norm = True self.speaker_encoder_ap.do_sound_norm = True
self.speaker_encoder_ap.do_trim_silence = True self.speaker_encoder_ap.do_trim_silence = True
def compute_x_vector_from_clip(self, wav_file: Union[str, list]) -> list: def compute_d_vector_from_clip(self, wav_file: Union[str, list]) -> list:
"""Compute a x_vector from a given audio file. """Compute a d_vector from a given audio file.
Args: Args:
wav_file (Union[str, list]): Target file path. wav_file (Union[str, list]): Target file path.
Returns: Returns:
list: Computed x_vector. list: Computed d_vector.
""" """
def _compute(wav_file: str): def _compute(wav_file: str):
@ -299,30 +299,30 @@ class SpeakerManager:
spec = self.speaker_encoder_ap.melspectrogram(waveform) spec = self.speaker_encoder_ap.melspectrogram(waveform)
spec = torch.from_numpy(spec.T) spec = torch.from_numpy(spec.T)
spec = spec.unsqueeze(0) spec = spec.unsqueeze(0)
x_vector = self.speaker_encoder.compute_embedding(spec) d_vector = self.speaker_encoder.compute_embedding(spec)
return x_vector return d_vector
if isinstance(wav_file, list): if isinstance(wav_file, list):
# compute the mean x_vector # compute the mean d_vector
x_vectors = None d_vectors = None
for wf in wav_file: for wf in wav_file:
x_vector = _compute(wf) d_vector = _compute(wf)
if x_vectors is None: if d_vectors is None:
x_vectors = x_vector d_vectors = d_vector
else: else:
x_vectors += x_vector d_vectors += d_vector
return (x_vectors / len(wav_file))[0].tolist() return (d_vectors / len(wav_file))[0].tolist()
x_vector = _compute(wav_file) d_vector = _compute(wav_file)
return x_vector[0].tolist() return d_vector[0].tolist()
def compute_x_vector(self, feats: Union[torch.Tensor, np.ndarray]) -> List: def compute_d_vector(self, feats: Union[torch.Tensor, np.ndarray]) -> List:
"""Compute x_vector from features. """Compute d_vector from features.
Args: Args:
feats (Union[torch.Tensor, np.ndarray]): Input features. feats (Union[torch.Tensor, np.ndarray]): Input features.
Returns: Returns:
List: computed x_vector. List: computed d_vector.
""" """
if isinstance(feats, np.ndarray): if isinstance(feats, np.ndarray):
feats = torch.from_numpy(feats) feats = torch.from_numpy(feats)

View File

@ -64,9 +64,9 @@ def compute_style_mel(style_wav, ap, cuda=False):
return style_mel return style_mel
def run_model_torch(model, inputs, speaker_id=None, style_mel=None, x_vector=None): def run_model_torch(model, inputs, speaker_id=None, style_mel=None, d_vector=None):
outputs = model.inference( outputs = model.inference(
inputs, cond_input={"speaker_ids": speaker_id, "x_vector": x_vector, "style_mel": style_mel} inputs, cond_input={"speaker_ids": speaker_id, "d_vector": d_vector, "style_mel": style_mel}
) )
return outputs return outputs
@ -139,13 +139,13 @@ def speaker_id_to_torch(speaker_id, cuda=False):
return speaker_id return speaker_id
def embedding_to_torch(x_vector, cuda=False): def embedding_to_torch(d_vector, cuda=False):
if x_vector is not None: if d_vector is not None:
x_vector = np.asarray(x_vector) d_vector = np.asarray(d_vector)
x_vector = torch.from_numpy(x_vector).unsqueeze(0).type(torch.FloatTensor) d_vector = torch.from_numpy(d_vector).unsqueeze(0).type(torch.FloatTensor)
if cuda: if cuda:
return x_vector.cuda() return d_vector.cuda()
return x_vector return d_vector
# TODO: perform GL with pytorch for batching # TODO: perform GL with pytorch for batching
@ -177,7 +177,7 @@ def synthesis(
enable_eos_bos_chars=False, # pylint: disable=unused-argument enable_eos_bos_chars=False, # pylint: disable=unused-argument
use_griffin_lim=False, use_griffin_lim=False,
do_trim_silence=False, do_trim_silence=False,
x_vector=None, d_vector=None,
backend="torch", backend="torch",
): ):
"""Synthesize voice for the given text. """Synthesize voice for the given text.
@ -209,8 +209,8 @@ def synthesis(
if speaker_id is not None: if speaker_id is not None:
speaker_id = speaker_id_to_torch(speaker_id, cuda=use_cuda) speaker_id = speaker_id_to_torch(speaker_id, cuda=use_cuda)
if x_vector is not None: if d_vector is not None:
x_vector = embedding_to_torch(x_vector, cuda=use_cuda) d_vector = embedding_to_torch(d_vector, cuda=use_cuda)
if not isinstance(style_mel, dict): if not isinstance(style_mel, dict):
style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda) style_mel = numpy_to_torch(style_mel, torch.float, cuda=use_cuda)
@ -227,7 +227,7 @@ def synthesis(
text_inputs = tf.expand_dims(text_inputs, 0) text_inputs = tf.expand_dims(text_inputs, 0)
# synthesize voice # synthesize voice
if backend == "torch": if backend == "torch":
outputs = run_model_torch(model, text_inputs, speaker_id, style_mel, x_vector=x_vector) outputs = run_model_torch(model, text_inputs, speaker_id, style_mel, d_vector=d_vector)
model_outputs = outputs["model_outputs"] model_outputs = outputs["model_outputs"]
model_outputs = model_outputs[0].data.cpu().numpy() model_outputs = model_outputs[0].data.cpu().numpy()
alignments = outputs["alignments"] alignments = outputs["alignments"]

View File

@ -63,7 +63,7 @@ class Synthesizer(object):
self.speaker_manager = None self.speaker_manager = None
self.num_speakers = 0 self.num_speakers = 0
self.tts_speakers = {} self.tts_speakers = {}
self.speaker_embedding_dim = 0 self.d_vector_dim = 0
self.seg = self._get_segmenter("en") self.seg = self._get_segmenter("en")
self.use_cuda = use_cuda self.use_cuda = use_cuda
@ -98,9 +98,9 @@ class Synthesizer(object):
self.speaker_manager = SpeakerManager( self.speaker_manager = SpeakerManager(
encoder_model_path=self.encoder_checkpoint, encoder_config_path=self.encoder_config encoder_model_path=self.encoder_checkpoint, encoder_config_path=self.encoder_config
) )
self.speaker_manager.load_x_vectors_file(self.tts_config.get("external_speaker_embedding_file", speaker_file)) self.speaker_manager.load_d_vectors_file(self.tts_config.get("external_speaker_embedding_file", speaker_file))
self.num_speakers = self.speaker_manager.num_speakers self.num_speakers = self.speaker_manager.num_speakers
self.speaker_embedding_dim = self.speaker_manager.x_vector_dim self.d_vector_dim = self.speaker_manager.d_vector_dim
def _load_tts(self, tts_checkpoint: str, tts_config_path: str, use_cuda: bool) -> None: def _load_tts(self, tts_checkpoint: str, tts_config_path: str, use_cuda: bool) -> None:
"""Load the TTS model. """Load the TTS model.
@ -135,7 +135,7 @@ class Synthesizer(object):
self.input_size, self.input_size,
num_speakers=self.num_speakers, num_speakers=self.num_speakers,
c=self.tts_config, c=self.tts_config,
speaker_embedding_dim=self.speaker_embedding_dim, d_vector_dim=self.d_vector_dim,
) )
self.tts_model.load_checkpoint(self.tts_config, tts_checkpoint, eval=True) self.tts_model.load_checkpoint(self.tts_config, tts_checkpoint, eval=True)
if use_cuda: if use_cuda:
@ -197,9 +197,9 @@ class Synthesizer(object):
print(sens) print(sens)
if self.tts_speakers_file: if self.tts_speakers_file:
# get the speaker embedding from the saved x_vectors. # get the speaker embedding from the saved d_vectors.
if speaker_idx and isinstance(speaker_idx, str): if speaker_idx and isinstance(speaker_idx, str):
speaker_embedding = self.speaker_manager.get_x_vectors_by_speaker(speaker_idx)[0] speaker_embedding = self.speaker_manager.get_d_vectors_by_speaker(speaker_idx)[0]
elif not speaker_idx and not speaker_wav: elif not speaker_idx and not speaker_wav:
raise ValueError( raise ValueError(
" [!] Look like you use a multi-speaker model. " " [!] Look like you use a multi-speaker model. "
@ -214,9 +214,9 @@ class Synthesizer(object):
"Define path for speaker.json if it is a multi-speaker model or remove defined speaker idx. " "Define path for speaker.json if it is a multi-speaker model or remove defined speaker idx. "
) )
# compute a new x_vector from the given clip. # compute a new d_vector from the given clip.
if speaker_wav is not None: if speaker_wav is not None:
speaker_embedding = self.speaker_manager.compute_x_vector_from_clip(speaker_wav) speaker_embedding = self.speaker_manager.compute_d_vector_from_clip(speaker_wav)
use_gl = self.vocoder_model is None use_gl = self.vocoder_model is None
@ -232,7 +232,7 @@ class Synthesizer(object):
style_wav=style_wav, style_wav=style_wav,
enable_eos_bos_chars=self.tts_config.enable_eos_bos_chars, enable_eos_bos_chars=self.tts_config.enable_eos_bos_chars,
use_griffin_lim=use_gl, use_griffin_lim=use_gl,
x_vector=speaker_embedding, d_vector=speaker_embedding,
) )
waveform = outputs["wav"] waveform = outputs["wav"]
mel_postnet_spec = outputs["model_outputs"] mel_postnet_spec = outputs["model_outputs"]

View File

@ -22,7 +22,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase):
c = load_config(config_path) c = load_config(config_path)
# create model # create model
num_chars = len(phonemes if c.use_phonemes else symbols) num_chars = len(phonemes if c.use_phonemes else symbols)
model = setup_model(num_chars, 1, c, speaker_embedding_dim=None) model = setup_model(num_chars, 1, c, d_vector_dim=None)
# save model # save model
torch.save({"model": model.state_dict()}, checkpoint_path) torch.save({"model": model.state_dict()}, checkpoint_path)
# run test # run test
@ -41,7 +41,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase):
c = load_config(config_path) c = load_config(config_path)
# create model # create model
num_chars = len(phonemes if c.use_phonemes else symbols) num_chars = len(phonemes if c.use_phonemes else symbols)
model = setup_model(num_chars, 1, c, speaker_embedding_dim=None) model = setup_model(num_chars, 1, c, d_vector_dim=None)
# save model # save model
torch.save({"model": model.state_dict()}, checkpoint_path) torch.save({"model": model.state_dict()}, checkpoint_path)
# run test # run test
@ -60,7 +60,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase):
c = load_config(config_path) c = load_config(config_path)
# create model # create model
num_chars = len(phonemes if c.use_phonemes else symbols) num_chars = len(phonemes if c.use_phonemes else symbols)
model = setup_model(num_chars, 1, c, speaker_embedding_dim=None) model = setup_model(num_chars, 1, c, d_vector_dim=None)
# save model # save model
torch.save({"model": model.state_dict()}, checkpoint_path) torch.save({"model": model.state_dict()}, checkpoint_path)
# run test # run test

View File

@ -15,11 +15,11 @@ encoder_config_path = os.path.join(get_tests_input_path(), "test_speaker_encoder
encoder_model_path = os.path.join(get_tests_input_path(), "checkpoint_0.pth.tar") encoder_model_path = os.path.join(get_tests_input_path(), "checkpoint_0.pth.tar")
sample_wav_path = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0001.wav") sample_wav_path = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0001.wav")
sample_wav_path2 = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0002.wav") sample_wav_path2 = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0002.wav")
x_vectors_file_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.json") d_vectors_file_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.json")
class SpeakerManagerTest(unittest.TestCase): class SpeakerManagerTest(unittest.TestCase):
"""Test SpeakerManager for loading embedding files and computing x_vectors from waveforms""" """Test SpeakerManager for loading embedding files and computing d_vectors from waveforms"""
@staticmethod @staticmethod
def test_speaker_embedding(): def test_speaker_embedding():
@ -38,38 +38,38 @@ class SpeakerManagerTest(unittest.TestCase):
# load a sample audio and compute embedding # load a sample audio and compute embedding
waveform = ap.load_wav(sample_wav_path) waveform = ap.load_wav(sample_wav_path)
mel = ap.melspectrogram(waveform) mel = ap.melspectrogram(waveform)
x_vector = manager.compute_x_vector(mel.T) d_vector = manager.compute_d_vector(mel.T)
assert x_vector.shape[1] == 256 assert d_vector.shape[1] == 256
# compute x_vector directly from an input file # compute d_vector directly from an input file
x_vector = manager.compute_x_vector_from_clip(sample_wav_path) d_vector = manager.compute_d_vector_from_clip(sample_wav_path)
x_vector2 = manager.compute_x_vector_from_clip(sample_wav_path) d_vector2 = manager.compute_d_vector_from_clip(sample_wav_path)
x_vector = torch.FloatTensor(x_vector) d_vector = torch.FloatTensor(d_vector)
x_vector2 = torch.FloatTensor(x_vector2) d_vector2 = torch.FloatTensor(d_vector2)
assert x_vector.shape[0] == 256 assert d_vector.shape[0] == 256
assert (x_vector - x_vector2).sum() == 0.0 assert (d_vector - d_vector2).sum() == 0.0
# compute x_vector from a list of wav files. # compute d_vector from a list of wav files.
x_vector3 = manager.compute_x_vector_from_clip([sample_wav_path, sample_wav_path2]) d_vector3 = manager.compute_d_vector_from_clip([sample_wav_path, sample_wav_path2])
x_vector3 = torch.FloatTensor(x_vector3) d_vector3 = torch.FloatTensor(d_vector3)
assert x_vector3.shape[0] == 256 assert d_vector3.shape[0] == 256
assert (x_vector - x_vector3).sum() != 0.0 assert (d_vector - d_vector3).sum() != 0.0
# remove dummy model # remove dummy model
os.remove(encoder_model_path) os.remove(encoder_model_path)
@staticmethod @staticmethod
def test_speakers_file_processing(): def test_speakers_file_processing():
manager = SpeakerManager(x_vectors_file_path=x_vectors_file_path) manager = SpeakerManager(d_vectors_file_path=d_vectors_file_path)
print(manager.num_speakers) print(manager.num_speakers)
print(manager.x_vector_dim) print(manager.d_vector_dim)
print(manager.clip_ids) print(manager.clip_ids)
x_vector = manager.get_x_vector_by_clip(manager.clip_ids[0]) d_vector = manager.get_d_vector_by_clip(manager.clip_ids[0])
assert len(x_vector) == 256 assert len(d_vector) == 256
x_vectors = manager.get_x_vectors_by_speaker(manager.speaker_ids[0]) d_vectors = manager.get_d_vectors_by_speaker(manager.speaker_ids[0])
assert len(x_vectors[0]) == 256 assert len(d_vectors[0]) == 256
x_vector1 = manager.get_mean_x_vector(manager.speaker_ids[0], num_samples=2, randomize=True) d_vector1 = manager.get_mean_d_vector(manager.speaker_ids[0], num_samples=2, randomize=True)
assert len(x_vector1) == 256 assert len(d_vector1) == 256
x_vector2 = manager.get_mean_x_vector(manager.speaker_ids[0], num_samples=2, randomize=False) d_vector2 = manager.get_mean_d_vector(manager.speaker_ids[0], num_samples=2, randomize=False)
assert len(x_vector2) == 256 assert len(d_vector2) == 256
assert np.sum(np.array(x_vector1) - np.array(x_vector2)) != 0 assert np.sum(np.array(d_vector1) - np.array(d_vector2)) != 0

View File

@ -57,7 +57,7 @@ def test_speedy_speech():
# with speaker embedding # with speaker embedding
model = SpeedySpeech(num_chars, out_channels=80, hidden_channels=128, num_speakers=10, c_in_channels=256).to(device) model = SpeedySpeech(num_chars, out_channels=80, hidden_channels=128, num_speakers=10, c_in_channels=256).to(device)
model.forward( model.forward(
x_dummy, x_lengths, y_lengths, durations, cond_input={"x_vectors": torch.randint(0, 10, (B,)).to(device)} x_dummy, x_lengths, y_lengths, durations, cond_input={"d_vectors": torch.randint(0, 10, (B,)).to(device)}
) )
o_de = outputs["model_outputs"] o_de = outputs["model_outputs"]
attn = outputs["alignments"] attn = outputs["alignments"]
@ -71,7 +71,7 @@ def test_speedy_speech():
model = SpeedySpeech( model = SpeedySpeech(
num_chars, out_channels=80, hidden_channels=128, num_speakers=10, external_c=True, c_in_channels=256 num_chars, out_channels=80, hidden_channels=128, num_speakers=10, external_c=True, c_in_channels=256
).to(device) ).to(device)
model.forward(x_dummy, x_lengths, y_lengths, durations, cond_input={"x_vectors": torch.rand((B, 256)).to(device)}) model.forward(x_dummy, x_lengths, y_lengths, durations, cond_input={"d_vectors": torch.rand((B, 256)).to(device)})
o_de = outputs["model_outputs"] o_de = outputs["model_outputs"]
attn = outputs["alignments"] attn = outputs["alignments"]
o_dr = outputs["durations_log"] o_dr = outputs["durations_log"]

View File

@ -95,7 +95,7 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase):
criterion = MSELossMasked(seq_len_norm=False).to(device) criterion = MSELossMasked(seq_len_norm=False).to(device)
criterion_st = nn.BCEWithLogitsLoss().to(device) criterion_st = nn.BCEWithLogitsLoss().to(device)
model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, speaker_embedding_dim=55).to(device) model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, d_vector_dim=55).to(device)
model.train() model.train()
model_ref = copy.deepcopy(model) model_ref = copy.deepcopy(model)
count = 0 count = 0
@ -105,7 +105,7 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase):
optimizer = optim.Adam(model.parameters(), lr=c.lr) optimizer = optim.Adam(model.parameters(), lr=c.lr)
for i in range(5): for i in range(5):
outputs = model.forward( outputs = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"x_vectors": speaker_ids} input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"d_vectors": speaker_ids}
) )
assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0 assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0
assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0 assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0
@ -259,7 +259,7 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze() stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
criterion = MSELossMasked(seq_len_norm=False).to(device) criterion = MSELossMasked(seq_len_norm=False).to(device)
criterion_st = nn.BCEWithLogitsLoss().to(device) criterion_st = nn.BCEWithLogitsLoss().to(device)
model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, speaker_embedding_dim=55, use_gst=True, gst=c.gst).to( model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, d_vector_dim=55, use_gst=True, gst=c.gst).to(
device device
) )
model.train() model.train()
@ -271,7 +271,7 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
optimizer = optim.Adam(model.parameters(), lr=c.lr) optimizer = optim.Adam(model.parameters(), lr=c.lr)
for i in range(5): for i in range(5):
outputs = model.forward( outputs = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"x_vectors": speaker_embeddings} input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"d_vectors": speaker_embeddings}
) )
assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0 assert torch.sigmoid(outputs["stop_tokens"]).data.max() <= 1.0
assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0 assert torch.sigmoid(outputs["stop_tokens"]).data.min() >= 0.0

View File

@ -116,7 +116,7 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase):
decoder_output_dim=c.audio["num_mels"], decoder_output_dim=c.audio["num_mels"],
r=c.r, r=c.r,
memory_size=c.memory_size, memory_size=c.memory_size,
speaker_embedding_dim=55, d_vector_dim=55,
).to( ).to(
device device
) # FIXME: missing num_speakers parameter to Tacotron ctor ) # FIXME: missing num_speakers parameter to Tacotron ctor
@ -130,7 +130,7 @@ class MultiSpeakeTacotronTrainTest(unittest.TestCase):
optimizer = optim.Adam(model.parameters(), lr=c.lr) optimizer = optim.Adam(model.parameters(), lr=c.lr)
for _ in range(5): for _ in range(5):
outputs = model.forward( outputs = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"x_vectors": speaker_embeddings} input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"d_vectors": speaker_embeddings}
) )
optimizer.zero_grad() optimizer.zero_grad()
loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths)
@ -305,7 +305,7 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
gst=c.gst, gst=c.gst,
r=c.r, r=c.r,
memory_size=c.memory_size, memory_size=c.memory_size,
speaker_embedding_dim=55, d_vector_dim=55,
).to( ).to(
device device
) # FIXME: missing num_speakers parameter to Tacotron ctor ) # FIXME: missing num_speakers parameter to Tacotron ctor
@ -319,7 +319,7 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
optimizer = optim.Adam(model.parameters(), lr=c.lr) optimizer = optim.Adam(model.parameters(), lr=c.lr)
for _ in range(5): for _ in range(5):
outputs = model.forward( outputs = model.forward(
input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"x_vectors": speaker_embeddings} input_dummy, input_lengths, mel_spec, mel_lengths, cond_input={"d_vectors": speaker_embeddings}
) )
optimizer.zero_grad() optimizer.zero_grad()
loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths) loss = criterion(outputs["decoder_outputs"], mel_spec, mel_lengths)