diff --git a/train.py b/train.py index 7ab213a6..6ad0835b 100644 --- a/train.py +++ b/train.py @@ -81,7 +81,7 @@ def format_data(data): text_input = data[0] text_lengths = data[1] speaker_names = data[2] - linear_input = data[3] if c.model in ["Tacotron", "TacotronGST"] else None + linear_input = data[3] if c.model in ["Tacotron"] else None mel_input = data[4] mel_lengths = data[5] stop_targets = data[6] @@ -96,7 +96,7 @@ def format_data(data): else: speaker_ids = None - # set stop targets view, we predict a single stop token per r frames prediction + # set stop targets view, we predict a single stop token per iteration. stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = (stop_targets.sum(2) > @@ -108,7 +108,7 @@ def format_data(data): text_lengths = text_lengths.cuda(non_blocking=True) mel_input = mel_input.cuda(non_blocking=True) mel_lengths = mel_lengths.cuda(non_blocking=True) - linear_input = linear_input.cuda(non_blocking=True) if c.model in ["Tacotron", "TacotronGST"] else None + linear_input = linear_input.cuda(non_blocking=True) if c.model in ["Tacotron"] else None stop_targets = stop_targets.cuda(non_blocking=True) if speaker_ids is not None: speaker_ids = speaker_ids.cuda(non_blocking=True) @@ -171,7 +171,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, stop_targets) if c.stopnet else torch.zeros(1) if c.loss_masking: decoder_loss = criterion(decoder_output, mel_input, mel_lengths) - if c.model in ["Tacotron", "TacotronGST"]: + if c.model in ["Tacotron"]: postnet_loss = criterion(postnet_output, linear_input, mel_lengths) else: @@ -179,7 +179,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, mel_lengths) else: decoder_loss = criterion(decoder_output, mel_input) - if c.model in ["Tacotron", "TacotronGST"]: + if c.model in ["Tacotron"]: postnet_loss = criterion(postnet_output, linear_input) else: postnet_loss = criterion(postnet_output, mel_input) @@ -277,7 +277,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, # Diagnostic visualizations const_spec = postnet_output[0].data.cpu().numpy() gt_spec = linear_input[0].data.cpu().numpy() if c.model in [ - "Tacotron", "TacotronGST" + "Tacotron" ] else mel_input[0].data.cpu().numpy() align_img = alignments[0].data.cpu().numpy() @@ -293,7 +293,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, tb_logger.tb_train_figures(global_step, figures) # Sample audio - if c.model in ["Tacotron", "TacotronGST"]: + if c.model in ["Tacotron"]: train_audio = ap.inv_spectrogram(const_spec.T) else: train_audio = ap.inv_mel_spectrogram(const_spec.T) @@ -370,7 +370,7 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch): if c.loss_masking: decoder_loss = criterion(decoder_output, mel_input, mel_lengths) - if c.model in ["Tacotron", "TacotronGST"]: + if c.model in ["Tacotron"]: postnet_loss = criterion(postnet_output, linear_input, mel_lengths) else: @@ -378,7 +378,7 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch): mel_lengths) else: decoder_loss = criterion(decoder_output, mel_input) - if c.model in ["Tacotron", "TacotronGST"]: + if c.model in ["Tacotron"]: postnet_loss = criterion(postnet_output, linear_input) else: postnet_loss = criterion(postnet_output, mel_input) @@ -434,7 +434,7 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch): idx = np.random.randint(mel_input.shape[0]) const_spec = postnet_output[idx].data.cpu().numpy() gt_spec = linear_input[idx].data.cpu().numpy() if c.model in [ - "Tacotron", "TacotronGST" + "Tacotron" ] else mel_input[idx].data.cpu().numpy() align_img = alignments[idx].data.cpu().numpy() @@ -445,7 +445,7 @@ def evaluate(model, criterion, criterion_st, ap, global_step, epoch): } # Sample audio - if c.model in ["Tacotron", "TacotronGST"]: + if c.model in ["Tacotron"]: eval_audio = ap.inv_spectrogram(const_spec.T) else: eval_audio = ap.inv_mel_spectrogram(const_spec.T) @@ -562,10 +562,10 @@ def main(args): # pylint: disable=redefined-outer-name optimizer_st = None if c.loss_masking: - criterion = L1LossMasked() if c.model in ["Tacotron", "TacotronGST" + criterion = L1LossMasked() if c.model in ["Tacotron" ] else MSELossMasked() else: - criterion = nn.L1Loss() if c.model in ["Tacotron", "TacotronGST" + criterion = nn.L1Loss() if c.model in ["Tacotron" ] else nn.MSELoss() criterion_st = nn.BCEWithLogitsLoss( pos_weight=torch.tensor(10)) if c.stopnet else None