From 23f6743ac9cf869679df8fa57faa62681ed3ebb9 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Tue, 24 Sep 2019 17:19:04 +0200 Subject: [PATCH] fix synthesize.py --- synthesize.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/synthesize.py b/synthesize.py index 23c67c73..a0bf6be6 100644 --- a/synthesize.py +++ b/synthesize.py @@ -2,6 +2,7 @@ import os import time import argparse import torch +import json import string from TTS.utils.synthesis import synthesis @@ -16,22 +17,27 @@ def tts(model, VC, text, ap, + ap_vocoder, use_cuda, batched_vocoder, + speaker_id=None, figures=False): t_1 = time.time() use_vocoder_model = vocoder_model is not None waveform, alignment, decoder_outputs, postnet_output, stop_tokens = synthesis( - model, text, C, use_cuda, ap, False, C.enable_eos_bos_chars) + model, text, C, use_cuda, ap, speaker_id, False, C.enable_eos_bos_chars) if C.model == "Tacotron" and use_vocoder_model: postnet_output = ap.out_linear_to_mel(postnet_output.T).T + # correct if there is a scale difference b/w two models + postnet_output = ap._denormalize(postnet_output) + postnet_output = ap_vocoder._normalize(postnet_output) if use_vocoder_model: vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0) waveform = vocoder_model.generate( vocoder_input.cuda() if use_cuda else vocoder_input, batched=batched_vocoder, - target=11000, - overlap=550) + target=8000, + overlap=400) print(" > Run-time: {}".format(time.time() - t_1)) return alignment, postnet_output, stop_tokens, waveform @@ -81,6 +87,12 @@ if __name__ == "__main__": help="JSON file for multi-speaker model.", default="" ) + parser.add_argument( + '--speaker_id', + type=int, + help="target speaker_id if the model is multi-speaker.", + default=None + ) args = parser.parse_args() if args.vocoder_path != "": @@ -109,10 +121,12 @@ if __name__ == "__main__": model.eval() if args.use_cuda: model.cuda() + model.decoder.set_r(cp['r']) # load vocoder model if args.vocoder_path != "": VC = load_config(args.vocoder_config_path) + ap_vocoder = AudioProcessor(**VC.audio) bits = 10 vocoder_model = VocoderModel( rnn_dims=512, @@ -127,6 +141,8 @@ if __name__ == "__main__": res_blocks=10, hop_length=ap.hop_length, sample_rate=ap.sample_rate, + use_aux_net=True, + use_upsample_net=True ) check = torch.load(args.vocoder_path) @@ -137,6 +153,7 @@ if __name__ == "__main__": else: vocoder_model = None VC = None + ap_vocoder = None # synthesize voice print(" > Text: {}".format(args.text)) @@ -147,8 +164,10 @@ if __name__ == "__main__": VC, args.text, ap, + ap_vocoder, args.use_cuda, args.batched_vocoder, + speaker_id=args.speaker_id, figures=False) # save the results