mirror of https://github.com/coqui-ai/TTS.git
fix synthesize.py
This commit is contained in:
parent
113f5860b8
commit
23f6743ac9
|
@ -2,6 +2,7 @@ import os
|
|||
import time
|
||||
import argparse
|
||||
import torch
|
||||
import json
|
||||
import string
|
||||
|
||||
from TTS.utils.synthesis import synthesis
|
||||
|
@ -16,22 +17,27 @@ def tts(model,
|
|||
VC,
|
||||
text,
|
||||
ap,
|
||||
ap_vocoder,
|
||||
use_cuda,
|
||||
batched_vocoder,
|
||||
speaker_id=None,
|
||||
figures=False):
|
||||
t_1 = time.time()
|
||||
use_vocoder_model = vocoder_model is not None
|
||||
waveform, alignment, decoder_outputs, postnet_output, stop_tokens = synthesis(
|
||||
model, text, C, use_cuda, ap, False, C.enable_eos_bos_chars)
|
||||
model, text, C, use_cuda, ap, speaker_id, False, C.enable_eos_bos_chars)
|
||||
if C.model == "Tacotron" and use_vocoder_model:
|
||||
postnet_output = ap.out_linear_to_mel(postnet_output.T).T
|
||||
# correct if there is a scale difference b/w two models
|
||||
postnet_output = ap._denormalize(postnet_output)
|
||||
postnet_output = ap_vocoder._normalize(postnet_output)
|
||||
if use_vocoder_model:
|
||||
vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0)
|
||||
waveform = vocoder_model.generate(
|
||||
vocoder_input.cuda() if use_cuda else vocoder_input,
|
||||
batched=batched_vocoder,
|
||||
target=11000,
|
||||
overlap=550)
|
||||
target=8000,
|
||||
overlap=400)
|
||||
print(" > Run-time: {}".format(time.time() - t_1))
|
||||
return alignment, postnet_output, stop_tokens, waveform
|
||||
|
||||
|
@ -81,6 +87,12 @@ if __name__ == "__main__":
|
|||
help="JSON file for multi-speaker model.",
|
||||
default=""
|
||||
)
|
||||
parser.add_argument(
|
||||
'--speaker_id',
|
||||
type=int,
|
||||
help="target speaker_id if the model is multi-speaker.",
|
||||
default=None
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.vocoder_path != "":
|
||||
|
@ -109,10 +121,12 @@ if __name__ == "__main__":
|
|||
model.eval()
|
||||
if args.use_cuda:
|
||||
model.cuda()
|
||||
model.decoder.set_r(cp['r'])
|
||||
|
||||
# load vocoder model
|
||||
if args.vocoder_path != "":
|
||||
VC = load_config(args.vocoder_config_path)
|
||||
ap_vocoder = AudioProcessor(**VC.audio)
|
||||
bits = 10
|
||||
vocoder_model = VocoderModel(
|
||||
rnn_dims=512,
|
||||
|
@ -127,6 +141,8 @@ if __name__ == "__main__":
|
|||
res_blocks=10,
|
||||
hop_length=ap.hop_length,
|
||||
sample_rate=ap.sample_rate,
|
||||
use_aux_net=True,
|
||||
use_upsample_net=True
|
||||
)
|
||||
|
||||
check = torch.load(args.vocoder_path)
|
||||
|
@ -137,6 +153,7 @@ if __name__ == "__main__":
|
|||
else:
|
||||
vocoder_model = None
|
||||
VC = None
|
||||
ap_vocoder = None
|
||||
|
||||
# synthesize voice
|
||||
print(" > Text: {}".format(args.text))
|
||||
|
@ -147,8 +164,10 @@ if __name__ == "__main__":
|
|||
VC,
|
||||
args.text,
|
||||
ap,
|
||||
ap_vocoder,
|
||||
args.use_cuda,
|
||||
args.batched_vocoder,
|
||||
speaker_id=args.speaker_id,
|
||||
figures=False)
|
||||
|
||||
# save the results
|
||||
|
|
Loading…
Reference in New Issue