fix synthesize.py

2019-09-24 17:19:04 +02:00 · 2019-09-24 17:19:04 +02:00 · 23f6743ac9
parent 113f5860b8
commit 23f6743ac9
1 changed files with 22 additions and 3 deletions
--- a/synthesize.py
+++ b/synthesize.py
@ -2,6 +2,7 @@ import os
 import time
 import argparse
 import torch
+import json
 import string

 from TTS.utils.synthesis import synthesis
@ -16,22 +17,27 @@ def tts(model,
        VC,
        text,
        ap,
+        ap_vocoder,
        use_cuda,
        batched_vocoder,
+        speaker_id=None, 
        figures=False):
    t_1 = time.time()
    use_vocoder_model = vocoder_model is not None
    waveform, alignment, decoder_outputs, postnet_output, stop_tokens = synthesis(
-        model, text, C, use_cuda, ap, False, C.enable_eos_bos_chars)
+        model, text, C, use_cuda, ap, speaker_id, False, C.enable_eos_bos_chars)
    if C.model == "Tacotron" and use_vocoder_model:
        postnet_output = ap.out_linear_to_mel(postnet_output.T).T
+    # correct if there is a scale difference b/w two models
+    postnet_output = ap._denormalize(postnet_output)
+    postnet_output = ap_vocoder._normalize(postnet_output)
    if use_vocoder_model:
        vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0)
        waveform = vocoder_model.generate(
            vocoder_input.cuda() if use_cuda else vocoder_input,
            batched=batched_vocoder,
-            target=11000,
-            overlap=550)
+            target=8000,
+            overlap=400)
    print(" >  Run-time: {}".format(time.time() - t_1))
    return alignment, postnet_output, stop_tokens, waveform

@ -81,6 +87,12 @@ if __name__ == "__main__":
        help="JSON file for multi-speaker model.",
        default=""
    )
+    parser.add_argument(
+        '--speaker_id',
+        type=int,
+        help="target speaker_id if the model is multi-speaker.",
+        default=None
+    )
    args = parser.parse_args()

    if args.vocoder_path != "":
@ -109,10 +121,12 @@ if __name__ == "__main__":
    model.eval()
    if args.use_cuda:
        model.cuda()
+    model.decoder.set_r(cp['r'])

    # load vocoder model
    if args.vocoder_path != "":
        VC = load_config(args.vocoder_config_path)
+        ap_vocoder = AudioProcessor(**VC.audio)
        bits = 10
        vocoder_model = VocoderModel(
            rnn_dims=512,
@ -127,6 +141,8 @@ if __name__ == "__main__":
            res_blocks=10,
            hop_length=ap.hop_length,
            sample_rate=ap.sample_rate,
+            use_aux_net=True,
+            use_upsample_net=True
        )

        check = torch.load(args.vocoder_path)
@ -137,6 +153,7 @@ if __name__ == "__main__":
    else:
        vocoder_model = None
        VC = None
+        ap_vocoder = None

    # synthesize voice
    print(" > Text: {}".format(args.text))
@ -147,8 +164,10 @@ if __name__ == "__main__":
        VC,
        args.text,
        ap,
+        ap_vocoder,
        args.use_cuda,
        args.batched_vocoder,
+        speaker_id=args.speaker_id,
        figures=False)

    # save the results