From 23f6743ac9cf869679df8fa57faa62681ed3ebb9 Mon Sep 17 00:00:00 2001
From: Eren Golge <egolge@mozilla.com>
Date: Tue, 24 Sep 2019 17:19:04 +0200
Subject: [PATCH] fix synthesize.py

---
 synthesize.py | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/synthesize.py b/synthesize.py
index 23c67c73..a0bf6be6 100644
--- a/synthesize.py
+++ b/synthesize.py
@@ -2,6 +2,7 @@ import os
 import time
 import argparse
 import torch
+import json
 import string
 
 from TTS.utils.synthesis import synthesis
@@ -16,22 +17,27 @@ def tts(model,
         VC,
         text,
         ap,
+        ap_vocoder,
         use_cuda,
         batched_vocoder,
+        speaker_id=None, 
         figures=False):
     t_1 = time.time()
     use_vocoder_model = vocoder_model is not None
     waveform, alignment, decoder_outputs, postnet_output, stop_tokens = synthesis(
-        model, text, C, use_cuda, ap, False, C.enable_eos_bos_chars)
+        model, text, C, use_cuda, ap, speaker_id, False, C.enable_eos_bos_chars)
     if C.model == "Tacotron" and use_vocoder_model:
         postnet_output = ap.out_linear_to_mel(postnet_output.T).T
+    # correct if there is a scale difference b/w two models
+    postnet_output = ap._denormalize(postnet_output)
+    postnet_output = ap_vocoder._normalize(postnet_output)
     if use_vocoder_model:
         vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0)
         waveform = vocoder_model.generate(
             vocoder_input.cuda() if use_cuda else vocoder_input,
             batched=batched_vocoder,
-            target=11000,
-            overlap=550)
+            target=8000,
+            overlap=400)
     print(" >  Run-time: {}".format(time.time() - t_1))
     return alignment, postnet_output, stop_tokens, waveform
 
@@ -81,6 +87,12 @@ if __name__ == "__main__":
         help="JSON file for multi-speaker model.",
         default=""
     )
+    parser.add_argument(
+        '--speaker_id',
+        type=int,
+        help="target speaker_id if the model is multi-speaker.",
+        default=None
+    )
     args = parser.parse_args()
 
     if args.vocoder_path != "":
@@ -109,10 +121,12 @@ if __name__ == "__main__":
     model.eval()
     if args.use_cuda:
         model.cuda()
+    model.decoder.set_r(cp['r'])
 
     # load vocoder model
     if args.vocoder_path != "":
         VC = load_config(args.vocoder_config_path)
+        ap_vocoder = AudioProcessor(**VC.audio)
         bits = 10
         vocoder_model = VocoderModel(
             rnn_dims=512,
@@ -127,6 +141,8 @@ if __name__ == "__main__":
             res_blocks=10,
             hop_length=ap.hop_length,
             sample_rate=ap.sample_rate,
+            use_aux_net=True,
+            use_upsample_net=True
         )
 
         check = torch.load(args.vocoder_path)
@@ -137,6 +153,7 @@ if __name__ == "__main__":
     else:
         vocoder_model = None
         VC = None
+        ap_vocoder = None
 
     # synthesize voice
     print(" > Text: {}".format(args.text))
@@ -147,8 +164,10 @@ if __name__ == "__main__":
         VC,
         args.text,
         ap,
+        ap_vocoder,
         args.use_cuda,
         args.batched_vocoder,
+        speaker_id=args.speaker_id,
         figures=False)
 
     # save the results