From 179722e3a7220dcba8133080faef78c76d3c1666 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Fri, 23 Apr 2021 17:46:21 +0200 Subject: [PATCH] new arguments to synthesize.py for loading speaker encoder and speaker wavs --- TTS/bin/synthesize.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index 75a167e9..1f7725eb 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -100,6 +100,13 @@ def main(): default=None, ) parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None) + parser.add_argument( + "--encoder_path", + type=str, + help="Path to speaker encoder model file.", + default=None, + ) + parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None) # args for multi-speaker synthesis parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None) @@ -109,6 +116,12 @@ def main(): help="if the tts model is trained with x-vectors, then speaker_idx is a file present in speakers.json else speaker_idx is the speaker id corresponding to a speaker in the speaker embedding layer.", default=None, ) + parser.add_argument( + "--speaker_wav", + nargs="+", + help="wav file(s) to condition a multi-speaker model. You can give multiple file paths. The x_vectors is computed as their average.", + default=None, + ) parser.add_argument("--gst_style", help="Wav path file for GST stylereference.", default=None) parser.add_argument( "--list_speaker_idxs", @@ -139,6 +152,8 @@ def main(): speakers_file_path = None vocoder_path = None vocoder_config_path = None + encoder_path = None + encoder_config_path = None # CASE1: list pre-trained TTS models if args.list_models: @@ -163,9 +178,14 @@ def main(): vocoder_path = args.vocoder_path vocoder_config_path = args.vocoder_config_path + if args.encoder_path is not None: + encoder_path = args.encoder_path + encoder_config_path = args.encoder_config_path + # load models synthesizer = Synthesizer( - model_path, config_path, speakers_file_path, vocoder_path, vocoder_config_path, args.use_cuda + model_path, config_path, speakers_file_path, vocoder_path, vocoder_config_path, encoder_path, + encoder_config_path, args.use_cuda ) # query speaker ids of a multi-speaker model. @@ -180,7 +200,7 @@ def main(): print(" > Text: {}".format(args.text)) # kick it - wav = synthesizer.tts(args.text, args.speaker_idx) + wav = synthesizer.tts(args.text, args.speaker_idx, args.speaker_wav) # save the results print(" > Saving output to {}".format(args.out_path))