diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index b61113a7..da91fbf7 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -29,27 +29,65 @@ def main(): """You can either use your trained model or choose a model from the provided list.\n\n""" """If you don't specify any models, then it uses LJSpeech based English models\n\n""" """ - Example runs: + # Example Runs: - # list provided models - ./TTS/bin/synthesize.py --list_models + ## Single Speaker Models - # run tts with default models. - ./TTS/bin synthesize.py --text "Text for TTS" + - list provided models - # run a tts model with its default vocoder model. - ./TTS/bin synthesize.py --text "Text for TTS" --model_name "//" + ``` + $ ./TTS/bin/synthesize.py --list_models + ``` - # run with specific tts and vocoder models from the list - ./TTS/bin/synthesize.py --text "Text for TTS" --model_name "//" --vocoder_name "//" --output_path + - run tts with default models. - # run your own TTS model (Using Griffin-Lim Vocoder) - ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav + ``` + $ ./TTS/bin synthesize.py --text "Text for TTS" + ``` - # run your own TTS and Vocoder models - ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav + - run a tts model with its default vocoder model. + + ``` + $ ./TTS/bin synthesize.py --text "Text for TTS" --model_name "// + ``` + + - run with specific tts and vocoder models from the list + + ``` + $ ./TTS/bin/synthesize.py --text "Text for TTS" --model_name "//" --vocoder_name "//" --output_path + ``` + + - run your own TTS model (Using Griffin-Lim Vocoder) + + ``` + $ ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav + ``` + + - run your own TTS and Vocoder models + ``` + $ ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav --vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json + ``` + ## MULTI-SPEAKER MODELS + + - list the available speakers and choose as among them. + + ``` + $ ./TTS/bin/synthesize.py --model_name "//" --list_speaker_idxs + ``` + + - run the multi-speaker TTS model with the target speaker ID. + + ``` + $ ./TTS/bin/synthesize.py --text "Text for TTS." --out_path output/path/speech.wav --model_name "//" --speaker_idx + ``` + + - run your own multi-speaker TTS model. + + ``` + $ ./TTS/bin/synthesize.py --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth.tar --speakers_file_path path/to/speaker.json --speaker_idx + ``` """, formatter_class=RawTextHelpFormatter, ) @@ -113,21 +151,23 @@ def main(): parser.add_argument( "--speaker_idx", type=str, - help="if the tts model is trained with x-vectors, then speaker_idx is a file present in speakers.json else speaker_idx is the speaker id corresponding to a speaker in the speaker embedding layer.", + help="Target speaker ID for a multi-speaker TTS model.", default=None, ) parser.add_argument( "--speaker_wav", nargs="+", - help="wav file(s) to condition a multi-speaker model. You can give multiple file paths. The x_vectors is computed as their average.", + help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The x_vectors is computed as their average.", default=None, ) parser.add_argument("--gst_style", help="Wav path file for GST stylereference.", default=None) parser.add_argument( "--list_speaker_idxs", help="List available speaker ids for the defined multi-speaker model.", - default=False, type=str2bool, + nargs="?", + const=True, + default=False, ) # aux args parser.add_argument( @@ -202,6 +242,14 @@ def main(): print(synthesizer.speaker_manager.speaker_ids) return + # check the arguments against a multi-speaker model. + if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav): + print( + " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to " + "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`." + ) + return + # RUN THE SYNTHESIS print(" > Text: {}".format(args.text)) diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py index 90e78af0..2d6873e1 100644 --- a/TTS/tts/utils/speakers.py +++ b/TTS/tts/utils/speakers.py @@ -51,35 +51,28 @@ def parse_speakers(c, args, meta_data_train, OUT_PATH): print( "WARNING: speakers.json was not found in restore_path, trying to use CONFIG.external_speaker_embedding_file" ) - speaker_mapping = load_speaker_mapping( - c.external_speaker_embedding_file) + speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file) if not speaker_mapping: raise RuntimeError( "You must copy the file speakers.json to restore_path, or set a valid file in CONFIG.external_speaker_embedding_file" ) - speaker_embedding_dim = len(speaker_mapping[list( - speaker_mapping.keys())[0]]["embedding"]) + speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]["embedding"]) elif ( - not c.use_external_speaker_embedding_file + not c.use_external_speaker_embedding_file ): # if restore checkpoint and don't use External Embedding file prev_out_path = os.path.dirname(args.restore_path) speaker_mapping = load_speaker_mapping(prev_out_path) speaker_embedding_dim = None - assert all( - speaker in speaker_mapping - for speaker in speakers), ("As of now you, you cannot " - "introduce new speakers to " - "a previously trained model.") - elif (c.use_external_speaker_embedding_file - and c.external_speaker_embedding_file - ): # if start new train using External Embedding file - speaker_mapping = load_speaker_mapping( - c.external_speaker_embedding_file) - speaker_embedding_dim = len(speaker_mapping[list( - speaker_mapping.keys())[0]]["embedding"]) + assert all(speaker in speaker_mapping for speaker in speakers), ( + "As of now you, you cannot " "introduce new speakers to " "a previously trained model." + ) elif ( - c.use_external_speaker_embedding_file - and not c.external_speaker_embedding_file + c.use_external_speaker_embedding_file and c.external_speaker_embedding_file + ): # if start new train using External Embedding file + speaker_mapping = load_speaker_mapping(c.external_speaker_embedding_file) + speaker_embedding_dim = len(speaker_mapping[list(speaker_mapping.keys())[0]]["embedding"]) + elif ( + c.use_external_speaker_embedding_file and not c.external_speaker_embedding_file ): # if start new train using External Embedding file and don't pass external embedding file raise "use_external_speaker_embedding_file is True, so you need pass a external speaker embedding file, run GE2E-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb or AngularPrototypical-Speaker_Encoder-ExtractSpeakerEmbeddings-by-sample.ipynb notebook in notebooks/ folder" else: # if start new train and don't use External Embedding file @@ -87,8 +80,7 @@ def parse_speakers(c, args, meta_data_train, OUT_PATH): speaker_embedding_dim = None save_speaker_mapping(OUT_PATH, speaker_mapping) num_speakers = len(speaker_mapping) - print(" > Training with {} speakers: {}".format( - len(speakers), ", ".join(speakers))) + print(" > Training with {} speakers: {}".format(len(speakers), ", ".join(speakers))) else: num_speakers = 0 speaker_embedding_dim = None @@ -133,6 +125,7 @@ class SpeakerManager: encoder_model_path (str, optional): Path to the speaker encoder model file. Defaults to "". encoder_config_path (str, optional): Path to the spealer encoder config file. Defaults to "". """ + def __init__( self, x_vectors_file_path: str = "", diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index f0a81227..0cf69706 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -107,6 +107,7 @@ class ModelManager(object): os.makedirs(output_path, exist_ok=True) print(f" > Downloading model to {output_path}") output_stats_path = os.path.join(output_path, "scale_stats.npy") + output_speakers_path = os.path.join(output_path, "speakers.json") # download files to the output path if self._check_dict_key(model_item, "github_rls_url"): # download from github release @@ -119,7 +120,7 @@ class ModelManager(object): if self._check_dict_key(model_item, "stats_file"): self._download_gdrive_file(model_item["stats_file"], output_stats_path) - # set the scale_path.npy file path in the model config.json + # update the scale_path.npy file path in the model config.json if self._check_dict_key(model_item, "stats_file") or os.path.exists(output_stats_path): # set scale stats path in config.json config_path = output_config_path @@ -127,6 +128,14 @@ class ModelManager(object): config["audio"]["stats_path"] = output_stats_path with open(config_path, "w") as jf: json.dump(config, jf) + # update the speakers.json file path in the model config.json to the current path + if os.path.exists(output_speakers_path): + # set scale stats path in config.json + config_path = output_config_path + config = load_config(config_path) + config["external_speaker_embedding_file"] = output_speakers_path + with open(config_path, "w") as jf: + json.dump(config, jf) return output_model_path, output_config_path, model_item def _download_gdrive_file(self, gdrive_idx, output): diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 8b8d1e3e..46dabad9 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -127,6 +127,9 @@ class Synthesizer(object): self.input_size = len(symbols) if self.tts_config.use_speaker_embedding is True: + self.tts_speakers_file = ( + self.tts_speakers_file if self.tts_speakers_file else self.tts_config["external_speaker_embedding_file"] + ) self._load_speakers(self.tts_speakers_file) self.tts_model = setup_model( @@ -189,15 +192,27 @@ class Synthesizer(object): """ start_time = time.time() wavs = [] - sens = self._split_into_sentences(text) + sens = self.split_into_sentences(text) print(" > Text splitted to sentences.") print(sens) - # get the speaker embedding from the saved x_vectors. - if speaker_idx and isinstance(speaker_idx, str): - speaker_embedding = self.speaker_manager.get_x_vectors_by_speaker(speaker_idx)[0] + if self.tts_speakers_file: + # get the speaker embedding from the saved x_vectors. + if speaker_idx and isinstance(speaker_idx, str): + speaker_embedding = self.speaker_manager.get_x_vectors_by_speaker(speaker_idx)[0] + elif not speaker_idx and not speaker_wav: + raise ValueError( + " [!] Look like you use a multi-speaker model. " + "You need to define either a `speaker_idx` or a `style_wav` to use a multi-speaker model." + ) + else: + speaker_embedding = None else: - speaker_embedding = None + if speaker_idx: + raise ValueError( + f" [!] Missing speaker.json file path for selecting speaker {speaker_idx}." + "Define path for speaker.json if it is a multi-speaker model or remove defined speaker idx. " + ) # compute a new x_vector from the given clip. if speaker_wav is not None: diff --git a/tests/test_speakers_manager.py b/tests/test_speakers_manager.py index 40914224..b98f990c 100644 --- a/tests/test_speakers_manager.py +++ b/tests/test_speakers_manager.py @@ -16,6 +16,7 @@ x_vectors_file_path = os.path.join(get_tests_input_path(), "../data/dummy_speake class SpeakerManagerTest(unittest.TestCase): """Test SpeakerManager for loading embedding files and computing x_vectors from waveforms""" + @staticmethod def test_speaker_embedding(): # load config