From 0fb1b200c668c09e4a25da49a59cd69d41afa64f Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Fri, 20 May 2022 18:29:39 -0300 Subject: [PATCH] Fix rebase issues --- TTS/bin/compute_embeddings.py | 4 +++- TTS/bin/synthesize.py | 11 +++-------- TTS/tts/datasets/__init__.py | 16 ++++++++-------- TTS/tts/models/vits.py | 12 +++++++++--- TTS/tts/utils/synthesis.py | 2 +- TTS/utils/manage.py | 4 ++-- .../test_vits_speaker_emb_with_emotion_train.py | 4 ++-- ...test_vits_speaker_emb_with_prosody_encoder.py | 2 +- 8 files changed, 29 insertions(+), 26 deletions(-) diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py index f8f3a90c..91d07257 100644 --- a/TTS/bin/compute_embeddings.py +++ b/TTS/bin/compute_embeddings.py @@ -25,7 +25,9 @@ parser.add_argument("--output_path", type=str, help="Path for output `pth` or `j parser.add_argument("--old_file", type=str, help="Previous embedding file to only compute new audios.", default=None) parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False) parser.add_argument("--no_eval", type=bool, help="Do not compute eval?. Default False", default=False) -parser.add_argument("--use_predicted_label", type=bool, help="If True and predicted label is available with will use it.", default=False) +parser.add_argument( + "--use_predicted_label", type=bool, help="If True and predicted label is available with will use it.", default=False +) args = parser.parse_args() diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index e25c0504..2f32ec96 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -178,14 +178,10 @@ If you don't specify any models, then it uses LJSpeech based English model. help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.", default=None, ) - parser.add_argument("--gst_style", help="Wav path file for GST style reference.", default=None) -<<<<<<< HEAD + parser.add_argument("--style_wav", type=str, help="Wav path file for prosody reference.", default=None) parser.add_argument( - "--capacitron_style_wav", type=str, help="Wav path file for Capacitron prosody reference.", default=None + "--capacitron_style_text", type=str, help="Transcription of the style_wav reference.", default=None ) - parser.add_argument("--capacitron_style_text", type=str, help="Transcription of the reference.", default=None) -======= ->>>>>>> 3a524b05... Add prosody encoder params on config parser.add_argument( "--list_speaker_idxs", help="List available speaker ids for the defined multi-speaker model.", @@ -324,9 +320,8 @@ If you don't specify any models, then it uses LJSpeech based English model. args.speaker_idx, args.language_idx, args.speaker_wav, - style_wav=args.gst_style, reference_wav=args.reference_wav, - style_wav=args.capacitron_style_wav, + style_wav=args.style_wav, style_text=args.capacitron_style_text, reference_speaker_name=args.reference_speaker_idx, emotion_name=args.emotion_idx, diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py index 2eed947f..137c070a 100644 --- a/TTS/tts/datasets/__init__.py +++ b/TTS/tts/datasets/__init__.py @@ -12,16 +12,16 @@ from TTS.tts.datasets.formatters import * def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01): """Split a dataset into train and eval. Consider speaker distribution in multi-speaker training. - Args: - items (List[List]): - A list of samples. Each sample is a dict containing the keys "text", "audio_file", and "speaker_name". + Args: + items (List[List]): + A list of samples. Each sample is a dict containing the keys "text", "audio_file", and "speaker_name". - eval_split_max_size (int): - Number maximum of samples to be used for evaluation in proportion split. Defaults to None (Disabled). + eval_split_max_size (int): + Number maximum of samples to be used for evaluation in proportion split. Defaults to None (Disabled). - eval_split_size (float): - If between 0.0 and 1.0 represents the proportion of the dataset to include in the evaluation set. - If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%). + eval_split_size (float): + If between 0.0 and 1.0 represents the proportion of the dataset to include in the evaluation set. + If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%). """ speakers = [item["speaker_name"] for item in items] is_multi_speaker = len(set(speakers)) > 1 diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index f9cdd2e4..cbe310a9 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -700,7 +700,9 @@ class Vits(BaseTTS): if self.args.use_text_enc_spk_reversal_classifier: self.speaker_text_enc_reversal_classifier = ReversalClassifier( - in_channels=self.args.hidden_channels + self.args.emotion_embedding_dim + self.args.prosody_embedding_dim, + in_channels=self.args.hidden_channels + + self.args.emotion_embedding_dim + + self.args.prosody_embedding_dim, out_channels=self.num_speakers, hidden_channels=256, ) @@ -1483,7 +1485,7 @@ class Vits(BaseTTS): gt_cons_emb=self.model_outputs_cache["gt_cons_emb"], syn_cons_emb=self.model_outputs_cache["syn_cons_emb"], loss_prosody_enc_spk_rev_classifier=self.model_outputs_cache["loss_prosody_enc_spk_rev_classifier"], - loss_text_enc_spk_rev_classifier=self.model_outputs_cache["loss_text_enc_spk_rev_classifier"] + loss_text_enc_spk_rev_classifier=self.model_outputs_cache["loss_text_enc_spk_rev_classifier"], ) return self.model_outputs_cache, loss_dict @@ -1654,7 +1656,11 @@ class Vits(BaseTTS): if ( self.speaker_manager is not None and self.speaker_manager.ids - and (self.args.use_speaker_embedding or self.args.use_prosody_encoder or self.args.use_text_enc_spk_reversal_classifier) + and ( + self.args.use_speaker_embedding + or self.args.use_prosody_encoder + or self.args.use_text_enc_spk_reversal_classifier + ) ): speaker_ids = [self.speaker_manager.ids[sn] for sn in batch["speaker_names"]] diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index e769648d..e9552d59 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -181,7 +181,7 @@ def synthesis( style_feature = compute_style_feature(style_wav, model.ap, cuda=use_cuda) style_feature = style_feature.transpose(1, 2) # [1, time, depth] - if hasattr(model, 'compute_style_feature') and style_wav is not None: + if hasattr(model, "compute_style_feature") and style_wav is not None: style_feature = model.compute_style_feature(style_wav) # convert text to sequence of token IDs diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index b171e855..ad18daab 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -218,8 +218,8 @@ class ModelManager(object): output_stats_path = os.path.join(output_path, "scale_stats.npy") output_d_vector_file_path = os.path.join(output_path, "speakers.json") output_speaker_ids_file_path = os.path.join(output_path, "speaker_ids.json") - speaker_encoder_config_path = os.path.join(output_path, "config_se.json") - speaker_encoder_model_path = self._find_speaker_encoder(output_path) + encoder_config_path = os.path.join(output_path, "config_se.json") + encoder_model_path = self._find_speaker_encoder(output_path) # update the scale_path.npy file path in the model config.json self._update_path("audio.stats_path", output_stats_path, config_path) diff --git a/tests/tts_tests/test_vits_speaker_emb_with_emotion_train.py b/tests/tts_tests/test_vits_speaker_emb_with_emotion_train.py index cd9118ad..4856c364 100644 --- a/tests/tts_tests/test_vits_speaker_emb_with_emotion_train.py +++ b/tests/tts_tests/test_vits_speaker_emb_with_emotion_train.py @@ -76,8 +76,8 @@ continue_restore_path, _ = get_last_checkpoint(continue_path) out_wav_path = os.path.join(get_tests_output_path(), "output.wav") speaker_id = "ljspeech-1" emotion_id = "ljspeech-3" -continue_speakers_path = os.path.join(continue_path, "speakers.json") -continue_emotion_path = os.path.join(continue_path, "speakers.json") +continue_speakers_path = config.d_vector_file +continue_emotion_path = os.path.join(continue_path, "emotions.json") inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --emotion_idx {emotion_id} --speakers_file_path {continue_speakers_path} --emotions_file_path {continue_emotion_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" diff --git a/tests/tts_tests/test_vits_speaker_emb_with_prosody_encoder.py b/tests/tts_tests/test_vits_speaker_emb_with_prosody_encoder.py index 6ff4412b..ccd48616 100644 --- a/tests/tts_tests/test_vits_speaker_emb_with_prosody_encoder.py +++ b/tests/tts_tests/test_vits_speaker_emb_with_prosody_encoder.py @@ -71,7 +71,7 @@ style_wav_path = "tests/data/ljspeech/wavs/LJ001-0001.wav" continue_speakers_path = os.path.join(continue_path, "speakers.json") -inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path} --gst_style {style_wav_path}" +inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path} --style_wav {style_wav_path}" run_cli(inference_command) # restore the model and continue training for one more epoch