From 0fb1b200c668c09e4a25da49a59cd69d41afa64f Mon Sep 17 00:00:00 2001
From: Edresson Casanova <edresson1@gmail.com>
Date: Fri, 20 May 2022 18:29:39 -0300
Subject: [PATCH] Fix rebase issues

---
 TTS/bin/compute_embeddings.py                    |  4 +++-
 TTS/bin/synthesize.py                            | 11 +++--------
 TTS/tts/datasets/__init__.py                     | 16 ++++++++--------
 TTS/tts/models/vits.py                           | 12 +++++++++---
 TTS/tts/utils/synthesis.py                       |  2 +-
 TTS/utils/manage.py                              |  4 ++--
 .../test_vits_speaker_emb_with_emotion_train.py  |  4 ++--
 ...test_vits_speaker_emb_with_prosody_encoder.py |  2 +-
 8 files changed, 29 insertions(+), 26 deletions(-)

diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py
index f8f3a90c..91d07257 100644
--- a/TTS/bin/compute_embeddings.py
+++ b/TTS/bin/compute_embeddings.py
@@ -25,7 +25,9 @@ parser.add_argument("--output_path", type=str, help="Path for output `pth` or `j
 parser.add_argument("--old_file", type=str, help="Previous embedding file to only compute new audios.", default=None)
 parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False)
 parser.add_argument("--no_eval", type=bool, help="Do not compute eval?. Default False", default=False)
-parser.add_argument("--use_predicted_label", type=bool, help="If True and predicted label is available with will use it.", default=False)
+parser.add_argument(
+    "--use_predicted_label", type=bool, help="If True and predicted label is available with will use it.", default=False
+)
 
 args = parser.parse_args()
 
diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py
index e25c0504..2f32ec96 100755
--- a/TTS/bin/synthesize.py
+++ b/TTS/bin/synthesize.py
@@ -178,14 +178,10 @@ If you don't specify any models, then it uses LJSpeech based English model.
         help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.",
         default=None,
     )
-    parser.add_argument("--gst_style", help="Wav path file for GST style reference.", default=None)
-<<<<<<< HEAD
+    parser.add_argument("--style_wav", type=str, help="Wav path file for prosody reference.", default=None)
     parser.add_argument(
-        "--capacitron_style_wav", type=str, help="Wav path file for Capacitron prosody reference.", default=None
+        "--capacitron_style_text", type=str, help="Transcription of the style_wav reference.", default=None
     )
-    parser.add_argument("--capacitron_style_text", type=str, help="Transcription of the reference.", default=None)
-=======
->>>>>>> 3a524b05... Add prosody encoder params on config
     parser.add_argument(
         "--list_speaker_idxs",
         help="List available speaker ids for the defined multi-speaker model.",
@@ -324,9 +320,8 @@ If you don't specify any models, then it uses LJSpeech based English model.
         args.speaker_idx,
         args.language_idx,
         args.speaker_wav,
-        style_wav=args.gst_style,
         reference_wav=args.reference_wav,
-        style_wav=args.capacitron_style_wav,
+        style_wav=args.style_wav,
         style_text=args.capacitron_style_text,
         reference_speaker_name=args.reference_speaker_idx,
         emotion_name=args.emotion_idx,
diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py
index 2eed947f..137c070a 100644
--- a/TTS/tts/datasets/__init__.py
+++ b/TTS/tts/datasets/__init__.py
@@ -12,16 +12,16 @@ from TTS.tts.datasets.formatters import *
 def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01):
     """Split a dataset into train and eval. Consider speaker distribution in multi-speaker training.
 
-        Args:
-            items (List[List]):
-                A list of samples. Each sample is a dict containing the keys "text", "audio_file", and "speaker_name".
+    Args:
+        items (List[List]):
+            A list of samples. Each sample is a dict containing the keys "text", "audio_file", and "speaker_name".
 
-            eval_split_max_size (int):
-                Number maximum of samples to be used for evaluation in proportion split. Defaults to None (Disabled).
+        eval_split_max_size (int):
+            Number maximum of samples to be used for evaluation in proportion split. Defaults to None (Disabled).
 
-            eval_split_size (float):
-                If between 0.0 and 1.0 represents the proportion of the dataset to include in the evaluation set.
-                If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%).
+        eval_split_size (float):
+            If between 0.0 and 1.0 represents the proportion of the dataset to include in the evaluation set.
+            If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%).
     """
     speakers = [item["speaker_name"] for item in items]
     is_multi_speaker = len(set(speakers)) > 1
diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py
index f9cdd2e4..cbe310a9 100644
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@@ -700,7 +700,9 @@ class Vits(BaseTTS):
 
         if self.args.use_text_enc_spk_reversal_classifier:
             self.speaker_text_enc_reversal_classifier = ReversalClassifier(
-                in_channels=self.args.hidden_channels + self.args.emotion_embedding_dim + self.args.prosody_embedding_dim,
+                in_channels=self.args.hidden_channels
+                + self.args.emotion_embedding_dim
+                + self.args.prosody_embedding_dim,
                 out_channels=self.num_speakers,
                 hidden_channels=256,
             )
@@ -1483,7 +1485,7 @@ class Vits(BaseTTS):
                     gt_cons_emb=self.model_outputs_cache["gt_cons_emb"],
                     syn_cons_emb=self.model_outputs_cache["syn_cons_emb"],
                     loss_prosody_enc_spk_rev_classifier=self.model_outputs_cache["loss_prosody_enc_spk_rev_classifier"],
-                    loss_text_enc_spk_rev_classifier=self.model_outputs_cache["loss_text_enc_spk_rev_classifier"]
+                    loss_text_enc_spk_rev_classifier=self.model_outputs_cache["loss_text_enc_spk_rev_classifier"],
                 )
 
             return self.model_outputs_cache, loss_dict
@@ -1654,7 +1656,11 @@ class Vits(BaseTTS):
         if (
             self.speaker_manager is not None
             and self.speaker_manager.ids
-            and (self.args.use_speaker_embedding or self.args.use_prosody_encoder or self.args.use_text_enc_spk_reversal_classifier)
+            and (
+                self.args.use_speaker_embedding
+                or self.args.use_prosody_encoder
+                or self.args.use_text_enc_spk_reversal_classifier
+            )
         ):
             speaker_ids = [self.speaker_manager.ids[sn] for sn in batch["speaker_names"]]
 
diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py
index e769648d..e9552d59 100644
--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@@ -181,7 +181,7 @@ def synthesis(
         style_feature = compute_style_feature(style_wav, model.ap, cuda=use_cuda)
         style_feature = style_feature.transpose(1, 2)  # [1, time, depth]
 
-    if hasattr(model, 'compute_style_feature') and style_wav is not None:
+    if hasattr(model, "compute_style_feature") and style_wav is not None:
         style_feature = model.compute_style_feature(style_wav)
 
     # convert text to sequence of token IDs
diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py
index b171e855..ad18daab 100644
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@@ -218,8 +218,8 @@ class ModelManager(object):
         output_stats_path = os.path.join(output_path, "scale_stats.npy")
         output_d_vector_file_path = os.path.join(output_path, "speakers.json")
         output_speaker_ids_file_path = os.path.join(output_path, "speaker_ids.json")
-        speaker_encoder_config_path = os.path.join(output_path, "config_se.json")
-        speaker_encoder_model_path = self._find_speaker_encoder(output_path)
+        encoder_config_path = os.path.join(output_path, "config_se.json")
+        encoder_model_path = self._find_speaker_encoder(output_path)
 
         # update the scale_path.npy file path in the model config.json
         self._update_path("audio.stats_path", output_stats_path, config_path)
diff --git a/tests/tts_tests/test_vits_speaker_emb_with_emotion_train.py b/tests/tts_tests/test_vits_speaker_emb_with_emotion_train.py
index cd9118ad..4856c364 100644
--- a/tests/tts_tests/test_vits_speaker_emb_with_emotion_train.py
+++ b/tests/tts_tests/test_vits_speaker_emb_with_emotion_train.py
@@ -76,8 +76,8 @@ continue_restore_path, _ = get_last_checkpoint(continue_path)
 out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
 speaker_id = "ljspeech-1"
 emotion_id = "ljspeech-3"
-continue_speakers_path = os.path.join(continue_path, "speakers.json")
-continue_emotion_path = os.path.join(continue_path, "speakers.json")
+continue_speakers_path = config.d_vector_file
+continue_emotion_path = os.path.join(continue_path, "emotions.json")
 
 
 inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --emotion_idx {emotion_id} --speakers_file_path {continue_speakers_path} --emotions_file_path {continue_emotion_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
diff --git a/tests/tts_tests/test_vits_speaker_emb_with_prosody_encoder.py b/tests/tts_tests/test_vits_speaker_emb_with_prosody_encoder.py
index 6ff4412b..ccd48616 100644
--- a/tests/tts_tests/test_vits_speaker_emb_with_prosody_encoder.py
+++ b/tests/tts_tests/test_vits_speaker_emb_with_prosody_encoder.py
@@ -71,7 +71,7 @@ style_wav_path = "tests/data/ljspeech/wavs/LJ001-0001.wav"
 continue_speakers_path = os.path.join(continue_path, "speakers.json")
 
 
-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path} --gst_style {style_wav_path}"
+inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path} --style_wav {style_wav_path}"
 run_cli(inference_command)
 
 # restore the model and continue training for one more epoch