Fix rebase issues

2022-05-20 18:29:39 -03:00 · 2022-05-20 18:29:39 -03:00 · 0fb1b200c6
parent 98c2834b17
commit 0fb1b200c6
8 changed files with 29 additions and 26 deletions
--- a/TTS/bin/compute_embeddings.py
+++ b/TTS/bin/compute_embeddings.py
@ -25,7 +25,9 @@ parser.add_argument("--output_path", type=str, help="Path for output `pth` or `j
 parser.add_argument("--old_file", type=str, help="Previous embedding file to only compute new audios.", default=None)
 parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False)
 parser.add_argument("--no_eval", type=bool, help="Do not compute eval?. Default False", default=False)
-parser.add_argument("--use_predicted_label", type=bool, help="If True and predicted label is available with will use it.", default=False)
+parser.add_argument(
+    "--use_predicted_label", type=bool, help="If True and predicted label is available with will use it.", default=False
+)

 args = parser.parse_args()

--- a/TTS/bin/synthesize.py
+++ b/TTS/bin/synthesize.py
@ -178,14 +178,10 @@ If you don't specify any models, then it uses LJSpeech based English model.
        help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.",
        default=None,
    )
-    parser.add_argument("--gst_style", help="Wav path file for GST style reference.", default=None)
-<<<<<<< HEAD
+    parser.add_argument("--style_wav", type=str, help="Wav path file for prosody reference.", default=None)
    parser.add_argument(
-        "--capacitron_style_wav", type=str, help="Wav path file for Capacitron prosody reference.", default=None
+        "--capacitron_style_text", type=str, help="Transcription of the style_wav reference.", default=None
    )
-    parser.add_argument("--capacitron_style_text", type=str, help="Transcription of the reference.", default=None)
-=======
->>>>>>> 3a524b05... Add prosody encoder params on config
    parser.add_argument(
        "--list_speaker_idxs",
        help="List available speaker ids for the defined multi-speaker model.",
@ -324,9 +320,8 @@ If you don't specify any models, then it uses LJSpeech based English model.
        args.speaker_idx,
        args.language_idx,
        args.speaker_wav,
-        style_wav=args.gst_style,
        reference_wav=args.reference_wav,
-        style_wav=args.capacitron_style_wav,
+        style_wav=args.style_wav,
        style_text=args.capacitron_style_text,
        reference_speaker_name=args.reference_speaker_idx,
        emotion_name=args.emotion_idx,
--- a/TTS/tts/datasets/init.py
+++ b/TTS/tts/datasets/init.py
@ -12,16 +12,16 @@ from TTS.tts.datasets.formatters import *
 def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01):
    """Split a dataset into train and eval. Consider speaker distribution in multi-speaker training.

-        Args:
-            items (List[List]):
-                A list of samples. Each sample is a dict containing the keys "text", "audio_file", and "speaker_name".
+    Args:
+        items (List[List]):
+            A list of samples. Each sample is a dict containing the keys "text", "audio_file", and "speaker_name".

-            eval_split_max_size (int):
-                Number maximum of samples to be used for evaluation in proportion split. Defaults to None (Disabled).
+        eval_split_max_size (int):
+            Number maximum of samples to be used for evaluation in proportion split. Defaults to None (Disabled).

-            eval_split_size (float):
-                If between 0.0 and 1.0 represents the proportion of the dataset to include in the evaluation set.
-                If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%).
+        eval_split_size (float):
+            If between 0.0 and 1.0 represents the proportion of the dataset to include in the evaluation set.
+            If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%).
    """
    speakers = [item["speaker_name"] for item in items]
    is_multi_speaker = len(set(speakers)) > 1
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@ -700,7 +700,9 @@ class Vits(BaseTTS):

        if self.args.use_text_enc_spk_reversal_classifier:
            self.speaker_text_enc_reversal_classifier = ReversalClassifier(
-                in_channels=self.args.hidden_channels + self.args.emotion_embedding_dim + self.args.prosody_embedding_dim,
+                in_channels=self.args.hidden_channels
+                + self.args.emotion_embedding_dim
+                + self.args.prosody_embedding_dim,
                out_channels=self.num_speakers,
                hidden_channels=256,
            )
@ -1483,7 +1485,7 @@ class Vits(BaseTTS):
                    gt_cons_emb=self.model_outputs_cache["gt_cons_emb"],
                    syn_cons_emb=self.model_outputs_cache["syn_cons_emb"],
                    loss_prosody_enc_spk_rev_classifier=self.model_outputs_cache["loss_prosody_enc_spk_rev_classifier"],
-                    loss_text_enc_spk_rev_classifier=self.model_outputs_cache["loss_text_enc_spk_rev_classifier"]
+                    loss_text_enc_spk_rev_classifier=self.model_outputs_cache["loss_text_enc_spk_rev_classifier"],
                )

            return self.model_outputs_cache, loss_dict
@ -1654,7 +1656,11 @@ class Vits(BaseTTS):
        if (
            self.speaker_manager is not None
            and self.speaker_manager.ids
-            and (self.args.use_speaker_embedding or self.args.use_prosody_encoder or self.args.use_text_enc_spk_reversal_classifier)
+            and (
+                self.args.use_speaker_embedding
+                or self.args.use_prosody_encoder
+                or self.args.use_text_enc_spk_reversal_classifier
+            )
        ):
            speaker_ids = [self.speaker_manager.ids[sn] for sn in batch["speaker_names"]]

--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@ -181,7 +181,7 @@ def synthesis(
        style_feature = compute_style_feature(style_wav, model.ap, cuda=use_cuda)
        style_feature = style_feature.transpose(1, 2)  # [1, time, depth]

-    if hasattr(model, 'compute_style_feature') and style_wav is not None:
+    if hasattr(model, "compute_style_feature") and style_wav is not None:
        style_feature = model.compute_style_feature(style_wav)

    # convert text to sequence of token IDs
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@ -218,8 +218,8 @@ class ModelManager(object):
        output_stats_path = os.path.join(output_path, "scale_stats.npy")
        output_d_vector_file_path = os.path.join(output_path, "speakers.json")
        output_speaker_ids_file_path = os.path.join(output_path, "speaker_ids.json")
-        speaker_encoder_config_path = os.path.join(output_path, "config_se.json")
-        speaker_encoder_model_path = self._find_speaker_encoder(output_path)
+        encoder_config_path = os.path.join(output_path, "config_se.json")
+        encoder_model_path = self._find_speaker_encoder(output_path)

        # update the scale_path.npy file path in the model config.json
        self._update_path("audio.stats_path", output_stats_path, config_path)
--- a/tests/tts_tests/test_vits_speaker_emb_with_emotion_train.py
+++ b/tests/tts_tests/test_vits_speaker_emb_with_emotion_train.py
@ -76,8 +76,8 @@ continue_restore_path, _ = get_last_checkpoint(continue_path)
 out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
 speaker_id = "ljspeech-1"
 emotion_id = "ljspeech-3"
-continue_speakers_path = os.path.join(continue_path, "speakers.json")
-continue_emotion_path = os.path.join(continue_path, "speakers.json")
+continue_speakers_path = config.d_vector_file
+continue_emotion_path = os.path.join(continue_path, "emotions.json")


 inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --emotion_idx {emotion_id} --speakers_file_path {continue_speakers_path} --emotions_file_path {continue_emotion_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
--- a/tests/tts_tests/test_vits_speaker_emb_with_prosody_encoder.py
+++ b/tests/tts_tests/test_vits_speaker_emb_with_prosody_encoder.py
@ -71,7 +71,7 @@ style_wav_path = "tests/data/ljspeech/wavs/LJ001-0001.wav"
 continue_speakers_path = os.path.join(continue_path, "speakers.json")


-inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path} --gst_style {style_wav_path}"
+inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path} --style_wav {style_wav_path}"
 run_cli(inference_command)

 # restore the model and continue training for one more epoch