From 047cebd7b8c86a71235b8a8cc80e04ff661e0ceb Mon Sep 17 00:00:00 2001
From: Edresson Casanova <edresson1@gmail.com>
Date: Wed, 30 Mar 2022 16:51:39 -0300
Subject: [PATCH] Fix Style tests

---
 TTS/bin/compute_embeddings.py | 16 ++++----
 TTS/bin/eval_encoder.py       |  2 +-
 TTS/bin/synthesize.py         |  2 +-
 TTS/tts/models/base_tts.py    |  9 +++--
 TTS/tts/models/vits.py        | 76 ++++++++++++++++++++++++-----------
 TTS/tts/utils/emotions.py     | 19 ++++++---
 TTS/tts/utils/languages.py    |  1 -
 TTS/tts/utils/speakers.py     |  5 +--
 TTS/tts/utils/synthesis.py    | 13 +++++-
 TTS/utils/synthesizer.py      | 45 ++++++++++++++++-----
 10 files changed, 128 insertions(+), 60 deletions(-)

diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py
index d749f410..71a419b3 100644
--- a/TTS/bin/compute_embeddings.py
+++ b/TTS/bin/compute_embeddings.py
@@ -1,8 +1,8 @@
 import argparse
 import os
-import torch
 from argparse import RawTextHelpFormatter
 
+import torch
 from tqdm import tqdm
 
 from TTS.config import load_config
@@ -30,11 +30,11 @@ parser.add_argument(
     help="Path to dataset config file.",
 )
 parser.add_argument("output_path", type=str, help="path for output .json file.")
-parser.add_argument(
-    "--old_file", type=str, help="Previous .json file, only compute for new audios.", default=None
-)
+parser.add_argument("--old_file", type=str, help="Previous .json file, only compute for new audios.", default=None)
 parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)
-parser.add_argument("--use_predicted_label", type=bool, help="If True and predicted label is available with will use it.", default=False)
+parser.add_argument(
+    "--use_predicted_label", type=bool, help="If True and predicted label is available with will use it.", default=False
+)
 parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
 
 args = parser.parse_args()
@@ -71,7 +71,7 @@ for idx, wav_file in enumerate(tqdm(wav_files)):
         embedd = encoder_manager.compute_embedding_from_clip(wav_file)
 
     if args.use_predicted_label:
-        map_classid_to_classname = getattr(encoder_manager.encoder_config, 'map_classid_to_classname', None)
+        map_classid_to_classname = getattr(encoder_manager.encoder_config, "map_classid_to_classname", None)
         if encoder_manager.encoder_criterion is not None and map_classid_to_classname is not None:
             embedding = torch.FloatTensor(embedd).unsqueeze(0)
             if encoder_manager.use_cuda:
@@ -80,9 +80,7 @@ for idx, wav_file in enumerate(tqdm(wav_files)):
             class_id = encoder_manager.encoder_criterion.softmax.inference(embedding).item()
             class_name = map_classid_to_classname[str(class_id)]
         else:
-            raise RuntimeError(
-                    " [!] use_predicted_label is enable and predicted_labels is not available !!"
-                )
+            raise RuntimeError(" [!] use_predicted_label is enable and predicted_labels is not available !!")
 
     # create class_mapping if target dataset is defined
     class_mapping[wav_file_name] = {}
diff --git a/TTS/bin/eval_encoder.py b/TTS/bin/eval_encoder.py
index 6d6c30d2..7fc67bef 100644
--- a/TTS/bin/eval_encoder.py
+++ b/TTS/bin/eval_encoder.py
@@ -12,7 +12,7 @@ from TTS.tts.utils.speakers import SpeakerManager
 def compute_encoder_accuracy(dataset_items, encoder_manager):
 
     class_name_key = encoder_manager.encoder_config.class_name_key
-    map_classid_to_classname = getattr(encoder_manager.encoder_config, 'map_classid_to_classname', None)
+    map_classid_to_classname = getattr(encoder_manager.encoder_config, "map_classid_to_classname", None)
     class_acc_dict = {}
 
     # compute embeddings for all wav_files
diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py
index 9c21aa49..bdc83d5e 100755
--- a/TTS/bin/synthesize.py
+++ b/TTS/bin/synthesize.py
@@ -319,7 +319,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
         args.speaker_wav,
         reference_wav=args.reference_wav,
         reference_speaker_name=args.reference_speaker_idx,
-        emotion_name=args.emotion_idx
+        emotion_name=args.emotion_idx,
     )
 
     # save the results
diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py
index f0a4032b..71da495c 100644
--- a/TTS/tts/models/base_tts.py
+++ b/TTS/tts/models/base_tts.py
@@ -422,9 +422,12 @@ class BaseTTS(BaseTrainerModel):
 
         if hasattr(self, "emotion_manager") and self.emotion_manager is not None:
             output_path = os.path.join(trainer.output_path, "emotions.json")
-            
+
             if hasattr(trainer.config, "model_args"):
-                if trainer.config.model_args.use_emotion_embedding and not trainer.config.model_args.external_emotions_embs_file:
+                if (
+                    trainer.config.model_args.use_emotion_embedding
+                    and not trainer.config.model_args.external_emotions_embs_file
+                ):
                     self.emotion_manager.save_ids_to_file(output_path)
                     trainer.config.model_args.emotions_ids_file = output_path
                 else:
@@ -440,4 +443,4 @@ class BaseTTS(BaseTrainerModel):
 
             trainer.config.save_json(os.path.join(trainer.output_path, "config.json"))
             print(f" > `emotions.json` is saved to {output_path}.")
-            print(" > `emotions_ids_file` or `external_emotions_embs_file` is updated in the config.json.")
\ No newline at end of file
+            print(" > `emotions_ids_file` or `external_emotions_embs_file` is updated in the config.json.")
diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py
index 40b11b6e..596d28c2 100644
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@@ -22,10 +22,10 @@ from TTS.tts.layers.vits.discriminator import VitsDiscriminator
 from TTS.tts.layers.vits.networks import PosteriorEncoder, ResidualCouplingBlocks, TextEncoder
 from TTS.tts.layers.vits.stochastic_duration_predictor import StochasticDurationPredictor
 from TTS.tts.models.base_tts import BaseTTS
+from TTS.tts.utils.emotions import EmotionManager
 from TTS.tts.utils.helpers import generate_path, maximum_path, rand_segments, segment, sequence_mask
 from TTS.tts.utils.languages import LanguageManager
 from TTS.tts.utils.speakers import SpeakerManager
-from TTS.tts.utils.emotions import EmotionManager
 from TTS.tts.utils.synthesis import synthesis
 from TTS.tts.utils.text.characters import BaseCharacters, _characters, _pad, _phonemes, _punctuations
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
@@ -666,8 +666,8 @@ class Vits(BaseTTS):
     def init_consistency_loss(self):
         if self.args.use_speaker_encoder_as_loss and self.args.use_emotion_encoder_as_loss:
             raise RuntimeError(
-                    " [!] The use of speaker consistency loss (SCL) and emotion consistency loss (ECL) together is not supported, please disable one of those !!"
-                )
+                " [!] The use of speaker consistency loss (SCL) and emotion consistency loss (ECL) together is not supported, please disable one of those !!"
+            )
 
         if self.args.use_speaker_encoder_as_loss:
             if self.speaker_manager.encoder is None and (
@@ -773,8 +773,14 @@ class Vits(BaseTTS):
 
     def get_aux_input(self, aux_input: Dict):
         sid, g, lid, eid, eg = self._set_cond_input(aux_input)
-        return {"speaker_ids": sid, "style_wav": None, "d_vectors": g, "language_ids": lid,
-                "emotion_embeddings": eg, "emotion_ids": eid}
+        return {
+            "speaker_ids": sid,
+            "style_wav": None,
+            "d_vectors": g,
+            "language_ids": lid,
+            "emotion_embeddings": eg,
+            "emotion_ids": eid,
+        }
 
     def _freeze_layers(self):
         if self.args.freeze_encoder:
@@ -886,7 +892,13 @@ class Vits(BaseTTS):
         y: torch.tensor,
         y_lengths: torch.tensor,
         waveform: torch.tensor,
-        aux_input={"d_vectors": None, "speaker_ids": None, "language_ids": None, "emotion_embeddings": None, "emotion_ids": None},
+        aux_input={
+            "d_vectors": None,
+            "speaker_ids": None,
+            "language_ids": None,
+            "emotion_embeddings": None,
+            "emotion_ids": None,
+        },
     ) -> Dict:
         """Forward pass of the model.
 
@@ -940,7 +952,7 @@ class Vits(BaseTTS):
             if g is None:
                 g = eg
             else:
-                g = torch.cat([g, eg], dim=1) # [b, h1+h2, 1]
+                g = torch.cat([g, eg], dim=1)  # [b, h1+h2, 1]
 
         # language embedding
         lang_emb = None
@@ -974,7 +986,9 @@ class Vits(BaseTTS):
         )
 
         if self.args.use_speaker_encoder_as_loss or self.args.use_emotion_encoder_as_loss:
-            encoder = self.speaker_manager.encoder if self.args.use_speaker_encoder_as_loss else self.emotion_manager.encoder
+            encoder = (
+                self.speaker_manager.encoder if self.args.use_speaker_encoder_as_loss else self.emotion_manager.encoder
+            )
             # concate generated and GT waveforms
             wavs_batch = torch.cat((wav_seg, o), dim=0)
 
@@ -1018,7 +1032,16 @@ class Vits(BaseTTS):
         return torch.tensor(x.shape[1:2]).to(x.device)
 
     def inference(
-        self, x, aux_input={"x_lengths": None, "d_vectors": None, "speaker_ids": None, "language_ids": None, "emotion_embeddings": None, "emotion_ids": None}
+        self,
+        x,
+        aux_input={
+            "x_lengths": None,
+            "d_vectors": None,
+            "speaker_ids": None,
+            "language_ids": None,
+            "emotion_embeddings": None,
+            "emotion_ids": None,
+        },
     ):  # pylint: disable=dangerous-default-value
         """
         Note:
@@ -1054,7 +1077,7 @@ class Vits(BaseTTS):
             if g is None:
                 g = eg
             else:
-                g = torch.cat([g, eg], dim=1) # [b, h1+h2, 1]
+                g = torch.cat([g, eg], dim=1)  # [b, h1+h2, 1]
 
         # language embedding
         lang_emb = None
@@ -1187,8 +1210,13 @@ class Vits(BaseTTS):
                 spec,
                 spec_lens,
                 waveform,
-                aux_input={"d_vectors": d_vectors, "speaker_ids": speaker_ids, "language_ids": language_ids,
-                            "emotion_embeddings": emotion_embeddings, "emotion_ids": emotion_ids},
+                aux_input={
+                    "d_vectors": d_vectors,
+                    "speaker_ids": speaker_ids,
+                    "language_ids": language_ids,
+                    "emotion_embeddings": emotion_embeddings,
+                    "emotion_ids": emotion_ids,
+                },
             )
 
             # cache tensors for the generator pass
@@ -1246,7 +1274,8 @@ class Vits(BaseTTS):
                     feats_disc_fake=feats_disc_fake,
                     feats_disc_real=feats_disc_real,
                     loss_duration=self.model_outputs_cache["loss_duration"],
-                    use_encoder_consistency_loss=self.args.use_speaker_encoder_as_loss or self.args.use_emotion_encoder_as_loss,
+                    use_encoder_consistency_loss=self.args.use_speaker_encoder_as_loss
+                    or self.args.use_emotion_encoder_as_loss,
                     gt_cons_emb=self.model_outputs_cache["gt_cons_emb"],
                     syn_cons_emb=self.model_outputs_cache["syn_cons_emb"],
                 )
@@ -1348,14 +1377,15 @@ class Vits(BaseTTS):
                 if emotion_name is None:
                     emotion_embedding = self.emotion_manager.get_random_embeddings()
                 else:
-                    emotion_embedding = self.emotion_manager.get_mean_embedding(emotion_name, num_samples=None, randomize=False)
+                    emotion_embedding = self.emotion_manager.get_mean_embedding(
+                        emotion_name, num_samples=None, randomize=False
+                    )
             elif config.use_emotion_embedding:
                 if emotion_name is None:
                     emotion_id = self.emotion_manager.get_random_id()
                 else:
                     emotion_id = self.emotion_manager.ids[emotion_name]
 
-
         return {
             "text": text,
             "speaker_id": speaker_id,
@@ -1364,7 +1394,7 @@ class Vits(BaseTTS):
             "language_id": language_id,
             "language_name": language_name,
             "emotion_embedding": emotion_embedding,
-            "emotion_ids": emotion_id
+            "emotion_ids": emotion_id,
         }
 
     @torch.no_grad()
@@ -1436,7 +1466,11 @@ class Vits(BaseTTS):
             language_ids = torch.LongTensor(language_ids)
 
         # get emotion embedding
-        if self.emotion_manager is not None and self.emotion_manager.embeddings and self.args.use_external_emotions_embeddings:
+        if (
+            self.emotion_manager is not None
+            and self.emotion_manager.embeddings
+            and self.args.use_external_emotions_embeddings
+        ):
             emotion_mapping = self.emotion_manager.embeddings
             emotion_embeddings = [emotion_mapping[w]["embedding"] for w in batch["audio_files"]]
             emotion_embeddings = torch.FloatTensor(emotion_embeddings)
@@ -1627,13 +1661,9 @@ class Vits(BaseTTS):
         emotion_manager = EmotionManager.init_from_config(config)
 
         if config.model_args.encoder_model_path and speaker_manager is not None:
-            speaker_manager.init_encoder(
-                config.model_args.encoder_model_path, config.model_args.encoder_config_path
-            )
+            speaker_manager.init_encoder(config.model_args.encoder_model_path, config.model_args.encoder_config_path)
         elif config.model_args.encoder_model_path and emotion_manager is not None:
-            emotion_manager.init_encoder(
-                config.model_args.encoder_model_path, config.model_args.encoder_config_path
-            )
+            emotion_manager.init_encoder(config.model_args.encoder_model_path, config.model_args.encoder_config_path)
 
         return Vits(new_config, ap, tokenizer, speaker_manager, language_manager, emotion_manager=emotion_manager)
 
diff --git a/TTS/tts/utils/emotions.py b/TTS/tts/utils/emotions.py
index d655ba03..909772ad 100644
--- a/TTS/tts/utils/emotions.py
+++ b/TTS/tts/utils/emotions.py
@@ -8,6 +8,7 @@ from coqpit import Coqpit
 from TTS.config import get_from_config_or_model_args_with_default
 from TTS.tts.utils.managers import EmbeddingManager
 
+
 class EmotionManager(EmbeddingManager):
     """Manage the emotions for emotional TTS. Load a datafile and parse the information
     in a way that can be queried by emotion or clip.
@@ -59,8 +60,8 @@ class EmotionManager(EmbeddingManager):
             id_file_path=emotion_id_file_path,
             encoder_model_path=encoder_model_path,
             encoder_config_path=encoder_config_path,
-            use_cuda=use_cuda
-            )
+            use_cuda=use_cuda,
+        )
 
     @property
     def num_emotions(self):
@@ -98,13 +99,17 @@ class EmotionManager(EmbeddingManager):
                 )
             elif get_from_config_or_model_args_with_default(config, "external_emotions_embs_file", None):
                 emotion_manager = EmotionManager(
-                    embeddings_file_path=get_from_config_or_model_args_with_default(config, "external_emotions_embs_file", None)
+                    embeddings_file_path=get_from_config_or_model_args_with_default(
+                        config, "external_emotions_embs_file", None
+                    )
                 )
 
         if get_from_config_or_model_args_with_default(config, "use_external_emotions_embeddings", False):
             if get_from_config_or_model_args_with_default(config, "external_emotions_embs_file", None):
                 emotion_manager = EmotionManager(
-                    embeddings_file_path=get_from_config_or_model_args_with_default(config, "external_emotions_embs_file", None)
+                    embeddings_file_path=get_from_config_or_model_args_with_default(
+                        config, "external_emotions_embs_file", None
+                    )
                 )
 
         return emotion_manager
@@ -159,7 +164,9 @@ def get_emotion_manager(c: Coqpit, restore_path: str = None, out_path: str = Non
         if c.use_external_emotions_embeddings:
             # restore emotion manager with the embedding file
             if not os.path.exists(emotions_ids_file):
-                print("WARNING: emotions.json was not found in restore_path, trying to use CONFIG.external_emotions_embs_file")
+                print(
+                    "WARNING: emotions.json was not found in restore_path, trying to use CONFIG.external_emotions_embs_file"
+                )
                 if not os.path.exists(c.external_emotions_embs_file):
                     raise RuntimeError(
                         "You must copy the file emotions.json to restore_path, or set a valid file in CONFIG.external_emotions_embs_file"
@@ -177,7 +184,7 @@ def get_emotion_manager(c: Coqpit, restore_path: str = None, out_path: str = Non
     elif c.use_emotion_embedding:
         if "emotions_ids_file" in c and c.emotions_ids_file:
             emotion_manager.load_ids_from_file(c.emotions_ids_file)
-        else: # enable get ids from eternal embedding files
+        else:  # enable get ids from eternal embedding files
             emotion_manager.load_embeddings_from_file(c.external_emotions_embs_file)
 
     if emotion_manager.num_emotions > 0:
diff --git a/TTS/tts/utils/languages.py b/TTS/tts/utils/languages.py
index 5f92e7f0..9b5e2007 100644
--- a/TTS/tts/utils/languages.py
+++ b/TTS/tts/utils/languages.py
@@ -1,7 +1,6 @@
 import os
 from typing import Any, Dict, List
 
-
 import fsspec
 import numpy as np
 import torch
diff --git a/TTS/tts/utils/speakers.py b/TTS/tts/utils/speakers.py
index 73588897..284d0179 100644
--- a/TTS/tts/utils/speakers.py
+++ b/TTS/tts/utils/speakers.py
@@ -65,9 +65,8 @@ class SpeakerManager(EmbeddingManager):
             id_file_path=speaker_id_file_path,
             encoder_model_path=encoder_model_path,
             encoder_config_path=encoder_config_path,
-            use_cuda=use_cuda
-            )
-
+            use_cuda=use_cuda,
+        )
 
         if data_items:
             self.set_ids_from_data(data_items, parse_key="speaker_name")
diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py
index e9940800..4208b6aa 100644
--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@@ -124,7 +124,7 @@ def synthesis(
     d_vector=None,
     language_id=None,
     emotion_id=None,
-    emotion_embedding=None
+    emotion_embedding=None,
 ):
     """Synthesize voice for the given text using Griffin-Lim vocoder or just compute output features to be passed to
     the vocoder model.
@@ -193,7 +193,16 @@ def synthesis(
     text_inputs = numpy_to_torch(text_inputs, torch.long, cuda=use_cuda)
     text_inputs = text_inputs.unsqueeze(0)
     # synthesize voice
-    outputs = run_model_torch(model, text_inputs, speaker_id, style_mel, d_vector=d_vector, language_id=language_id, emotion_id=emotion_id, emotion_embedding=emotion_embedding)
+    outputs = run_model_torch(
+        model,
+        text_inputs,
+        speaker_id,
+        style_mel,
+        d_vector=d_vector,
+        language_id=language_id,
+        emotion_id=emotion_id,
+        emotion_embedding=emotion_embedding,
+    )
     model_outputs = outputs["model_outputs"]
     model_outputs = model_outputs[0].data.cpu().numpy()
     alignments = outputs["alignments"]
diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py
index 62c0ad6a..2763d14e 100644
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@@ -121,26 +121,42 @@ class Synthesizer(object):
         if use_cuda:
             self.tts_model.cuda()
 
-        if self.encoder_checkpoint and hasattr(self.tts_model, "speaker_manager") and self.tts_model.speaker_manager is not None:
+        if (
+            self.encoder_checkpoint
+            and hasattr(self.tts_model, "speaker_manager")
+            and self.tts_model.speaker_manager is not None
+        ):
             self.tts_model.speaker_manager.init_encoder(self.encoder_checkpoint, self.encoder_config)
 
-        if self.tts_emotions_file and hasattr(self.tts_model, "emotion_manager") and self.tts_model.emotion_manager is not None:
-            if getattr(self.tts_config, "use_external_emotions_embeddings", False) or (getattr(self.tts_config, "model_args", None) and getattr(self.tts_config.model_args, "use_external_emotions_embeddings", False)):
+        if (
+            self.tts_emotions_file
+            and hasattr(self.tts_model, "emotion_manager")
+            and self.tts_model.emotion_manager is not None
+        ):
+            if getattr(self.tts_config, "use_external_emotions_embeddings", False) or (
+                getattr(self.tts_config, "model_args", None)
+                and getattr(self.tts_config.model_args, "use_external_emotions_embeddings", False)
+            ):
                 self.tts_model.emotion_manager.load_embeddings_from_file(self.tts_emotions_file)
             else:
                 self.tts_model.emotion_manager.load_ids_from_file(self.tts_emotions_file)
 
-        if self.tts_speakers_file and hasattr(self.tts_model, "speaker_manager") and self.tts_model.speaker_manager is not None:
-            if getattr(self.tts_config, "use_d_vector_file", False) or (getattr(self.tts_config, "model_args", None) and getattr(self.tts_config.model_args, "use_d_vector_file", False)):
+        if (
+            self.tts_speakers_file
+            and hasattr(self.tts_model, "speaker_manager")
+            and self.tts_model.speaker_manager is not None
+        ):
+            if getattr(self.tts_config, "use_d_vector_file", False) or (
+                getattr(self.tts_config, "model_args", None)
+                and getattr(self.tts_config.model_args, "use_d_vector_file", False)
+            ):
                 self.tts_model.speaker_manager.load_embeddings_from_file(self.tts_speakers_file)
             else:
                 self.tts_model.speaker_manager.load_ids_from_file(self.tts_speakers_file)
 
     def _set_speaker_encoder_paths_from_tts_config(self):
         """Set the encoder paths from the tts model config for models with speaker encoders."""
-        if hasattr(self.tts_config, "model_args") and hasattr(
-            self.tts_config.model_args, "encoder_config_path"
-        ):
+        if hasattr(self.tts_config, "model_args") and hasattr(self.tts_config.model_args, "encoder_config_path"):
             self.encoder_checkpoint = self.tts_config.model_args.encoder_model_path
             self.encoder_config = self.tts_config.model_args.encoder_config_path
 
@@ -273,11 +289,18 @@ class Synthesizer(object):
 
         # handle emotion
         emotion_embedding, emotion_id = None, None
-        if self.tts_emotions_file or (getattr(self.tts_model, "emotion_manager", None) and getattr(self.tts_model.emotion_manager, "ids", None)):
+        if self.tts_emotions_file or (
+            getattr(self.tts_model, "emotion_manager", None) and getattr(self.tts_model.emotion_manager, "ids", None)
+        ):
             if emotion_name and isinstance(emotion_name, str):
-                if getattr(self.tts_config, "use_external_emotions_embeddings", False) or (getattr(self.tts_config, "model_args", None) and getattr(self.tts_config.model_args, "use_external_emotions_embeddings", False)):
+                if getattr(self.tts_config, "use_external_emotions_embeddings", False) or (
+                    getattr(self.tts_config, "model_args", None)
+                    and getattr(self.tts_config.model_args, "use_external_emotions_embeddings", False)
+                ):
                     # get the average speaker embedding from the saved embeddings.
-                    emotion_embedding = self.tts_model.emotion_manager.get_mean_embedding(emotion_name, num_samples=None, randomize=False)
+                    emotion_embedding = self.tts_model.emotion_manager.get_mean_embedding(
+                        emotion_name, num_samples=None, randomize=False
+                    )
                     emotion_embedding = np.array(emotion_embedding)[None, :]  # [1 x embedding_dim]
                 else:
                     # get speaker idx from the speaker name