diff --git a/TTS/tts/layers/generic/classifier.py b/TTS/tts/layers/generic/classifier.py
index b09ccdc6..1cd60006 100644
--- a/TTS/tts/layers/generic/classifier.py
+++ b/TTS/tts/layers/generic/classifier.py
@@ -60,4 +60,3 @@ class ReversalClassifier(nn.Module):
         target = labels.repeat(input_mask.size(-1), 1).transpose(0, 1).int().long()
         target[~input_mask] = ignore_index
         return nn.functional.cross_entropy(predictions.transpose(1, 2), target, ignore_index=ignore_index)
-
diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py
index 118286a7..89a11139 100644
--- a/TTS/tts/layers/losses.py
+++ b/TTS/tts/layers/losses.py
@@ -761,20 +761,14 @@ class VitsGeneratorLoss(nn.Module):
 
             loss += kl_vae_loss
             return_dict["loss_kl_vae"] = kl_vae_loss
-        
+
         if end2end_info is not None:
-            # do not compute feature loss because for it we need waves segments with the same length
-            '''loss_feat_end2end = (
-                self.feature_loss(feats_real=end2end_info["feats_disc_real"], feats_generated=end2end_info["feats_disc_fake"]) * self.feat_loss_alpha
-            )
-            return_dict["loss_feat_end2end"] = loss_feat_end2end
-            loss += loss_feat_end2end'''
 
             # gen loss
             loss_gen_end2end = self.generator_loss(scores_fake=end2end_info["scores_disc_fake"])[0] * self.gen_loss_alpha
             return_dict["loss_gen_end2end"] = loss_gen_end2end
             loss += loss_gen_end2end
-            
+
             # if do not uses soft dtw
             if end2end_info["z_predicted"] is not None:
                 # loss KL using GT durations
@@ -793,7 +787,7 @@ class VitsGeneratorLoss(nn.Module):
             else:
                 pass
                 # ToDo: implement soft dtw
- 
+
         # pass losses to the dict
         return_dict["loss_gen"] = loss_gen
         return_dict["loss_kl"] = loss_kl
@@ -854,7 +848,7 @@ class VitsDiscriminatorLoss(nn.Module):
             loss_disc_end2end, loss_disc_real_end2end, _ = self.discriminator_loss(
                 scores_real=end2end_info["scores_disc_real"], scores_fake=end2end_info["scores_disc_fake"],
             )
-            return_dict["loss_disc_end2end"] = loss_disc_end2end * self.disc_loss_alpha 
+            return_dict["loss_disc_end2end"] = loss_disc_end2end * self.disc_loss_alpha
             return_dict["loss"] += return_dict["loss_disc_end2end"]
 
             for i, ldr in enumerate(loss_disc_real_end2end):
diff --git a/TTS/tts/layers/vits/discriminator.py b/TTS/tts/layers/vits/discriminator.py
index 6efcc069..8ec67d1e 100644
--- a/TTS/tts/layers/vits/discriminator.py
+++ b/TTS/tts/layers/vits/discriminator.py
@@ -94,9 +94,9 @@ class VitsDiscriminator(nn.Module):
         mp_scores, zp_scores, mp_feats, zp_feats = None, None, None, None
         if self.disc_latent is not None:
             if m_p is not None:
-                mp_scores, mp_feats = self.disc_latent(m_p.unsqueeze(1))
+                mp_scores, mp_feats = self.disc_latent(m_p.unsqueeze(-1))
             if z_p is not None:
-                zp_scores, zp_feats = self.disc_latent(z_p.unsqueeze(1))
+                zp_scores, zp_feats = self.disc_latent(z_p.unsqueeze(-1))
 
         return x_scores, x_feats, x_hat_scores, x_hat_feats, mp_scores, mp_feats, zp_scores, zp_feats
 
@@ -107,7 +107,6 @@ class LatentDiscriminator(nn.Module):
     def __init__(self, use_spectral_norm=False, hidden_channels=None):
         super().__init__()
         norm_f = nn.utils.spectral_norm if use_spectral_norm else nn.utils.weight_norm
-        self.hidden_channels = hidden_channels
         self.discriminators = nn.ModuleList(
             [
                 norm_f(nn.Conv2d(1 if hidden_channels is None else hidden_channels, 32, kernel_size=(3, 9), padding=(1, 4))),
@@ -122,8 +121,6 @@ class LatentDiscriminator(nn.Module):
 
     def forward(self, y):
         fmap = []
-        if self.hidden_channels is not None:
-            y = y.squeeze(1).unsqueeze(-1)
         for _, d in enumerate(self.discriminators):
             y = d(y)
             y = torch.nn.functional.leaky_relu(y, 0.1)
diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py
index bc96d0c3..ca5d607e 100644
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@@ -20,7 +20,7 @@ from TTS.tts.datasets.dataset import TTSDataset, _parse_sample
 from TTS.tts.layers.generic.classifier import ReversalClassifier
 from TTS.tts.layers.glow_tts.duration_predictor import DurationPredictor
 from TTS.tts.layers.vits.prosody_encoder import VitsGST, VitsVAE
-from TTS.tts.layers.vits.discriminator import VitsDiscriminator, LatentDiscriminator
+from TTS.tts.layers.vits.discriminator import VitsDiscriminator
 from TTS.tts.layers.vits.networks import PosteriorEncoder, ResidualCouplingBlocks, TextEncoder
 from TTS.tts.layers.glow_tts.transformer import RelativePositionTransformer
 from TTS.tts.layers.vits.stochastic_duration_predictor import StochasticDurationPredictor
@@ -556,7 +556,7 @@ class VitsArgs(Coqpit):
     use_prosody_encoder_z_p_input: bool = False
     use_prosody_enc_spk_reversal_classifier: bool = False
     use_prosody_enc_emo_classifier: bool = False
-    
+
     use_noise_scale_predictor: bool = False
 
     use_prosody_conditional_flow_module: bool = False
@@ -567,7 +567,6 @@ class VitsArgs(Coqpit):
     use_soft_dtw: bool = False
 
     use_latent_discriminator: bool = False
-    provide_hidden_dim_on_the_latent_discriminator: bool = False
 
     detach_dp_input: bool = True
     use_language_embedding: bool = False
@@ -686,10 +685,10 @@ class Vits(BaseTTS):
 
         dp_extra_inp_dim = 0
         if (self.args.use_emotion_embedding or self.args.use_external_emotions_embeddings) and not self.args.use_prosody_conditional_flow_module and not self.args.use_noise_scale_predictor:
-                dp_extra_inp_dim += self.args.emotion_embedding_dim
+            dp_extra_inp_dim += self.args.emotion_embedding_dim
 
         if self.args.use_prosody_encoder and not self.args.use_prosody_conditional_flow_module and not self.args.use_noise_scale_predictor:
-                dp_extra_inp_dim += self.args.prosody_embedding_dim
+            dp_extra_inp_dim += self.args.prosody_embedding_dim
 
         if self.args.use_sdp:
             self.duration_predictor = StochasticDurationPredictor(
@@ -724,7 +723,7 @@ class Vits(BaseTTS):
                     num_mel=self.args.hidden_channels,
                     capacitron_VAE_embedding_dim=self.args.prosody_embedding_dim,
                 )
-            else: 
+            else:
                 raise RuntimeError(
                 f" [!] The Prosody encoder type {self.args.prosody_encoder_type} is not supported !!"
             )
@@ -734,7 +733,7 @@ class Vits(BaseTTS):
                     out_channels=self.num_speakers,
                     hidden_channels=256,
                 )
-            if self.args.use_prosody_enc_emo_classifier:  
+            if self.args.use_prosody_enc_emo_classifier:
                 self.pros_enc_emotion_classifier = ReversalClassifier(
                     in_channels=self.args.prosody_embedding_dim,
                     out_channels=self.num_emotions,
@@ -817,7 +816,7 @@ class Vits(BaseTTS):
                 periods=self.args.periods_multi_period_discriminator,
                 use_spectral_norm=self.args.use_spectral_norm_disriminator,
                 use_latent_disc=self.args.use_latent_discriminator,
-                hidden_channels=self.args.hidden_channels if self.args.provide_hidden_dim_on_the_latent_discriminator else None,
+                hidden_channels=self.args.hidden_channels,
             )
 
     def init_multispeaker(self, config: Coqpit):
@@ -952,7 +951,7 @@ class Vits(BaseTTS):
                 if value == before_dict[key]:
                     raise RuntimeError(" [!] The weights of Text Encoder was not reinit check it !")
             print(" > Text Encoder was reinit.")
-        
+
     def init_emotion(self, emotion_manager: EmotionManager):
         # pylint: disable=attribute-defined-outside-init
         """Initialize emotion modules of a model. A model can be trained either with a emotion embedding layer
@@ -1345,7 +1344,7 @@ class Vits(BaseTTS):
                     z_p_end2end = self.prosody_conditional_module(z_p_end2end, y_mask_end2end, g=eg if (self.args.use_emotion_embedding or self.args.use_external_emotions_embeddings) else pros_emb, reverse=True)
 
             z_end2end = self.flow(z_p_end2end, y_mask_end2end, g=g, reverse=True)
-        
+
             # interpolate z if needed
             z_end2end, _, _, y_mask_end2end = self.upsampling_z(z, y_lengths=y_lengths_end2end, y_mask=y_mask_end2end)
             # z_slice_end2end, spec_segment_size, slice_ids_end2end, _ = self.upsampling_z(z_slice_end2end, slice_ids=slice_ids_end2end)
@@ -1505,7 +1504,7 @@ class Vits(BaseTTS):
 
         m_p = torch.matmul(attn.transpose(1, 2), m_p.transpose(1, 2)).transpose(1, 2)
         logs_p = torch.matmul(attn.transpose(1, 2), logs_p.transpose(1, 2)).transpose(1, 2)
-        
+
         if self.args.use_noise_scale_predictor:
             nsp_input = torch.transpose(m_p, 1, -1)
             if self.args.use_prosody_encoder and pros_emb is not None:
@@ -1850,7 +1849,7 @@ class Vits(BaseTTS):
 
         if style_wav and style_speaker_name is None:
             raise RuntimeError(
-                f" [!] You must to provide the style_speaker_name for the style_wav !!"
+                " [!] You must to provide the style_speaker_name for the style_wav !!"
             )
 
         # get speaker  id/d_vector
diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py
index 3e328506..4283750d 100644
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@@ -423,7 +423,7 @@ class Synthesizer(object):
             source_emotion_feature, target_emotion_feature = None, None
             if source_emotion is not None and target_emotion is not None and not getattr(self.tts_model, "prosody_encoder", False) and (self.tts_emotions_file or (
                 getattr(self.tts_model, "emotion_manager", None) and getattr(self.tts_model.emotion_manager, "ids", None)
-            )):
+            )): # pylint: disable=R0916
                 if source_emotion and isinstance(source_emotion, str):
                     if getattr(self.tts_config, "use_external_emotions_embeddings", False) or (
                         getattr(self.tts_config, "model_args", None)
diff --git a/tests/tts_tests/test_vits_speaker_emb_with_prosody_encoder.py b/tests/tts_tests/test_vits_speaker_emb_with_prosody_encoder.py
index 4a160fa5..d167d211 100644
--- a/tests/tts_tests/test_vits_speaker_emb_with_prosody_encoder.py
+++ b/tests/tts_tests/test_vits_speaker_emb_with_prosody_encoder.py
@@ -53,7 +53,6 @@ config.model_args.prosody_encoder_type = "gst"
 config.model_args.detach_prosody_enc_input = True
 
 config.model_args.use_latent_discriminator = True
-config.model_args.provide_hidden_dim_on_the_latent_discriminator = True
 config.model_args.use_noise_scale_predictor = False
 
 # enable end2end loss