diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py
index 1fa349cf..f3bebf2c 100644
--- a/TTS/tts/models/xtts.py
+++ b/TTS/tts/models/xtts.py
@@ -377,8 +377,8 @@ class Xtts(BaseTTS):
             sr (int): Sample rate of the audio.
             length (int): Length of the audio in seconds. Defaults to 3.
         """
-
-        audio_22k = torchaudio.functional.resample(audio, sr, 22050)
+        if sr != 22050:
+            audio_22k = torchaudio.functional.resample(audio, sr, 22050)
         audio_22k = audio_22k[:, : 22050 * length]
         if self.args.gpt_use_perceiver_resampler:
             mel = wav_to_mel_cloning(audio_22k,
@@ -600,6 +600,7 @@ class Xtts(BaseTTS):
         (gpt_cond_latent, diffusion_conditioning, speaker_embedding) = self.get_conditioning_latents(
             audio_path=ref_audio_path, gpt_cond_len=gpt_cond_len, max_ref_length=max_ref_len, sound_norm_refs=sound_norm_refs
         )
+
         return self.inference(
             text,
             language,