Fix resnet speaker encoder

2021-12-30 14:41:43 +00:00 · 2021-12-30 14:41:43 +00:00 · 36cef5966b
parent 348b5c96a2
commit 36cef5966b
3 changed files with 3 additions and 6 deletions
--- a/TTS/speaker_encoder/models/resnet.py
+++ b/TTS/speaker_encoder/models/resnet.py
@ -198,7 +198,7 @@ class ResNetSpeakerEncoder(nn.Module):
            l2_norm (bool): Whether to L2-normalize the outputs.

        Shapes:
-            - x: :math:`(N, 1, T_{in})`
+            - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})`
        """
        with torch.no_grad():
            with torch.cuda.amp.autocast(enabled=False):
@ -206,8 +206,6 @@ class ResNetSpeakerEncoder(nn.Module):
                # if you torch spec compute it otherwise use the mel spec computed by the AP
                if self.use_torch_spec:
                    x = self.torch_spec(x)
-                else:
-                    x = x.transpose(1, 2)

                if self.log_input:
                    x = (x + 1e-6).log()
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@ -417,8 +417,7 @@ class Vits(BaseTTS):

            if (
                hasattr(self.speaker_manager.speaker_encoder, "audio_config")
-                and self.config.audio["sample_rate"]
-                != self.speaker_manager.speaker_encoder.audio_config["sample_rate"]
+                and self.config.audio["sample_rate"] != self.speaker_manager.speaker_encoder.audio_config["sample_rate"]
            ):
                # TODO: change this with torchaudio Resample
                raise RuntimeError(
--- a/tests/tts_tests/test_vits.py
+++ b/tests/tts_tests/test_vits.py
@ -19,7 +19,7 @@ use_cuda = torch.cuda.is_available()
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


-#pylint: disable=no-self-use
+# pylint: disable=no-self-use
 class TestVits(unittest.TestCase):
    def test_init_multispeaker(self):
        num_speakers = 10