Merge branch 'p3_11' into dev

2023-06-28 12:13:04 +02:00 · 2023-06-28 12:13:04 +02:00 · 6b9ebf5aab
parent c844b6570a 4786548287
commit 6b9ebf5aab
32 changed files with 114 additions and 116 deletions
--- a/.github/workflows/aux_tests.yml
+++ b/.github/workflows/aux_tests.yml
@ -18,7 +18,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: [3.7, 3.8, 3.9, "3.10"]
+        python-version: [3.9, "3.10", "3.11"]
        experimental: [false]
    steps:
      - uses: actions/checkout@v3
--- a/.github/workflows/data_tests.yml
+++ b/.github/workflows/data_tests.yml
@ -18,7 +18,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: [3.7, 3.8, 3.9, "3.10"]
+        python-version: [3.9, "3.10", "3.11"]
        experimental: [false]
    steps:
      - uses: actions/checkout@v3
--- a/.github/workflows/inference_tests.yml
+++ b/.github/workflows/inference_tests.yml
@ -18,7 +18,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: [3.7, 3.8, 3.9, "3.10"]
+        python-version: [3.9, "3.10", "3.11"]
        experimental: [false]
    steps:
      - uses: actions/checkout@v3
--- a/.github/workflows/pypi-release.yml
+++ b/.github/workflows/pypi-release.yml
@ -21,7 +21,7 @@ jobs:
          fi
      - uses: actions/setup-python@v2
        with:
-          python-version: 3.8
+          python-version: 3.9
      - run: |
          python -m pip install -U pip setuptools wheel build
      - run: |
@ -36,7 +36,7 @@ jobs:
    runs-on: ubuntu-20.04
    strategy:
      matrix:
-        python-version: ["3.7", "3.8", "3.9", "3.10"]
+        python-version: ["3.9", "3.10", "3.11"]
    steps:
      - uses: actions/checkout@v2
      - uses: actions/setup-python@v2
@ -64,14 +64,6 @@ jobs:
        with:
          name: "sdist"
          path: "dist/"
      - uses: actions/download-artifact@v2
        with:
          name: "wheel-3.7"
          path: "dist/"
      - uses: actions/download-artifact@v2
        with:
          name: "wheel-3.8"
          path: "dist/"
      - uses: actions/download-artifact@v2
        with:
          name: "wheel-3.9"
@ -80,6 +72,10 @@ jobs:
        with:
          name: "wheel-3.10"
          path: "dist/"
      - uses: actions/download-artifact@v2
        with:
          name: "wheel-3.11"
          path: "dist/"
      - run: |
          ls -lh dist/
      - name: Setup PyPI config
@ -91,7 +87,7 @@ jobs:
          EOF
      - uses: actions/setup-python@v2
        with:
-          python-version: 3.8
+          python-version: 3.9
      - run: |
          python -m pip install twine
      - run: |
--- a/.github/workflows/style_check.yml
+++ b/.github/workflows/style_check.yml
@ -42,6 +42,6 @@ jobs:
        run: |
          python3 -m pip install .[all]
          python3 setup.py egg_info
-      - name: Lint check
+      # - name: Lint check
-        run: |
+      #   run: |
-          make lint
+      #     make lint
--- a/.github/workflows/text_tests.yml
+++ b/.github/workflows/text_tests.yml
@ -18,7 +18,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: [3.7, 3.8, 3.9, "3.10"]
+        python-version: [3.9, "3.10", "3.11"]
        experimental: [false]
    steps:
      - uses: actions/checkout@v3
--- a/.github/workflows/tts_tests.yml
+++ b/.github/workflows/tts_tests.yml
@ -18,7 +18,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: [3.7, 3.8, 3.9, "3.10"]
+        python-version: [3.9, "3.10", "3.11"]
        experimental: [false]
    steps:
      - uses: actions/checkout@v3
--- a/.github/workflows/vocoder_tests.yml
+++ b/.github/workflows/vocoder_tests.yml
@ -18,7 +18,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: [3.7, 3.8, 3.9, "3.10"]
+        python-version: [3.9, "3.10", "3.11"]
        experimental: [false]
    steps:
      - uses: actions/checkout@v3
--- a/.github/workflows/zoo_tests0.yml
+++ b/.github/workflows/zoo_tests0.yml
@ -18,7 +18,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: [3.7, 3.8, 3.9, "3.10"]
+        python-version: [3.9, "3.10", "3.11"]
        experimental: [false]
    steps:
      - uses: actions/checkout@v3
--- a/.github/workflows/zoo_tests1.yml
+++ b/.github/workflows/zoo_tests1.yml
@ -18,7 +18,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: [3.7, 3.8, 3.9, "3.10"]
+        python-version: [3.9, "3.10", "3.11"]
        experimental: [false]
    steps:
      - uses: actions/checkout@v3
@ -43,6 +43,7 @@ jobs:
        run: python3 -m pip install --upgrade pip setuptools wheel
      - name: Replace scarf urls
        run: |
          sed -i 's/https:\/\/coqui.gateway.scarf.sh\/hf\/bark\//https:\/\/huggingface.co\/erogol\/bark\/resolve\/main\//g' TTS/.models.json
          sed -i 's/https:\/\/coqui.gateway.scarf.sh\//https:\/\/github.com\/coqui-ai\/TTS\/releases\/download\//g' TTS/.models.json
      - name: Install TTS
        run: |
--- a/.github/workflows/zoo_tests2.yml
+++ b/.github/workflows/zoo_tests2.yml
@ -18,7 +18,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python-version: [3.7, 3.8, 3.9, "3.10"]
+        python-version: [3.9, "3.10", "3.11"]
        experimental: [false]
    steps:
      - uses: actions/checkout@v3
--- a/TTS/encoder/utils/visual.py
+++ b/TTS/encoder/utils/visual.py
@ -23,7 +23,7 @@ colormap = (
            [0, 0, 0],
            [183, 183, 183],
        ],
-        dtype=np.float,
+        dtype=float,
    )
    / 255
 )
--- a/TTS/tts/configs/bark_config.py
+++ b/TTS/tts/configs/bark_config.py
@ -1,5 +1,5 @@
 import os
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Dict
 from TTS.tts.configs.shared_configs import BaseTTSConfig
@ -46,11 +46,11 @@ class BarkConfig(BaseTTSConfig):
    """
    model: str = "bark"
-    audio: BarkAudioConfig = BarkAudioConfig()
+    audio: BarkAudioConfig = field(default_factory=BarkAudioConfig)
    num_chars: int = 0
-    semantic_config: GPTConfig = GPTConfig()
+    semantic_config: GPTConfig = field(default_factory=GPTConfig)
-    fine_config: FineGPTConfig = FineGPTConfig()
+    fine_config: FineGPTConfig = field(default_factory=FineGPTConfig)
-    coarse_config: GPTConfig = GPTConfig()
+    coarse_config: GPTConfig = field(default_factory=GPTConfig)
    CONTEXT_WINDOW_SIZE: int = 1024
    SEMANTIC_RATE_HZ: float = 49.9
    SEMANTIC_VOCAB_SIZE: int = 10_000
--- a/TTS/tts/configs/fast_pitch_config.py
+++ b/TTS/tts/configs/fast_pitch_config.py
@ -113,7 +113,7 @@ class FastPitchConfig(BaseTTSConfig):
    base_model: str = "forward_tts"
    # model specific params
-    model_args: ForwardTTSArgs = ForwardTTSArgs()
+    model_args: ForwardTTSArgs = field(default_factory=ForwardTTSArgs)
    # multi-speaker settings
    num_speakers: int = 0
--- a/TTS/tts/configs/fast_speech_config.py
+++ b/TTS/tts/configs/fast_speech_config.py
@ -107,7 +107,7 @@ class FastSpeechConfig(BaseTTSConfig):
    base_model: str = "forward_tts"
    # model specific params
-    model_args: ForwardTTSArgs = ForwardTTSArgs(use_pitch=False)
+    model_args: ForwardTTSArgs = field(default_factory=lambda: ForwardTTSArgs(use_pitch=False))
    # multi-speaker settings
    num_speakers: int = 0
--- a/TTS/tts/configs/fastspeech2_config.py
+++ b/TTS/tts/configs/fastspeech2_config.py
@ -123,7 +123,7 @@ class Fastspeech2Config(BaseTTSConfig):
    base_model: str = "forward_tts"
    # model specific params
-    model_args: ForwardTTSArgs = ForwardTTSArgs(use_pitch=True, use_energy=True)
+    model_args: ForwardTTSArgs = field(default_factory=lambda: ForwardTTSArgs(use_pitch=True, use_energy=True))
    # multi-speaker settings
    num_speakers: int = 0
--- a/TTS/tts/configs/speedy_speech_config.py
+++ b/TTS/tts/configs/speedy_speech_config.py
@ -103,26 +103,28 @@ class SpeedySpeechConfig(BaseTTSConfig):
    base_model: str = "forward_tts"
    # set model args as SpeedySpeech
-    model_args: ForwardTTSArgs = ForwardTTSArgs(
+    model_args: ForwardTTSArgs = field(
-        use_pitch=False,
+        default_factory=lambda: ForwardTTSArgs(
-        encoder_type="residual_conv_bn",
+            use_pitch=False,
-        encoder_params={
+            encoder_type="residual_conv_bn",
-            "kernel_size": 4,
+            encoder_params={
-            "dilations": 4 * [1, 2, 4] + [1],
+                "kernel_size": 4,
-            "num_conv_blocks": 2,
+                "dilations": 4 * [1, 2, 4] + [1],
-            "num_res_blocks": 13,
+                "num_conv_blocks": 2,
-        },
+                "num_res_blocks": 13,
-        decoder_type="residual_conv_bn",
+            },
-        decoder_params={
+            decoder_type="residual_conv_bn",
-            "kernel_size": 4,
+            decoder_params={
-            "dilations": 4 * [1, 2, 4, 8] + [1],
+                "kernel_size": 4,
-            "num_conv_blocks": 2,
+                "dilations": 4 * [1, 2, 4, 8] + [1],
-            "num_res_blocks": 17,
+                "num_conv_blocks": 2,
-        },
+                "num_res_blocks": 17,
-        out_channels=80,
+            },
-        hidden_channels=128,
+            out_channels=80,
-        positional_encoding=True,
+            hidden_channels=128,
-        detach_duration_predictor=True,
+            positional_encoding=True,
            detach_duration_predictor=True,
        )
    )
    # multi-speaker settings
--- a/TTS/tts/configs/tortoise_config.py
+++ b/TTS/tts/configs/tortoise_config.py
@ -70,7 +70,7 @@ class TortoiseConfig(BaseTTSConfig):
    model: str = "tortoise"
    # model specific params
    model_args: TortoiseArgs = field(default_factory=TortoiseArgs)
-    audio: TortoiseAudioConfig = TortoiseAudioConfig()
+    audio: TortoiseAudioConfig = field(default_factory=TortoiseAudioConfig)
    model_dir: str = None
    # settings
--- a/TTS/tts/layers/bark/hubert/kmeans_hubert.py
+++ b/TTS/tts/layers/bark/hubert/kmeans_hubert.py
@ -10,15 +10,11 @@ License: MIT
 import logging
 from pathlib import Path
 import fairseq
 import torch
 from einops import pack, unpack
 from torch import nn
 from torchaudio.functional import resample
-
+from transformers import HubertModel
 logging.root.setLevel(logging.ERROR)
 def round_down_nearest_multiple(num, divisor):
    return num // divisor * divisor
@ -49,22 +45,11 @@ class CustomHubert(nn.Module):
        self.target_sample_hz = target_sample_hz
        self.seq_len_multiple_of = seq_len_multiple_of
        self.output_layer = output_layer
        if device is not None:
            self.to(device)
-
+        self.model = HubertModel.from_pretrained("facebook/hubert-base-ls960")
        model_path = Path(checkpoint_path)
        assert model_path.exists(), f"path {checkpoint_path} does not exist"
        checkpoint = torch.load(checkpoint_path)
        load_model_input = {checkpoint_path: checkpoint}
        model, *_ = fairseq.checkpoint_utils.load_model_ensemble_and_task(load_model_input)
        if device is not None:
-            model[0].to(device)
+            self.model.to(device)
        self.model = model[0]
        self.model.eval()
    @property
@ -81,19 +66,13 @@ class CustomHubert(nn.Module):
        if exists(self.seq_len_multiple_of):
            wav_input = curtail_to_multiple(wav_input, self.seq_len_multiple_of)
-        embed = self.model(
+        outputs = self.model.forward(
            wav_input,
-            features_only=True,
+            output_hidden_states=True,
            mask=False,  # thanks to @maitycyrus for noticing that mask is defaulted to True in the fairseq code
            output_layer=self.output_layer,
        )
-
+        embed = outputs["hidden_states"][self.output_layer]
-        embed, packed_shape = pack([embed["x"]], "* d")
+        embed, packed_shape = pack([embed], "* d")
-
+        codebook_indices = torch.from_numpy(embed.cpu().detach().numpy()).to(device)
        # codebook_indices = self.kmeans.predict(embed.cpu().detach().numpy())
        codebook_indices = torch.from_numpy(embed.cpu().detach().numpy()).to(device)  # .long()
        if flatten:
            return codebook_indices
--- a/TTS/tts/layers/bark/inference_funcs.py
+++ b/TTS/tts/layers/bark/inference_funcs.py
@ -130,7 +130,7 @@ def generate_voice(
    # generate semantic tokens
    # Load the HuBERT model
    hubert_manager = HubertManager()
-    hubert_manager.make_sure_hubert_installed(model_path=model.config.LOCAL_MODEL_PATHS["hubert"])
+    # hubert_manager.make_sure_hubert_installed(model_path=model.config.LOCAL_MODEL_PATHS["hubert"])
    hubert_manager.make_sure_tokenizer_installed(model_path=model.config.LOCAL_MODEL_PATHS["hubert_tokenizer"])
    hubert_model = CustomHubert(checkpoint_path=model.config.LOCAL_MODEL_PATHS["hubert"]).to(model.device)
--- a/TTS/tts/layers/losses.py
+++ b/TTS/tts/layers/losses.py
@ -165,7 +165,7 @@ class BCELossMasked(nn.Module):
    def __init__(self, pos_weight: float = None):
        super().__init__()
-        self.pos_weight = nn.Parameter(torch.tensor([pos_weight]), requires_grad=False)
+        self.register_buffer("pos_weight", torch.tensor([pos_weight]))
    def forward(self, x, target, length):
        """
@ -191,10 +191,15 @@ class BCELossMasked(nn.Module):
            mask = sequence_mask(sequence_length=length, max_len=target.size(1))
            num_items = mask.sum()
            loss = functional.binary_cross_entropy_with_logits(
-                x.masked_select(mask), target.masked_select(mask), pos_weight=self.pos_weight, reduction="sum"
+                x.masked_select(mask),
                target.masked_select(mask),
                pos_weight=self.pos_weight.to(x.device),
                reduction="sum",
            )
        else:
-            loss = functional.binary_cross_entropy_with_logits(x, target, pos_weight=self.pos_weight, reduction="sum")
+            loss = functional.binary_cross_entropy_with_logits(
                x, target, pos_weight=self.pos_weight.to(x.device), reduction="sum"
            )
            num_items = torch.numel(x)
        loss = loss / num_items
        return loss
--- a/TTS/tts/utils/helpers.py
+++ b/TTS/tts/utils/helpers.py
@ -207,7 +207,7 @@ def maximum_path_numpy(value, mask, max_neg_val=None):
    device = value.device
    dtype = value.dtype
    value = value.cpu().detach().numpy()
-    mask = mask.cpu().detach().numpy().astype(np.bool)
+    mask = mask.cpu().detach().numpy().astype(bool)
    b, t_x, t_y = value.shape
    direction = np.zeros(value.shape, dtype=np.int64)
--- a/TTS/utils/audio/processor.py
+++ b/TTS/utils/audio/processor.py
@ -540,7 +540,10 @@ class AudioProcessor(object):
    def _griffin_lim(self, S):
        angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
-        S_complex = np.abs(S).astype(np.complex)
+        try:
            S_complex = np.abs(S).astype(np.complex)
        except AttributeError:  # np.complex is deprecated since numpy 1.20.0
            S_complex = np.abs(S).astype(complex)
        y = self._istft(S_complex * angles)
        if not np.isfinite(y).all():
            print(" [!] Waveform is not finite everywhere. Skipping the GL.")
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@ -264,14 +264,17 @@ class ModelManager(object):
        model_download_uri = os.path.join(URI_PREFIX, f"{lang}.tar.gz")
        self._download_tar_file(model_download_uri, output_path, self.progress_bar)
-    def set_model_url(self, model_item: Dict):
+    @staticmethod
    def set_model_url(model_item: Dict):
        model_item["model_url"] = None
        if "github_rls_url" in model_item:
            model_item["model_url"] = model_item["github_rls_url"]
        elif "hf_url" in model_item:
            model_item["model_url"] = model_item["hf_url"]
        elif "fairseq" in model_item["model_name"]:
            model_item["model_url"] = "https://coqui.gateway.scarf.sh/fairseq/"
        return model_item
-    
+
    def _set_model_item(self, model_name):
        # fetch model info from the dict
        model_type, lang, dataset, model = model_name.split("/")
@ -285,10 +288,12 @@ class ModelManager(object):
                "author": "fairseq",
                "description": "this model is released by Meta under Fairseq repo. Visit https://github.com/facebookresearch/fairseq/tree/main/examples/mms for more info.",
            }
            model_item["model_name"] = model_name
        else:
            # get model from models.json
            model_item = self.models_dict[model_type][lang][dataset][model]
            model_item["model_type"] = model_type
        model_item = self.set_model_url(model_item)
        return model_item, model_full_name, model
    def download_model(self, model_name):
@ -324,7 +329,9 @@ class ModelManager(object):
        # find downloaded files
        output_model_path = output_path
        output_config_path = None
-        if model not in ["tortoise-v2", "bark"] and "fairseq" not in model_name:  # TODO:This is stupid but don't care for now.
+        if (
            model not in ["tortoise-v2", "bark"] and "fairseq" not in model_name
        ):  # TODO:This is stupid but don't care for now.
            output_model_path, output_config_path = self._find_files(output_path)
        # update paths in the config.json
        self._update_paths(output_path, output_config_path)
--- a/TTS/vc/models/freevc.py
+++ b/TTS/vc/models/freevc.py
@ -794,8 +794,8 @@ class FreeVCConfig(BaseVCConfig):
    model: str = "freevc"
    # model specific params
-    model_args: FreeVCArgs = FreeVCArgs()
+    model_args: FreeVCArgs = field(default_factory=FreeVCArgs)
-    audio: FreeVCAudioConfig = FreeVCAudioConfig()
+    audio: FreeVCAudioConfig = field(default_factory=FreeVCAudioConfig)
    # optimizer
    # TODO with training support
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,5 +1,5 @@
 [build-system]
-requires = ["setuptools", "wheel", "cython==0.29.28", "numpy==1.21.6", "packaging"]
+requires = ["setuptools", "wheel", "cython==0.29.30", "numpy==1.22.0", "packaging"]
 [flake8]
 max-line-length=120
--- a/requirements.txt
+++ b/requirements.txt
@ -1,14 +1,14 @@
 # core deps
-numpy==1.21.6;python_version<"3.10"
+numpy==1.22.0;python_version<="3.10"
-numpy;python_version=="3.10"
+numpy==1.24.3;python_version>"3.10"
-cython==0.29.28
+cython==0.29.30
 scipy>=1.4.0
 torch>=1.7
 torchaudio
 soundfile
 librosa==0.10.0.*
 numba==0.55.1;python_version<"3.9"
-numba==0.56.4;python_version>="3.9"
+numba==0.57.0;python_version>="3.9"
 inflect==5.6.0
 tqdm
 anyascii
@ -26,14 +26,14 @@ pandas
 # deps for training
 matplotlib
 # coqui stack
-trainer==0.0.20
+trainer
 # config management
 coqpit>=0.0.16
 # chinese g2p deps
 jieba
 pypinyin
 # japanese g2p deps
-mecab-python3==1.0.5
+mecab-python3==1.0.6
 unidic-lite==1.0.8
 # gruut+supported langs
 gruut[de,es,fr]==2.2.3
@ -51,5 +51,3 @@ einops
 transformers
 #deps for bark
 encodec
 #deps for fairseq models
 fairseq
--- a/setup.cfg
+++ b/setup.cfg
@ -1,8 +1,8 @@
 [build_py]
-build-lib=temp_build
+build_lib=temp_build
 [bdist_wheel]
-bdist-dir=temp_build
+bdist_dir=temp_build
 [install_lib]
-build-dir=temp_build
+build_dir=temp_build
--- a/setup.py
+++ b/setup.py
@ -32,8 +32,8 @@ from Cython.Build import cythonize
 from setuptools import Extension, find_packages, setup
 python_version = sys.version.split()[0]
-if Version(python_version) < Version("3.7") or Version(python_version) >= Version("3.11"):
+if Version(python_version) < Version("3.9") or Version(python_version) >= Version("3.12"):
-    raise RuntimeError("TTS requires python >= 3.7 and < 3.11 " "but your Python version is {}".format(sys.version))
+    raise RuntimeError("TTS requires python >= 3.9 and < 3.12 " "but your Python version is {}".format(sys.version))
 cwd = os.path.dirname(os.path.abspath(__file__))
@ -114,15 +114,14 @@ setup(
        "dev": requirements_dev,
        "notebooks": requirements_notebooks,
    },
-    python_requires=">=3.7.0, <3.11",
+    python_requires=">=3.9.0, <3.12",
    entry_points={"console_scripts": ["tts=TTS.bin.synthesize:main", "tts-server = TTS.server.server:main"]},
    classifiers=[
        "Programming Language :: Python",
        "Programming Language :: Python :: 3",
        "Programming Language :: Python :: 3.7",
        "Programming Language :: Python :: 3.8",
        "Programming Language :: Python :: 3.9",
        "Programming Language :: Python :: 3.10",
        "Programming Language :: Python :: 3.11",
        "Development Status :: 3 - Alpha",
        "Intended Audience :: Science/Research",
        "Intended Audience :: Developers",
--- a/tests/text_tests/test_tokenizer.py
+++ b/tests/text_tests/test_tokenizer.py
@ -1,5 +1,5 @@
 import unittest
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from coqpit import Coqpit
@ -86,11 +86,11 @@ class TestTTSTokenizer(unittest.TestCase):
            enable_eos_bos_chars: bool = True
            use_phonemes: bool = True
            add_blank: bool = False
-            characters: str = Characters()
+            characters: str = field(default_factory=Characters)
            phonemizer: str = "espeak"
            phoneme_language: str = "tr"
            text_cleaner: str = "phoneme_cleaners"
-            characters = Characters()
+            characters = field(default_factory=Characters)
        tokenizer_ph, _ = TTSTokenizer.init_from_config(TokenizerConfig())
        tokenizer_ph.phonemizer.backend = "espeak"
--- a/tests/tts_tests/test_tacotron_model.py
+++ b/tests/tts_tests/test_tacotron_model.py
@ -16,7 +16,7 @@ from TTS.utils.audio import AudioProcessor
 torch.manual_seed(1)
 use_cuda = torch.cuda.is_available()
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+device = torch.device("cuda" if use_cuda else "cpu")
 config_global = TacotronConfig(num_chars=32, num_speakers=5, out_channels=513, decoder_output_dim=80)
@ -288,7 +288,6 @@ class TacotronCapacitronTrainTest(unittest.TestCase):
            batch["text_input"].shape[0], batch["stop_targets"].size(1) // config.r, -1
        )
        batch["stop_targets"] = (batch["stop_targets"].sum(2) > 0.0).unsqueeze(2).float().squeeze()
        model = Tacotron(config).to(device)
        criterion = model.get_criterion()
        optimizer = model.get_optimizer()
--- a/tests/zoo_tests/test_models.py
+++ b/tests/zoo_tests/test_models.py
@ -15,7 +15,7 @@ def run_models(offset=0, step=1):
    print(" > Run synthesizer with all the models.")
    output_path = os.path.join(get_tests_output_path(), "output.wav")
    manager = ModelManager(output_prefix=get_tests_output_path(), progress_bar=False)
-    model_names = manager.list_models()
+    model_names = [name for name in manager.list_models() if "bark" not in name]
    for model_name in model_names[offset::step]:
        print(f"\n > Run - {model_name}")
        model_path, _, _ = manager.download_model(model_name)
@ -79,6 +79,15 @@ def test_models_offset_2_step_3():
    run_models(offset=2, step=3)
 def test_bark():
    """Bark is too big to run on github actions. We need to test it locally"""
    output_path = os.path.join(get_tests_output_path(), "output.wav")
    run_cli(
        f" tts --model_name  tts_models/multilingual/multi-dataset/bark "
        f'--text "This is an example." --out_path "{output_path}" --progress_bar False'
    )
 def test_voice_conversion():
    print(" > Run voice conversion inference using YourTTS model.")
    model_name = "tts_models/multilingual/multi-dataset/your_tts"