Add bark model

2023-06-19 14:16:06 +02:00 · 2023-06-19 14:16:06 +02:00 · 37b708dac7
parent 2364c38d16
commit 37b708dac7
5 changed files with 100 additions and 81 deletions
--- a/TTS/.models.json
+++ b/TTS/.models.json
@ -9,6 +9,19 @@
                    "commit": "e9a1953e",
                    "license": "CC BY-NC-ND 4.0",
                    "contact": "egolge@coqui.ai"
                },
                "bark": {
                    "description": "🐶 Bark TTS model released by suno-ai. You can find the original implementation in https://github.com/suno-ai/bark.",
                    "hf_url": [
                        "https://coqui.gateway.scarf.sh/bark/coarse_2.pt",
                        "https://coqui.gateway.scarf.sh/bark/fine_2.pt",
                        "https://coqui.gateway.scarf.sh/bark/text_2.pt",
                        "https://coqui.gateway.scarf.sh/bark/config.json"
                    ],
                    "default_vocoder": null,
                    "commit": "e9a1953e",
                    "license": "MIT",
                    "contact": "https://www.suno.ai/"
                }
            }
        },
@ -220,12 +233,12 @@
                    "license": "apache 2.0",
                    "contact": "adamfroghyar@gmail.com"
                }
            },
            "multi-dataset": {
                "tortoise-v2": {
                    "description": "Tortoise tts model https://github.com/neonbjb/tortoise-tts",
-                    "github_rls_url": ["https://coqui.gateway.scarf.sh/v0.14.1_models/autoregressive.pth",
+                    "github_rls_url": [
                        "https://coqui.gateway.scarf.sh/v0.14.1_models/autoregressive.pth",
                        "https://coqui.gateway.scarf.sh/v0.14.1_models/clvp2.pth",
                        "https://coqui.gateway.scarf.sh/v0.14.1_models/cvvp.pth",
                        "https://coqui.gateway.scarf.sh/v0.14.1_models/diffusion_decoder.pth",
--- a/TTS/api.py
+++ b/TTS/api.py
@ -342,7 +342,7 @@ class TTS:
    def download_model_by_name(self, model_name: str):
        model_path, config_path, model_item = self.manager.download_model(model_name)
-        if isinstance(model_item["github_rls_url"], list):
+        if isinstance(model_item["model_url"], list):
            # return model directory if there are multiple files
            # we assume that the model knows how to load itself
            return None, None, None, None, model_path
@ -580,6 +580,8 @@ class TTS:
                Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None.
            file_path (str, optional):
                Output file path. Defaults to "output.wav".
            kwargs (dict, optional):
                Additional arguments for the model.
        """
        self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
--- a/TTS/tts/configs/bark_config.py
+++ b/TTS/tts/configs/bark_config.py
@ -5,11 +5,14 @@ from typing import Dict
 from TTS.tts.configs.shared_configs import BaseTTSConfig
 from TTS.tts.layers.bark.model import GPTConfig
 from TTS.tts.layers.bark.model_fine import FineGPTConfig
 from TTS.tts.models.bark import BarkAudioConfig
 from TTS.utils.generic_utils import get_user_data_dir
@dataclass
 class BarkConfig(BaseTTSConfig):
    model: str = "bark"
    audio: BarkAudioConfig = BarkAudioConfig()
    num_chars: int = 0
    semantic_config: GPTConfig = GPTConfig()
    fine_config: FineGPTConfig = FineGPTConfig()
@ -31,7 +34,7 @@ class BarkConfig(BaseTTSConfig):
    COARSE_SEMANTIC_PAD_TOKEN: int = 12_048
    COARSE_INFER_TOKEN: int = 12_050
-    REMOTE_BASE_URL = "https://dl.suno-models.io/bark/models/v0/"
+    REMOTE_BASE_URL = "https://huggingface.co/erogol/bark/tree/main/"
    REMOTE_MODEL_PATHS: Dict = None
    LOCAL_MODEL_PATHS: Dict = None
    SMALL_REMOTE_MODEL_PATHS: Dict = None
--- a/TTS/tts/layers/bark/inference_funcs.py
+++ b/TTS/tts/layers/bark/inference_funcs.py
@ -52,7 +52,7 @@ def load_voice(voice: str, extra_voice_dirs: List[str] = []):
        return semantic, coarse, fine
    if voice == "random":
-        return None, None
+        return None, None, None
    voices = get_voices(extra_voice_dirs)
    try:
@ -183,7 +183,7 @@ def generate_text_semantic(
    assert isinstance(text, str)
    text = _normalize_whitespace(text)
    assert len(text.strip()) > 0
-    if history_prompt is not None or base is not None:
+    if all(v is not None for v in history_prompt) or base is not None:
        if history_prompt is not None:
            semantic_history = history_prompt[0]
        if base is not None:
@ -327,7 +327,7 @@ def generate_coarse(
        model.config.COARSE_RATE_HZ / model.config.SEMANTIC_RATE_HZ * model.config.N_COARSE_CODEBOOKS
    )
    max_semantic_history = int(np.floor(max_coarse_history / semantic_to_coarse_ratio))
-    if history_prompt is not None or base is not None:
+    if all(v is not None for v in history_prompt) or base is not None:
        if history_prompt is not None:
            x_history = history_prompt
            x_semantic_history = x_history[0]
@ -477,7 +477,7 @@ def generate_fine(
        and x_coarse_gen.min() >= 0
        and x_coarse_gen.max() <= model.config.CODEBOOK_SIZE - 1
    )
-    if history_prompt is not None or base is not None:
+    if all(v is not None for v in history_prompt) or base is not None:
        if history_prompt is not None:
            x_fine_history = history_prompt[2]
        if base is not None:
@ -572,4 +572,4 @@ def codec_decode(fine_tokens, model):
    emb = model.encodec.quantizer.decode(arr)
    out = model.encodec.decoder(emb)
    audio_arr = out.detach().cpu().numpy().squeeze()
-    save_wav(path="test.wav", wav=audio_arr, sample_rate=model.config.sample_rate)
+    return audio_arr
--- a/TTS/tts/layers/bark/model.py
+++ b/TTS/tts/layers/bark/model.py
@ -4,6 +4,7 @@ Much of this code is adapted from Andrej Karpathy's NanoGPT
 """
 import math
 from dataclasses import dataclass
 from coqpit import Coqpit
 import torch
 import torch.nn as nn
@ -131,7 +132,7 @@ class Block(nn.Module):
@dataclass
-class GPTConfig:
+class GPTConfig(Coqpit):
    block_size: int = 1024
    input_vocab_size: int = 10_048
    output_vocab_size: int = 10_048