Add bark model

This commit is contained in:
Eren G??lge 2023-06-19 14:16:06 +02:00
parent 2364c38d16
commit 37b708dac7
5 changed files with 100 additions and 81 deletions

View File

@ -9,6 +9,19 @@
"commit": "e9a1953e",
"license": "CC BY-NC-ND 4.0",
"contact": "egolge@coqui.ai"
},
"bark": {
"description": "🐶 Bark TTS model released by suno-ai. You can find the original implementation in https://github.com/suno-ai/bark.",
"hf_url": [
"https://coqui.gateway.scarf.sh/bark/coarse_2.pt",
"https://coqui.gateway.scarf.sh/bark/fine_2.pt",
"https://coqui.gateway.scarf.sh/bark/text_2.pt",
"https://coqui.gateway.scarf.sh/bark/config.json"
],
"default_vocoder": null,
"commit": "e9a1953e",
"license": "MIT",
"contact": "https://www.suno.ai/"
}
}
},
@ -220,12 +233,12 @@
"license": "apache 2.0",
"contact": "adamfroghyar@gmail.com"
}
},
"multi-dataset": {
"tortoise-v2": {
"description": "Tortoise tts model https://github.com/neonbjb/tortoise-tts",
"github_rls_url": ["https://coqui.gateway.scarf.sh/v0.14.1_models/autoregressive.pth",
"github_rls_url": [
"https://coqui.gateway.scarf.sh/v0.14.1_models/autoregressive.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/clvp2.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/cvvp.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/diffusion_decoder.pth",

View File

@ -342,7 +342,7 @@ class TTS:
def download_model_by_name(self, model_name: str):
model_path, config_path, model_item = self.manager.download_model(model_name)
if isinstance(model_item["github_rls_url"], list):
if isinstance(model_item["model_url"], list):
# return model directory if there are multiple files
# we assume that the model knows how to load itself
return None, None, None, None, model_path
@ -580,6 +580,8 @@ class TTS:
Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None.
file_path (str, optional):
Output file path. Defaults to "output.wav".
kwargs (dict, optional):
Additional arguments for the model.
"""
self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)

View File

@ -5,11 +5,14 @@ from typing import Dict
from TTS.tts.configs.shared_configs import BaseTTSConfig
from TTS.tts.layers.bark.model import GPTConfig
from TTS.tts.layers.bark.model_fine import FineGPTConfig
from TTS.tts.models.bark import BarkAudioConfig
from TTS.utils.generic_utils import get_user_data_dir
@dataclass
class BarkConfig(BaseTTSConfig):
model: str = "bark"
audio: BarkAudioConfig = BarkAudioConfig()
num_chars: int = 0
semantic_config: GPTConfig = GPTConfig()
fine_config: FineGPTConfig = FineGPTConfig()
@ -31,7 +34,7 @@ class BarkConfig(BaseTTSConfig):
COARSE_SEMANTIC_PAD_TOKEN: int = 12_048
COARSE_INFER_TOKEN: int = 12_050
REMOTE_BASE_URL = "https://dl.suno-models.io/bark/models/v0/"
REMOTE_BASE_URL = "https://huggingface.co/erogol/bark/tree/main/"
REMOTE_MODEL_PATHS: Dict = None
LOCAL_MODEL_PATHS: Dict = None
SMALL_REMOTE_MODEL_PATHS: Dict = None

View File

@ -52,7 +52,7 @@ def load_voice(voice: str, extra_voice_dirs: List[str] = []):
return semantic, coarse, fine
if voice == "random":
return None, None
return None, None, None
voices = get_voices(extra_voice_dirs)
try:
@ -183,7 +183,7 @@ def generate_text_semantic(
assert isinstance(text, str)
text = _normalize_whitespace(text)
assert len(text.strip()) > 0
if history_prompt is not None or base is not None:
if all(v is not None for v in history_prompt) or base is not None:
if history_prompt is not None:
semantic_history = history_prompt[0]
if base is not None:
@ -327,7 +327,7 @@ def generate_coarse(
model.config.COARSE_RATE_HZ / model.config.SEMANTIC_RATE_HZ * model.config.N_COARSE_CODEBOOKS
)
max_semantic_history = int(np.floor(max_coarse_history / semantic_to_coarse_ratio))
if history_prompt is not None or base is not None:
if all(v is not None for v in history_prompt) or base is not None:
if history_prompt is not None:
x_history = history_prompt
x_semantic_history = x_history[0]
@ -477,7 +477,7 @@ def generate_fine(
and x_coarse_gen.min() >= 0
and x_coarse_gen.max() <= model.config.CODEBOOK_SIZE - 1
)
if history_prompt is not None or base is not None:
if all(v is not None for v in history_prompt) or base is not None:
if history_prompt is not None:
x_fine_history = history_prompt[2]
if base is not None:
@ -572,4 +572,4 @@ def codec_decode(fine_tokens, model):
emb = model.encodec.quantizer.decode(arr)
out = model.encodec.decoder(emb)
audio_arr = out.detach().cpu().numpy().squeeze()
save_wav(path="test.wav", wav=audio_arr, sample_rate=model.config.sample_rate)
return audio_arr

View File

@ -4,6 +4,7 @@ Much of this code is adapted from Andrej Karpathy's NanoGPT
"""
import math
from dataclasses import dataclass
from coqpit import Coqpit
import torch
import torch.nn as nn
@ -131,7 +132,7 @@ class Block(nn.Module):
@dataclass
class GPTConfig:
class GPTConfig(Coqpit):
block_size: int = 1024
input_vocab_size: int = 10_048
output_vocab_size: int = 10_048