mirror of https://github.com/coqui-ai/TTS.git
Add bark model
This commit is contained in:
parent
2364c38d16
commit
37b708dac7
|
@ -9,6 +9,19 @@
|
|||
"commit": "e9a1953e",
|
||||
"license": "CC BY-NC-ND 4.0",
|
||||
"contact": "egolge@coqui.ai"
|
||||
},
|
||||
"bark": {
|
||||
"description": "🐶 Bark TTS model released by suno-ai. You can find the original implementation in https://github.com/suno-ai/bark.",
|
||||
"hf_url": [
|
||||
"https://coqui.gateway.scarf.sh/bark/coarse_2.pt",
|
||||
"https://coqui.gateway.scarf.sh/bark/fine_2.pt",
|
||||
"https://coqui.gateway.scarf.sh/bark/text_2.pt",
|
||||
"https://coqui.gateway.scarf.sh/bark/config.json"
|
||||
],
|
||||
"default_vocoder": null,
|
||||
"commit": "e9a1953e",
|
||||
"license": "MIT",
|
||||
"contact": "https://www.suno.ai/"
|
||||
}
|
||||
}
|
||||
},
|
||||
|
@ -220,12 +233,12 @@
|
|||
"license": "apache 2.0",
|
||||
"contact": "adamfroghyar@gmail.com"
|
||||
}
|
||||
|
||||
},
|
||||
"multi-dataset": {
|
||||
"tortoise-v2": {
|
||||
"description": "Tortoise tts model https://github.com/neonbjb/tortoise-tts",
|
||||
"github_rls_url": ["https://coqui.gateway.scarf.sh/v0.14.1_models/autoregressive.pth",
|
||||
"github_rls_url": [
|
||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/autoregressive.pth",
|
||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/clvp2.pth",
|
||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/cvvp.pth",
|
||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/diffusion_decoder.pth",
|
||||
|
|
|
@ -342,7 +342,7 @@ class TTS:
|
|||
|
||||
def download_model_by_name(self, model_name: str):
|
||||
model_path, config_path, model_item = self.manager.download_model(model_name)
|
||||
if isinstance(model_item["github_rls_url"], list):
|
||||
if isinstance(model_item["model_url"], list):
|
||||
# return model directory if there are multiple files
|
||||
# we assume that the model knows how to load itself
|
||||
return None, None, None, None, model_path
|
||||
|
@ -580,6 +580,8 @@ class TTS:
|
|||
Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None.
|
||||
file_path (str, optional):
|
||||
Output file path. Defaults to "output.wav".
|
||||
kwargs (dict, optional):
|
||||
Additional arguments for the model.
|
||||
"""
|
||||
self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
|
||||
|
||||
|
|
|
@ -5,11 +5,14 @@ from typing import Dict
|
|||
from TTS.tts.configs.shared_configs import BaseTTSConfig
|
||||
from TTS.tts.layers.bark.model import GPTConfig
|
||||
from TTS.tts.layers.bark.model_fine import FineGPTConfig
|
||||
from TTS.tts.models.bark import BarkAudioConfig
|
||||
from TTS.utils.generic_utils import get_user_data_dir
|
||||
|
||||
|
||||
@dataclass
|
||||
class BarkConfig(BaseTTSConfig):
|
||||
model: str = "bark"
|
||||
audio: BarkAudioConfig = BarkAudioConfig()
|
||||
num_chars: int = 0
|
||||
semantic_config: GPTConfig = GPTConfig()
|
||||
fine_config: FineGPTConfig = FineGPTConfig()
|
||||
|
@ -31,7 +34,7 @@ class BarkConfig(BaseTTSConfig):
|
|||
COARSE_SEMANTIC_PAD_TOKEN: int = 12_048
|
||||
COARSE_INFER_TOKEN: int = 12_050
|
||||
|
||||
REMOTE_BASE_URL = "https://dl.suno-models.io/bark/models/v0/"
|
||||
REMOTE_BASE_URL = "https://huggingface.co/erogol/bark/tree/main/"
|
||||
REMOTE_MODEL_PATHS: Dict = None
|
||||
LOCAL_MODEL_PATHS: Dict = None
|
||||
SMALL_REMOTE_MODEL_PATHS: Dict = None
|
||||
|
|
|
@ -52,7 +52,7 @@ def load_voice(voice: str, extra_voice_dirs: List[str] = []):
|
|||
return semantic, coarse, fine
|
||||
|
||||
if voice == "random":
|
||||
return None, None
|
||||
return None, None, None
|
||||
|
||||
voices = get_voices(extra_voice_dirs)
|
||||
try:
|
||||
|
@ -183,7 +183,7 @@ def generate_text_semantic(
|
|||
assert isinstance(text, str)
|
||||
text = _normalize_whitespace(text)
|
||||
assert len(text.strip()) > 0
|
||||
if history_prompt is not None or base is not None:
|
||||
if all(v is not None for v in history_prompt) or base is not None:
|
||||
if history_prompt is not None:
|
||||
semantic_history = history_prompt[0]
|
||||
if base is not None:
|
||||
|
@ -327,7 +327,7 @@ def generate_coarse(
|
|||
model.config.COARSE_RATE_HZ / model.config.SEMANTIC_RATE_HZ * model.config.N_COARSE_CODEBOOKS
|
||||
)
|
||||
max_semantic_history = int(np.floor(max_coarse_history / semantic_to_coarse_ratio))
|
||||
if history_prompt is not None or base is not None:
|
||||
if all(v is not None for v in history_prompt) or base is not None:
|
||||
if history_prompt is not None:
|
||||
x_history = history_prompt
|
||||
x_semantic_history = x_history[0]
|
||||
|
@ -477,7 +477,7 @@ def generate_fine(
|
|||
and x_coarse_gen.min() >= 0
|
||||
and x_coarse_gen.max() <= model.config.CODEBOOK_SIZE - 1
|
||||
)
|
||||
if history_prompt is not None or base is not None:
|
||||
if all(v is not None for v in history_prompt) or base is not None:
|
||||
if history_prompt is not None:
|
||||
x_fine_history = history_prompt[2]
|
||||
if base is not None:
|
||||
|
@ -572,4 +572,4 @@ def codec_decode(fine_tokens, model):
|
|||
emb = model.encodec.quantizer.decode(arr)
|
||||
out = model.encodec.decoder(emb)
|
||||
audio_arr = out.detach().cpu().numpy().squeeze()
|
||||
save_wav(path="test.wav", wav=audio_arr, sample_rate=model.config.sample_rate)
|
||||
return audio_arr
|
||||
|
|
|
@ -4,6 +4,7 @@ Much of this code is adapted from Andrej Karpathy's NanoGPT
|
|||
"""
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from coqpit import Coqpit
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
@ -131,7 +132,7 @@ class Block(nn.Module):
|
|||
|
||||
|
||||
@dataclass
|
||||
class GPTConfig:
|
||||
class GPTConfig(Coqpit):
|
||||
block_size: int = 1024
|
||||
input_vocab_size: int = 10_048
|
||||
output_vocab_size: int = 10_048
|
||||
|
|
Loading…
Reference in New Issue