mirror of https://github.com/coqui-ai/TTS.git
Add bark model
This commit is contained in:
parent
2364c38d16
commit
37b708dac7
|
@ -9,6 +9,19 @@
|
||||||
"commit": "e9a1953e",
|
"commit": "e9a1953e",
|
||||||
"license": "CC BY-NC-ND 4.0",
|
"license": "CC BY-NC-ND 4.0",
|
||||||
"contact": "egolge@coqui.ai"
|
"contact": "egolge@coqui.ai"
|
||||||
|
},
|
||||||
|
"bark": {
|
||||||
|
"description": "🐶 Bark TTS model released by suno-ai. You can find the original implementation in https://github.com/suno-ai/bark.",
|
||||||
|
"hf_url": [
|
||||||
|
"https://coqui.gateway.scarf.sh/bark/coarse_2.pt",
|
||||||
|
"https://coqui.gateway.scarf.sh/bark/fine_2.pt",
|
||||||
|
"https://coqui.gateway.scarf.sh/bark/text_2.pt",
|
||||||
|
"https://coqui.gateway.scarf.sh/bark/config.json"
|
||||||
|
],
|
||||||
|
"default_vocoder": null,
|
||||||
|
"commit": "e9a1953e",
|
||||||
|
"license": "MIT",
|
||||||
|
"contact": "https://www.suno.ai/"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
@ -220,12 +233,12 @@
|
||||||
"license": "apache 2.0",
|
"license": "apache 2.0",
|
||||||
"contact": "adamfroghyar@gmail.com"
|
"contact": "adamfroghyar@gmail.com"
|
||||||
}
|
}
|
||||||
|
|
||||||
},
|
},
|
||||||
"multi-dataset": {
|
"multi-dataset": {
|
||||||
"tortoise-v2": {
|
"tortoise-v2": {
|
||||||
"description": "Tortoise tts model https://github.com/neonbjb/tortoise-tts",
|
"description": "Tortoise tts model https://github.com/neonbjb/tortoise-tts",
|
||||||
"github_rls_url": ["https://coqui.gateway.scarf.sh/v0.14.1_models/autoregressive.pth",
|
"github_rls_url": [
|
||||||
|
"https://coqui.gateway.scarf.sh/v0.14.1_models/autoregressive.pth",
|
||||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/clvp2.pth",
|
"https://coqui.gateway.scarf.sh/v0.14.1_models/clvp2.pth",
|
||||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/cvvp.pth",
|
"https://coqui.gateway.scarf.sh/v0.14.1_models/cvvp.pth",
|
||||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/diffusion_decoder.pth",
|
"https://coqui.gateway.scarf.sh/v0.14.1_models/diffusion_decoder.pth",
|
||||||
|
|
|
@ -342,7 +342,7 @@ class TTS:
|
||||||
|
|
||||||
def download_model_by_name(self, model_name: str):
|
def download_model_by_name(self, model_name: str):
|
||||||
model_path, config_path, model_item = self.manager.download_model(model_name)
|
model_path, config_path, model_item = self.manager.download_model(model_name)
|
||||||
if isinstance(model_item["github_rls_url"], list):
|
if isinstance(model_item["model_url"], list):
|
||||||
# return model directory if there are multiple files
|
# return model directory if there are multiple files
|
||||||
# we assume that the model knows how to load itself
|
# we assume that the model knows how to load itself
|
||||||
return None, None, None, None, model_path
|
return None, None, None, None, model_path
|
||||||
|
@ -580,6 +580,8 @@ class TTS:
|
||||||
Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None.
|
Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None.
|
||||||
file_path (str, optional):
|
file_path (str, optional):
|
||||||
Output file path. Defaults to "output.wav".
|
Output file path. Defaults to "output.wav".
|
||||||
|
kwargs (dict, optional):
|
||||||
|
Additional arguments for the model.
|
||||||
"""
|
"""
|
||||||
self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
|
self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
|
||||||
|
|
||||||
|
|
|
@ -5,11 +5,14 @@ from typing import Dict
|
||||||
from TTS.tts.configs.shared_configs import BaseTTSConfig
|
from TTS.tts.configs.shared_configs import BaseTTSConfig
|
||||||
from TTS.tts.layers.bark.model import GPTConfig
|
from TTS.tts.layers.bark.model import GPTConfig
|
||||||
from TTS.tts.layers.bark.model_fine import FineGPTConfig
|
from TTS.tts.layers.bark.model_fine import FineGPTConfig
|
||||||
|
from TTS.tts.models.bark import BarkAudioConfig
|
||||||
from TTS.utils.generic_utils import get_user_data_dir
|
from TTS.utils.generic_utils import get_user_data_dir
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class BarkConfig(BaseTTSConfig):
|
class BarkConfig(BaseTTSConfig):
|
||||||
|
model: str = "bark"
|
||||||
|
audio: BarkAudioConfig = BarkAudioConfig()
|
||||||
num_chars: int = 0
|
num_chars: int = 0
|
||||||
semantic_config: GPTConfig = GPTConfig()
|
semantic_config: GPTConfig = GPTConfig()
|
||||||
fine_config: FineGPTConfig = FineGPTConfig()
|
fine_config: FineGPTConfig = FineGPTConfig()
|
||||||
|
@ -31,7 +34,7 @@ class BarkConfig(BaseTTSConfig):
|
||||||
COARSE_SEMANTIC_PAD_TOKEN: int = 12_048
|
COARSE_SEMANTIC_PAD_TOKEN: int = 12_048
|
||||||
COARSE_INFER_TOKEN: int = 12_050
|
COARSE_INFER_TOKEN: int = 12_050
|
||||||
|
|
||||||
REMOTE_BASE_URL = "https://dl.suno-models.io/bark/models/v0/"
|
REMOTE_BASE_URL = "https://huggingface.co/erogol/bark/tree/main/"
|
||||||
REMOTE_MODEL_PATHS: Dict = None
|
REMOTE_MODEL_PATHS: Dict = None
|
||||||
LOCAL_MODEL_PATHS: Dict = None
|
LOCAL_MODEL_PATHS: Dict = None
|
||||||
SMALL_REMOTE_MODEL_PATHS: Dict = None
|
SMALL_REMOTE_MODEL_PATHS: Dict = None
|
||||||
|
|
|
@ -52,7 +52,7 @@ def load_voice(voice: str, extra_voice_dirs: List[str] = []):
|
||||||
return semantic, coarse, fine
|
return semantic, coarse, fine
|
||||||
|
|
||||||
if voice == "random":
|
if voice == "random":
|
||||||
return None, None
|
return None, None, None
|
||||||
|
|
||||||
voices = get_voices(extra_voice_dirs)
|
voices = get_voices(extra_voice_dirs)
|
||||||
try:
|
try:
|
||||||
|
@ -183,7 +183,7 @@ def generate_text_semantic(
|
||||||
assert isinstance(text, str)
|
assert isinstance(text, str)
|
||||||
text = _normalize_whitespace(text)
|
text = _normalize_whitespace(text)
|
||||||
assert len(text.strip()) > 0
|
assert len(text.strip()) > 0
|
||||||
if history_prompt is not None or base is not None:
|
if all(v is not None for v in history_prompt) or base is not None:
|
||||||
if history_prompt is not None:
|
if history_prompt is not None:
|
||||||
semantic_history = history_prompt[0]
|
semantic_history = history_prompt[0]
|
||||||
if base is not None:
|
if base is not None:
|
||||||
|
@ -327,7 +327,7 @@ def generate_coarse(
|
||||||
model.config.COARSE_RATE_HZ / model.config.SEMANTIC_RATE_HZ * model.config.N_COARSE_CODEBOOKS
|
model.config.COARSE_RATE_HZ / model.config.SEMANTIC_RATE_HZ * model.config.N_COARSE_CODEBOOKS
|
||||||
)
|
)
|
||||||
max_semantic_history = int(np.floor(max_coarse_history / semantic_to_coarse_ratio))
|
max_semantic_history = int(np.floor(max_coarse_history / semantic_to_coarse_ratio))
|
||||||
if history_prompt is not None or base is not None:
|
if all(v is not None for v in history_prompt) or base is not None:
|
||||||
if history_prompt is not None:
|
if history_prompt is not None:
|
||||||
x_history = history_prompt
|
x_history = history_prompt
|
||||||
x_semantic_history = x_history[0]
|
x_semantic_history = x_history[0]
|
||||||
|
@ -477,7 +477,7 @@ def generate_fine(
|
||||||
and x_coarse_gen.min() >= 0
|
and x_coarse_gen.min() >= 0
|
||||||
and x_coarse_gen.max() <= model.config.CODEBOOK_SIZE - 1
|
and x_coarse_gen.max() <= model.config.CODEBOOK_SIZE - 1
|
||||||
)
|
)
|
||||||
if history_prompt is not None or base is not None:
|
if all(v is not None for v in history_prompt) or base is not None:
|
||||||
if history_prompt is not None:
|
if history_prompt is not None:
|
||||||
x_fine_history = history_prompt[2]
|
x_fine_history = history_prompt[2]
|
||||||
if base is not None:
|
if base is not None:
|
||||||
|
@ -572,4 +572,4 @@ def codec_decode(fine_tokens, model):
|
||||||
emb = model.encodec.quantizer.decode(arr)
|
emb = model.encodec.quantizer.decode(arr)
|
||||||
out = model.encodec.decoder(emb)
|
out = model.encodec.decoder(emb)
|
||||||
audio_arr = out.detach().cpu().numpy().squeeze()
|
audio_arr = out.detach().cpu().numpy().squeeze()
|
||||||
save_wav(path="test.wav", wav=audio_arr, sample_rate=model.config.sample_rate)
|
return audio_arr
|
||||||
|
|
|
@ -4,6 +4,7 @@ Much of this code is adapted from Andrej Karpathy's NanoGPT
|
||||||
"""
|
"""
|
||||||
import math
|
import math
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
from coqpit import Coqpit
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
|
@ -131,7 +132,7 @@ class Block(nn.Module):
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class GPTConfig:
|
class GPTConfig(Coqpit):
|
||||||
block_size: int = 1024
|
block_size: int = 1024
|
||||||
input_vocab_size: int = 10_048
|
input_vocab_size: int = 10_048
|
||||||
output_vocab_size: int = 10_048
|
output_vocab_size: int = 10_048
|
||||||
|
|
Loading…
Reference in New Issue