Add bark model

This commit is contained in:
Eren G??lge 2023-06-19 14:16:06 +02:00
parent 2364c38d16
commit 37b708dac7
5 changed files with 100 additions and 81 deletions

View File

@ -1,20 +1,33 @@
{
"tts_models": {
"multilingual":{
"multi-dataset":{
"your_tts":{
"multilingual": {
"multi-dataset": {
"your_tts": {
"description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--multilingual--multi-dataset--your_tts.zip",
"default_vocoder": null,
"commit": "e9a1953e",
"license": "CC BY-NC-ND 4.0",
"contact": "egolge@coqui.ai"
},
"bark": {
"description": "🐶 Bark TTS model released by suno-ai. You can find the original implementation in https://github.com/suno-ai/bark.",
"hf_url": [
"https://coqui.gateway.scarf.sh/bark/coarse_2.pt",
"https://coqui.gateway.scarf.sh/bark/fine_2.pt",
"https://coqui.gateway.scarf.sh/bark/text_2.pt",
"https://coqui.gateway.scarf.sh/bark/config.json"
],
"default_vocoder": null,
"commit": "e9a1953e",
"license": "MIT",
"contact": "https://www.suno.ai/"
}
}
},
"bg": {
"cv": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--bg--cv--vits.zip",
"default_vocoder": null,
"commit": null,
@ -25,7 +38,7 @@
},
"cs": {
"cv": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--cs--cv--vits.zip",
"default_vocoder": null,
"commit": null,
@ -36,7 +49,7 @@
},
"da": {
"cv": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--da--cv--vits.zip",
"default_vocoder": null,
"commit": null,
@ -47,7 +60,7 @@
},
"et": {
"cv": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--et--cv--vits.zip",
"default_vocoder": null,
"commit": null,
@ -58,7 +71,7 @@
},
"ga": {
"cv": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ga--cv--vits.zip",
"default_vocoder": null,
"commit": null,
@ -180,7 +193,7 @@
"license": "apache 2.0",
"contact": "egolge@coqui.ai"
},
"fast_pitch":{
"fast_pitch": {
"description": "FastPitch model trained on VCTK dataseset.",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--fast_pitch.zip",
"default_vocoder": null,
@ -220,21 +233,21 @@
"license": "apache 2.0",
"contact": "adamfroghyar@gmail.com"
}
},
"multi-dataset":{
"tortoise-v2":{
"multi-dataset": {
"tortoise-v2": {
"description": "Tortoise tts model https://github.com/neonbjb/tortoise-tts",
"github_rls_url": ["https://coqui.gateway.scarf.sh/v0.14.1_models/autoregressive.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/clvp2.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/cvvp.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/diffusion_decoder.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_auto.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_diffuser.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/vocoder.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/mel_norms.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/config.json"
],
"github_rls_url": [
"https://coqui.gateway.scarf.sh/v0.14.1_models/autoregressive.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/clvp2.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/cvvp.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/diffusion_decoder.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_auto.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_diffuser.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/vocoder.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/mel_norms.pth",
"https://coqui.gateway.scarf.sh/v0.14.1_models/config.json"
],
"commit": "c1875f6",
"default_vocoder": null,
"author": "@neonbjb - James Betker, @manmay-nakhashi Manmay Nakhashi",
@ -242,7 +255,7 @@
}
},
"jenny": {
"jenny":{
"jenny": {
"description": "VITS model trained with Jenny(Dioco) dataset. Named as Jenny as demanded by the license. Original URL for the model https://www.kaggle.com/datasets/noml4u/tts-models--en--jenny-dioco--vits",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.14.0_models/tts_models--en--jenny--jenny.zip",
"default_vocoder": null,
@ -263,8 +276,8 @@
"contact": "egolge@coqui.com"
}
},
"css10":{
"vits":{
"css10": {
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--es--css10--vits.zip",
"default_vocoder": null,
"commit": null,
@ -284,8 +297,8 @@
"contact": "egolge@coqui.com"
}
},
"css10":{
"vits":{
"css10": {
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fr--css10--vits.zip",
"default_vocoder": null,
"commit": null,
@ -294,17 +307,17 @@
}
}
},
"uk":{
"uk": {
"mai": {
"glow-tts": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--uk--mai--glow-tts.zip",
"author":"@robinhad",
"author": "@robinhad",
"commit": "bdab788d",
"license": "MIT",
"contact": "",
"default_vocoder": "vocoder_models/uk/mai/multiband-melgan"
},
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--uk--mai--vits.zip",
"default_vocoder": null,
"commit": null,
@ -335,8 +348,8 @@
"commit": "540d811"
}
},
"css10":{
"vits":{
"css10": {
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--nl--css10--vits.zip",
"default_vocoder": null,
"commit": null,
@ -371,7 +384,7 @@
}
},
"css10": {
"vits-neon":{
"vits-neon": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--de--css10--vits.zip",
"default_vocoder": null,
"author": "@NeonGeckoCom",
@ -392,9 +405,9 @@
}
}
},
"tr":{
"tr": {
"common-voice": {
"glow-tts":{
"glow-tts": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--tr--common-voice--glow-tts.zip",
"default_vocoder": "vocoder_models/tr/common-voice/hifigan",
"license": "MIT",
@ -406,7 +419,7 @@
},
"it": {
"mai_female": {
"glow-tts":{
"glow-tts": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--glow-tts.zip",
"default_vocoder": null,
"description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
@ -414,7 +427,7 @@
"license": "apache 2.0",
"commit": null
},
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--vits.zip",
"default_vocoder": null,
"description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
@ -424,7 +437,7 @@
}
},
"mai_male": {
"glow-tts":{
"glow-tts": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--glow-tts.zip",
"default_vocoder": null,
"description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
@ -432,7 +445,7 @@
"license": "apache 2.0",
"commit": null
},
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--vits.zip",
"default_vocoder": null,
"description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
@ -444,7 +457,7 @@
},
"ewe": {
"openbible": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--ewe--openbible--vits.zip",
"default_vocoder": null,
"license": "CC-BY-SA 4.0",
@ -456,7 +469,7 @@
},
"hau": {
"openbible": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--hau--openbible--vits.zip",
"default_vocoder": null,
"license": "CC-BY-SA 4.0",
@ -468,7 +481,7 @@
},
"lin": {
"openbible": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--lin--openbible--vits.zip",
"default_vocoder": null,
"license": "CC-BY-SA 4.0",
@ -480,7 +493,7 @@
},
"tw_akuapem": {
"openbible": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_akuapem--openbible--vits.zip",
"default_vocoder": null,
"license": "CC-BY-SA 4.0",
@ -492,7 +505,7 @@
},
"tw_asante": {
"openbible": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_asante--openbible--vits.zip",
"default_vocoder": null,
"license": "CC-BY-SA 4.0",
@ -504,7 +517,7 @@
},
"yor": {
"openbible": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--yor--openbible--vits.zip",
"default_vocoder": null,
"license": "CC-BY-SA 4.0",
@ -538,7 +551,7 @@
},
"fi": {
"css10": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fi--css10--vits.zip",
"default_vocoder": null,
"commit": null,
@ -549,7 +562,7 @@
},
"hr": {
"cv": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hr--cv--vits.zip",
"default_vocoder": null,
"commit": null,
@ -560,7 +573,7 @@
},
"lt": {
"cv": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lt--cv--vits.zip",
"default_vocoder": null,
"commit": null,
@ -571,7 +584,7 @@
},
"lv": {
"cv": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lv--cv--vits.zip",
"default_vocoder": null,
"commit": null,
@ -582,7 +595,7 @@
},
"mt": {
"cv": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--mt--cv--vits.zip",
"default_vocoder": null,
"commit": null,
@ -593,7 +606,7 @@
},
"pl": {
"mai_female": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pl--mai_female--vits.zip",
"default_vocoder": null,
"commit": null,
@ -604,7 +617,7 @@
},
"pt": {
"cv": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pt--cv--vits.zip",
"default_vocoder": null,
"commit": null,
@ -615,7 +628,7 @@
},
"ro": {
"cv": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ro--cv--vits.zip",
"default_vocoder": null,
"commit": null,
@ -626,7 +639,7 @@
},
"sk": {
"cv": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sk--cv--vits.zip",
"default_vocoder": null,
"commit": null,
@ -637,7 +650,7 @@
},
"sl": {
"cv": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sl--cv--vits.zip",
"default_vocoder": null,
"commit": null,
@ -648,7 +661,7 @@
},
"sv": {
"cv": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sv--cv--vits.zip",
"default_vocoder": null,
"commit": null,
@ -659,7 +672,7 @@
},
"ca": {
"custom": {
"vits":{
"vits": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--ca--custom--vits.zip",
"default_vocoder": null,
"commit": null,
@ -669,8 +682,8 @@
}
}
},
"fa":{
"custom":{
"fa": {
"custom": {
"glow-tts": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--fa--custom--glow-tts.zip",
"default_vocoder": null,
@ -681,18 +694,18 @@
}
}
},
"bn":{
"custom":{
"vits-male":{
"github_rls_url":"https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_male.zip",
"bn": {
"custom": {
"vits-male": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_male.zip",
"default_vocoder": null,
"commit": null,
"description": "Single speaker Bangla male model. For more information -> https://github.com/mobassir94/comprehensive-bangla-tts",
"author": "@mobassir94",
"license": "Apache 2.0"
},
"vits-female":{
"github_rls_url":"https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_female.zip",
"vits-female": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_female.zip",
"default_vocoder": null,
"commit": null,
"description": "Single speaker Bangla female model. For more information -> https://github.com/mobassir94/comprehensive-bangla-tts",
@ -834,16 +847,16 @@
"mai": {
"multiband-melgan": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--uk--mai--multiband-melgan.zip",
"author":"@robinhad",
"author": "@robinhad",
"commit": "bdab788d",
"license": "MIT",
"contact": ""
}
}
},
"tr":{
"tr": {
"common-voice": {
"hifigan":{
"hifigan": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--tr--common-voice--hifigan.zip",
"description": "HifiGAN model using an unknown speaker from the Common-Voice dataset.",
"author": "Fatih Akademi",
@ -853,10 +866,10 @@
}
}
},
"voice_conversion_models":{
"multilingual":{
"vctk":{
"freevc24":{
"voice_conversion_models": {
"multilingual": {
"vctk": {
"freevc24": {
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.0_models/voice_conversion_models--multilingual--vctk--freevc24.zip",
"description": "FreeVC model trained on VCTK dataset from https://github.com/OlaWod/FreeVC",
"author": "Jing-Yi Li @OlaWod",
@ -866,4 +879,4 @@
}
}
}
}
}

View File

@ -342,7 +342,7 @@ class TTS:
def download_model_by_name(self, model_name: str):
model_path, config_path, model_item = self.manager.download_model(model_name)
if isinstance(model_item["github_rls_url"], list):
if isinstance(model_item["model_url"], list):
# return model directory if there are multiple files
# we assume that the model knows how to load itself
return None, None, None, None, model_path
@ -580,6 +580,8 @@ class TTS:
Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None.
file_path (str, optional):
Output file path. Defaults to "output.wav".
kwargs (dict, optional):
Additional arguments for the model.
"""
self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)

View File

@ -5,11 +5,14 @@ from typing import Dict
from TTS.tts.configs.shared_configs import BaseTTSConfig
from TTS.tts.layers.bark.model import GPTConfig
from TTS.tts.layers.bark.model_fine import FineGPTConfig
from TTS.tts.models.bark import BarkAudioConfig
from TTS.utils.generic_utils import get_user_data_dir
@dataclass
class BarkConfig(BaseTTSConfig):
model: str = "bark"
audio: BarkAudioConfig = BarkAudioConfig()
num_chars: int = 0
semantic_config: GPTConfig = GPTConfig()
fine_config: FineGPTConfig = FineGPTConfig()
@ -31,7 +34,7 @@ class BarkConfig(BaseTTSConfig):
COARSE_SEMANTIC_PAD_TOKEN: int = 12_048
COARSE_INFER_TOKEN: int = 12_050
REMOTE_BASE_URL = "https://dl.suno-models.io/bark/models/v0/"
REMOTE_BASE_URL = "https://huggingface.co/erogol/bark/tree/main/"
REMOTE_MODEL_PATHS: Dict = None
LOCAL_MODEL_PATHS: Dict = None
SMALL_REMOTE_MODEL_PATHS: Dict = None

View File

@ -52,7 +52,7 @@ def load_voice(voice: str, extra_voice_dirs: List[str] = []):
return semantic, coarse, fine
if voice == "random":
return None, None
return None, None, None
voices = get_voices(extra_voice_dirs)
try:
@ -183,7 +183,7 @@ def generate_text_semantic(
assert isinstance(text, str)
text = _normalize_whitespace(text)
assert len(text.strip()) > 0
if history_prompt is not None or base is not None:
if all(v is not None for v in history_prompt) or base is not None:
if history_prompt is not None:
semantic_history = history_prompt[0]
if base is not None:
@ -327,7 +327,7 @@ def generate_coarse(
model.config.COARSE_RATE_HZ / model.config.SEMANTIC_RATE_HZ * model.config.N_COARSE_CODEBOOKS
)
max_semantic_history = int(np.floor(max_coarse_history / semantic_to_coarse_ratio))
if history_prompt is not None or base is not None:
if all(v is not None for v in history_prompt) or base is not None:
if history_prompt is not None:
x_history = history_prompt
x_semantic_history = x_history[0]
@ -477,7 +477,7 @@ def generate_fine(
and x_coarse_gen.min() >= 0
and x_coarse_gen.max() <= model.config.CODEBOOK_SIZE - 1
)
if history_prompt is not None or base is not None:
if all(v is not None for v in history_prompt) or base is not None:
if history_prompt is not None:
x_fine_history = history_prompt[2]
if base is not None:
@ -572,4 +572,4 @@ def codec_decode(fine_tokens, model):
emb = model.encodec.quantizer.decode(arr)
out = model.encodec.decoder(emb)
audio_arr = out.detach().cpu().numpy().squeeze()
save_wav(path="test.wav", wav=audio_arr, sample_rate=model.config.sample_rate)
return audio_arr

View File

@ -4,6 +4,7 @@ Much of this code is adapted from Andrej Karpathy's NanoGPT
"""
import math
from dataclasses import dataclass
from coqpit import Coqpit
import torch
import torch.nn as nn
@ -131,7 +132,7 @@ class Block(nn.Module):
@dataclass
class GPTConfig:
class GPTConfig(Coqpit):
block_size: int = 1024
input_vocab_size: int = 10_048
output_vocab_size: int = 10_048