mirror of https://github.com/coqui-ai/TTS.git
Add bark model
This commit is contained in:
parent
2364c38d16
commit
37b708dac7
159
TTS/.models.json
159
TTS/.models.json
|
@ -1,20 +1,33 @@
|
|||
{
|
||||
"tts_models": {
|
||||
"multilingual":{
|
||||
"multi-dataset":{
|
||||
"your_tts":{
|
||||
"multilingual": {
|
||||
"multi-dataset": {
|
||||
"your_tts": {
|
||||
"description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418",
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--multilingual--multi-dataset--your_tts.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": "e9a1953e",
|
||||
"license": "CC BY-NC-ND 4.0",
|
||||
"contact": "egolge@coqui.ai"
|
||||
},
|
||||
"bark": {
|
||||
"description": "🐶 Bark TTS model released by suno-ai. You can find the original implementation in https://github.com/suno-ai/bark.",
|
||||
"hf_url": [
|
||||
"https://coqui.gateway.scarf.sh/bark/coarse_2.pt",
|
||||
"https://coqui.gateway.scarf.sh/bark/fine_2.pt",
|
||||
"https://coqui.gateway.scarf.sh/bark/text_2.pt",
|
||||
"https://coqui.gateway.scarf.sh/bark/config.json"
|
||||
],
|
||||
"default_vocoder": null,
|
||||
"commit": "e9a1953e",
|
||||
"license": "MIT",
|
||||
"contact": "https://www.suno.ai/"
|
||||
}
|
||||
}
|
||||
},
|
||||
"bg": {
|
||||
"cv": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--bg--cv--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -25,7 +38,7 @@
|
|||
},
|
||||
"cs": {
|
||||
"cv": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--cs--cv--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -36,7 +49,7 @@
|
|||
},
|
||||
"da": {
|
||||
"cv": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--da--cv--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -47,7 +60,7 @@
|
|||
},
|
||||
"et": {
|
||||
"cv": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--et--cv--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -58,7 +71,7 @@
|
|||
},
|
||||
"ga": {
|
||||
"cv": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ga--cv--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -180,7 +193,7 @@
|
|||
"license": "apache 2.0",
|
||||
"contact": "egolge@coqui.ai"
|
||||
},
|
||||
"fast_pitch":{
|
||||
"fast_pitch": {
|
||||
"description": "FastPitch model trained on VCTK dataseset.",
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--fast_pitch.zip",
|
||||
"default_vocoder": null,
|
||||
|
@ -220,21 +233,21 @@
|
|||
"license": "apache 2.0",
|
||||
"contact": "adamfroghyar@gmail.com"
|
||||
}
|
||||
|
||||
},
|
||||
"multi-dataset":{
|
||||
"tortoise-v2":{
|
||||
"multi-dataset": {
|
||||
"tortoise-v2": {
|
||||
"description": "Tortoise tts model https://github.com/neonbjb/tortoise-tts",
|
||||
"github_rls_url": ["https://coqui.gateway.scarf.sh/v0.14.1_models/autoregressive.pth",
|
||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/clvp2.pth",
|
||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/cvvp.pth",
|
||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/diffusion_decoder.pth",
|
||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_auto.pth",
|
||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_diffuser.pth",
|
||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/vocoder.pth",
|
||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/mel_norms.pth",
|
||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/config.json"
|
||||
],
|
||||
"github_rls_url": [
|
||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/autoregressive.pth",
|
||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/clvp2.pth",
|
||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/cvvp.pth",
|
||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/diffusion_decoder.pth",
|
||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_auto.pth",
|
||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_diffuser.pth",
|
||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/vocoder.pth",
|
||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/mel_norms.pth",
|
||||
"https://coqui.gateway.scarf.sh/v0.14.1_models/config.json"
|
||||
],
|
||||
"commit": "c1875f6",
|
||||
"default_vocoder": null,
|
||||
"author": "@neonbjb - James Betker, @manmay-nakhashi Manmay Nakhashi",
|
||||
|
@ -242,7 +255,7 @@
|
|||
}
|
||||
},
|
||||
"jenny": {
|
||||
"jenny":{
|
||||
"jenny": {
|
||||
"description": "VITS model trained with Jenny(Dioco) dataset. Named as Jenny as demanded by the license. Original URL for the model https://www.kaggle.com/datasets/noml4u/tts-models--en--jenny-dioco--vits",
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.14.0_models/tts_models--en--jenny--jenny.zip",
|
||||
"default_vocoder": null,
|
||||
|
@ -263,8 +276,8 @@
|
|||
"contact": "egolge@coqui.com"
|
||||
}
|
||||
},
|
||||
"css10":{
|
||||
"vits":{
|
||||
"css10": {
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--es--css10--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -284,8 +297,8 @@
|
|||
"contact": "egolge@coqui.com"
|
||||
}
|
||||
},
|
||||
"css10":{
|
||||
"vits":{
|
||||
"css10": {
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fr--css10--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -294,17 +307,17 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"uk":{
|
||||
"uk": {
|
||||
"mai": {
|
||||
"glow-tts": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--uk--mai--glow-tts.zip",
|
||||
"author":"@robinhad",
|
||||
"author": "@robinhad",
|
||||
"commit": "bdab788d",
|
||||
"license": "MIT",
|
||||
"contact": "",
|
||||
"default_vocoder": "vocoder_models/uk/mai/multiband-melgan"
|
||||
},
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--uk--mai--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -335,8 +348,8 @@
|
|||
"commit": "540d811"
|
||||
}
|
||||
},
|
||||
"css10":{
|
||||
"vits":{
|
||||
"css10": {
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--nl--css10--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -371,7 +384,7 @@
|
|||
}
|
||||
},
|
||||
"css10": {
|
||||
"vits-neon":{
|
||||
"vits-neon": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--de--css10--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"author": "@NeonGeckoCom",
|
||||
|
@ -392,9 +405,9 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"tr":{
|
||||
"tr": {
|
||||
"common-voice": {
|
||||
"glow-tts":{
|
||||
"glow-tts": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--tr--common-voice--glow-tts.zip",
|
||||
"default_vocoder": "vocoder_models/tr/common-voice/hifigan",
|
||||
"license": "MIT",
|
||||
|
@ -406,7 +419,7 @@
|
|||
},
|
||||
"it": {
|
||||
"mai_female": {
|
||||
"glow-tts":{
|
||||
"glow-tts": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--glow-tts.zip",
|
||||
"default_vocoder": null,
|
||||
"description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
|
||||
|
@ -414,7 +427,7 @@
|
|||
"license": "apache 2.0",
|
||||
"commit": null
|
||||
},
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
|
||||
|
@ -424,7 +437,7 @@
|
|||
}
|
||||
},
|
||||
"mai_male": {
|
||||
"glow-tts":{
|
||||
"glow-tts": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--glow-tts.zip",
|
||||
"default_vocoder": null,
|
||||
"description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
|
||||
|
@ -432,7 +445,7 @@
|
|||
"license": "apache 2.0",
|
||||
"commit": null
|
||||
},
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
|
||||
|
@ -444,7 +457,7 @@
|
|||
},
|
||||
"ewe": {
|
||||
"openbible": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--ewe--openbible--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"license": "CC-BY-SA 4.0",
|
||||
|
@ -456,7 +469,7 @@
|
|||
},
|
||||
"hau": {
|
||||
"openbible": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--hau--openbible--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"license": "CC-BY-SA 4.0",
|
||||
|
@ -468,7 +481,7 @@
|
|||
},
|
||||
"lin": {
|
||||
"openbible": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--lin--openbible--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"license": "CC-BY-SA 4.0",
|
||||
|
@ -480,7 +493,7 @@
|
|||
},
|
||||
"tw_akuapem": {
|
||||
"openbible": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_akuapem--openbible--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"license": "CC-BY-SA 4.0",
|
||||
|
@ -492,7 +505,7 @@
|
|||
},
|
||||
"tw_asante": {
|
||||
"openbible": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_asante--openbible--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"license": "CC-BY-SA 4.0",
|
||||
|
@ -504,7 +517,7 @@
|
|||
},
|
||||
"yor": {
|
||||
"openbible": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--yor--openbible--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"license": "CC-BY-SA 4.0",
|
||||
|
@ -538,7 +551,7 @@
|
|||
},
|
||||
"fi": {
|
||||
"css10": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fi--css10--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -549,7 +562,7 @@
|
|||
},
|
||||
"hr": {
|
||||
"cv": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hr--cv--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -560,7 +573,7 @@
|
|||
},
|
||||
"lt": {
|
||||
"cv": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lt--cv--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -571,7 +584,7 @@
|
|||
},
|
||||
"lv": {
|
||||
"cv": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lv--cv--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -582,7 +595,7 @@
|
|||
},
|
||||
"mt": {
|
||||
"cv": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--mt--cv--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -593,7 +606,7 @@
|
|||
},
|
||||
"pl": {
|
||||
"mai_female": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pl--mai_female--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -604,7 +617,7 @@
|
|||
},
|
||||
"pt": {
|
||||
"cv": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pt--cv--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -615,7 +628,7 @@
|
|||
},
|
||||
"ro": {
|
||||
"cv": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ro--cv--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -626,7 +639,7 @@
|
|||
},
|
||||
"sk": {
|
||||
"cv": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sk--cv--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -637,7 +650,7 @@
|
|||
},
|
||||
"sl": {
|
||||
"cv": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sl--cv--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -648,7 +661,7 @@
|
|||
},
|
||||
"sv": {
|
||||
"cv": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sv--cv--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -659,7 +672,7 @@
|
|||
},
|
||||
"ca": {
|
||||
"custom": {
|
||||
"vits":{
|
||||
"vits": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--ca--custom--vits.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
|
@ -669,8 +682,8 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"fa":{
|
||||
"custom":{
|
||||
"fa": {
|
||||
"custom": {
|
||||
"glow-tts": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--fa--custom--glow-tts.zip",
|
||||
"default_vocoder": null,
|
||||
|
@ -681,18 +694,18 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"bn":{
|
||||
"custom":{
|
||||
"vits-male":{
|
||||
"github_rls_url":"https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_male.zip",
|
||||
"bn": {
|
||||
"custom": {
|
||||
"vits-male": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_male.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
"description": "Single speaker Bangla male model. For more information -> https://github.com/mobassir94/comprehensive-bangla-tts",
|
||||
"author": "@mobassir94",
|
||||
"license": "Apache 2.0"
|
||||
},
|
||||
"vits-female":{
|
||||
"github_rls_url":"https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_female.zip",
|
||||
"vits-female": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_female.zip",
|
||||
"default_vocoder": null,
|
||||
"commit": null,
|
||||
"description": "Single speaker Bangla female model. For more information -> https://github.com/mobassir94/comprehensive-bangla-tts",
|
||||
|
@ -834,16 +847,16 @@
|
|||
"mai": {
|
||||
"multiband-melgan": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--uk--mai--multiband-melgan.zip",
|
||||
"author":"@robinhad",
|
||||
"author": "@robinhad",
|
||||
"commit": "bdab788d",
|
||||
"license": "MIT",
|
||||
"contact": ""
|
||||
}
|
||||
}
|
||||
},
|
||||
"tr":{
|
||||
"tr": {
|
||||
"common-voice": {
|
||||
"hifigan":{
|
||||
"hifigan": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--tr--common-voice--hifigan.zip",
|
||||
"description": "HifiGAN model using an unknown speaker from the Common-Voice dataset.",
|
||||
"author": "Fatih Akademi",
|
||||
|
@ -853,10 +866,10 @@
|
|||
}
|
||||
}
|
||||
},
|
||||
"voice_conversion_models":{
|
||||
"multilingual":{
|
||||
"vctk":{
|
||||
"freevc24":{
|
||||
"voice_conversion_models": {
|
||||
"multilingual": {
|
||||
"vctk": {
|
||||
"freevc24": {
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.0_models/voice_conversion_models--multilingual--vctk--freevc24.zip",
|
||||
"description": "FreeVC model trained on VCTK dataset from https://github.com/OlaWod/FreeVC",
|
||||
"author": "Jing-Yi Li @OlaWod",
|
||||
|
@ -866,4 +879,4 @@
|
|||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -342,7 +342,7 @@ class TTS:
|
|||
|
||||
def download_model_by_name(self, model_name: str):
|
||||
model_path, config_path, model_item = self.manager.download_model(model_name)
|
||||
if isinstance(model_item["github_rls_url"], list):
|
||||
if isinstance(model_item["model_url"], list):
|
||||
# return model directory if there are multiple files
|
||||
# we assume that the model knows how to load itself
|
||||
return None, None, None, None, model_path
|
||||
|
@ -580,6 +580,8 @@ class TTS:
|
|||
Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None.
|
||||
file_path (str, optional):
|
||||
Output file path. Defaults to "output.wav".
|
||||
kwargs (dict, optional):
|
||||
Additional arguments for the model.
|
||||
"""
|
||||
self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
|
||||
|
||||
|
|
|
@ -5,11 +5,14 @@ from typing import Dict
|
|||
from TTS.tts.configs.shared_configs import BaseTTSConfig
|
||||
from TTS.tts.layers.bark.model import GPTConfig
|
||||
from TTS.tts.layers.bark.model_fine import FineGPTConfig
|
||||
from TTS.tts.models.bark import BarkAudioConfig
|
||||
from TTS.utils.generic_utils import get_user_data_dir
|
||||
|
||||
|
||||
@dataclass
|
||||
class BarkConfig(BaseTTSConfig):
|
||||
model: str = "bark"
|
||||
audio: BarkAudioConfig = BarkAudioConfig()
|
||||
num_chars: int = 0
|
||||
semantic_config: GPTConfig = GPTConfig()
|
||||
fine_config: FineGPTConfig = FineGPTConfig()
|
||||
|
@ -31,7 +34,7 @@ class BarkConfig(BaseTTSConfig):
|
|||
COARSE_SEMANTIC_PAD_TOKEN: int = 12_048
|
||||
COARSE_INFER_TOKEN: int = 12_050
|
||||
|
||||
REMOTE_BASE_URL = "https://dl.suno-models.io/bark/models/v0/"
|
||||
REMOTE_BASE_URL = "https://huggingface.co/erogol/bark/tree/main/"
|
||||
REMOTE_MODEL_PATHS: Dict = None
|
||||
LOCAL_MODEL_PATHS: Dict = None
|
||||
SMALL_REMOTE_MODEL_PATHS: Dict = None
|
||||
|
|
|
@ -52,7 +52,7 @@ def load_voice(voice: str, extra_voice_dirs: List[str] = []):
|
|||
return semantic, coarse, fine
|
||||
|
||||
if voice == "random":
|
||||
return None, None
|
||||
return None, None, None
|
||||
|
||||
voices = get_voices(extra_voice_dirs)
|
||||
try:
|
||||
|
@ -183,7 +183,7 @@ def generate_text_semantic(
|
|||
assert isinstance(text, str)
|
||||
text = _normalize_whitespace(text)
|
||||
assert len(text.strip()) > 0
|
||||
if history_prompt is not None or base is not None:
|
||||
if all(v is not None for v in history_prompt) or base is not None:
|
||||
if history_prompt is not None:
|
||||
semantic_history = history_prompt[0]
|
||||
if base is not None:
|
||||
|
@ -327,7 +327,7 @@ def generate_coarse(
|
|||
model.config.COARSE_RATE_HZ / model.config.SEMANTIC_RATE_HZ * model.config.N_COARSE_CODEBOOKS
|
||||
)
|
||||
max_semantic_history = int(np.floor(max_coarse_history / semantic_to_coarse_ratio))
|
||||
if history_prompt is not None or base is not None:
|
||||
if all(v is not None for v in history_prompt) or base is not None:
|
||||
if history_prompt is not None:
|
||||
x_history = history_prompt
|
||||
x_semantic_history = x_history[0]
|
||||
|
@ -477,7 +477,7 @@ def generate_fine(
|
|||
and x_coarse_gen.min() >= 0
|
||||
and x_coarse_gen.max() <= model.config.CODEBOOK_SIZE - 1
|
||||
)
|
||||
if history_prompt is not None or base is not None:
|
||||
if all(v is not None for v in history_prompt) or base is not None:
|
||||
if history_prompt is not None:
|
||||
x_fine_history = history_prompt[2]
|
||||
if base is not None:
|
||||
|
@ -572,4 +572,4 @@ def codec_decode(fine_tokens, model):
|
|||
emb = model.encodec.quantizer.decode(arr)
|
||||
out = model.encodec.decoder(emb)
|
||||
audio_arr = out.detach().cpu().numpy().squeeze()
|
||||
save_wav(path="test.wav", wav=audio_arr, sample_rate=model.config.sample_rate)
|
||||
return audio_arr
|
||||
|
|
|
@ -4,6 +4,7 @@ Much of this code is adapted from Andrej Karpathy's NanoGPT
|
|||
"""
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from coqpit import Coqpit
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
@ -131,7 +132,7 @@ class Block(nn.Module):
|
|||
|
||||
|
||||
@dataclass
|
||||
class GPTConfig:
|
||||
class GPTConfig(Coqpit):
|
||||
block_size: int = 1024
|
||||
input_vocab_size: int = 10_048
|
||||
output_vocab_size: int = 10_048
|
||||
|
|
Loading…
Reference in New Issue