From 37b708dac75b744fb14d60e0a45f6b4c59b416ad Mon Sep 17 00:00:00 2001 From: Eren G??lge Date: Mon, 19 Jun 2023 14:16:06 +0200 Subject: [PATCH] Add bark model --- TTS/.models.json | 159 +++++++++++++------------ TTS/api.py | 4 +- TTS/tts/configs/bark_config.py | 5 +- TTS/tts/layers/bark/inference_funcs.py | 10 +- TTS/tts/layers/bark/model.py | 3 +- 5 files changed, 100 insertions(+), 81 deletions(-) diff --git a/TTS/.models.json b/TTS/.models.json index b396e641..801485a1 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -1,20 +1,33 @@ { "tts_models": { - "multilingual":{ - "multi-dataset":{ - "your_tts":{ + "multilingual": { + "multi-dataset": { + "your_tts": { "description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418", "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--multilingual--multi-dataset--your_tts.zip", "default_vocoder": null, "commit": "e9a1953e", "license": "CC BY-NC-ND 4.0", "contact": "egolge@coqui.ai" + }, + "bark": { + "description": "🐶 Bark TTS model released by suno-ai. You can find the original implementation in https://github.com/suno-ai/bark.", + "hf_url": [ + "https://coqui.gateway.scarf.sh/bark/coarse_2.pt", + "https://coqui.gateway.scarf.sh/bark/fine_2.pt", + "https://coqui.gateway.scarf.sh/bark/text_2.pt", + "https://coqui.gateway.scarf.sh/bark/config.json" + ], + "default_vocoder": null, + "commit": "e9a1953e", + "license": "MIT", + "contact": "https://www.suno.ai/" } } }, "bg": { "cv": { - "vits":{ + "vits": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--bg--cv--vits.zip", "default_vocoder": null, "commit": null, @@ -25,7 +38,7 @@ }, "cs": { "cv": { - "vits":{ + "vits": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--cs--cv--vits.zip", "default_vocoder": null, "commit": null, @@ -36,7 +49,7 @@ }, "da": { "cv": { - "vits":{ + "vits": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--da--cv--vits.zip", "default_vocoder": null, "commit": null, @@ -47,7 +60,7 @@ }, "et": { "cv": { - "vits":{ + "vits": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--et--cv--vits.zip", "default_vocoder": null, "commit": null, @@ -58,7 +71,7 @@ }, "ga": { "cv": { - "vits":{ + "vits": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ga--cv--vits.zip", "default_vocoder": null, "commit": null, @@ -180,7 +193,7 @@ "license": "apache 2.0", "contact": "egolge@coqui.ai" }, - "fast_pitch":{ + "fast_pitch": { "description": "FastPitch model trained on VCTK dataseset.", "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--fast_pitch.zip", "default_vocoder": null, @@ -220,21 +233,21 @@ "license": "apache 2.0", "contact": "adamfroghyar@gmail.com" } - }, - "multi-dataset":{ - "tortoise-v2":{ + "multi-dataset": { + "tortoise-v2": { "description": "Tortoise tts model https://github.com/neonbjb/tortoise-tts", - "github_rls_url": ["https://coqui.gateway.scarf.sh/v0.14.1_models/autoregressive.pth", - "https://coqui.gateway.scarf.sh/v0.14.1_models/clvp2.pth", - "https://coqui.gateway.scarf.sh/v0.14.1_models/cvvp.pth", - "https://coqui.gateway.scarf.sh/v0.14.1_models/diffusion_decoder.pth", - "https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_auto.pth", - "https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_diffuser.pth", - "https://coqui.gateway.scarf.sh/v0.14.1_models/vocoder.pth", - "https://coqui.gateway.scarf.sh/v0.14.1_models/mel_norms.pth", - "https://coqui.gateway.scarf.sh/v0.14.1_models/config.json" - ], + "github_rls_url": [ + "https://coqui.gateway.scarf.sh/v0.14.1_models/autoregressive.pth", + "https://coqui.gateway.scarf.sh/v0.14.1_models/clvp2.pth", + "https://coqui.gateway.scarf.sh/v0.14.1_models/cvvp.pth", + "https://coqui.gateway.scarf.sh/v0.14.1_models/diffusion_decoder.pth", + "https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_auto.pth", + "https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_diffuser.pth", + "https://coqui.gateway.scarf.sh/v0.14.1_models/vocoder.pth", + "https://coqui.gateway.scarf.sh/v0.14.1_models/mel_norms.pth", + "https://coqui.gateway.scarf.sh/v0.14.1_models/config.json" + ], "commit": "c1875f6", "default_vocoder": null, "author": "@neonbjb - James Betker, @manmay-nakhashi Manmay Nakhashi", @@ -242,7 +255,7 @@ } }, "jenny": { - "jenny":{ + "jenny": { "description": "VITS model trained with Jenny(Dioco) dataset. Named as Jenny as demanded by the license. Original URL for the model https://www.kaggle.com/datasets/noml4u/tts-models--en--jenny-dioco--vits", "github_rls_url": "https://coqui.gateway.scarf.sh/v0.14.0_models/tts_models--en--jenny--jenny.zip", "default_vocoder": null, @@ -263,8 +276,8 @@ "contact": "egolge@coqui.com" } }, - "css10":{ - "vits":{ + "css10": { + "vits": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--es--css10--vits.zip", "default_vocoder": null, "commit": null, @@ -284,8 +297,8 @@ "contact": "egolge@coqui.com" } }, - "css10":{ - "vits":{ + "css10": { + "vits": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fr--css10--vits.zip", "default_vocoder": null, "commit": null, @@ -294,17 +307,17 @@ } } }, - "uk":{ + "uk": { "mai": { "glow-tts": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--uk--mai--glow-tts.zip", - "author":"@robinhad", + "author": "@robinhad", "commit": "bdab788d", "license": "MIT", "contact": "", "default_vocoder": "vocoder_models/uk/mai/multiband-melgan" }, - "vits":{ + "vits": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--uk--mai--vits.zip", "default_vocoder": null, "commit": null, @@ -335,8 +348,8 @@ "commit": "540d811" } }, - "css10":{ - "vits":{ + "css10": { + "vits": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--nl--css10--vits.zip", "default_vocoder": null, "commit": null, @@ -371,7 +384,7 @@ } }, "css10": { - "vits-neon":{ + "vits-neon": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--de--css10--vits.zip", "default_vocoder": null, "author": "@NeonGeckoCom", @@ -392,9 +405,9 @@ } } }, - "tr":{ + "tr": { "common-voice": { - "glow-tts":{ + "glow-tts": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--tr--common-voice--glow-tts.zip", "default_vocoder": "vocoder_models/tr/common-voice/hifigan", "license": "MIT", @@ -406,7 +419,7 @@ }, "it": { "mai_female": { - "glow-tts":{ + "glow-tts": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--glow-tts.zip", "default_vocoder": null, "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.", @@ -414,7 +427,7 @@ "license": "apache 2.0", "commit": null }, - "vits":{ + "vits": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--vits.zip", "default_vocoder": null, "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.", @@ -424,7 +437,7 @@ } }, "mai_male": { - "glow-tts":{ + "glow-tts": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--glow-tts.zip", "default_vocoder": null, "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.", @@ -432,7 +445,7 @@ "license": "apache 2.0", "commit": null }, - "vits":{ + "vits": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--vits.zip", "default_vocoder": null, "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.", @@ -444,7 +457,7 @@ }, "ewe": { "openbible": { - "vits":{ + "vits": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--ewe--openbible--vits.zip", "default_vocoder": null, "license": "CC-BY-SA 4.0", @@ -456,7 +469,7 @@ }, "hau": { "openbible": { - "vits":{ + "vits": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--hau--openbible--vits.zip", "default_vocoder": null, "license": "CC-BY-SA 4.0", @@ -468,7 +481,7 @@ }, "lin": { "openbible": { - "vits":{ + "vits": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--lin--openbible--vits.zip", "default_vocoder": null, "license": "CC-BY-SA 4.0", @@ -480,7 +493,7 @@ }, "tw_akuapem": { "openbible": { - "vits":{ + "vits": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_akuapem--openbible--vits.zip", "default_vocoder": null, "license": "CC-BY-SA 4.0", @@ -492,7 +505,7 @@ }, "tw_asante": { "openbible": { - "vits":{ + "vits": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_asante--openbible--vits.zip", "default_vocoder": null, "license": "CC-BY-SA 4.0", @@ -504,7 +517,7 @@ }, "yor": { "openbible": { - "vits":{ + "vits": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--yor--openbible--vits.zip", "default_vocoder": null, "license": "CC-BY-SA 4.0", @@ -538,7 +551,7 @@ }, "fi": { "css10": { - "vits":{ + "vits": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fi--css10--vits.zip", "default_vocoder": null, "commit": null, @@ -549,7 +562,7 @@ }, "hr": { "cv": { - "vits":{ + "vits": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hr--cv--vits.zip", "default_vocoder": null, "commit": null, @@ -560,7 +573,7 @@ }, "lt": { "cv": { - "vits":{ + "vits": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lt--cv--vits.zip", "default_vocoder": null, "commit": null, @@ -571,7 +584,7 @@ }, "lv": { "cv": { - "vits":{ + "vits": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lv--cv--vits.zip", "default_vocoder": null, "commit": null, @@ -582,7 +595,7 @@ }, "mt": { "cv": { - "vits":{ + "vits": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--mt--cv--vits.zip", "default_vocoder": null, "commit": null, @@ -593,7 +606,7 @@ }, "pl": { "mai_female": { - "vits":{ + "vits": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pl--mai_female--vits.zip", "default_vocoder": null, "commit": null, @@ -604,7 +617,7 @@ }, "pt": { "cv": { - "vits":{ + "vits": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pt--cv--vits.zip", "default_vocoder": null, "commit": null, @@ -615,7 +628,7 @@ }, "ro": { "cv": { - "vits":{ + "vits": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ro--cv--vits.zip", "default_vocoder": null, "commit": null, @@ -626,7 +639,7 @@ }, "sk": { "cv": { - "vits":{ + "vits": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sk--cv--vits.zip", "default_vocoder": null, "commit": null, @@ -637,7 +650,7 @@ }, "sl": { "cv": { - "vits":{ + "vits": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sl--cv--vits.zip", "default_vocoder": null, "commit": null, @@ -648,7 +661,7 @@ }, "sv": { "cv": { - "vits":{ + "vits": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sv--cv--vits.zip", "default_vocoder": null, "commit": null, @@ -659,7 +672,7 @@ }, "ca": { "custom": { - "vits":{ + "vits": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--ca--custom--vits.zip", "default_vocoder": null, "commit": null, @@ -669,8 +682,8 @@ } } }, - "fa":{ - "custom":{ + "fa": { + "custom": { "glow-tts": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--fa--custom--glow-tts.zip", "default_vocoder": null, @@ -681,18 +694,18 @@ } } }, - "bn":{ - "custom":{ - "vits-male":{ - "github_rls_url":"https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_male.zip", + "bn": { + "custom": { + "vits-male": { + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_male.zip", "default_vocoder": null, "commit": null, "description": "Single speaker Bangla male model. For more information -> https://github.com/mobassir94/comprehensive-bangla-tts", "author": "@mobassir94", "license": "Apache 2.0" }, - "vits-female":{ - "github_rls_url":"https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_female.zip", + "vits-female": { + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_female.zip", "default_vocoder": null, "commit": null, "description": "Single speaker Bangla female model. For more information -> https://github.com/mobassir94/comprehensive-bangla-tts", @@ -834,16 +847,16 @@ "mai": { "multiband-melgan": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--uk--mai--multiband-melgan.zip", - "author":"@robinhad", + "author": "@robinhad", "commit": "bdab788d", "license": "MIT", "contact": "" } } }, - "tr":{ + "tr": { "common-voice": { - "hifigan":{ + "hifigan": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--tr--common-voice--hifigan.zip", "description": "HifiGAN model using an unknown speaker from the Common-Voice dataset.", "author": "Fatih Akademi", @@ -853,10 +866,10 @@ } } }, - "voice_conversion_models":{ - "multilingual":{ - "vctk":{ - "freevc24":{ + "voice_conversion_models": { + "multilingual": { + "vctk": { + "freevc24": { "github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.0_models/voice_conversion_models--multilingual--vctk--freevc24.zip", "description": "FreeVC model trained on VCTK dataset from https://github.com/OlaWod/FreeVC", "author": "Jing-Yi Li @OlaWod", @@ -866,4 +879,4 @@ } } } -} +} \ No newline at end of file diff --git a/TTS/api.py b/TTS/api.py index 8bd087f6..190fe6b8 100644 --- a/TTS/api.py +++ b/TTS/api.py @@ -342,7 +342,7 @@ class TTS: def download_model_by_name(self, model_name: str): model_path, config_path, model_item = self.manager.download_model(model_name) - if isinstance(model_item["github_rls_url"], list): + if isinstance(model_item["model_url"], list): # return model directory if there are multiple files # we assume that the model knows how to load itself return None, None, None, None, model_path @@ -580,6 +580,8 @@ class TTS: Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None. file_path (str, optional): Output file path. Defaults to "output.wav". + kwargs (dict, optional): + Additional arguments for the model. """ self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs) diff --git a/TTS/tts/configs/bark_config.py b/TTS/tts/configs/bark_config.py index 760776a8..57ccf2d0 100644 --- a/TTS/tts/configs/bark_config.py +++ b/TTS/tts/configs/bark_config.py @@ -5,11 +5,14 @@ from typing import Dict from TTS.tts.configs.shared_configs import BaseTTSConfig from TTS.tts.layers.bark.model import GPTConfig from TTS.tts.layers.bark.model_fine import FineGPTConfig +from TTS.tts.models.bark import BarkAudioConfig from TTS.utils.generic_utils import get_user_data_dir @dataclass class BarkConfig(BaseTTSConfig): + model: str = "bark" + audio: BarkAudioConfig = BarkAudioConfig() num_chars: int = 0 semantic_config: GPTConfig = GPTConfig() fine_config: FineGPTConfig = FineGPTConfig() @@ -31,7 +34,7 @@ class BarkConfig(BaseTTSConfig): COARSE_SEMANTIC_PAD_TOKEN: int = 12_048 COARSE_INFER_TOKEN: int = 12_050 - REMOTE_BASE_URL = "https://dl.suno-models.io/bark/models/v0/" + REMOTE_BASE_URL = "https://huggingface.co/erogol/bark/tree/main/" REMOTE_MODEL_PATHS: Dict = None LOCAL_MODEL_PATHS: Dict = None SMALL_REMOTE_MODEL_PATHS: Dict = None diff --git a/TTS/tts/layers/bark/inference_funcs.py b/TTS/tts/layers/bark/inference_funcs.py index 73c9ee71..6fa87c37 100644 --- a/TTS/tts/layers/bark/inference_funcs.py +++ b/TTS/tts/layers/bark/inference_funcs.py @@ -52,7 +52,7 @@ def load_voice(voice: str, extra_voice_dirs: List[str] = []): return semantic, coarse, fine if voice == "random": - return None, None + return None, None, None voices = get_voices(extra_voice_dirs) try: @@ -183,7 +183,7 @@ def generate_text_semantic( assert isinstance(text, str) text = _normalize_whitespace(text) assert len(text.strip()) > 0 - if history_prompt is not None or base is not None: + if all(v is not None for v in history_prompt) or base is not None: if history_prompt is not None: semantic_history = history_prompt[0] if base is not None: @@ -327,7 +327,7 @@ def generate_coarse( model.config.COARSE_RATE_HZ / model.config.SEMANTIC_RATE_HZ * model.config.N_COARSE_CODEBOOKS ) max_semantic_history = int(np.floor(max_coarse_history / semantic_to_coarse_ratio)) - if history_prompt is not None or base is not None: + if all(v is not None for v in history_prompt) or base is not None: if history_prompt is not None: x_history = history_prompt x_semantic_history = x_history[0] @@ -477,7 +477,7 @@ def generate_fine( and x_coarse_gen.min() >= 0 and x_coarse_gen.max() <= model.config.CODEBOOK_SIZE - 1 ) - if history_prompt is not None or base is not None: + if all(v is not None for v in history_prompt) or base is not None: if history_prompt is not None: x_fine_history = history_prompt[2] if base is not None: @@ -572,4 +572,4 @@ def codec_decode(fine_tokens, model): emb = model.encodec.quantizer.decode(arr) out = model.encodec.decoder(emb) audio_arr = out.detach().cpu().numpy().squeeze() - save_wav(path="test.wav", wav=audio_arr, sample_rate=model.config.sample_rate) + return audio_arr diff --git a/TTS/tts/layers/bark/model.py b/TTS/tts/layers/bark/model.py index 485e6665..81117b3e 100644 --- a/TTS/tts/layers/bark/model.py +++ b/TTS/tts/layers/bark/model.py @@ -4,6 +4,7 @@ Much of this code is adapted from Andrej Karpathy's NanoGPT """ import math from dataclasses import dataclass +from coqpit import Coqpit import torch import torch.nn as nn @@ -131,7 +132,7 @@ class Block(nn.Module): @dataclass -class GPTConfig: +class GPTConfig(Coqpit): block_size: int = 1024 input_vocab_size: int = 10_048 output_vocab_size: int = 10_048