From 03c347b7f3f5af29027ad1919c809d4e6cf434c8 Mon Sep 17 00:00:00 2001 From: Eren G??lge Date: Wed, 21 Jun 2023 11:58:18 +0200 Subject: [PATCH] Update Bark Config --- TTS/bin/synthesize.py | 2 +- TTS/tts/configs/bark_config.py | 39 ++++++++++++++++++++++++++++++++-- 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index 8a7e178d..0334c023 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -356,7 +356,7 @@ If you don't specify any models, then it uses LJSpeech based English model. vc_config_path = config_path # tts model with multiple files to be loaded from the directory path - if isinstance(model_item["github_rls_url"], list): + if isinstance(model_item["model_url"], list): model_dir = model_path tts_path = None tts_config_path = None diff --git a/TTS/tts/configs/bark_config.py b/TTS/tts/configs/bark_config.py index 57ccf2d0..943f3dea 100644 --- a/TTS/tts/configs/bark_config.py +++ b/TTS/tts/configs/bark_config.py @@ -1,5 +1,5 @@ import os -from dataclasses import dataclass, field +from dataclasses import dataclass from typing import Dict from TTS.tts.configs.shared_configs import BaseTTSConfig @@ -11,6 +11,40 @@ from TTS.utils.generic_utils import get_user_data_dir @dataclass class BarkConfig(BaseTTSConfig): + """ Bark TTS configuration + + Args: + model (str): model name that registers the model. + audio (BarkAudioConfig): audio configuration. Defaults to BarkAudioConfig(). + num_chars (int): number of characters in the alphabet. Defaults to 0. + semantic_config (GPTConfig): semantic configuration. Defaults to GPTConfig(). + fine_config (FineGPTConfig): fine configuration. Defaults to FineGPTConfig(). + coarse_config (GPTConfig): coarse configuration. Defaults to GPTConfig(). + CONTEXT_WINDOW_SIZE (int): GPT context window size. Defaults to 1024. + SEMANTIC_RATE_HZ (float): semantic tokens rate in Hz. Defaults to 49.9. + SEMANTIC_VOCAB_SIZE (int): semantic vocabulary size. Defaults to 10_000. + CODEBOOK_SIZE (int): encodec codebook size. Defaults to 1024. + N_COARSE_CODEBOOKS (int): number of coarse codebooks. Defaults to 2. + N_FINE_CODEBOOKS (int): number of fine codebooks. Defaults to 8. + COARSE_RATE_HZ (int): coarse tokens rate in Hz. Defaults to 75. + SAMPLE_RATE (int): sample rate. Defaults to 24_000. + USE_SMALLER_MODELS (bool): use smaller models. Defaults to False. + TEXT_ENCODING_OFFSET (int): text encoding offset. Defaults to 10_048. + SEMANTIC_PAD_TOKEN (int): semantic pad token. Defaults to 10_000. + TEXT_PAD_TOKEN ([type]): text pad token. Defaults to 10_048. + TEXT_EOS_TOKEN ([type]): text end of sentence token. Defaults to 10_049. + TEXT_SOS_TOKEN ([type]): text start of sentence token. Defaults to 10_050. + SEMANTIC_INFER_TOKEN (int): semantic infer token. Defaults to 10_051. + COARSE_SEMANTIC_PAD_TOKEN (int): coarse semantic pad token. Defaults to 12_048. + COARSE_INFER_TOKEN (int): coarse infer token. Defaults to 12_050. + REMOTE_BASE_URL ([type]): remote base url. Defaults to "https://huggingface.co/erogol/bark/tree". + REMOTE_MODEL_PATHS (Dict): remote model paths. Defaults to None. + LOCAL_MODEL_PATHS (Dict): local model paths. Defaults to None. + SMALL_REMOTE_MODEL_PATHS (Dict): small remote model paths. Defaults to None. + CACHE_DIR (str): local cache directory. Defaults to get_user_data_dir(). + DEF_SPEAKER_DIR (str): default speaker directory to stoke speaker values for voice cloning. Defaults to get_user_data_dir(). + """ + model: str = "bark" audio: BarkAudioConfig = BarkAudioConfig() num_chars: int = 0 @@ -39,6 +73,7 @@ class BarkConfig(BaseTTSConfig): LOCAL_MODEL_PATHS: Dict = None SMALL_REMOTE_MODEL_PATHS: Dict = None CACHE_DIR: str = str(get_user_data_dir("tts/suno/bark_v0")) + DEF_SPEAKER_DIR: str = str(get_user_data_dir("tts/bark_v0/speakers")) def __post_init__(self): self.REMOTE_MODEL_PATHS = { @@ -67,4 +102,4 @@ class BarkConfig(BaseTTSConfig): "coarse": {"path": os.path.join(self.REMOTE_BASE_URL, "coarse.pt")}, "fine": {"path": os.path.join(self.REMOTE_BASE_URL, "fine.pt")}, } - self.sample_rate = self.SAMPLE_RATE + self.sample_rate = self.SAMPLE_RATE # pylint: disable=attribute-defined-outside-init