Fix here and ther

2023-06-21 11:59:27 +02:00 · 2023-06-21 11:59:27 +02:00 · 0f8932a6a9
parent 03c347b7f3
commit 0f8932a6a9
8 changed files with 138 additions and 160 deletions
--- a/TTS/tts/layers/bark/hubert/hubert_manager.py
+++ b/TTS/tts/layers/bark/hubert/hubert_manager.py
@ -17,6 +17,7 @@ class HubertManager:
            urllib.request.urlretrieve(download_url, model_path)
            print("Downloaded HuBERT")
            return model_path
        return None
    @staticmethod
    def make_sure_tokenizer_installed(
@ -31,3 +32,4 @@ class HubertManager:
            shutil.move(os.path.join(model_dir, model), model_path)
            print("Downloaded tokenizer")
            return model_path
        return None
--- a/TTS/tts/layers/bark/hubert/tokenizer.py
+++ b/TTS/tts/layers/bark/hubert/tokenizer.py
@ -16,7 +16,7 @@ from torch.serialization import MAP_LOCATION
 class HubertTokenizer(nn.Module):
    def __init__(self, hidden_size=1024, input_size=768, output_size=10000, version=0):
-        super(HubertTokenizer, self).__init__()
+        super().__init__()
        next_size = input_size
        if version == 0:
            self.lstm = nn.LSTM(input_size, hidden_size, 2, batch_first=True)
@ -181,7 +181,7 @@ def auto_train(data_path, save_path="model.pth", load_model: str = None, save_ep
    epoch = 1
    while 1:
-        for i in range(save_epochs):
+        for _ in range(save_epochs):
            j = 0
            for x, y in zip(data_x, data_y):
                model_training.train_step(
--- a/TTS/tts/layers/bark/inference_funcs.py
+++ b/TTS/tts/layers/bark/inference_funcs.py
@ -16,7 +16,7 @@ from torch.nn import functional as F
 from TTS.tts.layers.bark.hubert.hubert_manager import HubertManager
 from TTS.tts.layers.bark.hubert.kmeans_hubert import CustomHubert
 from TTS.tts.layers.bark.hubert.tokenizer import HubertTokenizer
-from TTS.tts.layers.bark.load_model import _clear_cuda_cache, _inference_mode
+from TTS.tts.layers.bark.load_model import clear_cuda_cache, inference_mode
 logger = logging.getLogger(__name__)
@ -34,34 +34,53 @@ def _normalize_whitespace(text):
 def get_voices(extra_voice_dirs: List[str] = []):
-    voices = {}
+    dirs = extra_voice_dirs
-    for dir in extra_voice_dirs:
+    voices: Dict[str, List[str]] = {}
-        paths = list(glob(f"{dir}/*.npz"))
+    for d in dirs:
-        for path in paths:
+        subs = os.listdir(d)
-            name = os.path.basename(path).replace(".npz", "")
+        for sub in subs:
-            voices[name] = path
+            subj = os.path.join(d, sub)
            if os.path.isdir(subj):
                voices[sub] = list(glob(f"{subj}/*.npz"))
                # fetch audio files if no npz files are found
                if len(voices[sub]) == 0:
                    voices[sub] = list(glob(f"{subj}/*.wav")) + list(glob(f"{subj}/*.mp3"))
    return voices
-def load_voice(voice: str, extra_voice_dirs: List[str] = []):
+def load_npz(npz_file):
    def load_npz(npz_file):
        x_history = np.load(npz_file)
        semantic = x_history["semantic_prompt"]
        coarse = x_history["coarse_prompt"]
        fine = x_history["fine_prompt"]
        return semantic, coarse, fine
 def load_voice(model, voice: str, extra_voice_dirs: List[str] = []):  # pylint: disable=dangerous-default-value
    if voice == "random":
        return None, None, None
    voices = get_voices(extra_voice_dirs)
    paths = voices[voice]
    # bark only uses a single sample for cloning
    if len(paths) > 1:
        raise ValueError(f"Voice {voice} has multiple paths: {paths}")
    try:
        path = voices[voice]
-    except KeyError:
+    except KeyError as e:
-        raise KeyError(f"Voice {voice} not found in {extra_voice_dirs}")
+        raise KeyError(f"Voice {voice} not found in {extra_voice_dirs}") from e
    prompt = load_npz(path)
    return prompt
    if len(paths) == 1 and paths[0].endswith(".npz"):
        return load_npz(path[0])
    else:
        audio_path = paths[0]
        # replace the file extension with .npz
        output_path = os.path.splitext(audio_path)[0] + ".npz"
        generate_voice(audio=audio_path, model=model, output_path=output_path)
        breakpoint()
        return load_voice(model, voice, extra_voice_dirs)
 def zero_crossing_rate(audio, frame_length=1024, hop_length=512):
    zero_crossings = np.sum(np.abs(np.diff(np.sign(audio))) / 2)
@ -85,7 +104,6 @@ def compute_average_bass_energy(audio_data, sample_rate, max_bass_freq=250):
 def generate_voice(
    audio,
    text,
    model,
    output_path,
 ):
@ -106,9 +124,6 @@ def generate_voice(
        encoded_frames = model.encodec.encode(audio)
    codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()  # [n_q, T]
    # get seconds of audio
    seconds = audio.shape[-1] / model.config.sample_rate
    # move codes to cpu
    codes = codes.cpu().numpy()
@ -133,36 +148,6 @@ def generate_voice(
    np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)
    # while attempts < max_attempts:
    #     if attempts > 0 and base is not None:
    #         # Reset the base model token
    #         print(f"Reset the base model token Regenerating...")
    #         base = None
    #     audio_array, x = model.generate_audio(text, history_promp=None, base=base, **kwargs)
    #     zcr = zero_crossing_rate(audio_array)
    #     spectral_contrast = compute_spectral_contrast(audio_array, model.config.sample_rate)
    #     bass_energy = compute_average_bass_energy(audio_array, model.config.sample_rate)
    #     print(f"Attempt {attempts + 1}: ZCR = {zcr}, Spectral Contrast = {spectral_contrast:.2f}, Bass Energy = {bass_energy:.2f}")
    #     # Save the audio array to the output_array directory with a random name for debugging
    #     #output_file = os.path.join(output_directory, f"audio_{zcr:.2f}_sc{spectral_contrast:.2f}_be{bass_energy:.2f}.wav")
    #     #wavfile.write(output_file, sample_rate, audio_array)
    #     #print(f"Saved audio array to {output_file}")
    #     if zcr < zcr_threshold and spectral_contrast < spectral_threshold and bass_energy < bass_energy_threshold:
    #         print(f"Audio passed ZCR, Spectral Contrast, and Bass Energy thresholds. No need to regenerate.")
    #         break
    #     else:
    #         print(f"Audio failed ZCR, Spectral Contrast, and/or Bass Energy thresholds. Regenerating...")
    #     attempts += 1
    # if attempts == max_attempts:
    #     print("Reached maximum attempts. Returning the last generated audio.")
    # return audio_array, x, zcr, spectral_contrast, bass_energy
 def generate_text_semantic(
    text,
@ -224,7 +209,7 @@ def generate_text_semantic(
        np.hstack([encoded_text, semantic_history, np.array([model.config.SEMANTIC_INFER_TOKEN])]).astype(np.int64)
    )[None]
    assert x.shape[1] == 256 + 256 + 1
-    with _inference_mode():
+    with inference_mode():
        x = x.to(model.device)
        n_tot_steps = 768
        # custom tqdm updates since we don't know when eos will occur
@ -285,8 +270,8 @@ def generate_text_semantic(
            pbar_state = req_pbar_state
        pbar.close()
        out = x.detach().cpu().numpy().squeeze()[256 + 256 + 1 :]
-    assert all(0 <= out) and all(out < model.config.SEMANTIC_VOCAB_SIZE)
+    assert all(out >= 0) and all(out < model.config.SEMANTIC_VOCAB_SIZE)
-    _clear_cuda_cache()
+    clear_cuda_cache()
    return out
@ -382,7 +367,7 @@ def generate_coarse(
    x_semantic = np.hstack([x_semantic_history, x_semantic]).astype(np.int32)
    x_coarse = x_coarse_history.astype(np.int32)
    base_semantic_idx = len(x_semantic_history)
-    with _inference_mode():
+    with inference_mode():
        x_semantic_in = torch.from_numpy(x_semantic)[None].to(model.device)
        x_coarse_in = torch.from_numpy(x_coarse)[None].to(model.device)
        n_window_steps = int(np.ceil(n_steps / sliding_window_len))
@ -456,7 +441,7 @@ def generate_coarse(
    )
    for n in range(1, model.config.N_COARSE_CODEBOOKS):
        gen_coarse_audio_arr[n, :] -= n * model.config.CODEBOOK_SIZE
-    _clear_cuda_cache()
+    clear_cuda_cache()
    return gen_coarse_audio_arr
@ -526,7 +511,7 @@ def generate_fine(
        )
    # we can be lazy about fractional loop and just keep overwriting codebooks
    n_loops = np.max([0, int(np.ceil((x_coarse_gen.shape[1] - (1024 - n_history)) / 512))]) + 1
-    with _inference_mode():
+    with inference_mode():
        in_arr = torch.tensor(in_arr.T).to(model.device)
        for n in tqdm.tqdm(range(n_loops), disable=silent):
            start_idx = np.min([n * 512, in_arr.shape[0] - 1024])
@ -558,14 +543,12 @@ def generate_fine(
    if n_remove_from_end > 0:
        gen_fine_arr = gen_fine_arr[:, :-n_remove_from_end]
    assert gen_fine_arr.shape[-1] == x_coarse_gen.shape[-1]
-    _clear_cuda_cache()
+    clear_cuda_cache()
    return gen_fine_arr
 def codec_decode(fine_tokens, model):
    """Turn quantized audio codes into audio array using encodec."""
    from TTS.utils.audio.numpy_transforms import save_wav
    arr = torch.from_numpy(fine_tokens)[None]
    arr = arr.to(model.device)
    arr = arr.transpose(0, 1)
--- a/TTS/tts/layers/bark/load_model.py
+++ b/TTS/tts/layers/bark/load_model.py
@ -1,17 +1,12 @@
 import contextlib
 # import funcy
 import functools
 import hashlib
 import logging
 import os
 import re
 import requests
 import torch
 import tqdm
 from encodec import EncodecModel
 from transformers import BertTokenizer
 from TTS.tts.layers.bark.model import GPT, GPTConfig
 from TTS.tts.layers.bark.model_fine import FineGPT, FineGPTConfig
@ -31,8 +26,6 @@ else:
 # hold models in global scope to lazy load
 global models
 models = {}
 logger = logging.getLogger(__name__)
@ -44,10 +37,10 @@ if not hasattr(torch.nn.functional, "scaled_dot_product_attention"):
    )
-def _string_md5(s):
+# def _string_md5(s):
-    m = hashlib.md5()
+#     m = hashlib.md5()
-    m.update(s.encode("utf-8"))
+#     m.update(s.encode("utf-8"))
-    return m.hexdigest()
+#     return m.hexdigest()
 def _md5(fname):
@ -58,18 +51,18 @@ def _md5(fname):
    return hash_md5.hexdigest()
-def _get_ckpt_path(model_type, CACHE_DIR):
+# def _get_ckpt_path(model_type, CACHE_DIR):
-    model_name = _string_md5(REMOTE_MODEL_PATHS[model_type]["path"])
+#     model_name = _string_md5(REMOTE_MODEL_PATHS[model_type]["path"])
-    return os.path.join(CACHE_DIR, f"{model_name}.pt")
+#     return os.path.join(CACHE_DIR, f"{model_name}.pt")
-S3_BUCKET_PATH_RE = r"s3\:\/\/(.+?)\/"
+# S3_BUCKET_PATH_RE = r"s3\:\/\/(.+?)\/"
-def _parse_s3_filepath(s3_filepath):
+# def _parse_s3_filepath(s3_filepath):
-    bucket_name = re.search(S3_BUCKET_PATH_RE, s3_filepath).group(1)
+#     bucket_name = re.search(S3_BUCKET_PATH_RE, s3_filepath).group(1)
-    rel_s3_filepath = re.sub(S3_BUCKET_PATH_RE, "", s3_filepath)
+#     rel_s3_filepath = re.sub(S3_BUCKET_PATH_RE, "", s3_filepath)
-    return bucket_name, rel_s3_filepath
+#     return bucket_name, rel_s3_filepath
 def _download(from_s3_path, to_local_path, CACHE_DIR):
@ -83,7 +76,7 @@ def _download(from_s3_path, to_local_path, CACHE_DIR):
            progress_bar.update(len(data))
            file.write(data)
    progress_bar.close()
-    if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
+    if total_size_in_bytes not in [0, progress_bar.n]:
        raise ValueError("ERROR, something went wrong")
@ -107,27 +100,27 @@ if torch.cuda.is_available():
@contextlib.contextmanager
-def _inference_mode():
+def inference_mode():
    with InferenceContext(), torch.inference_mode(), torch.no_grad(), autocast():
        yield
-def _clear_cuda_cache():
+def clear_cuda_cache():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
-def clean_models(model_key=None):
+# def clean_models(model_key=None):
-    global models
+#     global models
-    model_keys = [model_key] if model_key is not None else models.keys()
+#     model_keys = [model_key] if model_key is not None else models.keys()
-    for k in model_keys:
+#     for k in model_keys:
-        if k in models:
+#         if k in models:
-            del models[k]
+#             del models[k]
-    _clear_cuda_cache()
+#     clear_cuda_cache()
-def _load_model(ckpt_path, device, config, model_type="text"):
+def load_model(ckpt_path, device, config, model_type="text"):
    logger.info(f"loading {model_type} model from {ckpt_path}...")
    if device == "cpu":
@ -174,13 +167,13 @@ def _load_model(ckpt_path, device, config, model_type="text"):
    state_dict = checkpoint["model"]
    # fixup checkpoint
    unwanted_prefix = "_orig_mod."
-    for k, v in list(state_dict.items()):
+    for k, _ in list(state_dict.items()):
        if k.startswith(unwanted_prefix):
            state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k)
    extra_keys = set(state_dict.keys()) - set(model.state_dict().keys())
-    extra_keys = set([k for k in extra_keys if not k.endswith(".attn.bias")])
+    extra_keys = set(k for k in extra_keys if not k.endswith(".attn.bias"))
    missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())
-    missing_keys = set([k for k in missing_keys if not k.endswith(".attn.bias")])
+    missing_keys = set(k for k in missing_keys if not k.endswith(".attn.bias"))
    if len(extra_keys) != 0:
        raise ValueError(f"extra keys found: {extra_keys}")
    if len(missing_keys) != 0:
@ -192,63 +185,63 @@ def _load_model(ckpt_path, device, config, model_type="text"):
    model.eval()
    model.to(device)
    del checkpoint, state_dict
-    _clear_cuda_cache()
+    clear_cuda_cache()
    return model, config
-def _load_codec_model(device):
+# def _load_codec_model(device):
-    model = EncodecModel.encodec_model_24khz()
+#     model = EncodecModel.encodec_model_24khz()
-    model.set_target_bandwidth(6.0)
+#     model.set_target_bandwidth(6.0)
-    model.eval()
+#     model.eval()
-    model.to(device)
+#     model.to(device)
-    _clear_cuda_cache()
+#     clear_cuda_cache()
-    return model
+#     return model
-def load_model(ckpt_path=None, use_gpu=True, force_reload=False, model_type="text"):
+# def load_model(ckpt_path=None, use_gpu=True, force_reload=False, model_type="text"):
-    _load_model_f = functools.partial(_load_model, model_type=model_type)
+#     _load_model_f = functools.partial(_load_model, model_type=model_type)
-    if model_type not in ("text", "coarse", "fine"):
+#     if model_type not in ("text", "coarse", "fine"):
-        raise NotImplementedError()
+#         raise NotImplementedError()
-    global models
+#     global models
-    if torch.cuda.device_count() == 0 or not use_gpu:
+#     if torch.cuda.device_count() == 0 or not use_gpu:
-        device = "cpu"
+#         device = "cpu"
-    else:
+#     else:
-        device = "cuda"
+#         device = "cuda"
-    model_key = str(device) + f"__{model_type}"
+#     model_key = str(device) + f"__{model_type}"
-    if model_key not in models or force_reload:
+#     if model_key not in models or force_reload:
-        if ckpt_path is None:
+#         if ckpt_path is None:
-            ckpt_path = _get_ckpt_path(model_type)
+#             ckpt_path = _get_ckpt_path(model_type)
-        clean_models(model_key=model_key)
+#         clean_models(model_key=model_key)
-        model = _load_model_f(ckpt_path, device)
+#         model = _load_model_f(ckpt_path, device)
-        models[model_key] = model
+#         models[model_key] = model
-    return models[model_key]
+#     return models[model_key]
-def load_codec_model(use_gpu=True, force_reload=False):
+# def load_codec_model(use_gpu=True, force_reload=False):
-    global models
+#     global models
-    if torch.cuda.device_count() == 0 or not use_gpu:
+#     if torch.cuda.device_count() == 0 or not use_gpu:
-        device = "cpu"
+#         device = "cpu"
-    else:
+#     else:
-        device = "cuda"
+#         device = "cuda"
-    model_key = str(device) + f"__codec"
+#     model_key = str(device) + f"__codec"
-    if model_key not in models or force_reload:
+#     if model_key not in models or force_reload:
-        clean_models(model_key=model_key)
+#         clean_models(model_key=model_key)
-        model = _load_codec_model(device)
+#         model = _load_codec_model(device)
-        models[model_key] = model
+#         models[model_key] = model
-    return models[model_key]
+#     return models[model_key]
-def preload_models(
+# def preload_models(
-    text_ckpt_path=None, coarse_ckpt_path=None, fine_ckpt_path=None, use_gpu=True, use_smaller_models=False
+#     text_ckpt_path=None, coarse_ckpt_path=None, fine_ckpt_path=None, use_gpu=True, use_smaller_models=False
-):
+# ):
-    global USE_SMALLER_MODELS
+#     global USE_SMALLER_MODELS
-    global REMOTE_MODEL_PATHS
+#     global REMOTE_MODEL_PATHS
-    if use_smaller_models:
+#     if use_smaller_models:
-        USE_SMALLER_MODELS = True
+#         USE_SMALLER_MODELS = True
-        logger.info("Using smaller models generation.py")
+#         logger.info("Using smaller models generation.py")
-        REMOTE_MODEL_PATHS = SMALL_REMOTE_MODEL_PATHS
+#         REMOTE_MODEL_PATHS = SMALL_REMOTE_MODEL_PATHS
-    _ = load_model(ckpt_path=text_ckpt_path, model_type="text", use_gpu=use_gpu, force_reload=True)
+#     _ = load_model(ckpt_path=text_ckpt_path, model_type="text", use_gpu=use_gpu, force_reload=True)
-    _ = load_model(ckpt_path=coarse_ckpt_path, model_type="coarse", use_gpu=use_gpu, force_reload=True)
+#     _ = load_model(ckpt_path=coarse_ckpt_path, model_type="coarse", use_gpu=use_gpu, force_reload=True)
-    _ = load_model(ckpt_path=fine_ckpt_path, model_type="fine", use_gpu=use_gpu, force_reload=True)
+#     _ = load_model(ckpt_path=fine_ckpt_path, model_type="fine", use_gpu=use_gpu, force_reload=True)
-    _ = load_codec_model(use_gpu=use_gpu, force_reload=True)
+#     _ = load_codec_model(use_gpu=use_gpu, force_reload=True)
--- a/TTS/tts/layers/bark/model.py
+++ b/TTS/tts/layers/bark/model.py
@ -6,8 +6,8 @@ import math
 from dataclasses import dataclass
 import torch
 import torch.nn as nn
 from coqpit import Coqpit
 from torch import nn
 from torch.nn import functional as F
@ -19,8 +19,8 @@ class LayerNorm(nn.Module):
        self.weight = nn.Parameter(torch.ones(ndim))
        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
-    def forward(self, input):
+    def forward(self, x):
-        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)
+        return F.layer_norm(x, self.weight.shape, self.weight, self.bias, 1e-5)
 class CausalSelfAttention(nn.Module):
@ -177,7 +177,7 @@ class GPT(nn.Module):
    def forward(self, idx, merge_context=False, past_kv=None, position_ids=None, use_cache=False):
        device = idx.device
-        b, t = idx.size()
+        _, t = idx.size()
        if past_kv is not None:
            assert t == 1
            tok_emb = self.transformer.wte(idx)  # token embeddings of shape (b, t, n_embd)
@ -219,7 +219,7 @@ class GPT(nn.Module):
        new_kv = () if use_cache else None
-        for i, (block, past_layer_kv) in enumerate(zip(self.transformer.h, past_kv)):
+        for _, (block, past_layer_kv) in enumerate(zip(self.transformer.h, past_kv)):
            x, kv = block(x, past_kv=past_layer_kv, use_cache=use_cache)
            if use_cache:
--- a/TTS/tts/layers/bark/model_fine.py
+++ b/TTS/tts/layers/bark/model_fine.py
@ -6,7 +6,7 @@ import math
 from dataclasses import dataclass
 import torch
-import torch.nn as nn
+from torch import nn
 from torch.nn import functional as F
 from .model import GPT, MLP, GPTConfig
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@ -341,7 +341,7 @@ class Synthesizer(object):
        use_gl = self.vocoder_model is None
-        if not reference_wav:
+        if not reference_wav:  # not voice conversion
            for sen in sens:
                if hasattr(self.tts_model, "synthesize"):
                    sp_name = "random" if speaker_name is None else speaker_name
--- a/docs/source/models/tortoise.md
+++ b/docs/source/models/tortoise.md
@ -12,7 +12,7 @@ from TTS.tts.configs.tortoise_config import TortoiseConfig
 from TTS.tts.models.tortoise import Tortoise
 config = TortoiseConfig()
-model = Tortoise.inif_from_config(config)
+model = Tortoise.init_from_config(config)
 model.load_checkpoint(config, checkpoint_dir="paths/to/models_dir/", eval=True)
 # with random speaker
@ -32,7 +32,7 @@ tts = TTS("tts_models/en/multi-dataset/tortoise-v2")
 # with custom inference settings overriding defaults.
 tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
                file_path="output.wav",
-                voice_dir="TTS/tts/utils/assets/tortoise/voices/",
+                voice_dir="path/to/tortoise/voices/dir/",
                speaker="lj",
                num_autoregressive_samples=1,
                diffusion_iterations=10)
@ -40,7 +40,7 @@ tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
 # Using presets with the same voice
 tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
                file_path="output.wav",
-                voice_dir="TTS/tts/utils/assets/tortoise/voices/",
+                voice_dir="path/to/tortoise/voices/dir/",
                speaker="lj",
                preset="ultra_fast")
@ -55,15 +55,15 @@ Using 🐸TTS Command line:
 # cloning the `lj` voice
 tts --model_name  tts_models/en/multi-dataset/tortoise-v2 \
 --text "This is an example." \
--out_path "/data/speech_synth/coqui-tts/TTS/tests/outputs/output.wav" \
+--out_path "output.wav" \
--voice_dir TTS/tts/utils/assets/tortoise/voices/ \
+--voice_dir path/to/tortoise/voices/dir/ \
 --speaker_idx "lj" \
 --progress_bar True
 # Random voice generation
 tts --model_name  tts_models/en/multi-dataset/tortoise-v2 \
 --text "This is an example." \
--out_path "/data/speech_synth/coqui-tts/TTS/tests/outputs/output.wav" \
+--out_path "output.wav" \
 --progress_bar True
 ```