Fix here and ther

This commit is contained in:
Eren G??lge 2023-06-21 11:59:27 +02:00
parent 03c347b7f3
commit 0f8932a6a9
8 changed files with 138 additions and 160 deletions

View File

@ -17,6 +17,7 @@ class HubertManager:
urllib.request.urlretrieve(download_url, model_path) urllib.request.urlretrieve(download_url, model_path)
print("Downloaded HuBERT") print("Downloaded HuBERT")
return model_path return model_path
return None
@staticmethod @staticmethod
def make_sure_tokenizer_installed( def make_sure_tokenizer_installed(
@ -31,3 +32,4 @@ class HubertManager:
shutil.move(os.path.join(model_dir, model), model_path) shutil.move(os.path.join(model_dir, model), model_path)
print("Downloaded tokenizer") print("Downloaded tokenizer")
return model_path return model_path
return None

View File

@ -16,7 +16,7 @@ from torch.serialization import MAP_LOCATION
class HubertTokenizer(nn.Module): class HubertTokenizer(nn.Module):
def __init__(self, hidden_size=1024, input_size=768, output_size=10000, version=0): def __init__(self, hidden_size=1024, input_size=768, output_size=10000, version=0):
super(HubertTokenizer, self).__init__() super().__init__()
next_size = input_size next_size = input_size
if version == 0: if version == 0:
self.lstm = nn.LSTM(input_size, hidden_size, 2, batch_first=True) self.lstm = nn.LSTM(input_size, hidden_size, 2, batch_first=True)
@ -181,7 +181,7 @@ def auto_train(data_path, save_path="model.pth", load_model: str = None, save_ep
epoch = 1 epoch = 1
while 1: while 1:
for i in range(save_epochs): for _ in range(save_epochs):
j = 0 j = 0
for x, y in zip(data_x, data_y): for x, y in zip(data_x, data_y):
model_training.train_step( model_training.train_step(

View File

@ -16,7 +16,7 @@ from torch.nn import functional as F
from TTS.tts.layers.bark.hubert.hubert_manager import HubertManager from TTS.tts.layers.bark.hubert.hubert_manager import HubertManager
from TTS.tts.layers.bark.hubert.kmeans_hubert import CustomHubert from TTS.tts.layers.bark.hubert.kmeans_hubert import CustomHubert
from TTS.tts.layers.bark.hubert.tokenizer import HubertTokenizer from TTS.tts.layers.bark.hubert.tokenizer import HubertTokenizer
from TTS.tts.layers.bark.load_model import _clear_cuda_cache, _inference_mode from TTS.tts.layers.bark.load_model import clear_cuda_cache, inference_mode
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -34,34 +34,53 @@ def _normalize_whitespace(text):
def get_voices(extra_voice_dirs: List[str] = []): def get_voices(extra_voice_dirs: List[str] = []):
voices = {} dirs = extra_voice_dirs
for dir in extra_voice_dirs: voices: Dict[str, List[str]] = {}
paths = list(glob(f"{dir}/*.npz")) for d in dirs:
for path in paths: subs = os.listdir(d)
name = os.path.basename(path).replace(".npz", "") for sub in subs:
voices[name] = path subj = os.path.join(d, sub)
if os.path.isdir(subj):
voices[sub] = list(glob(f"{subj}/*.npz"))
# fetch audio files if no npz files are found
if len(voices[sub]) == 0:
voices[sub] = list(glob(f"{subj}/*.wav")) + list(glob(f"{subj}/*.mp3"))
return voices return voices
def load_voice(voice: str, extra_voice_dirs: List[str] = []): def load_npz(npz_file):
def load_npz(npz_file):
x_history = np.load(npz_file) x_history = np.load(npz_file)
semantic = x_history["semantic_prompt"] semantic = x_history["semantic_prompt"]
coarse = x_history["coarse_prompt"] coarse = x_history["coarse_prompt"]
fine = x_history["fine_prompt"] fine = x_history["fine_prompt"]
return semantic, coarse, fine return semantic, coarse, fine
def load_voice(model, voice: str, extra_voice_dirs: List[str] = []): # pylint: disable=dangerous-default-value
if voice == "random": if voice == "random":
return None, None, None return None, None, None
voices = get_voices(extra_voice_dirs) voices = get_voices(extra_voice_dirs)
paths = voices[voice]
# bark only uses a single sample for cloning
if len(paths) > 1:
raise ValueError(f"Voice {voice} has multiple paths: {paths}")
try: try:
path = voices[voice] path = voices[voice]
except KeyError: except KeyError as e:
raise KeyError(f"Voice {voice} not found in {extra_voice_dirs}") raise KeyError(f"Voice {voice} not found in {extra_voice_dirs}") from e
prompt = load_npz(path)
return prompt
if len(paths) == 1 and paths[0].endswith(".npz"):
return load_npz(path[0])
else:
audio_path = paths[0]
# replace the file extension with .npz
output_path = os.path.splitext(audio_path)[0] + ".npz"
generate_voice(audio=audio_path, model=model, output_path=output_path)
breakpoint()
return load_voice(model, voice, extra_voice_dirs)
def zero_crossing_rate(audio, frame_length=1024, hop_length=512): def zero_crossing_rate(audio, frame_length=1024, hop_length=512):
zero_crossings = np.sum(np.abs(np.diff(np.sign(audio))) / 2) zero_crossings = np.sum(np.abs(np.diff(np.sign(audio))) / 2)
@ -85,7 +104,6 @@ def compute_average_bass_energy(audio_data, sample_rate, max_bass_freq=250):
def generate_voice( def generate_voice(
audio, audio,
text,
model, model,
output_path, output_path,
): ):
@ -106,9 +124,6 @@ def generate_voice(
encoded_frames = model.encodec.encode(audio) encoded_frames = model.encodec.encode(audio)
codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze() # [n_q, T] codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze() # [n_q, T]
# get seconds of audio
seconds = audio.shape[-1] / model.config.sample_rate
# move codes to cpu # move codes to cpu
codes = codes.cpu().numpy() codes = codes.cpu().numpy()
@ -133,36 +148,6 @@ def generate_voice(
np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens) np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)
# while attempts < max_attempts:
# if attempts > 0 and base is not None:
# # Reset the base model token
# print(f"Reset the base model token Regenerating...")
# base = None
# audio_array, x = model.generate_audio(text, history_promp=None, base=base, **kwargs)
# zcr = zero_crossing_rate(audio_array)
# spectral_contrast = compute_spectral_contrast(audio_array, model.config.sample_rate)
# bass_energy = compute_average_bass_energy(audio_array, model.config.sample_rate)
# print(f"Attempt {attempts + 1}: ZCR = {zcr}, Spectral Contrast = {spectral_contrast:.2f}, Bass Energy = {bass_energy:.2f}")
# # Save the audio array to the output_array directory with a random name for debugging
# #output_file = os.path.join(output_directory, f"audio_{zcr:.2f}_sc{spectral_contrast:.2f}_be{bass_energy:.2f}.wav")
# #wavfile.write(output_file, sample_rate, audio_array)
# #print(f"Saved audio array to {output_file}")
# if zcr < zcr_threshold and spectral_contrast < spectral_threshold and bass_energy < bass_energy_threshold:
# print(f"Audio passed ZCR, Spectral Contrast, and Bass Energy thresholds. No need to regenerate.")
# break
# else:
# print(f"Audio failed ZCR, Spectral Contrast, and/or Bass Energy thresholds. Regenerating...")
# attempts += 1
# if attempts == max_attempts:
# print("Reached maximum attempts. Returning the last generated audio.")
# return audio_array, x, zcr, spectral_contrast, bass_energy
def generate_text_semantic( def generate_text_semantic(
text, text,
@ -224,7 +209,7 @@ def generate_text_semantic(
np.hstack([encoded_text, semantic_history, np.array([model.config.SEMANTIC_INFER_TOKEN])]).astype(np.int64) np.hstack([encoded_text, semantic_history, np.array([model.config.SEMANTIC_INFER_TOKEN])]).astype(np.int64)
)[None] )[None]
assert x.shape[1] == 256 + 256 + 1 assert x.shape[1] == 256 + 256 + 1
with _inference_mode(): with inference_mode():
x = x.to(model.device) x = x.to(model.device)
n_tot_steps = 768 n_tot_steps = 768
# custom tqdm updates since we don't know when eos will occur # custom tqdm updates since we don't know when eos will occur
@ -285,8 +270,8 @@ def generate_text_semantic(
pbar_state = req_pbar_state pbar_state = req_pbar_state
pbar.close() pbar.close()
out = x.detach().cpu().numpy().squeeze()[256 + 256 + 1 :] out = x.detach().cpu().numpy().squeeze()[256 + 256 + 1 :]
assert all(0 <= out) and all(out < model.config.SEMANTIC_VOCAB_SIZE) assert all(out >= 0) and all(out < model.config.SEMANTIC_VOCAB_SIZE)
_clear_cuda_cache() clear_cuda_cache()
return out return out
@ -382,7 +367,7 @@ def generate_coarse(
x_semantic = np.hstack([x_semantic_history, x_semantic]).astype(np.int32) x_semantic = np.hstack([x_semantic_history, x_semantic]).astype(np.int32)
x_coarse = x_coarse_history.astype(np.int32) x_coarse = x_coarse_history.astype(np.int32)
base_semantic_idx = len(x_semantic_history) base_semantic_idx = len(x_semantic_history)
with _inference_mode(): with inference_mode():
x_semantic_in = torch.from_numpy(x_semantic)[None].to(model.device) x_semantic_in = torch.from_numpy(x_semantic)[None].to(model.device)
x_coarse_in = torch.from_numpy(x_coarse)[None].to(model.device) x_coarse_in = torch.from_numpy(x_coarse)[None].to(model.device)
n_window_steps = int(np.ceil(n_steps / sliding_window_len)) n_window_steps = int(np.ceil(n_steps / sliding_window_len))
@ -456,7 +441,7 @@ def generate_coarse(
) )
for n in range(1, model.config.N_COARSE_CODEBOOKS): for n in range(1, model.config.N_COARSE_CODEBOOKS):
gen_coarse_audio_arr[n, :] -= n * model.config.CODEBOOK_SIZE gen_coarse_audio_arr[n, :] -= n * model.config.CODEBOOK_SIZE
_clear_cuda_cache() clear_cuda_cache()
return gen_coarse_audio_arr return gen_coarse_audio_arr
@ -526,7 +511,7 @@ def generate_fine(
) )
# we can be lazy about fractional loop and just keep overwriting codebooks # we can be lazy about fractional loop and just keep overwriting codebooks
n_loops = np.max([0, int(np.ceil((x_coarse_gen.shape[1] - (1024 - n_history)) / 512))]) + 1 n_loops = np.max([0, int(np.ceil((x_coarse_gen.shape[1] - (1024 - n_history)) / 512))]) + 1
with _inference_mode(): with inference_mode():
in_arr = torch.tensor(in_arr.T).to(model.device) in_arr = torch.tensor(in_arr.T).to(model.device)
for n in tqdm.tqdm(range(n_loops), disable=silent): for n in tqdm.tqdm(range(n_loops), disable=silent):
start_idx = np.min([n * 512, in_arr.shape[0] - 1024]) start_idx = np.min([n * 512, in_arr.shape[0] - 1024])
@ -558,14 +543,12 @@ def generate_fine(
if n_remove_from_end > 0: if n_remove_from_end > 0:
gen_fine_arr = gen_fine_arr[:, :-n_remove_from_end] gen_fine_arr = gen_fine_arr[:, :-n_remove_from_end]
assert gen_fine_arr.shape[-1] == x_coarse_gen.shape[-1] assert gen_fine_arr.shape[-1] == x_coarse_gen.shape[-1]
_clear_cuda_cache() clear_cuda_cache()
return gen_fine_arr return gen_fine_arr
def codec_decode(fine_tokens, model): def codec_decode(fine_tokens, model):
"""Turn quantized audio codes into audio array using encodec.""" """Turn quantized audio codes into audio array using encodec."""
from TTS.utils.audio.numpy_transforms import save_wav
arr = torch.from_numpy(fine_tokens)[None] arr = torch.from_numpy(fine_tokens)[None]
arr = arr.to(model.device) arr = arr.to(model.device)
arr = arr.transpose(0, 1) arr = arr.transpose(0, 1)

View File

@ -1,17 +1,12 @@
import contextlib import contextlib
# import funcy
import functools import functools
import hashlib import hashlib
import logging import logging
import os import os
import re
import requests import requests
import torch import torch
import tqdm import tqdm
from encodec import EncodecModel
from transformers import BertTokenizer
from TTS.tts.layers.bark.model import GPT, GPTConfig from TTS.tts.layers.bark.model import GPT, GPTConfig
from TTS.tts.layers.bark.model_fine import FineGPT, FineGPTConfig from TTS.tts.layers.bark.model_fine import FineGPT, FineGPTConfig
@ -31,8 +26,6 @@ else:
# hold models in global scope to lazy load # hold models in global scope to lazy load
global models
models = {}
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -44,10 +37,10 @@ if not hasattr(torch.nn.functional, "scaled_dot_product_attention"):
) )
def _string_md5(s): # def _string_md5(s):
m = hashlib.md5() # m = hashlib.md5()
m.update(s.encode("utf-8")) # m.update(s.encode("utf-8"))
return m.hexdigest() # return m.hexdigest()
def _md5(fname): def _md5(fname):
@ -58,18 +51,18 @@ def _md5(fname):
return hash_md5.hexdigest() return hash_md5.hexdigest()
def _get_ckpt_path(model_type, CACHE_DIR): # def _get_ckpt_path(model_type, CACHE_DIR):
model_name = _string_md5(REMOTE_MODEL_PATHS[model_type]["path"]) # model_name = _string_md5(REMOTE_MODEL_PATHS[model_type]["path"])
return os.path.join(CACHE_DIR, f"{model_name}.pt") # return os.path.join(CACHE_DIR, f"{model_name}.pt")
S3_BUCKET_PATH_RE = r"s3\:\/\/(.+?)\/" # S3_BUCKET_PATH_RE = r"s3\:\/\/(.+?)\/"
def _parse_s3_filepath(s3_filepath): # def _parse_s3_filepath(s3_filepath):
bucket_name = re.search(S3_BUCKET_PATH_RE, s3_filepath).group(1) # bucket_name = re.search(S3_BUCKET_PATH_RE, s3_filepath).group(1)
rel_s3_filepath = re.sub(S3_BUCKET_PATH_RE, "", s3_filepath) # rel_s3_filepath = re.sub(S3_BUCKET_PATH_RE, "", s3_filepath)
return bucket_name, rel_s3_filepath # return bucket_name, rel_s3_filepath
def _download(from_s3_path, to_local_path, CACHE_DIR): def _download(from_s3_path, to_local_path, CACHE_DIR):
@ -83,7 +76,7 @@ def _download(from_s3_path, to_local_path, CACHE_DIR):
progress_bar.update(len(data)) progress_bar.update(len(data))
file.write(data) file.write(data)
progress_bar.close() progress_bar.close()
if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes: if total_size_in_bytes not in [0, progress_bar.n]:
raise ValueError("ERROR, something went wrong") raise ValueError("ERROR, something went wrong")
@ -107,27 +100,27 @@ if torch.cuda.is_available():
@contextlib.contextmanager @contextlib.contextmanager
def _inference_mode(): def inference_mode():
with InferenceContext(), torch.inference_mode(), torch.no_grad(), autocast(): with InferenceContext(), torch.inference_mode(), torch.no_grad(), autocast():
yield yield
def _clear_cuda_cache(): def clear_cuda_cache():
if torch.cuda.is_available(): if torch.cuda.is_available():
torch.cuda.empty_cache() torch.cuda.empty_cache()
torch.cuda.synchronize() torch.cuda.synchronize()
def clean_models(model_key=None): # def clean_models(model_key=None):
global models # global models
model_keys = [model_key] if model_key is not None else models.keys() # model_keys = [model_key] if model_key is not None else models.keys()
for k in model_keys: # for k in model_keys:
if k in models: # if k in models:
del models[k] # del models[k]
_clear_cuda_cache() # clear_cuda_cache()
def _load_model(ckpt_path, device, config, model_type="text"): def load_model(ckpt_path, device, config, model_type="text"):
logger.info(f"loading {model_type} model from {ckpt_path}...") logger.info(f"loading {model_type} model from {ckpt_path}...")
if device == "cpu": if device == "cpu":
@ -174,13 +167,13 @@ def _load_model(ckpt_path, device, config, model_type="text"):
state_dict = checkpoint["model"] state_dict = checkpoint["model"]
# fixup checkpoint # fixup checkpoint
unwanted_prefix = "_orig_mod." unwanted_prefix = "_orig_mod."
for k, v in list(state_dict.items()): for k, _ in list(state_dict.items()):
if k.startswith(unwanted_prefix): if k.startswith(unwanted_prefix):
state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k) state_dict[k[len(unwanted_prefix) :]] = state_dict.pop(k)
extra_keys = set(state_dict.keys()) - set(model.state_dict().keys()) extra_keys = set(state_dict.keys()) - set(model.state_dict().keys())
extra_keys = set([k for k in extra_keys if not k.endswith(".attn.bias")]) extra_keys = set(k for k in extra_keys if not k.endswith(".attn.bias"))
missing_keys = set(model.state_dict().keys()) - set(state_dict.keys()) missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())
missing_keys = set([k for k in missing_keys if not k.endswith(".attn.bias")]) missing_keys = set(k for k in missing_keys if not k.endswith(".attn.bias"))
if len(extra_keys) != 0: if len(extra_keys) != 0:
raise ValueError(f"extra keys found: {extra_keys}") raise ValueError(f"extra keys found: {extra_keys}")
if len(missing_keys) != 0: if len(missing_keys) != 0:
@ -192,63 +185,63 @@ def _load_model(ckpt_path, device, config, model_type="text"):
model.eval() model.eval()
model.to(device) model.to(device)
del checkpoint, state_dict del checkpoint, state_dict
_clear_cuda_cache() clear_cuda_cache()
return model, config return model, config
def _load_codec_model(device): # def _load_codec_model(device):
model = EncodecModel.encodec_model_24khz() # model = EncodecModel.encodec_model_24khz()
model.set_target_bandwidth(6.0) # model.set_target_bandwidth(6.0)
model.eval() # model.eval()
model.to(device) # model.to(device)
_clear_cuda_cache() # clear_cuda_cache()
return model # return model
def load_model(ckpt_path=None, use_gpu=True, force_reload=False, model_type="text"): # def load_model(ckpt_path=None, use_gpu=True, force_reload=False, model_type="text"):
_load_model_f = functools.partial(_load_model, model_type=model_type) # _load_model_f = functools.partial(_load_model, model_type=model_type)
if model_type not in ("text", "coarse", "fine"): # if model_type not in ("text", "coarse", "fine"):
raise NotImplementedError() # raise NotImplementedError()
global models # global models
if torch.cuda.device_count() == 0 or not use_gpu: # if torch.cuda.device_count() == 0 or not use_gpu:
device = "cpu" # device = "cpu"
else: # else:
device = "cuda" # device = "cuda"
model_key = str(device) + f"__{model_type}" # model_key = str(device) + f"__{model_type}"
if model_key not in models or force_reload: # if model_key not in models or force_reload:
if ckpt_path is None: # if ckpt_path is None:
ckpt_path = _get_ckpt_path(model_type) # ckpt_path = _get_ckpt_path(model_type)
clean_models(model_key=model_key) # clean_models(model_key=model_key)
model = _load_model_f(ckpt_path, device) # model = _load_model_f(ckpt_path, device)
models[model_key] = model # models[model_key] = model
return models[model_key] # return models[model_key]
def load_codec_model(use_gpu=True, force_reload=False): # def load_codec_model(use_gpu=True, force_reload=False):
global models # global models
if torch.cuda.device_count() == 0 or not use_gpu: # if torch.cuda.device_count() == 0 or not use_gpu:
device = "cpu" # device = "cpu"
else: # else:
device = "cuda" # device = "cuda"
model_key = str(device) + f"__codec" # model_key = str(device) + f"__codec"
if model_key not in models or force_reload: # if model_key not in models or force_reload:
clean_models(model_key=model_key) # clean_models(model_key=model_key)
model = _load_codec_model(device) # model = _load_codec_model(device)
models[model_key] = model # models[model_key] = model
return models[model_key] # return models[model_key]
def preload_models( # def preload_models(
text_ckpt_path=None, coarse_ckpt_path=None, fine_ckpt_path=None, use_gpu=True, use_smaller_models=False # text_ckpt_path=None, coarse_ckpt_path=None, fine_ckpt_path=None, use_gpu=True, use_smaller_models=False
): # ):
global USE_SMALLER_MODELS # global USE_SMALLER_MODELS
global REMOTE_MODEL_PATHS # global REMOTE_MODEL_PATHS
if use_smaller_models: # if use_smaller_models:
USE_SMALLER_MODELS = True # USE_SMALLER_MODELS = True
logger.info("Using smaller models generation.py") # logger.info("Using smaller models generation.py")
REMOTE_MODEL_PATHS = SMALL_REMOTE_MODEL_PATHS # REMOTE_MODEL_PATHS = SMALL_REMOTE_MODEL_PATHS
_ = load_model(ckpt_path=text_ckpt_path, model_type="text", use_gpu=use_gpu, force_reload=True) # _ = load_model(ckpt_path=text_ckpt_path, model_type="text", use_gpu=use_gpu, force_reload=True)
_ = load_model(ckpt_path=coarse_ckpt_path, model_type="coarse", use_gpu=use_gpu, force_reload=True) # _ = load_model(ckpt_path=coarse_ckpt_path, model_type="coarse", use_gpu=use_gpu, force_reload=True)
_ = load_model(ckpt_path=fine_ckpt_path, model_type="fine", use_gpu=use_gpu, force_reload=True) # _ = load_model(ckpt_path=fine_ckpt_path, model_type="fine", use_gpu=use_gpu, force_reload=True)
_ = load_codec_model(use_gpu=use_gpu, force_reload=True) # _ = load_codec_model(use_gpu=use_gpu, force_reload=True)

View File

@ -6,8 +6,8 @@ import math
from dataclasses import dataclass from dataclasses import dataclass
import torch import torch
import torch.nn as nn
from coqpit import Coqpit from coqpit import Coqpit
from torch import nn
from torch.nn import functional as F from torch.nn import functional as F
@ -19,8 +19,8 @@ class LayerNorm(nn.Module):
self.weight = nn.Parameter(torch.ones(ndim)) self.weight = nn.Parameter(torch.ones(ndim))
self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
def forward(self, input): def forward(self, x):
return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5) return F.layer_norm(x, self.weight.shape, self.weight, self.bias, 1e-5)
class CausalSelfAttention(nn.Module): class CausalSelfAttention(nn.Module):
@ -177,7 +177,7 @@ class GPT(nn.Module):
def forward(self, idx, merge_context=False, past_kv=None, position_ids=None, use_cache=False): def forward(self, idx, merge_context=False, past_kv=None, position_ids=None, use_cache=False):
device = idx.device device = idx.device
b, t = idx.size() _, t = idx.size()
if past_kv is not None: if past_kv is not None:
assert t == 1 assert t == 1
tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd) tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
@ -219,7 +219,7 @@ class GPT(nn.Module):
new_kv = () if use_cache else None new_kv = () if use_cache else None
for i, (block, past_layer_kv) in enumerate(zip(self.transformer.h, past_kv)): for _, (block, past_layer_kv) in enumerate(zip(self.transformer.h, past_kv)):
x, kv = block(x, past_kv=past_layer_kv, use_cache=use_cache) x, kv = block(x, past_kv=past_layer_kv, use_cache=use_cache)
if use_cache: if use_cache:

View File

@ -6,7 +6,7 @@ import math
from dataclasses import dataclass from dataclasses import dataclass
import torch import torch
import torch.nn as nn from torch import nn
from torch.nn import functional as F from torch.nn import functional as F
from .model import GPT, MLP, GPTConfig from .model import GPT, MLP, GPTConfig

View File

@ -341,7 +341,7 @@ class Synthesizer(object):
use_gl = self.vocoder_model is None use_gl = self.vocoder_model is None
if not reference_wav: if not reference_wav: # not voice conversion
for sen in sens: for sen in sens:
if hasattr(self.tts_model, "synthesize"): if hasattr(self.tts_model, "synthesize"):
sp_name = "random" if speaker_name is None else speaker_name sp_name = "random" if speaker_name is None else speaker_name

View File

@ -12,7 +12,7 @@ from TTS.tts.configs.tortoise_config import TortoiseConfig
from TTS.tts.models.tortoise import Tortoise from TTS.tts.models.tortoise import Tortoise
config = TortoiseConfig() config = TortoiseConfig()
model = Tortoise.inif_from_config(config) model = Tortoise.init_from_config(config)
model.load_checkpoint(config, checkpoint_dir="paths/to/models_dir/", eval=True) model.load_checkpoint(config, checkpoint_dir="paths/to/models_dir/", eval=True)
# with random speaker # with random speaker
@ -32,7 +32,7 @@ tts = TTS("tts_models/en/multi-dataset/tortoise-v2")
# with custom inference settings overriding defaults. # with custom inference settings overriding defaults.
tts.tts_to_file(text="Hello, my name is Manmay , how are you?", tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
file_path="output.wav", file_path="output.wav",
voice_dir="TTS/tts/utils/assets/tortoise/voices/", voice_dir="path/to/tortoise/voices/dir/",
speaker="lj", speaker="lj",
num_autoregressive_samples=1, num_autoregressive_samples=1,
diffusion_iterations=10) diffusion_iterations=10)
@ -40,7 +40,7 @@ tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
# Using presets with the same voice # Using presets with the same voice
tts.tts_to_file(text="Hello, my name is Manmay , how are you?", tts.tts_to_file(text="Hello, my name is Manmay , how are you?",
file_path="output.wav", file_path="output.wav",
voice_dir="TTS/tts/utils/assets/tortoise/voices/", voice_dir="path/to/tortoise/voices/dir/",
speaker="lj", speaker="lj",
preset="ultra_fast") preset="ultra_fast")
@ -55,15 +55,15 @@ Using 🐸TTS Command line:
# cloning the `lj` voice # cloning the `lj` voice
tts --model_name tts_models/en/multi-dataset/tortoise-v2 \ tts --model_name tts_models/en/multi-dataset/tortoise-v2 \
--text "This is an example." \ --text "This is an example." \
--out_path "/data/speech_synth/coqui-tts/TTS/tests/outputs/output.wav" \ --out_path "output.wav" \
--voice_dir TTS/tts/utils/assets/tortoise/voices/ \ --voice_dir path/to/tortoise/voices/dir/ \
--speaker_idx "lj" \ --speaker_idx "lj" \
--progress_bar True --progress_bar True
# Random voice generation # Random voice generation
tts --model_name tts_models/en/multi-dataset/tortoise-v2 \ tts --model_name tts_models/en/multi-dataset/tortoise-v2 \
--text "This is an example." \ --text "This is an example." \
--out_path "/data/speech_synth/coqui-tts/TTS/tests/outputs/output.wav" \ --out_path "output.wav" \
--progress_bar True --progress_bar True
``` ```