mirror of https://github.com/coqui-ai/TTS.git
Merge pull request #7 from eginhard/pin-black
Pin black for consistent outputs
This commit is contained in:
commit
1aef5ff091
|
@ -8,7 +8,7 @@ repos:
|
|||
# - id: end-of-file-fixer
|
||||
# - id: trailing-whitespace
|
||||
- repo: "https://github.com/psf/black"
|
||||
rev: 23.12.0
|
||||
rev: 24.2.0
|
||||
hooks:
|
||||
- id: black
|
||||
language_version: python3
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
"""Get detailed info about the working environment."""
|
||||
|
||||
import json
|
||||
import os
|
||||
import platform
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
"""Find all the unique characters in a dataset"""
|
||||
|
||||
import argparse
|
||||
from argparse import RawTextHelpFormatter
|
||||
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
"""Find all the unique characters in a dataset"""
|
||||
|
||||
import argparse
|
||||
import multiprocessing
|
||||
from argparse import RawTextHelpFormatter
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
"""Search a good noise schedule for WaveGrad for a given number of inference iterations"""
|
||||
|
||||
import argparse
|
||||
from itertools import product as cartesian_product
|
||||
|
||||
|
|
|
@ -457,9 +457,11 @@ class TTSDataset(Dataset):
|
|||
|
||||
# lengths adjusted by the reduction factor
|
||||
mel_lengths_adjusted = [
|
||||
m.shape[1] + (self.outputs_per_step - (m.shape[1] % self.outputs_per_step))
|
||||
if m.shape[1] % self.outputs_per_step
|
||||
else m.shape[1]
|
||||
(
|
||||
m.shape[1] + (self.outputs_per_step - (m.shape[1] % self.outputs_per_step))
|
||||
if m.shape[1] % self.outputs_per_step
|
||||
else m.shape[1]
|
||||
)
|
||||
for m in mel
|
||||
]
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
Much of this code is adapted from Andrej Karpathy's NanoGPT
|
||||
(https://github.com/karpathy/nanoGPT)
|
||||
"""
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
Much of this code is adapted from Andrej Karpathy's NanoGPT
|
||||
(https://github.com/karpathy/nanoGPT)
|
||||
"""
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
|
|
@ -187,9 +187,9 @@ class GPT(nn.Module):
|
|||
def get_grad_norm_parameter_groups(self):
|
||||
return {
|
||||
"conditioning_encoder": list(self.conditioning_encoder.parameters()),
|
||||
"conditioning_perceiver": list(self.conditioning_perceiver.parameters())
|
||||
if self.use_perceiver_resampler
|
||||
else None,
|
||||
"conditioning_perceiver": (
|
||||
list(self.conditioning_perceiver.parameters()) if self.use_perceiver_resampler else None
|
||||
),
|
||||
"gpt": list(self.gpt.parameters()),
|
||||
"heads": list(self.text_head.parameters()) + list(self.mel_head.parameters()),
|
||||
}
|
||||
|
|
|
@ -186,9 +186,9 @@ class XTTSDataset(torch.utils.data.Dataset):
|
|||
"wav_lengths": torch.tensor(wav.shape[-1], dtype=torch.long),
|
||||
"filenames": audiopath,
|
||||
"conditioning": cond.unsqueeze(1),
|
||||
"cond_lens": torch.tensor(cond_len, dtype=torch.long)
|
||||
if cond_len is not torch.nan
|
||||
else torch.tensor([cond_len]),
|
||||
"cond_lens": (
|
||||
torch.tensor(cond_len, dtype=torch.long) if cond_len is not torch.nan else torch.tensor([cond_len])
|
||||
),
|
||||
"cond_idxs": torch.tensor(cond_idxs) if cond_idxs is not torch.nan else torch.tensor([cond_idxs]),
|
||||
}
|
||||
return res
|
||||
|
|
|
@ -225,14 +225,11 @@ class Bark(BaseTTS):
|
|||
|
||||
return return_dict
|
||||
|
||||
def eval_step(self):
|
||||
...
|
||||
def eval_step(self): ...
|
||||
|
||||
def forward(self):
|
||||
...
|
||||
def forward(self): ...
|
||||
|
||||
def inference(self):
|
||||
...
|
||||
def inference(self): ...
|
||||
|
||||
@staticmethod
|
||||
def init_from_config(config: "BarkConfig", **kwargs): # pylint: disable=unused-argument
|
||||
|
|
|
@ -369,9 +369,11 @@ class BaseTTS(BaseTrainerModel):
|
|||
d_vector = (random.sample(sorted(d_vector), 1),)
|
||||
|
||||
aux_inputs = {
|
||||
"speaker_id": None
|
||||
if not self.config.use_speaker_embedding
|
||||
else random.sample(sorted(self.speaker_manager.name_to_id.values()), 1),
|
||||
"speaker_id": (
|
||||
None
|
||||
if not self.config.use_speaker_embedding
|
||||
else random.sample(sorted(self.speaker_manager.name_to_id.values()), 1)
|
||||
),
|
||||
"d_vector": d_vector,
|
||||
"style_wav": None, # TODO: handle GST style input
|
||||
}
|
||||
|
|
|
@ -101,12 +101,16 @@ class Tacotron(BaseTacotron):
|
|||
num_mel=self.decoder_output_dim,
|
||||
encoder_output_dim=self.encoder_in_features,
|
||||
capacitron_VAE_embedding_dim=self.capacitron_vae.capacitron_VAE_embedding_dim,
|
||||
speaker_embedding_dim=self.embedded_speaker_dim
|
||||
if self.use_speaker_embedding and self.capacitron_vae.capacitron_use_speaker_embedding
|
||||
else None,
|
||||
text_summary_embedding_dim=self.capacitron_vae.capacitron_text_summary_embedding_dim
|
||||
if self.capacitron_vae.capacitron_use_text_summary_embeddings
|
||||
else None,
|
||||
speaker_embedding_dim=(
|
||||
self.embedded_speaker_dim
|
||||
if self.use_speaker_embedding and self.capacitron_vae.capacitron_use_speaker_embedding
|
||||
else None
|
||||
),
|
||||
text_summary_embedding_dim=(
|
||||
self.capacitron_vae.capacitron_text_summary_embedding_dim
|
||||
if self.capacitron_vae.capacitron_use_text_summary_embeddings
|
||||
else None
|
||||
),
|
||||
)
|
||||
|
||||
# backward pass decoder
|
||||
|
@ -171,9 +175,9 @@ class Tacotron(BaseTacotron):
|
|||
encoder_outputs, *capacitron_vae_outputs = self.compute_capacitron_VAE_embedding(
|
||||
encoder_outputs,
|
||||
reference_mel_info=[mel_specs, mel_lengths],
|
||||
text_info=[inputs, text_lengths]
|
||||
if self.capacitron_vae.capacitron_use_text_summary_embeddings
|
||||
else None,
|
||||
text_info=(
|
||||
[inputs, text_lengths] if self.capacitron_vae.capacitron_use_text_summary_embeddings else None
|
||||
),
|
||||
speaker_embedding=embedded_speakers if self.capacitron_vae.capacitron_use_speaker_embedding else None,
|
||||
)
|
||||
else:
|
||||
|
@ -237,13 +241,13 @@ class Tacotron(BaseTacotron):
|
|||
# B x capacitron_VAE_embedding_dim
|
||||
encoder_outputs, *_ = self.compute_capacitron_VAE_embedding(
|
||||
encoder_outputs,
|
||||
reference_mel_info=[aux_input["style_mel"], reference_mel_length]
|
||||
if aux_input["style_mel"] is not None
|
||||
else None,
|
||||
reference_mel_info=(
|
||||
[aux_input["style_mel"], reference_mel_length] if aux_input["style_mel"] is not None else None
|
||||
),
|
||||
text_info=[style_text_embedding, style_text_length] if aux_input["style_text"] is not None else None,
|
||||
speaker_embedding=aux_input["d_vectors"]
|
||||
if self.capacitron_vae.capacitron_use_speaker_embedding
|
||||
else None,
|
||||
speaker_embedding=(
|
||||
aux_input["d_vectors"] if self.capacitron_vae.capacitron_use_speaker_embedding else None
|
||||
),
|
||||
)
|
||||
if self.num_speakers > 1:
|
||||
if not self.use_d_vector_file:
|
||||
|
|
|
@ -113,12 +113,14 @@ class Tacotron2(BaseTacotron):
|
|||
num_mel=self.decoder_output_dim,
|
||||
encoder_output_dim=self.encoder_in_features,
|
||||
capacitron_VAE_embedding_dim=self.capacitron_vae.capacitron_VAE_embedding_dim,
|
||||
speaker_embedding_dim=self.embedded_speaker_dim
|
||||
if self.capacitron_vae.capacitron_use_speaker_embedding
|
||||
else None,
|
||||
text_summary_embedding_dim=self.capacitron_vae.capacitron_text_summary_embedding_dim
|
||||
if self.capacitron_vae.capacitron_use_text_summary_embeddings
|
||||
else None,
|
||||
speaker_embedding_dim=(
|
||||
self.embedded_speaker_dim if self.capacitron_vae.capacitron_use_speaker_embedding else None
|
||||
),
|
||||
text_summary_embedding_dim=(
|
||||
self.capacitron_vae.capacitron_text_summary_embedding_dim
|
||||
if self.capacitron_vae.capacitron_use_text_summary_embeddings
|
||||
else None
|
||||
),
|
||||
)
|
||||
|
||||
# backward pass decoder
|
||||
|
@ -191,9 +193,11 @@ class Tacotron2(BaseTacotron):
|
|||
encoder_outputs, *capacitron_vae_outputs = self.compute_capacitron_VAE_embedding(
|
||||
encoder_outputs,
|
||||
reference_mel_info=[mel_specs, mel_lengths],
|
||||
text_info=[embedded_inputs.transpose(1, 2), text_lengths]
|
||||
if self.capacitron_vae.capacitron_use_text_summary_embeddings
|
||||
else None,
|
||||
text_info=(
|
||||
[embedded_inputs.transpose(1, 2), text_lengths]
|
||||
if self.capacitron_vae.capacitron_use_text_summary_embeddings
|
||||
else None
|
||||
),
|
||||
speaker_embedding=embedded_speakers if self.capacitron_vae.capacitron_use_speaker_embedding else None,
|
||||
)
|
||||
else:
|
||||
|
@ -265,13 +269,13 @@ class Tacotron2(BaseTacotron):
|
|||
# B x capacitron_VAE_embedding_dim
|
||||
encoder_outputs, *_ = self.compute_capacitron_VAE_embedding(
|
||||
encoder_outputs,
|
||||
reference_mel_info=[aux_input["style_mel"], reference_mel_length]
|
||||
if aux_input["style_mel"] is not None
|
||||
else None,
|
||||
reference_mel_info=(
|
||||
[aux_input["style_mel"], reference_mel_length] if aux_input["style_mel"] is not None else None
|
||||
),
|
||||
text_info=[style_text_embedding, style_text_length] if aux_input["style_text"] is not None else None,
|
||||
speaker_embedding=aux_input["d_vectors"]
|
||||
if self.capacitron_vae.capacitron_use_speaker_embedding
|
||||
else None,
|
||||
speaker_embedding=(
|
||||
aux_input["d_vectors"] if self.capacitron_vae.capacitron_use_speaker_embedding else None
|
||||
),
|
||||
)
|
||||
|
||||
if self.num_speakers > 1:
|
||||
|
|
|
@ -715,8 +715,9 @@ class Tortoise(BaseTTS):
|
|||
self.autoregressive = self.autoregressive.to(self.device)
|
||||
if verbose:
|
||||
print("Generating autoregressive samples..")
|
||||
with self.temporary_cuda(self.autoregressive) as autoregressive, torch.autocast(
|
||||
device_type="cuda", dtype=torch.float16, enabled=half
|
||||
with (
|
||||
self.temporary_cuda(self.autoregressive) as autoregressive,
|
||||
torch.autocast(device_type="cuda", dtype=torch.float16, enabled=half),
|
||||
):
|
||||
for b in tqdm(range(num_batches), disable=not verbose):
|
||||
codes = autoregressive.inference_speech(
|
||||
|
@ -737,8 +738,9 @@ class Tortoise(BaseTTS):
|
|||
self.autoregressive_batch_size = orig_batch_size # in the case of single_sample
|
||||
|
||||
clip_results = []
|
||||
with self.temporary_cuda(self.clvp) as clvp, torch.autocast(
|
||||
device_type="cuda", dtype=torch.float16, enabled=half
|
||||
with (
|
||||
self.temporary_cuda(self.clvp) as clvp,
|
||||
torch.autocast(device_type="cuda", dtype=torch.float16, enabled=half),
|
||||
):
|
||||
for batch in tqdm(samples, disable=not verbose):
|
||||
for i in range(batch.shape[0]):
|
||||
|
|
|
@ -1887,9 +1887,11 @@ class Vits(BaseTTS):
|
|||
import onnxruntime as ort
|
||||
|
||||
providers = [
|
||||
"CPUExecutionProvider"
|
||||
if cuda is False
|
||||
else ("CUDAExecutionProvider", {"cudnn_conv_algo_search": "DEFAULT"})
|
||||
(
|
||||
"CPUExecutionProvider"
|
||||
if cuda is False
|
||||
else ("CUDAExecutionProvider", {"cudnn_conv_algo_search": "DEFAULT"})
|
||||
)
|
||||
]
|
||||
sess_options = ort.SessionOptions()
|
||||
self.onnx_sess = ort.InferenceSession(
|
||||
|
|
|
@ -207,6 +207,7 @@ class SSIMLoss(_Loss):
|
|||
https://ece.uwaterloo.ca/~z70wang/publications/ssim.pdf,
|
||||
DOI:`10.1109/TIP.2003.819861`
|
||||
"""
|
||||
|
||||
__constants__ = ["kernel_size", "k1", "k2", "sigma", "kernel", "reduction"]
|
||||
|
||||
def __init__(
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
"""Set of default text cleaners"""
|
||||
|
||||
# TODO: pick the cleaner for languages dynamically
|
||||
|
||||
import re
|
||||
|
|
|
@ -36,13 +36,16 @@ def stream_url(
|
|||
if start_byte:
|
||||
req.headers["Range"] = "bytes={}-".format(start_byte)
|
||||
|
||||
with urllib.request.urlopen(req) as upointer, tqdm(
|
||||
unit="B",
|
||||
unit_scale=True,
|
||||
unit_divisor=1024,
|
||||
total=url_size,
|
||||
disable=not progress_bar,
|
||||
) as pbar:
|
||||
with (
|
||||
urllib.request.urlopen(req) as upointer,
|
||||
tqdm(
|
||||
unit="B",
|
||||
unit_scale=True,
|
||||
unit_divisor=1024,
|
||||
total=url_size,
|
||||
disable=not progress_bar,
|
||||
) as pbar,
|
||||
):
|
||||
num_bytes = 0
|
||||
while True:
|
||||
chunk = upointer.read(block_size)
|
||||
|
|
|
@ -357,9 +357,11 @@ class BaseVC(BaseTrainerModel):
|
|||
d_vector = (random.sample(sorted(d_vector), 1),)
|
||||
|
||||
aux_inputs = {
|
||||
"speaker_id": None
|
||||
if not self.config.use_speaker_embedding
|
||||
else random.sample(sorted(self.speaker_manager.name_to_id.values()), 1),
|
||||
"speaker_id": (
|
||||
None
|
||||
if not self.config.use_speaker_embedding
|
||||
else random.sample(sorted(self.speaker_manager.name_to_id.values()), 1)
|
||||
),
|
||||
"d_vector": d_vector,
|
||||
"style_wav": None, # TODO: handle GST style input
|
||||
}
|
||||
|
|
|
@ -544,8 +544,7 @@ class FreeVC(BaseVC):
|
|||
audio = audio[0][0].data.cpu().float().numpy()
|
||||
return audio
|
||||
|
||||
def eval_step():
|
||||
...
|
||||
def eval_step(): ...
|
||||
|
||||
@staticmethod
|
||||
def init_from_config(config: FreeVCConfig, samples: Union[List[List], List[Dict]] = None, verbose=True):
|
||||
|
@ -558,5 +557,4 @@ class FreeVC(BaseVC):
|
|||
if eval:
|
||||
self.eval()
|
||||
|
||||
def train_step():
|
||||
...
|
||||
def train_step(): ...
|
||||
|
|
|
@ -155,7 +155,9 @@ def compute_mask_indices(
|
|||
|
||||
class WavLMConfig:
|
||||
def __init__(self, cfg=None):
|
||||
self.extractor_mode: str = "default" # mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with normalize=True)
|
||||
self.extractor_mode: str = (
|
||||
"default" # mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with normalize=True)
|
||||
)
|
||||
self.encoder_layers: int = 12 # num encoder layers in the transformer
|
||||
|
||||
self.encoder_embed_dim: int = 768 # encoder embedding dimension
|
||||
|
@ -164,7 +166,9 @@ class WavLMConfig:
|
|||
self.activation_fn: str = "gelu" # activation function to use
|
||||
|
||||
self.layer_norm_first: bool = False # apply layernorm first in the transformer
|
||||
self.conv_feature_layers: str = "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2" # string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...]
|
||||
self.conv_feature_layers: str = (
|
||||
"[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2" # string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...]
|
||||
)
|
||||
self.conv_bias: bool = False # include bias in conv encoder
|
||||
self.feature_grad_mult: float = 1.0 # multiply feature extractor var grads by this
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
black
|
||||
black==24.2.0
|
||||
coverage
|
||||
nose2
|
||||
ruff==0.3.0
|
||||
|
|
|
@ -234,8 +234,12 @@ class TestZH_CN_Phonemizer(unittest.TestCase):
|
|||
class TestBN_Phonemizer(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.phonemizer = BN_Phonemizer()
|
||||
self._TEST_CASES = "রাসূলুল্লাহ সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম শিক্ষা দিয়েছেন যে, কেউ যদি কোন খারাপ কিছুর সম্মুখীন হয়, তখনও যেন"
|
||||
self._EXPECTED = "রাসূলুল্লাহ সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম শিক্ষা দিয়েছেন যে কেউ যদি কোন খারাপ কিছুর সম্মুখীন হয় তখনও যেন।"
|
||||
self._TEST_CASES = (
|
||||
"রাসূলুল্লাহ সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম শিক্ষা দিয়েছেন যে, কেউ যদি কোন খারাপ কিছুর সম্মুখীন হয়, তখনও যেন"
|
||||
)
|
||||
self._EXPECTED = (
|
||||
"রাসূলুল্লাহ সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম শিক্ষা দিয়েছেন যে কেউ যদি কোন খারাপ কিছুর সম্মুখীন হয় তখনও যেন।"
|
||||
)
|
||||
|
||||
def test_phonemize(self):
|
||||
self.assertEqual(self.phonemizer.phonemize(self._TEST_CASES, separator=""), self._EXPECTED)
|
||||
|
|
|
@ -115,20 +115,14 @@ class TestFreeVC(unittest.TestCase):
|
|||
output_wav.shape[0] + config.audio.hop_length == source_wav.shape[0]
|
||||
), f"{output_wav.shape} != {source_wav.shape}"
|
||||
|
||||
def test_train_step(self):
|
||||
...
|
||||
def test_train_step(self): ...
|
||||
|
||||
def test_train_eval_log(self):
|
||||
...
|
||||
def test_train_eval_log(self): ...
|
||||
|
||||
def test_test_run(self):
|
||||
...
|
||||
def test_test_run(self): ...
|
||||
|
||||
def test_load_checkpoint(self):
|
||||
...
|
||||
def test_load_checkpoint(self): ...
|
||||
|
||||
def test_get_criterion(self):
|
||||
...
|
||||
def test_get_criterion(self): ...
|
||||
|
||||
def test_init_from_config(self):
|
||||
...
|
||||
def test_init_from_config(self): ...
|
||||
|
|
Loading…
Reference in New Issue