Merge pull request #7 from eginhard/pin-black

Pin black for consistent outputs
This commit is contained in:
Enno Hermann 2024-03-07 17:32:02 +01:00 committed by GitHub
commit 1aef5ff091
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
25 changed files with 113 additions and 87 deletions

View File

@ -8,7 +8,7 @@ repos:
# - id: end-of-file-fixer
# - id: trailing-whitespace
- repo: "https://github.com/psf/black"
rev: 23.12.0
rev: 24.2.0
hooks:
- id: black
language_version: python3

View File

@ -1,4 +1,5 @@
"""Get detailed info about the working environment."""
import json
import os
import platform

View File

@ -1,4 +1,5 @@
"""Find all the unique characters in a dataset"""
import argparse
from argparse import RawTextHelpFormatter

View File

@ -1,4 +1,5 @@
"""Find all the unique characters in a dataset"""
import argparse
import multiprocessing
from argparse import RawTextHelpFormatter

View File

@ -1,4 +1,5 @@
"""Search a good noise schedule for WaveGrad for a given number of inference iterations"""
import argparse
from itertools import product as cartesian_product

View File

@ -457,9 +457,11 @@ class TTSDataset(Dataset):
# lengths adjusted by the reduction factor
mel_lengths_adjusted = [
m.shape[1] + (self.outputs_per_step - (m.shape[1] % self.outputs_per_step))
if m.shape[1] % self.outputs_per_step
else m.shape[1]
(
m.shape[1] + (self.outputs_per_step - (m.shape[1] % self.outputs_per_step))
if m.shape[1] % self.outputs_per_step
else m.shape[1]
)
for m in mel
]

View File

@ -2,6 +2,7 @@
Much of this code is adapted from Andrej Karpathy's NanoGPT
(https://github.com/karpathy/nanoGPT)
"""
import math
from dataclasses import dataclass

View File

@ -2,6 +2,7 @@
Much of this code is adapted from Andrej Karpathy's NanoGPT
(https://github.com/karpathy/nanoGPT)
"""
import math
from dataclasses import dataclass

View File

@ -187,9 +187,9 @@ class GPT(nn.Module):
def get_grad_norm_parameter_groups(self):
return {
"conditioning_encoder": list(self.conditioning_encoder.parameters()),
"conditioning_perceiver": list(self.conditioning_perceiver.parameters())
if self.use_perceiver_resampler
else None,
"conditioning_perceiver": (
list(self.conditioning_perceiver.parameters()) if self.use_perceiver_resampler else None
),
"gpt": list(self.gpt.parameters()),
"heads": list(self.text_head.parameters()) + list(self.mel_head.parameters()),
}

View File

@ -186,9 +186,9 @@ class XTTSDataset(torch.utils.data.Dataset):
"wav_lengths": torch.tensor(wav.shape[-1], dtype=torch.long),
"filenames": audiopath,
"conditioning": cond.unsqueeze(1),
"cond_lens": torch.tensor(cond_len, dtype=torch.long)
if cond_len is not torch.nan
else torch.tensor([cond_len]),
"cond_lens": (
torch.tensor(cond_len, dtype=torch.long) if cond_len is not torch.nan else torch.tensor([cond_len])
),
"cond_idxs": torch.tensor(cond_idxs) if cond_idxs is not torch.nan else torch.tensor([cond_idxs]),
}
return res

View File

@ -225,14 +225,11 @@ class Bark(BaseTTS):
return return_dict
def eval_step(self):
...
def eval_step(self): ...
def forward(self):
...
def forward(self): ...
def inference(self):
...
def inference(self): ...
@staticmethod
def init_from_config(config: "BarkConfig", **kwargs): # pylint: disable=unused-argument

View File

@ -369,9 +369,11 @@ class BaseTTS(BaseTrainerModel):
d_vector = (random.sample(sorted(d_vector), 1),)
aux_inputs = {
"speaker_id": None
if not self.config.use_speaker_embedding
else random.sample(sorted(self.speaker_manager.name_to_id.values()), 1),
"speaker_id": (
None
if not self.config.use_speaker_embedding
else random.sample(sorted(self.speaker_manager.name_to_id.values()), 1)
),
"d_vector": d_vector,
"style_wav": None, # TODO: handle GST style input
}

View File

@ -101,12 +101,16 @@ class Tacotron(BaseTacotron):
num_mel=self.decoder_output_dim,
encoder_output_dim=self.encoder_in_features,
capacitron_VAE_embedding_dim=self.capacitron_vae.capacitron_VAE_embedding_dim,
speaker_embedding_dim=self.embedded_speaker_dim
if self.use_speaker_embedding and self.capacitron_vae.capacitron_use_speaker_embedding
else None,
text_summary_embedding_dim=self.capacitron_vae.capacitron_text_summary_embedding_dim
if self.capacitron_vae.capacitron_use_text_summary_embeddings
else None,
speaker_embedding_dim=(
self.embedded_speaker_dim
if self.use_speaker_embedding and self.capacitron_vae.capacitron_use_speaker_embedding
else None
),
text_summary_embedding_dim=(
self.capacitron_vae.capacitron_text_summary_embedding_dim
if self.capacitron_vae.capacitron_use_text_summary_embeddings
else None
),
)
# backward pass decoder
@ -171,9 +175,9 @@ class Tacotron(BaseTacotron):
encoder_outputs, *capacitron_vae_outputs = self.compute_capacitron_VAE_embedding(
encoder_outputs,
reference_mel_info=[mel_specs, mel_lengths],
text_info=[inputs, text_lengths]
if self.capacitron_vae.capacitron_use_text_summary_embeddings
else None,
text_info=(
[inputs, text_lengths] if self.capacitron_vae.capacitron_use_text_summary_embeddings else None
),
speaker_embedding=embedded_speakers if self.capacitron_vae.capacitron_use_speaker_embedding else None,
)
else:
@ -237,13 +241,13 @@ class Tacotron(BaseTacotron):
# B x capacitron_VAE_embedding_dim
encoder_outputs, *_ = self.compute_capacitron_VAE_embedding(
encoder_outputs,
reference_mel_info=[aux_input["style_mel"], reference_mel_length]
if aux_input["style_mel"] is not None
else None,
reference_mel_info=(
[aux_input["style_mel"], reference_mel_length] if aux_input["style_mel"] is not None else None
),
text_info=[style_text_embedding, style_text_length] if aux_input["style_text"] is not None else None,
speaker_embedding=aux_input["d_vectors"]
if self.capacitron_vae.capacitron_use_speaker_embedding
else None,
speaker_embedding=(
aux_input["d_vectors"] if self.capacitron_vae.capacitron_use_speaker_embedding else None
),
)
if self.num_speakers > 1:
if not self.use_d_vector_file:

View File

@ -113,12 +113,14 @@ class Tacotron2(BaseTacotron):
num_mel=self.decoder_output_dim,
encoder_output_dim=self.encoder_in_features,
capacitron_VAE_embedding_dim=self.capacitron_vae.capacitron_VAE_embedding_dim,
speaker_embedding_dim=self.embedded_speaker_dim
if self.capacitron_vae.capacitron_use_speaker_embedding
else None,
text_summary_embedding_dim=self.capacitron_vae.capacitron_text_summary_embedding_dim
if self.capacitron_vae.capacitron_use_text_summary_embeddings
else None,
speaker_embedding_dim=(
self.embedded_speaker_dim if self.capacitron_vae.capacitron_use_speaker_embedding else None
),
text_summary_embedding_dim=(
self.capacitron_vae.capacitron_text_summary_embedding_dim
if self.capacitron_vae.capacitron_use_text_summary_embeddings
else None
),
)
# backward pass decoder
@ -191,9 +193,11 @@ class Tacotron2(BaseTacotron):
encoder_outputs, *capacitron_vae_outputs = self.compute_capacitron_VAE_embedding(
encoder_outputs,
reference_mel_info=[mel_specs, mel_lengths],
text_info=[embedded_inputs.transpose(1, 2), text_lengths]
if self.capacitron_vae.capacitron_use_text_summary_embeddings
else None,
text_info=(
[embedded_inputs.transpose(1, 2), text_lengths]
if self.capacitron_vae.capacitron_use_text_summary_embeddings
else None
),
speaker_embedding=embedded_speakers if self.capacitron_vae.capacitron_use_speaker_embedding else None,
)
else:
@ -265,13 +269,13 @@ class Tacotron2(BaseTacotron):
# B x capacitron_VAE_embedding_dim
encoder_outputs, *_ = self.compute_capacitron_VAE_embedding(
encoder_outputs,
reference_mel_info=[aux_input["style_mel"], reference_mel_length]
if aux_input["style_mel"] is not None
else None,
reference_mel_info=(
[aux_input["style_mel"], reference_mel_length] if aux_input["style_mel"] is not None else None
),
text_info=[style_text_embedding, style_text_length] if aux_input["style_text"] is not None else None,
speaker_embedding=aux_input["d_vectors"]
if self.capacitron_vae.capacitron_use_speaker_embedding
else None,
speaker_embedding=(
aux_input["d_vectors"] if self.capacitron_vae.capacitron_use_speaker_embedding else None
),
)
if self.num_speakers > 1:

View File

@ -715,8 +715,9 @@ class Tortoise(BaseTTS):
self.autoregressive = self.autoregressive.to(self.device)
if verbose:
print("Generating autoregressive samples..")
with self.temporary_cuda(self.autoregressive) as autoregressive, torch.autocast(
device_type="cuda", dtype=torch.float16, enabled=half
with (
self.temporary_cuda(self.autoregressive) as autoregressive,
torch.autocast(device_type="cuda", dtype=torch.float16, enabled=half),
):
for b in tqdm(range(num_batches), disable=not verbose):
codes = autoregressive.inference_speech(
@ -737,8 +738,9 @@ class Tortoise(BaseTTS):
self.autoregressive_batch_size = orig_batch_size # in the case of single_sample
clip_results = []
with self.temporary_cuda(self.clvp) as clvp, torch.autocast(
device_type="cuda", dtype=torch.float16, enabled=half
with (
self.temporary_cuda(self.clvp) as clvp,
torch.autocast(device_type="cuda", dtype=torch.float16, enabled=half),
):
for batch in tqdm(samples, disable=not verbose):
for i in range(batch.shape[0]):

View File

@ -1887,9 +1887,11 @@ class Vits(BaseTTS):
import onnxruntime as ort
providers = [
"CPUExecutionProvider"
if cuda is False
else ("CUDAExecutionProvider", {"cudnn_conv_algo_search": "DEFAULT"})
(
"CPUExecutionProvider"
if cuda is False
else ("CUDAExecutionProvider", {"cudnn_conv_algo_search": "DEFAULT"})
)
]
sess_options = ort.SessionOptions()
self.onnx_sess = ort.InferenceSession(

View File

@ -207,6 +207,7 @@ class SSIMLoss(_Loss):
https://ece.uwaterloo.ca/~z70wang/publications/ssim.pdf,
DOI:`10.1109/TIP.2003.819861`
"""
__constants__ = ["kernel_size", "k1", "k2", "sigma", "kernel", "reduction"]
def __init__(

View File

@ -1,4 +1,5 @@
"""Set of default text cleaners"""
# TODO: pick the cleaner for languages dynamically
import re

View File

@ -36,13 +36,16 @@ def stream_url(
if start_byte:
req.headers["Range"] = "bytes={}-".format(start_byte)
with urllib.request.urlopen(req) as upointer, tqdm(
unit="B",
unit_scale=True,
unit_divisor=1024,
total=url_size,
disable=not progress_bar,
) as pbar:
with (
urllib.request.urlopen(req) as upointer,
tqdm(
unit="B",
unit_scale=True,
unit_divisor=1024,
total=url_size,
disable=not progress_bar,
) as pbar,
):
num_bytes = 0
while True:
chunk = upointer.read(block_size)

View File

@ -357,9 +357,11 @@ class BaseVC(BaseTrainerModel):
d_vector = (random.sample(sorted(d_vector), 1),)
aux_inputs = {
"speaker_id": None
if not self.config.use_speaker_embedding
else random.sample(sorted(self.speaker_manager.name_to_id.values()), 1),
"speaker_id": (
None
if not self.config.use_speaker_embedding
else random.sample(sorted(self.speaker_manager.name_to_id.values()), 1)
),
"d_vector": d_vector,
"style_wav": None, # TODO: handle GST style input
}

View File

@ -544,8 +544,7 @@ class FreeVC(BaseVC):
audio = audio[0][0].data.cpu().float().numpy()
return audio
def eval_step():
...
def eval_step(): ...
@staticmethod
def init_from_config(config: FreeVCConfig, samples: Union[List[List], List[Dict]] = None, verbose=True):
@ -558,5 +557,4 @@ class FreeVC(BaseVC):
if eval:
self.eval()
def train_step():
...
def train_step(): ...

View File

@ -155,7 +155,9 @@ def compute_mask_indices(
class WavLMConfig:
def __init__(self, cfg=None):
self.extractor_mode: str = "default" # mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with normalize=True)
self.extractor_mode: str = (
"default" # mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with normalize=True)
)
self.encoder_layers: int = 12 # num encoder layers in the transformer
self.encoder_embed_dim: int = 768 # encoder embedding dimension
@ -164,7 +166,9 @@ class WavLMConfig:
self.activation_fn: str = "gelu" # activation function to use
self.layer_norm_first: bool = False # apply layernorm first in the transformer
self.conv_feature_layers: str = "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2" # string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...]
self.conv_feature_layers: str = (
"[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2" # string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...]
)
self.conv_bias: bool = False # include bias in conv encoder
self.feature_grad_mult: float = 1.0 # multiply feature extractor var grads by this

View File

@ -1,4 +1,4 @@
black
black==24.2.0
coverage
nose2
ruff==0.3.0

View File

@ -234,8 +234,12 @@ class TestZH_CN_Phonemizer(unittest.TestCase):
class TestBN_Phonemizer(unittest.TestCase):
def setUp(self):
self.phonemizer = BN_Phonemizer()
self._TEST_CASES = "রাসূলুল্লাহ সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম শিক্ষা দিয়েছেন যে, কেউ যদি কোন খারাপ কিছুর সম্মুখীন হয়, তখনও যেন"
self._EXPECTED = "রাসূলুল্লাহ সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম শিক্ষা দিয়েছেন যে কেউ যদি কোন খারাপ কিছুর সম্মুখীন হয় তখনও যেন।"
self._TEST_CASES = (
"রাসূলুল্লাহ সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম শিক্ষা দিয়েছেন যে, কেউ যদি কোন খারাপ কিছুর সম্মুখীন হয়, তখনও যেন"
)
self._EXPECTED = (
"রাসূলুল্লাহ সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম শিক্ষা দিয়েছেন যে কেউ যদি কোন খারাপ কিছুর সম্মুখীন হয় তখনও যেন।"
)
def test_phonemize(self):
self.assertEqual(self.phonemizer.phonemize(self._TEST_CASES, separator=""), self._EXPECTED)

View File

@ -115,20 +115,14 @@ class TestFreeVC(unittest.TestCase):
output_wav.shape[0] + config.audio.hop_length == source_wav.shape[0]
), f"{output_wav.shape} != {source_wav.shape}"
def test_train_step(self):
...
def test_train_step(self): ...
def test_train_eval_log(self):
...
def test_train_eval_log(self): ...
def test_test_run(self):
...
def test_test_run(self): ...
def test_load_checkpoint(self):
...
def test_load_checkpoint(self): ...
def test_get_criterion(self):
...
def test_get_criterion(self): ...
def test_init_from_config(self):
...
def test_init_from_config(self): ...