diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4853cb07..eeb02fde 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -8,7 +8,7 @@ repos: # - id: end-of-file-fixer # - id: trailing-whitespace - repo: "https://github.com/psf/black" - rev: 23.12.0 + rev: 24.2.0 hooks: - id: black language_version: python3 diff --git a/TTS/bin/collect_env_info.py b/TTS/bin/collect_env_info.py index e76f6a75..32aa303e 100644 --- a/TTS/bin/collect_env_info.py +++ b/TTS/bin/collect_env_info.py @@ -1,4 +1,5 @@ """Get detailed info about the working environment.""" + import json import os import platform diff --git a/TTS/bin/find_unique_chars.py b/TTS/bin/find_unique_chars.py index ea169748..81f2f446 100644 --- a/TTS/bin/find_unique_chars.py +++ b/TTS/bin/find_unique_chars.py @@ -1,4 +1,5 @@ """Find all the unique characters in a dataset""" + import argparse from argparse import RawTextHelpFormatter diff --git a/TTS/bin/find_unique_phonemes.py b/TTS/bin/find_unique_phonemes.py index 2df07006..48f2e7b7 100644 --- a/TTS/bin/find_unique_phonemes.py +++ b/TTS/bin/find_unique_phonemes.py @@ -1,4 +1,5 @@ """Find all the unique characters in a dataset""" + import argparse import multiprocessing from argparse import RawTextHelpFormatter diff --git a/TTS/bin/tune_wavegrad.py b/TTS/bin/tune_wavegrad.py index 09582cea..a4b10009 100644 --- a/TTS/bin/tune_wavegrad.py +++ b/TTS/bin/tune_wavegrad.py @@ -1,4 +1,5 @@ """Search a good noise schedule for WaveGrad for a given number of inference iterations""" + import argparse from itertools import product as cartesian_product diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index d5928940..9d0c45ad 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -457,9 +457,11 @@ class TTSDataset(Dataset): # lengths adjusted by the reduction factor mel_lengths_adjusted = [ - m.shape[1] + (self.outputs_per_step - (m.shape[1] % self.outputs_per_step)) - if m.shape[1] % self.outputs_per_step - else m.shape[1] + ( + m.shape[1] + (self.outputs_per_step - (m.shape[1] % self.outputs_per_step)) + if m.shape[1] % self.outputs_per_step + else m.shape[1] + ) for m in mel ] diff --git a/TTS/tts/layers/bark/model.py b/TTS/tts/layers/bark/model.py index c84022bd..68c50dbd 100644 --- a/TTS/tts/layers/bark/model.py +++ b/TTS/tts/layers/bark/model.py @@ -2,6 +2,7 @@ Much of this code is adapted from Andrej Karpathy's NanoGPT (https://github.com/karpathy/nanoGPT) """ + import math from dataclasses import dataclass diff --git a/TTS/tts/layers/bark/model_fine.py b/TTS/tts/layers/bark/model_fine.py index 09e5f476..29126b41 100644 --- a/TTS/tts/layers/bark/model_fine.py +++ b/TTS/tts/layers/bark/model_fine.py @@ -2,6 +2,7 @@ Much of this code is adapted from Andrej Karpathy's NanoGPT (https://github.com/karpathy/nanoGPT) """ + import math from dataclasses import dataclass diff --git a/TTS/tts/layers/xtts/gpt.py b/TTS/tts/layers/xtts/gpt.py index ca0dc7cc..b55b84d9 100644 --- a/TTS/tts/layers/xtts/gpt.py +++ b/TTS/tts/layers/xtts/gpt.py @@ -187,9 +187,9 @@ class GPT(nn.Module): def get_grad_norm_parameter_groups(self): return { "conditioning_encoder": list(self.conditioning_encoder.parameters()), - "conditioning_perceiver": list(self.conditioning_perceiver.parameters()) - if self.use_perceiver_resampler - else None, + "conditioning_perceiver": ( + list(self.conditioning_perceiver.parameters()) if self.use_perceiver_resampler else None + ), "gpt": list(self.gpt.parameters()), "heads": list(self.text_head.parameters()) + list(self.mel_head.parameters()), } diff --git a/TTS/tts/layers/xtts/trainer/dataset.py b/TTS/tts/layers/xtts/trainer/dataset.py index 4d6d6ede..0a19997a 100644 --- a/TTS/tts/layers/xtts/trainer/dataset.py +++ b/TTS/tts/layers/xtts/trainer/dataset.py @@ -186,9 +186,9 @@ class XTTSDataset(torch.utils.data.Dataset): "wav_lengths": torch.tensor(wav.shape[-1], dtype=torch.long), "filenames": audiopath, "conditioning": cond.unsqueeze(1), - "cond_lens": torch.tensor(cond_len, dtype=torch.long) - if cond_len is not torch.nan - else torch.tensor([cond_len]), + "cond_lens": ( + torch.tensor(cond_len, dtype=torch.long) if cond_len is not torch.nan else torch.tensor([cond_len]) + ), "cond_idxs": torch.tensor(cond_idxs) if cond_idxs is not torch.nan else torch.tensor([cond_idxs]), } return res diff --git a/TTS/tts/models/bark.py b/TTS/tts/models/bark.py index e5edffd4..833a9093 100644 --- a/TTS/tts/models/bark.py +++ b/TTS/tts/models/bark.py @@ -225,14 +225,11 @@ class Bark(BaseTTS): return return_dict - def eval_step(self): - ... + def eval_step(self): ... - def forward(self): - ... + def forward(self): ... - def inference(self): - ... + def inference(self): ... @staticmethod def init_from_config(config: "BarkConfig", **kwargs): # pylint: disable=unused-argument diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index be76f6c2..0aa5edc6 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -369,9 +369,11 @@ class BaseTTS(BaseTrainerModel): d_vector = (random.sample(sorted(d_vector), 1),) aux_inputs = { - "speaker_id": None - if not self.config.use_speaker_embedding - else random.sample(sorted(self.speaker_manager.name_to_id.values()), 1), + "speaker_id": ( + None + if not self.config.use_speaker_embedding + else random.sample(sorted(self.speaker_manager.name_to_id.values()), 1) + ), "d_vector": d_vector, "style_wav": None, # TODO: handle GST style input } diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index 474ec464..400a86d0 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -101,12 +101,16 @@ class Tacotron(BaseTacotron): num_mel=self.decoder_output_dim, encoder_output_dim=self.encoder_in_features, capacitron_VAE_embedding_dim=self.capacitron_vae.capacitron_VAE_embedding_dim, - speaker_embedding_dim=self.embedded_speaker_dim - if self.use_speaker_embedding and self.capacitron_vae.capacitron_use_speaker_embedding - else None, - text_summary_embedding_dim=self.capacitron_vae.capacitron_text_summary_embedding_dim - if self.capacitron_vae.capacitron_use_text_summary_embeddings - else None, + speaker_embedding_dim=( + self.embedded_speaker_dim + if self.use_speaker_embedding and self.capacitron_vae.capacitron_use_speaker_embedding + else None + ), + text_summary_embedding_dim=( + self.capacitron_vae.capacitron_text_summary_embedding_dim + if self.capacitron_vae.capacitron_use_text_summary_embeddings + else None + ), ) # backward pass decoder @@ -171,9 +175,9 @@ class Tacotron(BaseTacotron): encoder_outputs, *capacitron_vae_outputs = self.compute_capacitron_VAE_embedding( encoder_outputs, reference_mel_info=[mel_specs, mel_lengths], - text_info=[inputs, text_lengths] - if self.capacitron_vae.capacitron_use_text_summary_embeddings - else None, + text_info=( + [inputs, text_lengths] if self.capacitron_vae.capacitron_use_text_summary_embeddings else None + ), speaker_embedding=embedded_speakers if self.capacitron_vae.capacitron_use_speaker_embedding else None, ) else: @@ -237,13 +241,13 @@ class Tacotron(BaseTacotron): # B x capacitron_VAE_embedding_dim encoder_outputs, *_ = self.compute_capacitron_VAE_embedding( encoder_outputs, - reference_mel_info=[aux_input["style_mel"], reference_mel_length] - if aux_input["style_mel"] is not None - else None, + reference_mel_info=( + [aux_input["style_mel"], reference_mel_length] if aux_input["style_mel"] is not None else None + ), text_info=[style_text_embedding, style_text_length] if aux_input["style_text"] is not None else None, - speaker_embedding=aux_input["d_vectors"] - if self.capacitron_vae.capacitron_use_speaker_embedding - else None, + speaker_embedding=( + aux_input["d_vectors"] if self.capacitron_vae.capacitron_use_speaker_embedding else None + ), ) if self.num_speakers > 1: if not self.use_d_vector_file: diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index 71ab1eac..4b1317f4 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -113,12 +113,14 @@ class Tacotron2(BaseTacotron): num_mel=self.decoder_output_dim, encoder_output_dim=self.encoder_in_features, capacitron_VAE_embedding_dim=self.capacitron_vae.capacitron_VAE_embedding_dim, - speaker_embedding_dim=self.embedded_speaker_dim - if self.capacitron_vae.capacitron_use_speaker_embedding - else None, - text_summary_embedding_dim=self.capacitron_vae.capacitron_text_summary_embedding_dim - if self.capacitron_vae.capacitron_use_text_summary_embeddings - else None, + speaker_embedding_dim=( + self.embedded_speaker_dim if self.capacitron_vae.capacitron_use_speaker_embedding else None + ), + text_summary_embedding_dim=( + self.capacitron_vae.capacitron_text_summary_embedding_dim + if self.capacitron_vae.capacitron_use_text_summary_embeddings + else None + ), ) # backward pass decoder @@ -191,9 +193,11 @@ class Tacotron2(BaseTacotron): encoder_outputs, *capacitron_vae_outputs = self.compute_capacitron_VAE_embedding( encoder_outputs, reference_mel_info=[mel_specs, mel_lengths], - text_info=[embedded_inputs.transpose(1, 2), text_lengths] - if self.capacitron_vae.capacitron_use_text_summary_embeddings - else None, + text_info=( + [embedded_inputs.transpose(1, 2), text_lengths] + if self.capacitron_vae.capacitron_use_text_summary_embeddings + else None + ), speaker_embedding=embedded_speakers if self.capacitron_vae.capacitron_use_speaker_embedding else None, ) else: @@ -265,13 +269,13 @@ class Tacotron2(BaseTacotron): # B x capacitron_VAE_embedding_dim encoder_outputs, *_ = self.compute_capacitron_VAE_embedding( encoder_outputs, - reference_mel_info=[aux_input["style_mel"], reference_mel_length] - if aux_input["style_mel"] is not None - else None, + reference_mel_info=( + [aux_input["style_mel"], reference_mel_length] if aux_input["style_mel"] is not None else None + ), text_info=[style_text_embedding, style_text_length] if aux_input["style_text"] is not None else None, - speaker_embedding=aux_input["d_vectors"] - if self.capacitron_vae.capacitron_use_speaker_embedding - else None, + speaker_embedding=( + aux_input["d_vectors"] if self.capacitron_vae.capacitron_use_speaker_embedding else None + ), ) if self.num_speakers > 1: diff --git a/TTS/tts/models/tortoise.py b/TTS/tts/models/tortoise.py index 16644ff9..99e0107f 100644 --- a/TTS/tts/models/tortoise.py +++ b/TTS/tts/models/tortoise.py @@ -715,8 +715,9 @@ class Tortoise(BaseTTS): self.autoregressive = self.autoregressive.to(self.device) if verbose: print("Generating autoregressive samples..") - with self.temporary_cuda(self.autoregressive) as autoregressive, torch.autocast( - device_type="cuda", dtype=torch.float16, enabled=half + with ( + self.temporary_cuda(self.autoregressive) as autoregressive, + torch.autocast(device_type="cuda", dtype=torch.float16, enabled=half), ): for b in tqdm(range(num_batches), disable=not verbose): codes = autoregressive.inference_speech( @@ -737,8 +738,9 @@ class Tortoise(BaseTTS): self.autoregressive_batch_size = orig_batch_size # in the case of single_sample clip_results = [] - with self.temporary_cuda(self.clvp) as clvp, torch.autocast( - device_type="cuda", dtype=torch.float16, enabled=half + with ( + self.temporary_cuda(self.clvp) as clvp, + torch.autocast(device_type="cuda", dtype=torch.float16, enabled=half), ): for batch in tqdm(samples, disable=not verbose): for i in range(batch.shape[0]): diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 2c60ece7..e91d26b9 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -1887,9 +1887,11 @@ class Vits(BaseTTS): import onnxruntime as ort providers = [ - "CPUExecutionProvider" - if cuda is False - else ("CUDAExecutionProvider", {"cudnn_conv_algo_search": "DEFAULT"}) + ( + "CPUExecutionProvider" + if cuda is False + else ("CUDAExecutionProvider", {"cudnn_conv_algo_search": "DEFAULT"}) + ) ] sess_options = ort.SessionOptions() self.onnx_sess = ort.InferenceSession( diff --git a/TTS/tts/utils/ssim.py b/TTS/tts/utils/ssim.py index 4bc3befc..eddf05db 100644 --- a/TTS/tts/utils/ssim.py +++ b/TTS/tts/utils/ssim.py @@ -207,6 +207,7 @@ class SSIMLoss(_Loss): https://ece.uwaterloo.ca/~z70wang/publications/ssim.pdf, DOI:`10.1109/TIP.2003.819861` """ + __constants__ = ["kernel_size", "k1", "k2", "sigma", "kernel", "reduction"] def __init__( diff --git a/TTS/tts/utils/text/cleaners.py b/TTS/tts/utils/text/cleaners.py index 74d3910b..794a87c8 100644 --- a/TTS/tts/utils/text/cleaners.py +++ b/TTS/tts/utils/text/cleaners.py @@ -1,4 +1,5 @@ """Set of default text cleaners""" + # TODO: pick the cleaner for languages dynamically import re diff --git a/TTS/utils/download.py b/TTS/utils/download.py index 3f06b578..37e6ed3c 100644 --- a/TTS/utils/download.py +++ b/TTS/utils/download.py @@ -36,13 +36,16 @@ def stream_url( if start_byte: req.headers["Range"] = "bytes={}-".format(start_byte) - with urllib.request.urlopen(req) as upointer, tqdm( - unit="B", - unit_scale=True, - unit_divisor=1024, - total=url_size, - disable=not progress_bar, - ) as pbar: + with ( + urllib.request.urlopen(req) as upointer, + tqdm( + unit="B", + unit_scale=True, + unit_divisor=1024, + total=url_size, + disable=not progress_bar, + ) as pbar, + ): num_bytes = 0 while True: chunk = upointer.read(block_size) diff --git a/TTS/vc/models/base_vc.py b/TTS/vc/models/base_vc.py index 19f2761b..78f1556b 100644 --- a/TTS/vc/models/base_vc.py +++ b/TTS/vc/models/base_vc.py @@ -357,9 +357,11 @@ class BaseVC(BaseTrainerModel): d_vector = (random.sample(sorted(d_vector), 1),) aux_inputs = { - "speaker_id": None - if not self.config.use_speaker_embedding - else random.sample(sorted(self.speaker_manager.name_to_id.values()), 1), + "speaker_id": ( + None + if not self.config.use_speaker_embedding + else random.sample(sorted(self.speaker_manager.name_to_id.values()), 1) + ), "d_vector": d_vector, "style_wav": None, # TODO: handle GST style input } diff --git a/TTS/vc/models/freevc.py b/TTS/vc/models/freevc.py index a5a340f2..8f2a35d2 100644 --- a/TTS/vc/models/freevc.py +++ b/TTS/vc/models/freevc.py @@ -544,8 +544,7 @@ class FreeVC(BaseVC): audio = audio[0][0].data.cpu().float().numpy() return audio - def eval_step(): - ... + def eval_step(): ... @staticmethod def init_from_config(config: FreeVCConfig, samples: Union[List[List], List[Dict]] = None, verbose=True): @@ -558,5 +557,4 @@ class FreeVC(BaseVC): if eval: self.eval() - def train_step(): - ... + def train_step(): ... diff --git a/TTS/vc/modules/freevc/wavlm/wavlm.py b/TTS/vc/modules/freevc/wavlm/wavlm.py index d2f28d19..10dd09ed 100644 --- a/TTS/vc/modules/freevc/wavlm/wavlm.py +++ b/TTS/vc/modules/freevc/wavlm/wavlm.py @@ -155,7 +155,9 @@ def compute_mask_indices( class WavLMConfig: def __init__(self, cfg=None): - self.extractor_mode: str = "default" # mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with normalize=True) + self.extractor_mode: str = ( + "default" # mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with normalize=True) + ) self.encoder_layers: int = 12 # num encoder layers in the transformer self.encoder_embed_dim: int = 768 # encoder embedding dimension @@ -164,7 +166,9 @@ class WavLMConfig: self.activation_fn: str = "gelu" # activation function to use self.layer_norm_first: bool = False # apply layernorm first in the transformer - self.conv_feature_layers: str = "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2" # string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...] + self.conv_feature_layers: str = ( + "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2" # string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...] + ) self.conv_bias: bool = False # include bias in conv encoder self.feature_grad_mult: float = 1.0 # multiply feature extractor var grads by this diff --git a/requirements.dev.txt b/requirements.dev.txt index 1c23a181..68450fca 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -1,4 +1,4 @@ -black +black==24.2.0 coverage nose2 ruff==0.3.0 diff --git a/tests/text_tests/test_phonemizer.py b/tests/text_tests/test_phonemizer.py index 88105544..ca25b302 100644 --- a/tests/text_tests/test_phonemizer.py +++ b/tests/text_tests/test_phonemizer.py @@ -234,8 +234,12 @@ class TestZH_CN_Phonemizer(unittest.TestCase): class TestBN_Phonemizer(unittest.TestCase): def setUp(self): self.phonemizer = BN_Phonemizer() - self._TEST_CASES = "রাসূলুল্লাহ সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম শিক্ষা দিয়েছেন যে, কেউ যদি কোন খারাপ কিছুর সম্মুখীন হয়, তখনও যেন" - self._EXPECTED = "রাসূলুল্লাহ সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম শিক্ষা দিয়েছেন যে কেউ যদি কোন খারাপ কিছুর সম্মুখীন হয় তখনও যেন।" + self._TEST_CASES = ( + "রাসূলুল্লাহ সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম শিক্ষা দিয়েছেন যে, কেউ যদি কোন খারাপ কিছুর সম্মুখীন হয়, তখনও যেন" + ) + self._EXPECTED = ( + "রাসূলুল্লাহ সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম শিক্ষা দিয়েছেন যে কেউ যদি কোন খারাপ কিছুর সম্মুখীন হয় তখনও যেন।" + ) def test_phonemize(self): self.assertEqual(self.phonemizer.phonemize(self._TEST_CASES, separator=""), self._EXPECTED) diff --git a/tests/vc_tests/test_freevc.py b/tests/vc_tests/test_freevc.py index 3755ab3f..c9e6cedf 100644 --- a/tests/vc_tests/test_freevc.py +++ b/tests/vc_tests/test_freevc.py @@ -115,20 +115,14 @@ class TestFreeVC(unittest.TestCase): output_wav.shape[0] + config.audio.hop_length == source_wav.shape[0] ), f"{output_wav.shape} != {source_wav.shape}" - def test_train_step(self): - ... + def test_train_step(self): ... - def test_train_eval_log(self): - ... + def test_train_eval_log(self): ... - def test_test_run(self): - ... + def test_test_run(self): ... - def test_load_checkpoint(self): - ... + def test_load_checkpoint(self): ... - def test_get_criterion(self): - ... + def test_get_criterion(self): ... - def test_init_from_config(self): - ... + def test_init_from_config(self): ...