style: run black

This commit is contained in:
Enno Hermann 2024-03-07 11:46:51 +01:00
parent c86cf9b2ef
commit efdafd5a7f
23 changed files with 111 additions and 85 deletions

View File

@ -1,4 +1,5 @@
"""Get detailed info about the working environment.""" """Get detailed info about the working environment."""
import json import json
import os import os
import platform import platform

View File

@ -1,4 +1,5 @@
"""Find all the unique characters in a dataset""" """Find all the unique characters in a dataset"""
import argparse import argparse
from argparse import RawTextHelpFormatter from argparse import RawTextHelpFormatter

View File

@ -1,4 +1,5 @@
"""Find all the unique characters in a dataset""" """Find all the unique characters in a dataset"""
import argparse import argparse
import multiprocessing import multiprocessing
from argparse import RawTextHelpFormatter from argparse import RawTextHelpFormatter

View File

@ -1,4 +1,5 @@
"""Search a good noise schedule for WaveGrad for a given number of inference iterations""" """Search a good noise schedule for WaveGrad for a given number of inference iterations"""
import argparse import argparse
from itertools import product as cartesian_product from itertools import product as cartesian_product

View File

@ -457,9 +457,11 @@ class TTSDataset(Dataset):
# lengths adjusted by the reduction factor # lengths adjusted by the reduction factor
mel_lengths_adjusted = [ mel_lengths_adjusted = [
m.shape[1] + (self.outputs_per_step - (m.shape[1] % self.outputs_per_step)) (
if m.shape[1] % self.outputs_per_step m.shape[1] + (self.outputs_per_step - (m.shape[1] % self.outputs_per_step))
else m.shape[1] if m.shape[1] % self.outputs_per_step
else m.shape[1]
)
for m in mel for m in mel
] ]

View File

@ -2,6 +2,7 @@
Much of this code is adapted from Andrej Karpathy's NanoGPT Much of this code is adapted from Andrej Karpathy's NanoGPT
(https://github.com/karpathy/nanoGPT) (https://github.com/karpathy/nanoGPT)
""" """
import math import math
from dataclasses import dataclass from dataclasses import dataclass

View File

@ -2,6 +2,7 @@
Much of this code is adapted from Andrej Karpathy's NanoGPT Much of this code is adapted from Andrej Karpathy's NanoGPT
(https://github.com/karpathy/nanoGPT) (https://github.com/karpathy/nanoGPT)
""" """
import math import math
from dataclasses import dataclass from dataclasses import dataclass

View File

@ -187,9 +187,9 @@ class GPT(nn.Module):
def get_grad_norm_parameter_groups(self): def get_grad_norm_parameter_groups(self):
return { return {
"conditioning_encoder": list(self.conditioning_encoder.parameters()), "conditioning_encoder": list(self.conditioning_encoder.parameters()),
"conditioning_perceiver": list(self.conditioning_perceiver.parameters()) "conditioning_perceiver": (
if self.use_perceiver_resampler list(self.conditioning_perceiver.parameters()) if self.use_perceiver_resampler else None
else None, ),
"gpt": list(self.gpt.parameters()), "gpt": list(self.gpt.parameters()),
"heads": list(self.text_head.parameters()) + list(self.mel_head.parameters()), "heads": list(self.text_head.parameters()) + list(self.mel_head.parameters()),
} }

View File

@ -186,9 +186,9 @@ class XTTSDataset(torch.utils.data.Dataset):
"wav_lengths": torch.tensor(wav.shape[-1], dtype=torch.long), "wav_lengths": torch.tensor(wav.shape[-1], dtype=torch.long),
"filenames": audiopath, "filenames": audiopath,
"conditioning": cond.unsqueeze(1), "conditioning": cond.unsqueeze(1),
"cond_lens": torch.tensor(cond_len, dtype=torch.long) "cond_lens": (
if cond_len is not torch.nan torch.tensor(cond_len, dtype=torch.long) if cond_len is not torch.nan else torch.tensor([cond_len])
else torch.tensor([cond_len]), ),
"cond_idxs": torch.tensor(cond_idxs) if cond_idxs is not torch.nan else torch.tensor([cond_idxs]), "cond_idxs": torch.tensor(cond_idxs) if cond_idxs is not torch.nan else torch.tensor([cond_idxs]),
} }
return res return res

View File

@ -225,14 +225,11 @@ class Bark(BaseTTS):
return return_dict return return_dict
def eval_step(self): def eval_step(self): ...
...
def forward(self): def forward(self): ...
...
def inference(self): def inference(self): ...
...
@staticmethod @staticmethod
def init_from_config(config: "BarkConfig", **kwargs): # pylint: disable=unused-argument def init_from_config(config: "BarkConfig", **kwargs): # pylint: disable=unused-argument

View File

@ -369,9 +369,11 @@ class BaseTTS(BaseTrainerModel):
d_vector = (random.sample(sorted(d_vector), 1),) d_vector = (random.sample(sorted(d_vector), 1),)
aux_inputs = { aux_inputs = {
"speaker_id": None "speaker_id": (
if not self.config.use_speaker_embedding None
else random.sample(sorted(self.speaker_manager.name_to_id.values()), 1), if not self.config.use_speaker_embedding
else random.sample(sorted(self.speaker_manager.name_to_id.values()), 1)
),
"d_vector": d_vector, "d_vector": d_vector,
"style_wav": None, # TODO: handle GST style input "style_wav": None, # TODO: handle GST style input
} }

View File

@ -101,12 +101,16 @@ class Tacotron(BaseTacotron):
num_mel=self.decoder_output_dim, num_mel=self.decoder_output_dim,
encoder_output_dim=self.encoder_in_features, encoder_output_dim=self.encoder_in_features,
capacitron_VAE_embedding_dim=self.capacitron_vae.capacitron_VAE_embedding_dim, capacitron_VAE_embedding_dim=self.capacitron_vae.capacitron_VAE_embedding_dim,
speaker_embedding_dim=self.embedded_speaker_dim speaker_embedding_dim=(
if self.use_speaker_embedding and self.capacitron_vae.capacitron_use_speaker_embedding self.embedded_speaker_dim
else None, if self.use_speaker_embedding and self.capacitron_vae.capacitron_use_speaker_embedding
text_summary_embedding_dim=self.capacitron_vae.capacitron_text_summary_embedding_dim else None
if self.capacitron_vae.capacitron_use_text_summary_embeddings ),
else None, text_summary_embedding_dim=(
self.capacitron_vae.capacitron_text_summary_embedding_dim
if self.capacitron_vae.capacitron_use_text_summary_embeddings
else None
),
) )
# backward pass decoder # backward pass decoder
@ -171,9 +175,9 @@ class Tacotron(BaseTacotron):
encoder_outputs, *capacitron_vae_outputs = self.compute_capacitron_VAE_embedding( encoder_outputs, *capacitron_vae_outputs = self.compute_capacitron_VAE_embedding(
encoder_outputs, encoder_outputs,
reference_mel_info=[mel_specs, mel_lengths], reference_mel_info=[mel_specs, mel_lengths],
text_info=[inputs, text_lengths] text_info=(
if self.capacitron_vae.capacitron_use_text_summary_embeddings [inputs, text_lengths] if self.capacitron_vae.capacitron_use_text_summary_embeddings else None
else None, ),
speaker_embedding=embedded_speakers if self.capacitron_vae.capacitron_use_speaker_embedding else None, speaker_embedding=embedded_speakers if self.capacitron_vae.capacitron_use_speaker_embedding else None,
) )
else: else:
@ -237,13 +241,13 @@ class Tacotron(BaseTacotron):
# B x capacitron_VAE_embedding_dim # B x capacitron_VAE_embedding_dim
encoder_outputs, *_ = self.compute_capacitron_VAE_embedding( encoder_outputs, *_ = self.compute_capacitron_VAE_embedding(
encoder_outputs, encoder_outputs,
reference_mel_info=[aux_input["style_mel"], reference_mel_length] reference_mel_info=(
if aux_input["style_mel"] is not None [aux_input["style_mel"], reference_mel_length] if aux_input["style_mel"] is not None else None
else None, ),
text_info=[style_text_embedding, style_text_length] if aux_input["style_text"] is not None else None, text_info=[style_text_embedding, style_text_length] if aux_input["style_text"] is not None else None,
speaker_embedding=aux_input["d_vectors"] speaker_embedding=(
if self.capacitron_vae.capacitron_use_speaker_embedding aux_input["d_vectors"] if self.capacitron_vae.capacitron_use_speaker_embedding else None
else None, ),
) )
if self.num_speakers > 1: if self.num_speakers > 1:
if not self.use_d_vector_file: if not self.use_d_vector_file:

View File

@ -113,12 +113,14 @@ class Tacotron2(BaseTacotron):
num_mel=self.decoder_output_dim, num_mel=self.decoder_output_dim,
encoder_output_dim=self.encoder_in_features, encoder_output_dim=self.encoder_in_features,
capacitron_VAE_embedding_dim=self.capacitron_vae.capacitron_VAE_embedding_dim, capacitron_VAE_embedding_dim=self.capacitron_vae.capacitron_VAE_embedding_dim,
speaker_embedding_dim=self.embedded_speaker_dim speaker_embedding_dim=(
if self.capacitron_vae.capacitron_use_speaker_embedding self.embedded_speaker_dim if self.capacitron_vae.capacitron_use_speaker_embedding else None
else None, ),
text_summary_embedding_dim=self.capacitron_vae.capacitron_text_summary_embedding_dim text_summary_embedding_dim=(
if self.capacitron_vae.capacitron_use_text_summary_embeddings self.capacitron_vae.capacitron_text_summary_embedding_dim
else None, if self.capacitron_vae.capacitron_use_text_summary_embeddings
else None
),
) )
# backward pass decoder # backward pass decoder
@ -191,9 +193,11 @@ class Tacotron2(BaseTacotron):
encoder_outputs, *capacitron_vae_outputs = self.compute_capacitron_VAE_embedding( encoder_outputs, *capacitron_vae_outputs = self.compute_capacitron_VAE_embedding(
encoder_outputs, encoder_outputs,
reference_mel_info=[mel_specs, mel_lengths], reference_mel_info=[mel_specs, mel_lengths],
text_info=[embedded_inputs.transpose(1, 2), text_lengths] text_info=(
if self.capacitron_vae.capacitron_use_text_summary_embeddings [embedded_inputs.transpose(1, 2), text_lengths]
else None, if self.capacitron_vae.capacitron_use_text_summary_embeddings
else None
),
speaker_embedding=embedded_speakers if self.capacitron_vae.capacitron_use_speaker_embedding else None, speaker_embedding=embedded_speakers if self.capacitron_vae.capacitron_use_speaker_embedding else None,
) )
else: else:
@ -265,13 +269,13 @@ class Tacotron2(BaseTacotron):
# B x capacitron_VAE_embedding_dim # B x capacitron_VAE_embedding_dim
encoder_outputs, *_ = self.compute_capacitron_VAE_embedding( encoder_outputs, *_ = self.compute_capacitron_VAE_embedding(
encoder_outputs, encoder_outputs,
reference_mel_info=[aux_input["style_mel"], reference_mel_length] reference_mel_info=(
if aux_input["style_mel"] is not None [aux_input["style_mel"], reference_mel_length] if aux_input["style_mel"] is not None else None
else None, ),
text_info=[style_text_embedding, style_text_length] if aux_input["style_text"] is not None else None, text_info=[style_text_embedding, style_text_length] if aux_input["style_text"] is not None else None,
speaker_embedding=aux_input["d_vectors"] speaker_embedding=(
if self.capacitron_vae.capacitron_use_speaker_embedding aux_input["d_vectors"] if self.capacitron_vae.capacitron_use_speaker_embedding else None
else None, ),
) )
if self.num_speakers > 1: if self.num_speakers > 1:

View File

@ -715,8 +715,9 @@ class Tortoise(BaseTTS):
self.autoregressive = self.autoregressive.to(self.device) self.autoregressive = self.autoregressive.to(self.device)
if verbose: if verbose:
print("Generating autoregressive samples..") print("Generating autoregressive samples..")
with self.temporary_cuda(self.autoregressive) as autoregressive, torch.autocast( with (
device_type="cuda", dtype=torch.float16, enabled=half self.temporary_cuda(self.autoregressive) as autoregressive,
torch.autocast(device_type="cuda", dtype=torch.float16, enabled=half),
): ):
for b in tqdm(range(num_batches), disable=not verbose): for b in tqdm(range(num_batches), disable=not verbose):
codes = autoregressive.inference_speech( codes = autoregressive.inference_speech(
@ -737,8 +738,9 @@ class Tortoise(BaseTTS):
self.autoregressive_batch_size = orig_batch_size # in the case of single_sample self.autoregressive_batch_size = orig_batch_size # in the case of single_sample
clip_results = [] clip_results = []
with self.temporary_cuda(self.clvp) as clvp, torch.autocast( with (
device_type="cuda", dtype=torch.float16, enabled=half self.temporary_cuda(self.clvp) as clvp,
torch.autocast(device_type="cuda", dtype=torch.float16, enabled=half),
): ):
for batch in tqdm(samples, disable=not verbose): for batch in tqdm(samples, disable=not verbose):
for i in range(batch.shape[0]): for i in range(batch.shape[0]):

View File

@ -1887,9 +1887,11 @@ class Vits(BaseTTS):
import onnxruntime as ort import onnxruntime as ort
providers = [ providers = [
"CPUExecutionProvider" (
if cuda is False "CPUExecutionProvider"
else ("CUDAExecutionProvider", {"cudnn_conv_algo_search": "DEFAULT"}) if cuda is False
else ("CUDAExecutionProvider", {"cudnn_conv_algo_search": "DEFAULT"})
)
] ]
sess_options = ort.SessionOptions() sess_options = ort.SessionOptions()
self.onnx_sess = ort.InferenceSession( self.onnx_sess = ort.InferenceSession(

View File

@ -207,6 +207,7 @@ class SSIMLoss(_Loss):
https://ece.uwaterloo.ca/~z70wang/publications/ssim.pdf, https://ece.uwaterloo.ca/~z70wang/publications/ssim.pdf,
DOI:`10.1109/TIP.2003.819861` DOI:`10.1109/TIP.2003.819861`
""" """
__constants__ = ["kernel_size", "k1", "k2", "sigma", "kernel", "reduction"] __constants__ = ["kernel_size", "k1", "k2", "sigma", "kernel", "reduction"]
def __init__( def __init__(

View File

@ -1,4 +1,5 @@
"""Set of default text cleaners""" """Set of default text cleaners"""
# TODO: pick the cleaner for languages dynamically # TODO: pick the cleaner for languages dynamically
import re import re

View File

@ -36,13 +36,16 @@ def stream_url(
if start_byte: if start_byte:
req.headers["Range"] = "bytes={}-".format(start_byte) req.headers["Range"] = "bytes={}-".format(start_byte)
with urllib.request.urlopen(req) as upointer, tqdm( with (
unit="B", urllib.request.urlopen(req) as upointer,
unit_scale=True, tqdm(
unit_divisor=1024, unit="B",
total=url_size, unit_scale=True,
disable=not progress_bar, unit_divisor=1024,
) as pbar: total=url_size,
disable=not progress_bar,
) as pbar,
):
num_bytes = 0 num_bytes = 0
while True: while True:
chunk = upointer.read(block_size) chunk = upointer.read(block_size)

View File

@ -357,9 +357,11 @@ class BaseVC(BaseTrainerModel):
d_vector = (random.sample(sorted(d_vector), 1),) d_vector = (random.sample(sorted(d_vector), 1),)
aux_inputs = { aux_inputs = {
"speaker_id": None "speaker_id": (
if not self.config.use_speaker_embedding None
else random.sample(sorted(self.speaker_manager.name_to_id.values()), 1), if not self.config.use_speaker_embedding
else random.sample(sorted(self.speaker_manager.name_to_id.values()), 1)
),
"d_vector": d_vector, "d_vector": d_vector,
"style_wav": None, # TODO: handle GST style input "style_wav": None, # TODO: handle GST style input
} }

View File

@ -544,8 +544,7 @@ class FreeVC(BaseVC):
audio = audio[0][0].data.cpu().float().numpy() audio = audio[0][0].data.cpu().float().numpy()
return audio return audio
def eval_step(): def eval_step(): ...
...
@staticmethod @staticmethod
def init_from_config(config: FreeVCConfig, samples: Union[List[List], List[Dict]] = None, verbose=True): def init_from_config(config: FreeVCConfig, samples: Union[List[List], List[Dict]] = None, verbose=True):
@ -558,5 +557,4 @@ class FreeVC(BaseVC):
if eval: if eval:
self.eval() self.eval()
def train_step(): def train_step(): ...
...

View File

@ -155,7 +155,9 @@ def compute_mask_indices(
class WavLMConfig: class WavLMConfig:
def __init__(self, cfg=None): def __init__(self, cfg=None):
self.extractor_mode: str = "default" # mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with normalize=True) self.extractor_mode: str = (
"default" # mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with normalize=True)
)
self.encoder_layers: int = 12 # num encoder layers in the transformer self.encoder_layers: int = 12 # num encoder layers in the transformer
self.encoder_embed_dim: int = 768 # encoder embedding dimension self.encoder_embed_dim: int = 768 # encoder embedding dimension
@ -164,7 +166,9 @@ class WavLMConfig:
self.activation_fn: str = "gelu" # activation function to use self.activation_fn: str = "gelu" # activation function to use
self.layer_norm_first: bool = False # apply layernorm first in the transformer self.layer_norm_first: bool = False # apply layernorm first in the transformer
self.conv_feature_layers: str = "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2" # string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...] self.conv_feature_layers: str = (
"[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2" # string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...]
)
self.conv_bias: bool = False # include bias in conv encoder self.conv_bias: bool = False # include bias in conv encoder
self.feature_grad_mult: float = 1.0 # multiply feature extractor var grads by this self.feature_grad_mult: float = 1.0 # multiply feature extractor var grads by this

View File

@ -234,8 +234,12 @@ class TestZH_CN_Phonemizer(unittest.TestCase):
class TestBN_Phonemizer(unittest.TestCase): class TestBN_Phonemizer(unittest.TestCase):
def setUp(self): def setUp(self):
self.phonemizer = BN_Phonemizer() self.phonemizer = BN_Phonemizer()
self._TEST_CASES = "রাসূলুল্লাহ সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম শিক্ষা দিয়েছেন যে, কেউ যদি কোন খারাপ কিছুর সম্মুখীন হয়, তখনও যেন" self._TEST_CASES = (
self._EXPECTED = "রাসূলুল্লাহ সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম শিক্ষা দিয়েছেন যে কেউ যদি কোন খারাপ কিছুর সম্মুখীন হয় তখনও যেন।" "রাসূলুল্লাহ সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম শিক্ষা দিয়েছেন যে, কেউ যদি কোন খারাপ কিছুর সম্মুখীন হয়, তখনও যেন"
)
self._EXPECTED = (
"রাসূলুল্লাহ সাল্লাল্লাহু আলাইহি ওয়া সাল্লাম শিক্ষা দিয়েছেন যে কেউ যদি কোন খারাপ কিছুর সম্মুখীন হয় তখনও যেন।"
)
def test_phonemize(self): def test_phonemize(self):
self.assertEqual(self.phonemizer.phonemize(self._TEST_CASES, separator=""), self._EXPECTED) self.assertEqual(self.phonemizer.phonemize(self._TEST_CASES, separator=""), self._EXPECTED)

View File

@ -115,20 +115,14 @@ class TestFreeVC(unittest.TestCase):
output_wav.shape[0] + config.audio.hop_length == source_wav.shape[0] output_wav.shape[0] + config.audio.hop_length == source_wav.shape[0]
), f"{output_wav.shape} != {source_wav.shape}" ), f"{output_wav.shape} != {source_wav.shape}"
def test_train_step(self): def test_train_step(self): ...
...
def test_train_eval_log(self): def test_train_eval_log(self): ...
...
def test_test_run(self): def test_test_run(self): ...
...
def test_load_checkpoint(self): def test_load_checkpoint(self): ...
...
def test_get_criterion(self): def test_get_criterion(self): ...
...
def test_init_from_config(self): def test_init_from_config(self): ...
...