diff --git a/TTS/VERSION b/TTS/VERSION index 5eef0f10..d9df1bbc 100644 --- a/TTS/VERSION +++ b/TTS/VERSION @@ -1 +1 @@ -0.10.2 +0.11.0 diff --git a/TTS/api.py b/TTS/api.py index 850f0681..0e694263 100644 --- a/TTS/api.py +++ b/TTS/api.py @@ -102,7 +102,7 @@ class TTS: return model_path, config_path, vocoder_path, vocoder_config_path def load_model_by_name(self, model_name: str, gpu: bool = False): - """ Load one of 🐸TTS models by name. + """Load one of 🐸TTS models by name. Args: model_name (str): Model name to load. You can list models by ```tts.models```. diff --git a/TTS/bin/eval_encoder.py b/TTS/bin/eval_encoder.py index 7f9fdf93..60fed139 100644 --- a/TTS/bin/eval_encoder.py +++ b/TTS/bin/eval_encoder.py @@ -10,7 +10,6 @@ from TTS.tts.utils.speakers import SpeakerManager def compute_encoder_accuracy(dataset_items, encoder_manager): - class_name_key = encoder_manager.encoder_config.class_name_key map_classid_to_classname = getattr(encoder_manager.encoder_config, "map_classid_to_classname", None) diff --git a/TTS/bin/extract_tts_spectrograms.py b/TTS/bin/extract_tts_spectrograms.py index 8cfd156b..9eadee07 100755 --- a/TTS/bin/extract_tts_spectrograms.py +++ b/TTS/bin/extract_tts_spectrograms.py @@ -164,7 +164,6 @@ def extract_spectrograms( model.eval() export_metadata = [] for _, data in tqdm(enumerate(data_loader), total=len(data_loader)): - # format data ( text_input, diff --git a/TTS/bin/resample.py b/TTS/bin/resample.py index eb4ee58e..a3f28485 100644 --- a/TTS/bin/resample.py +++ b/TTS/bin/resample.py @@ -35,7 +35,6 @@ def resample_files(input_dir, output_sr, output_dir=None, file_ext="wav", n_jobs if __name__ == "__main__": - parser = argparse.ArgumentParser( description="""Resample a folder recusively with librosa Can be used in place or create a copy of the folder as an output.\n\n diff --git a/TTS/encoder/utils/generic_utils.py b/TTS/encoder/utils/generic_utils.py index 91a896f6..1da02961 100644 --- a/TTS/encoder/utils/generic_utils.py +++ b/TTS/encoder/utils/generic_utils.py @@ -14,7 +14,6 @@ from TTS.utils.io import save_fsspec class AugmentWAV(object): def __init__(self, ap, augmentation_config): - self.ap = ap self.use_additive_noise = False @@ -67,7 +66,6 @@ class AugmentWAV(object): self.global_noise_list.append("RIR_AUG") def additive_noise(self, noise_type, audio): - clean_db = 10 * np.log10(np.mean(audio**2) + 1e-4) noise_list = random.sample( diff --git a/TTS/tts/datasets/dataset.py b/TTS/tts/datasets/dataset.py index a8ff9772..db74186b 100644 --- a/TTS/tts/datasets/dataset.py +++ b/TTS/tts/datasets/dataset.py @@ -411,7 +411,6 @@ class TTSDataset(Dataset): # Puts each data field into a tensor with outer dimension batch size if isinstance(batch[0], collections.abc.Mapping): - token_ids_lengths = np.array([len(d["token_ids"]) for d in batch]) # sort items with text input length for RNN efficiency diff --git a/TTS/tts/layers/feed_forward/decoder.py b/TTS/tts/layers/feed_forward/decoder.py index 34c586aa..0376e2e3 100644 --- a/TTS/tts/layers/feed_forward/decoder.py +++ b/TTS/tts/layers/feed_forward/decoder.py @@ -81,7 +81,6 @@ class RelativePositionTransformerDecoder(nn.Module): """ def __init__(self, in_channels, out_channels, hidden_channels, params): - super().__init__() self.prenet = Conv1dBN(in_channels, hidden_channels, 1, 1) self.rel_pos_transformer = RelativePositionTransformer(in_channels, out_channels, hidden_channels, **params) @@ -111,7 +110,6 @@ class FFTransformerDecoder(nn.Module): """ def __init__(self, in_channels, out_channels, params): - super().__init__() self.transformer_block = FFTransformerBlock(in_channels, **params) self.postnet = nn.Conv1d(in_channels, out_channels, 1) diff --git a/TTS/tts/layers/feed_forward/duration_predictor.py b/TTS/tts/layers/feed_forward/duration_predictor.py index 5392aeca..4422648f 100644 --- a/TTS/tts/layers/feed_forward/duration_predictor.py +++ b/TTS/tts/layers/feed_forward/duration_predictor.py @@ -18,7 +18,6 @@ class DurationPredictor(nn.Module): """ def __init__(self, hidden_channels): - super().__init__() self.layers = nn.ModuleList( diff --git a/TTS/tts/layers/generic/res_conv_bn.py b/TTS/tts/layers/generic/res_conv_bn.py index 30c134cd..4beda291 100644 --- a/TTS/tts/layers/generic/res_conv_bn.py +++ b/TTS/tts/layers/generic/res_conv_bn.py @@ -100,7 +100,6 @@ class ResidualConv1dBNBlock(nn.Module): def __init__( self, in_channels, out_channels, hidden_channels, kernel_size, dilations, num_res_blocks=13, num_conv_blocks=2 ): - super().__init__() assert len(dilations) == num_res_blocks self.res_blocks = nn.ModuleList() diff --git a/TTS/tts/layers/generic/wavenet.py b/TTS/tts/layers/generic/wavenet.py index 613ad19d..bc89da4f 100644 --- a/TTS/tts/layers/generic/wavenet.py +++ b/TTS/tts/layers/generic/wavenet.py @@ -153,7 +153,6 @@ class WNBlocks(nn.Module): dropout_p=0, weight_norm=True, ): - super().__init__() self.wn_blocks = nn.ModuleList() for idx in range(num_blocks): diff --git a/TTS/tts/layers/glow_tts/transformer.py b/TTS/tts/layers/glow_tts/transformer.py index 0f837abf..02688d61 100644 --- a/TTS/tts/layers/glow_tts/transformer.py +++ b/TTS/tts/layers/glow_tts/transformer.py @@ -64,7 +64,6 @@ class RelativePositionMultiHeadAttention(nn.Module): proximal_bias=False, proximal_init=False, ): - super().__init__() assert channels % num_heads == 0, " [!] channels should be divisible by num_heads." # class attributes @@ -272,7 +271,6 @@ class FeedForwardNetwork(nn.Module): """ def __init__(self, in_channels, out_channels, hidden_channels, kernel_size, dropout_p=0.0, causal=False): - super().__init__() self.in_channels = in_channels self.out_channels = out_channels diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py index f39431fa..98be5b88 100644 --- a/TTS/tts/layers/losses.py +++ b/TTS/tts/layers/losses.py @@ -363,7 +363,6 @@ class TacotronLoss(torch.nn.Module): alignments_backwards, input_lens, ): - # decoder outputs linear or mel spectrograms for Tacotron and Tacotron2 # the target should be set acccordingly postnet_target = linear_input if self.config.model.lower() in ["tacotron"] else mel_input diff --git a/TTS/tts/layers/overflow/common_layers.py b/TTS/tts/layers/overflow/common_layers.py index ba9a139e..b036dd1b 100644 --- a/TTS/tts/layers/overflow/common_layers.py +++ b/TTS/tts/layers/overflow/common_layers.py @@ -22,7 +22,6 @@ class Encoder(nn.Module): """ def __init__(self, num_chars, state_per_phone, in_out_channels=512, n_convolutions=3): - super().__init__() self.state_per_phone = state_per_phone diff --git a/TTS/tts/layers/overflow/decoder.py b/TTS/tts/layers/overflow/decoder.py index 4e65993f..4cd7ae88 100644 --- a/TTS/tts/layers/overflow/decoder.py +++ b/TTS/tts/layers/overflow/decoder.py @@ -36,7 +36,6 @@ class Decoder(nn.Module): sigmoid_scale=False, c_in_channels=0, ): - super().__init__() self.glow_decoder = GlowDecoder( diff --git a/TTS/tts/layers/overflow/neural_hmm.py b/TTS/tts/layers/overflow/neural_hmm.py index 231aab83..0631ba98 100644 --- a/TTS/tts/layers/overflow/neural_hmm.py +++ b/TTS/tts/layers/overflow/neural_hmm.py @@ -123,7 +123,6 @@ class NeuralHMM(nn.Module): h_memory, c_memory = self._init_lstm_states(batch_size, self.memory_rnn_dim, mels) for t in range(T_max): - # Process Autoregression h_memory, c_memory = self._process_ar_timestep(t, ar_inputs, h_memory, c_memory) # Get mean, std and transition vector from decoder for this timestep @@ -418,7 +417,6 @@ class NeuralHMM(nn.Module): output_parameter_values = [] quantile = 1 while True: - memory_input = self.prenet(prenet_input.flatten(1).unsqueeze(0)) # will be 1 while sampling h_memory, c_memory = self.memory_rnn(memory_input.squeeze(0), (h_memory, c_memory)) diff --git a/TTS/tts/layers/tacotron/attentions.py b/TTS/tts/layers/tacotron/attentions.py index d8a90d72..25c3798e 100644 --- a/TTS/tts/layers/tacotron/attentions.py +++ b/TTS/tts/layers/tacotron/attentions.py @@ -50,7 +50,6 @@ class GravesAttention(nn.Module): COEF = 0.3989422917366028 # numpy.sqrt(1/(2*numpy.pi)) def __init__(self, query_dim, K): - super().__init__() self._mask_value = 1e-8 self.K = K diff --git a/TTS/tts/layers/tacotron/capacitron_layers.py b/TTS/tts/layers/tacotron/capacitron_layers.py index 68321358..2181ffa7 100644 --- a/TTS/tts/layers/tacotron/capacitron_layers.py +++ b/TTS/tts/layers/tacotron/capacitron_layers.py @@ -83,7 +83,6 @@ class ReferenceEncoder(nn.Module): """ def __init__(self, num_mel, out_dim): - super().__init__() self.num_mel = num_mel filters = [1] + [32, 32, 64, 64, 128, 128] diff --git a/TTS/tts/layers/tacotron/gst_layers.py b/TTS/tts/layers/tacotron/gst_layers.py index ec622e4d..05dba708 100644 --- a/TTS/tts/layers/tacotron/gst_layers.py +++ b/TTS/tts/layers/tacotron/gst_layers.py @@ -31,7 +31,6 @@ class ReferenceEncoder(nn.Module): """ def __init__(self, num_mel, embedding_dim): - super().__init__() self.num_mel = num_mel filters = [1] + [32, 32, 64, 64, 128, 128] @@ -119,7 +118,6 @@ class MultiHeadAttention(nn.Module): """ def __init__(self, query_dim, key_dim, num_units, num_heads): - super().__init__() self.num_units = num_units self.num_heads = num_heads diff --git a/TTS/tts/layers/tacotron/tacotron.py b/TTS/tts/layers/tacotron/tacotron.py index bddaf449..7a47c35e 100644 --- a/TTS/tts/layers/tacotron/tacotron.py +++ b/TTS/tts/layers/tacotron/tacotron.py @@ -27,7 +27,6 @@ class BatchNormConv1d(nn.Module): """ def __init__(self, in_channels, out_channels, kernel_size, stride, padding, activation=None): - super().__init__() self.padding = padding self.padder = nn.ConstantPad1d(padding, 0) @@ -149,7 +148,7 @@ class CBHG(nn.Module): activations += [None] # setup conv1d projection layers layer_set = [] - for (in_size, out_size, ac) in zip(out_features, conv_projections, activations): + for in_size, out_size, ac in zip(out_features, conv_projections, activations): layer = BatchNormConv1d(in_size, out_size, kernel_size=3, stride=1, padding=[1, 1], activation=ac) layer_set.append(layer) self.conv1d_projections = nn.ModuleList(layer_set) diff --git a/TTS/tts/layers/vits/transforms.py b/TTS/tts/layers/vits/transforms.py index c1505554..3cac1b8d 100644 --- a/TTS/tts/layers/vits/transforms.py +++ b/TTS/tts/layers/vits/transforms.py @@ -21,7 +21,6 @@ def piecewise_rational_quadratic_transform( min_bin_height=DEFAULT_MIN_BIN_HEIGHT, min_derivative=DEFAULT_MIN_DERIVATIVE, ): - if tails is None: spline_fn = rational_quadratic_spline spline_kwargs = {} diff --git a/TTS/tts/models/align_tts.py b/TTS/tts/models/align_tts.py index 4fdaa596..b2e51de7 100644 --- a/TTS/tts/models/align_tts.py +++ b/TTS/tts/models/align_tts.py @@ -109,7 +109,6 @@ class AlignTTS(BaseTTS): tokenizer: "TTSTokenizer" = None, speaker_manager: SpeakerManager = None, ): - super().__init__(config, ap, tokenizer, speaker_manager) self.speaker_manager = speaker_manager self.phase = -1 diff --git a/TTS/tts/models/base_tacotron.py b/TTS/tts/models/base_tacotron.py index 4aaf5261..f38dace2 100644 --- a/TTS/tts/models/base_tacotron.py +++ b/TTS/tts/models/base_tacotron.py @@ -252,7 +252,12 @@ class BaseTacotron(BaseTTS): def compute_capacitron_VAE_embedding(self, inputs, reference_mel_info, text_info=None, speaker_embedding=None): """Capacitron Variational Autoencoder""" - (VAE_outputs, posterior_distribution, prior_distribution, capacitron_beta,) = self.capacitron_vae_layer( + ( + VAE_outputs, + posterior_distribution, + prior_distribution, + capacitron_beta, + ) = self.capacitron_vae_layer( reference_mel_info, text_info, speaker_embedding, # pylint: disable=not-callable diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py index 69980e72..2059612d 100644 --- a/TTS/tts/models/base_tts.py +++ b/TTS/tts/models/base_tts.py @@ -357,7 +357,6 @@ class BaseTTS(BaseTrainerModel): def _get_test_aux_input( self, ) -> Dict: - d_vector = None if self.config.use_d_vector_file: d_vector = [self.speaker_manager.embeddings[name]["embedding"] for name in self.speaker_manager.embeddings] diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py index cc241c43..bfd1a2b6 100644 --- a/TTS/tts/models/glow_tts.py +++ b/TTS/tts/models/glow_tts.py @@ -63,7 +63,6 @@ class GlowTTS(BaseTTS): tokenizer: "TTSTokenizer" = None, speaker_manager: SpeakerManager = None, ): - super().__init__(config, ap, tokenizer, speaker_manager) # pass all config fields to `self` diff --git a/TTS/tts/models/tacotron.py b/TTS/tts/models/tacotron.py index 7bfa6ba5..474ec464 100644 --- a/TTS/tts/models/tacotron.py +++ b/TTS/tts/models/tacotron.py @@ -36,7 +36,6 @@ class Tacotron(BaseTacotron): tokenizer: "TTSTokenizer" = None, speaker_manager: SpeakerManager = None, ): - super().__init__(config, ap, tokenizer, speaker_manager) # pass all config fields to `self` diff --git a/TTS/tts/models/tacotron2.py b/TTS/tts/models/tacotron2.py index 95d339f1..71ab1eac 100644 --- a/TTS/tts/models/tacotron2.py +++ b/TTS/tts/models/tacotron2.py @@ -50,7 +50,6 @@ class Tacotron2(BaseTacotron): tokenizer: "TTSTokenizer" = None, speaker_manager: SpeakerManager = None, ): - super().__init__(config, ap, tokenizer, speaker_manager) self.decoder_output_dim = config.out_channels diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 1b367cd7..14c76add 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -633,7 +633,6 @@ class Vits(BaseTTS): speaker_manager: SpeakerManager = None, language_manager: LanguageManager = None, ): - super().__init__(config, ap, tokenizer, speaker_manager, language_manager) self.init_multispeaker(config) @@ -1280,7 +1279,6 @@ class Vits(BaseTTS): # compute melspec segment with autocast(enabled=False): - if self.args.encoder_sample_rate: spec_segment_size = self.spec_segment_size * int(self.interpolate_factor) else: diff --git a/TTS/tts/utils/text/phonemizers/base.py b/TTS/tts/utils/text/phonemizers/base.py index 3f8e8eaf..4fc79874 100644 --- a/TTS/tts/utils/text/phonemizers/base.py +++ b/TTS/tts/utils/text/phonemizers/base.py @@ -32,7 +32,6 @@ class BasePhonemizer(abc.ABC): """ def __init__(self, language, punctuations=Punctuation.default_puncs(), keep_puncs=False): - # ensure the backend is installed on the system if not self.is_available(): raise RuntimeError("{} not installed on your system".format(self.name())) # pragma: nocover diff --git a/TTS/utils/audio/processor.py b/TTS/utils/audio/processor.py index 9d16474a..c872efa3 100644 --- a/TTS/utils/audio/processor.py +++ b/TTS/utils/audio/processor.py @@ -158,7 +158,6 @@ class AudioProcessor(object): verbose=True, **_, ): - # setup class attributed self.sample_rate = sample_rate self.resample = resample diff --git a/TTS/utils/download.py b/TTS/utils/download.py index de9b31a7..3f06b578 100644 --- a/TTS/utils/download.py +++ b/TTS/utils/download.py @@ -43,7 +43,6 @@ def stream_url( total=url_size, disable=not progress_bar, ) as pbar: - num_bytes = 0 while True: chunk = upointer.read(block_size) diff --git a/TTS/utils/radam.py b/TTS/utils/radam.py index 73426e64..cbd14990 100644 --- a/TTS/utils/radam.py +++ b/TTS/utils/radam.py @@ -31,13 +31,11 @@ class RAdam(Optimizer): super().__setstate__(state) def step(self, closure=None): - loss = None if closure is not None: loss = closure() for group in self.param_groups: - for p in group["params"]: if p.grad is None: continue diff --git a/TTS/utils/samplers.py b/TTS/utils/samplers.py index df5d4185..b08a763a 100644 --- a/TTS/utils/samplers.py +++ b/TTS/utils/samplers.py @@ -72,7 +72,6 @@ class PerfectBatchSampler(Sampler): self._num_classes_in_batch = num_classes_in_batch def __iter__(self): - batch = [] if self._num_classes_in_batch != len(self._samplers): valid_samplers_idx = random.sample(range(len(self._samplers)), self._num_classes_in_batch) diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 2310baf9..c197b1f5 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -212,7 +212,6 @@ class Synthesizer(object): speaker_embedding = None speaker_id = None if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "name_to_id"): - # handle Neon models with single speaker. if len(self.tts_model.speaker_manager.name_to_id) == 1: speaker_id = list(self.tts_model.speaker_manager.name_to_id.values())[0] @@ -247,7 +246,6 @@ class Synthesizer(object): if self.tts_languages_file or ( hasattr(self.tts_model, "language_manager") and self.tts_model.language_manager is not None ): - if len(self.tts_model.language_manager.name_to_id) == 1: language_id = list(self.tts_model.language_manager.name_to_id.values())[0] diff --git a/TTS/utils/vad.py b/TTS/utils/vad.py index c978c837..90c45e49 100644 --- a/TTS/utils/vad.py +++ b/TTS/utils/vad.py @@ -47,7 +47,6 @@ def get_vad_model_and_utils(use_cuda=False): def remove_silence( model_and_utils, audio_path, out_path, vad_sample_rate=8000, trim_just_beginning_and_end=True, use_cuda=False ): - # get the VAD model and utils functions model, get_speech_timestamps, _, collect_chunks = model_and_utils diff --git a/TTS/vocoder/datasets/gan_dataset.py b/TTS/vocoder/datasets/gan_dataset.py index a782067e..50c38c4d 100644 --- a/TTS/vocoder/datasets/gan_dataset.py +++ b/TTS/vocoder/datasets/gan_dataset.py @@ -118,7 +118,6 @@ class GANDataset(Dataset): mel = self.ap.melspectrogram(audio) audio, mel = self._pad_short_samples(audio, mel) else: - # load precomputed features wavpath, feat_path = self.item_list[idx] diff --git a/TTS/vocoder/datasets/wavegrad_dataset.py b/TTS/vocoder/datasets/wavegrad_dataset.py index d941eab3..305fe430 100644 --- a/TTS/vocoder/datasets/wavegrad_dataset.py +++ b/TTS/vocoder/datasets/wavegrad_dataset.py @@ -30,7 +30,6 @@ class WaveGradDataset(Dataset): use_cache=False, verbose=False, ): - super().__init__() self.ap = ap self.item_list = items diff --git a/TTS/vocoder/datasets/wavernn_dataset.py b/TTS/vocoder/datasets/wavernn_dataset.py index 2c771cf0..c3907964 100644 --- a/TTS/vocoder/datasets/wavernn_dataset.py +++ b/TTS/vocoder/datasets/wavernn_dataset.py @@ -12,7 +12,6 @@ class WaveRNNDataset(Dataset): def __init__( self, ap, items, seq_len, hop_len, pad, mode, mulaw, is_training=True, verbose=False, return_segments=True ): - super().__init__() self.ap = ap self.compute_feat = not isinstance(items[0], (tuple, list)) @@ -52,7 +51,6 @@ class WaveRNNDataset(Dataset): else compute it on the fly """ if self.compute_feat: - wavpath = self.item_list[index] audio = self.ap.load_wav(wavpath) if self.return_segments: @@ -74,7 +72,6 @@ class WaveRNNDataset(Dataset): raise RuntimeError("Unknown dataset mode - ", self.mode) else: - wavpath, feat_path = self.item_list[index] mel = np.load(feat_path.replace("/quant/", "/mel/")) diff --git a/TTS/vocoder/models/parallel_wavegan_generator.py b/TTS/vocoder/models/parallel_wavegan_generator.py index c741774a..5587fb72 100644 --- a/TTS/vocoder/models/parallel_wavegan_generator.py +++ b/TTS/vocoder/models/parallel_wavegan_generator.py @@ -33,7 +33,6 @@ class ParallelWaveganGenerator(torch.nn.Module): upsample_factors=[4, 4, 4, 4], inference_padding=2, ): - super().__init__() self.in_channels = in_channels self.out_channels = out_channels diff --git a/TTS/vocoder/models/random_window_discriminator.py b/TTS/vocoder/models/random_window_discriminator.py index ea95668a..79b68e97 100644 --- a/TTS/vocoder/models/random_window_discriminator.py +++ b/TTS/vocoder/models/random_window_discriminator.py @@ -77,7 +77,7 @@ class ConditionalDiscriminator(nn.Module): # layers before condition features self.pre_cond_layers += [DBlock(in_channels, 64, 1)] in_channels = 64 - for (i, channel) in enumerate(out_channels): + for i, channel in enumerate(out_channels): self.pre_cond_layers.append(DBlock(in_channels, channel, downsample_factors[i])) in_channels = channel @@ -116,7 +116,7 @@ class UnconditionalDiscriminator(nn.Module): self.layers = nn.ModuleList() self.layers += [DBlock(self.in_channels, base_channels, 1)] in_channels = base_channels - for (i, factor) in enumerate(downsample_factors): + for i, factor in enumerate(downsample_factors): self.layers.append(DBlock(in_channels, out_channels[i], factor)) in_channels *= 2 self.layers += [ @@ -147,7 +147,6 @@ class RandomWindowDiscriminator(nn.Module): cond_disc_out_channels=((128, 128, 256, 256), (128, 256, 256), (128, 256), (256,), (128, 256)), window_sizes=(512, 1024, 2048, 4096, 8192), ): - super().__init__() self.cond_channels = cond_channels self.window_sizes = window_sizes @@ -185,14 +184,14 @@ class RandomWindowDiscriminator(nn.Module): scores = [] feats = [] # unconditional pass - for (window_size, layer) in zip(self.window_sizes, self.unconditional_discriminators): + for window_size, layer in zip(self.window_sizes, self.unconditional_discriminators): index = np.random.randint(x.shape[-1] - window_size) score = layer(x[:, :, index : index + window_size]) scores.append(score) # conditional pass - for (window_size, layer) in zip(self.window_sizes, self.conditional_discriminators): + for window_size, layer in zip(self.window_sizes, self.conditional_discriminators): frame_size = window_size // self.hop_length lc_index = np.random.randint(c.shape[-1] - frame_size) sample_index = lc_index * self.hop_length diff --git a/TTS/vocoder/models/univnet_discriminator.py b/TTS/vocoder/models/univnet_discriminator.py index 34e2d1c2..4c09520c 100644 --- a/TTS/vocoder/models/univnet_discriminator.py +++ b/TTS/vocoder/models/univnet_discriminator.py @@ -32,7 +32,6 @@ class SpecDiscriminator(nn.Module): self.out = norm_f(nn.Conv2d(32, 1, 3, 1, 1)) def forward(self, y): - fmap = [] with torch.no_grad(): y = y.squeeze(1) @@ -53,7 +52,6 @@ class MultiResSpecDiscriminator(torch.nn.Module): def __init__( # pylint: disable=dangerous-default-value self, fft_sizes=[1024, 2048, 512], hop_sizes=[120, 240, 50], win_lengths=[600, 1200, 240], window="hann_window" ): - super().__init__() self.discriminators = nn.ModuleList( [ diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py index 0ea6b6e0..903f4b7e 100644 --- a/TTS/vocoder/models/wavernn.py +++ b/TTS/vocoder/models/wavernn.py @@ -312,7 +312,6 @@ class Wavernn(BaseVocoder): return self.fc3(x) def inference(self, mels, batched=None, target=None, overlap=None): - self.eval() output = [] start = time.time() @@ -346,7 +345,6 @@ class Wavernn(BaseVocoder): aux_split = [aux[:, :, d * i : d * (i + 1)] for i in range(4)] for i in range(seq_len): - m_t = mels[:, i, :] if self.args.use_aux_net: diff --git a/tests/aux_tests/test_extract_tts_spectrograms.py b/tests/aux_tests/test_extract_tts_spectrograms.py index f9392706..f2d119ac 100644 --- a/tests/aux_tests/test_extract_tts_spectrograms.py +++ b/tests/aux_tests/test_extract_tts_spectrograms.py @@ -9,6 +9,7 @@ from TTS.tts.models import setup_model torch.manual_seed(1) + # pylint: disable=protected-access class TestExtractTTSSpectrograms(unittest.TestCase): @staticmethod diff --git a/tests/aux_tests/test_find_unique_phonemes.py b/tests/aux_tests/test_find_unique_phonemes.py index f48e5c3a..018679f5 100644 --- a/tests/aux_tests/test_find_unique_phonemes.py +++ b/tests/aux_tests/test_find_unique_phonemes.py @@ -29,6 +29,7 @@ dataset_config_pt = BaseDatasetConfig( ) """ + # pylint: disable=protected-access class TestFindUniquePhonemes(unittest.TestCase): @staticmethod diff --git a/tests/aux_tests/test_speaker_encoder.py b/tests/aux_tests/test_speaker_encoder.py index f2875cc1..01f6118a 100644 --- a/tests/aux_tests/test_speaker_encoder.py +++ b/tests/aux_tests/test_speaker_encoder.py @@ -129,7 +129,6 @@ class AngleProtoLossTests(unittest.TestCase): class SoftmaxAngleProtoLossTests(unittest.TestCase): # pylint: disable=R0201 def test_in_out(self): - embedding_dim = 64 num_speakers = 5 batch_size = 4 diff --git a/tests/data_tests/test_loader.py b/tests/data_tests/test_loader.py index bc69cdb7..cbd98fc0 100644 --- a/tests/data_tests/test_loader.py +++ b/tests/data_tests/test_loader.py @@ -45,7 +45,6 @@ class TestTTSDataset(unittest.TestCase): self.ap = AudioProcessor(**c.audio) def _create_dataloader(self, batch_size, r, bgs, start_by_longest=False): - # load dataset meta_data_train, meta_data_eval = load_tts_samples(dataset_config, eval_split=True, eval_split_size=0.2) items = meta_data_train + meta_data_eval diff --git a/tests/data_tests/test_samplers.py b/tests/data_tests/test_samplers.py index b2129811..0975d5ed 100644 --- a/tests/data_tests/test_samplers.py +++ b/tests/data_tests/test_samplers.py @@ -75,7 +75,6 @@ class TestSamplers(unittest.TestCase): assert is_balanced(en, pt), "Language Weighted sampler is supposed to be balanced" def test_speaker_weighted_random_sampler(self): # pylint: disable=no-self-use - weighted_sampler = torch.utils.data.sampler.WeightedRandomSampler( get_speaker_balancer_weights(train_samples), len(train_samples) ) diff --git a/tests/inference_tests/test_python_api.py b/tests/inference_tests/test_python_api.py index 02569607..a44c98e8 100644 --- a/tests/inference_tests/test_python_api.py +++ b/tests/inference_tests/test_python_api.py @@ -2,7 +2,6 @@ import os import unittest from tests import get_tests_data_path, get_tests_output_path - from TTS.api import TTS OUTPUT_PATH = os.path.join(get_tests_output_path(), "test_python_api.wav") diff --git a/tests/text_tests/test_phonemizer.py b/tests/text_tests/test_phonemizer.py index 4ca62384..a96a3a2f 100644 --- a/tests/text_tests/test_phonemizer.py +++ b/tests/text_tests/test_phonemizer.py @@ -235,7 +235,6 @@ class TestMultiPhonemizer(unittest.TestCase): self.phonemizer = MultiPhonemizer({"tr": "espeak", "en-us": "", "de": "gruut", "zh-cn": ""}) def test_phonemize(self): - # Enlish espeak text = "Be a voice, not an! echo?" gt = "biː ɐ vˈɔɪs, nˈɑːt æn! ˈɛkoʊ?" diff --git a/tests/tts_tests/test_tacotron2_model.py b/tests/tts_tests/test_tacotron2_model.py index ed79a26d..b1bdeb9f 100644 --- a/tests/tts_tests/test_tacotron2_model.py +++ b/tests/tts_tests/test_tacotron2_model.py @@ -332,7 +332,6 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase): @staticmethod def test_train_step(): - config = config_global.copy() config.use_d_vector_file = True diff --git a/tests/tts_tests/test_vits.py b/tests/tts_tests/test_vits.py index 8e408519..fca99556 100644 --- a/tests/tts_tests/test_vits.py +++ b/tests/tts_tests/test_vits.py @@ -401,7 +401,6 @@ class TestVits(unittest.TestCase): def test_train_step(self): # setup the model with torch.autograd.set_detect_anomaly(True): - config = VitsConfig(model_args=VitsArgs(num_chars=32, spec_segment_size=10)) model = Vits(config).to(device) model.train()