pep8 check

2018-04-03 03:24:57 -07:00 · 2018-04-03 03:24:57 -07:00 · a9eadd1b8a
parent b4a4377875
commit a9eadd1b8a
23 changed files with 198 additions and 228 deletions
--- a/datasets/LJSpeech.py
+++ b/datasets/LJSpeech.py
@ -55,7 +55,8 @@ class LJSpeechDataset(Dataset):
                ignored.append(idx)
            else:
                new_frames.append(self.frames[idx])
-        print(" | > {} instances are ignored by min_seq_len ({})".format(len(ignored), self.min_seq_len))
+        print(" | > {} instances are ignored by min_seq_len ({})".format(
+            len(ignored), self.min_seq_len))
        self.frames = new_frames

    def __len__(self):
@ -65,7 +66,8 @@ class LJSpeechDataset(Dataset):
        wav_name = os.path.join(self.root_dir,
                                self.frames[idx][0]) + '.wav'
        text = self.frames[idx][1]
-        text = np.asarray(text_to_sequence(text, [self.cleaners]), dtype=np.int32)
+        text = np.asarray(text_to_sequence(
+            text, [self.cleaners]), dtype=np.int32)
        wav = np.asarray(self.load_wav(wav_name)[0], dtype=np.float32)
        sample = {'text': text, 'wav': wav, 'item_idx': self.frames[idx][0]}
        return sample
@ -99,10 +101,12 @@ class LJSpeechDataset(Dataset):
            mel_lengths = [m.shape[1] + 1 for m in mel]  # +1 for zero-frame

            # compute 'stop token' targets
-            stop_targets = [np.array([0.]*(mel_len-1)) for mel_len in mel_lengths]
+            stop_targets = [np.array([0.]*(mel_len-1))
+                            for mel_len in mel_lengths]

            # PAD stop targets
-            stop_targets = prepare_stop_target(stop_targets, self.outputs_per_step)
+            stop_targets = prepare_stop_target(
+                stop_targets, self.outputs_per_step)

            # PAD sequences with largest length of the batch
            text = prepare_data(text).astype(np.int32)
--- a/debug_config.py
+++ b/debug_config.py
@ -22,7 +22,7 @@

    "num_loader_workers": 16,

-  "save_step":1 ,
+    "save_step": 1,
    "data_path": "/data/shared/KeithIto/LJSpeech-1.0",
    "output_path": "result",
    "log_dir": "/home/erogol/projects/TTS/logs/"
--- a/layers/attention.py
+++ b/layers/attention.py
@ -25,7 +25,8 @@ class BahdanauAttention(nn.Module):
        processed_annots = self.annot_layer(annots)

        # (batch, max_time, 1)
-        alignment = self.v(nn.functional.tanh(processed_query + processed_annots))
+        alignment = self.v(nn.functional.tanh(
+            processed_query + processed_annots))

        # (batch, max_time)
        return alignment.squeeze(-1)
@ -85,5 +86,3 @@ class AttentionRNN(nn.Module):
        context = torch.bmm(alignment.unsqueeze(1), annotations)
        context = context.squeeze(1)
        return rnn_output, context, alignment
-
-
--- a/layers/losses.py
+++ b/layers/losses.py
@ -51,7 +51,8 @@ class L1LossMasked(nn.Module):
        # losses: (batch, max_len, dim)
        losses = losses_flat.view(*target.size())
        # mask: (batch, max_len, 1)
-        mask = _sequence_mask(sequence_length=length, max_len=target.size(1)).unsqueeze(2)
+        mask = _sequence_mask(sequence_length=length,
+                              max_len=target.size(1)).unsqueeze(2)
        losses = losses * mask.float()
        loss = losses.sum() / (length.float().sum() * float(target.shape[2]))
        return loss
--- a/layers/tacotron.py
+++ b/layers/tacotron.py
@ -6,6 +6,7 @@ from torch import nn
 from .attention import AttentionRNN
 from .attention import get_mask_from_lengths

+
 class Prenet(nn.Module):
    r""" Prenet as explained at https://arxiv.org/abs/1703.10135.
    It creates as many layers as given by 'out_features'
@ -214,6 +215,7 @@ class Decoder(nn.Module):
        r (int): number of outputs per time step.
        eps (float): threshold for detecting the end of a sentence.
    """
+
    def __init__(self, in_features, memory_dim, r, eps=0.05, mode='train'):
        super(Decoder, self).__init__()
        self.mode = mode
@ -251,23 +253,18 @@ class Decoder(nn.Module):
            - memory: batch x #mels_pecs x mel_spec_dim
        """
        B = inputs.size(0)
-
        # Run greedy decoding if memory is None
        greedy = not self.training
-
        if memory is not None:
-            
            # Grouping multiple frames if necessary
            if memory.size(-1) == self.memory_dim:
                memory = memory.view(B, memory.size(1) // self.r, -1)
                " !! Dimension mismatch {} vs {} * {}".format(memory.size(-1),
                                                              self.memory_dim, self.r)
            T_decoder = memory.size(1)
-
        # go frame - 0 frames tarting the sequence
        initial_memory = Variable(
            inputs.data.new(B, self.memory_dim * self.r).zero_())
-
        # Init decoder states
        attention_rnn_hidden = Variable(
            inputs.data.new(B, 256).zero_())
@ -276,14 +273,11 @@ class Decoder(nn.Module):
            for _ in range(len(self.decoder_rnns))]
        current_context_vec = Variable(
            inputs.data.new(B, 256).zero_())
-
        # Time first (T_decoder, B, memory_dim)
        if memory is not None:
            memory = memory.transpose(0, 1)
-
        outputs = []
        alignments = []
-
        t = 0
        memory_input = initial_memory
        while True:
@ -291,6 +285,7 @@ class Decoder(nn.Module):
                if greedy:
                    memory_input = outputs[-1]
                else:
+                    # TODO: try sampled teacher forcing
                    # combine prev. model output and prev. real target
                    # memory_input = torch.div(outputs[-1] + memory[t-1], 2.0)
                    # add a random noise
@ -298,36 +293,26 @@ class Decoder(nn.Module):
                        # memory_input.data.new(memory_input.size()).normal_(0.0, 0.5))
                    # memory_input = memory_input + noise
                    memory_input = memory[t-1]
-
            # Prenet
            processed_memory = self.prenet(memory_input)
-
            # Attention RNN
            attention_rnn_hidden, current_context_vec, alignment = self.attention_rnn(
                processed_memory, current_context_vec, attention_rnn_hidden, inputs)
-
            # Concat RNN output and attention context vector
            decoder_input = self.project_to_decoder_in(
                torch.cat((attention_rnn_hidden, current_context_vec), -1))
-
            # Pass through the decoder RNNs
            for idx in range(len(self.decoder_rnns)):
                decoder_rnn_hiddens[idx] = self.decoder_rnns[idx](
                    decoder_input, decoder_rnn_hiddens[idx])
                # Residual connectinon
                decoder_input = decoder_rnn_hiddens[idx] + decoder_input
-            
            output = decoder_input
-            
-
            # predict mel vectors from decoder vectors
            output = self.proj_to_mel(output)
-
            outputs += [output]
            alignments += [alignment]
-
            t += 1
-
            if (not greedy and self.training) or (greedy and memory is not None):
                if t >= T_decoder:
                    break
@ -338,15 +323,12 @@ class Decoder(nn.Module):
                    print(" !! Decoder stopped with 'max_decoder_steps'. \
                          Something is probably wrong.")
                    break
-                           
        assert greedy or len(outputs) == T_decoder
-
        # Back to batch first
        alignments = torch.stack(alignments).transpose(0, 1)
        outputs = torch.stack(outputs).transpose(0, 1).contiguous()
-
        return outputs, alignments


-def is_end_of_frames(output, eps=0.2): #0.2
+def is_end_of_frames(output, eps=0.2):  # 0.2
    return (output.data <= eps).all()
--- a/models/tacotron.py
+++ b/models/tacotron.py
@ -9,7 +9,6 @@ from TTS.layers.tacotron import Prenet, Encoder, Decoder, CBHG
 class Tacotron(nn.Module):
    def __init__(self, embedding_dim=256, linear_dim=1025, mel_dim=80,
                 r=5, padding_idx=None):
-                 
        super(Tacotron, self).__init__()
        self.r = r
        self.mel_dim = mel_dim
@ -17,34 +16,23 @@ class Tacotron(nn.Module):
        self.embedding = nn.Embedding(len(symbols), embedding_dim,
                                      padding_idx=padding_idx)
        print(" | > Embedding dim : {}".format(len(symbols)))
-
-        # Trying smaller std
        self.embedding.weight.data.normal_(0, 0.3)
        self.encoder = Encoder(embedding_dim)
        self.decoder = Decoder(256, mel_dim, r)
-
        self.postnet = CBHG(mel_dim, K=8, projections=[256, mel_dim])
        self.last_linear = nn.Linear(mel_dim * 2, linear_dim)

    def forward(self, characters, mel_specs=None):
-        
        B = characters.size(0)
-
        inputs = self.embedding(characters)
-        # (B, T', in_dim)
+        # batch x time x dim
        encoder_outputs = self.encoder(inputs)
-
-        # (B, T', mel_dim*r)
+        # batch x time x dim*r
        mel_outputs, alignments = self.decoder(
            encoder_outputs, mel_specs)
-
-        # Post net processing below
-
        # Reshape
-        # (B, T, mel_dim)
+        # batch x time x dim
        mel_outputs = mel_outputs.view(B, -1, self.mel_dim)
-
        linear_outputs = self.postnet(mel_outputs)
        linear_outputs = self.last_linear(linear_outputs)
-
        return mel_outputs, linear_outputs, alignments
--- a/module.py
+++ b/module.py
@ -288,7 +288,8 @@ class AttentionDecoder(nn.Module):
        bf_out = gru2_input + gru2_hidden

        # Output
-        output = self.out(bf_out).view(-1, self.num_mels, self.outputs_per_step)
+        output = self.out(bf_out).view(-1, self.num_mels,
+                                       self.outputs_per_step)

        return output, d_t, gru1_hidden, gru2_hidden

--- a/notebooks/utils.py
+++ b/notebooks/utils.py
@ -7,6 +7,7 @@ from matplotlib import pylab as plt

 hop_length = 250

+
 def create_speech(m, s, CONFIG, use_cuda, ap):
    text_cleaner = [CONFIG.text_cleaner]
    seq = np.array(text_to_sequence(s, text_cleaner))
@ -14,13 +15,15 @@ def create_speech(m, s, CONFIG, use_cuda, ap):
 #     mel = np.zeros([seq.shape[0], CONFIG.num_mels, 1], dtype=np.float32)

    if use_cuda:
-        chars_var = torch.autograd.Variable(torch.from_numpy(seq), volatile=True).unsqueeze(0).cuda()
+        chars_var = torch.autograd.Variable(
+            torch.from_numpy(seq), volatile=True).unsqueeze(0).cuda()
 #         mel_var = torch.autograd.Variable(torch.from_numpy(mel).type(torch.cuda.FloatTensor), volatile=True).cuda()
    else:
-        chars_var = torch.autograd.Variable(torch.from_numpy(seq), volatile=True).unsqueeze(0)
+        chars_var = torch.autograd.Variable(
+            torch.from_numpy(seq), volatile=True).unsqueeze(0)
 #         mel_var = torch.autograd.Variable(torch.from_numpy(mel).type(torch.FloatTensor), volatile=True)

-    mel_out, linear_out, alignments =m.forward(chars_var)
+    mel_out, linear_out, alignments = m.forward(chars_var)
    linear_out = linear_out[0].data.cpu().numpy()
    alignment = alignments[0].cpu().data.numpy()
    spec = ap._denormalize(linear_out)
@ -33,19 +36,18 @@ def create_speech(m, s, CONFIG, use_cuda, ap):

 def visualize(alignment, spectrogram, CONFIG):
    label_fontsize = 16
-    plt.figure(figsize=(16,16))
+    plt.figure(figsize=(16, 16))

-    plt.subplot(2,1,1)
+    plt.subplot(2, 1, 1)
    plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
    plt.xlabel("Decoder timestamp", fontsize=label_fontsize)
    plt.ylabel("Encoder timestamp", fontsize=label_fontsize)
    plt.colorbar()

-    plt.subplot(2,1,2)
+    plt.subplot(2, 1, 2)
    librosa.display.specshow(spectrogram.T, sr=CONFIG.sample_rate,
                             hop_length=hop_length, x_axis="time", y_axis="linear")
    plt.xlabel("Time", fontsize=label_fontsize)
    plt.ylabel("Hz", fontsize=label_fontsize)
    plt.tight_layout()
    plt.colorbar()
-
--- a/synthesis.py
+++ b/synthesis.py
@ -1,4 +1,4 @@
-#-*- coding: utf-8 -*-
+# -*- coding: utf-8 -*-

 from network import *
 from data import inv_spectrogram, find_endpoint, save_wav, spectrogram
--- a/tests/generic_utils_text.py
+++ b/tests/generic_utils_text.py
@ -6,6 +6,7 @@ from TTS.layers.tacotron import Prenet, CBHG, Decoder, Encoder

 OUT_PATH = '/tmp/test.pth.tar'

+
 class ModelSavingTests(unittest.TestCase):

    def save_checkpoint_test(self):
--- a/tests/layers_tests.py
+++ b/tests/layers_tests.py
@ -20,7 +20,7 @@ class PrenetTests(unittest.TestCase):
 class CBHGTests(unittest.TestCase):

    def test_in_out(self):
-        layer = CBHG(128, K= 6, projections=[128, 128], num_highways=2)
+        layer = CBHG(128, K=6, projections=[128, 128], num_highways=2)
        dummy_input = T.autograd.Variable(T.rand(4, 8, 128))

        print(layer)
@ -78,7 +78,8 @@ class L1LossMaskedTests(unittest.TestCase):

        dummy_input = T.autograd.Variable(T.ones(4, 8, 128).float())
        dummy_target = T.autograd.Variable(T.zeros(4, 8, 128).float())
-        dummy_length = T.autograd.Variable((T.arange(5,9)).long())
-        mask = ((_sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
+        dummy_length = T.autograd.Variable((T.arange(5, 9)).long())
+        mask = ((_sequence_mask(dummy_length).float() - 1.0)
+                * 100.0).unsqueeze(2)
        output = layer(dummy_input + mask, dummy_target, dummy_length)
        assert output.data[0] == 1.0, "1.0 vs {}".format(output.data[0])
--- a/tests/loader_tests.py
+++ b/tests/loader_tests.py
@ -10,6 +10,7 @@ from TTS.datasets.LJSpeech import LJSpeechDataset
 file_path = os.path.dirname(os.path.realpath(__file__))
 c = load_config(os.path.join(file_path, 'test_config.json'))

+
 class TestDataset(unittest.TestCase):

    def __init__(self, *args, **kwargs):
@ -140,7 +141,3 @@ class TestDataset(unittest.TestCase):
            # check batch conditions
            assert (mel_input * stop_target.unsqueeze(2)).sum() == 0
            assert (linear_input * stop_target.unsqueeze(2)).sum() == 0
-
-
-
-
--- a/train.py
+++ b/train.py
@ -83,7 +83,8 @@ def train(model, criterion, data_loader, optimizer, epoch):
        mel_input = data[3]
        mel_lengths = data[4]

-        current_step = num_iter + args.restore_step + epoch * len(data_loader) + 1
+        current_step = num_iter + args.restore_step + \
+            epoch * len(data_loader) + 1

        # setup lr
        current_lr = lr_decay(c.lr, current_step, c.warmup_steps)
@ -113,7 +114,7 @@ def train(model, criterion, data_loader, optimizer, epoch):
        mel_loss = criterion(mel_output, mel_spec_var, mel_lengths_var)
        linear_loss = 0.5 * criterion(linear_output, linear_spec_var, mel_lengths_var) \
            + 0.5 * criterion(linear_output[:, :, :n_priority_freq],
-                                  linear_spec_var[: ,: ,:n_priority_freq],
+                              linear_spec_var[:, :, :n_priority_freq],
                              mel_lengths_var)
        loss = mel_loss + linear_loss

@ -131,7 +132,8 @@ def train(model, criterion, data_loader, optimizer, epoch):

        # update
        progbar.update(num_iter+1, values=[('total_loss', loss.data[0]),
-                                           ('linear_loss', linear_loss.data[0]),
+                                           ('linear_loss',
+                                            linear_loss.data[0]),
                                           ('mel_loss', mel_loss.data[0]),
                                           ('grad_norm', grad_norm)])

@ -167,7 +169,8 @@ def train(model, criterion, data_loader, optimizer, epoch):
            # Sample audio
            audio_signal = linear_output[0].data.cpu().numpy()
            data_loader.dataset.ap.griffin_lim_iters = 60
-            audio_signal = data_loader.dataset.ap.inv_spectrogram(audio_signal.T)
+            audio_signal = data_loader.dataset.ap.inv_spectrogram(
+                audio_signal.T)
            try:
                tb.add_audio('SampleAudio', audio_signal, current_step,
                             sample_rate=c.sample_rate)
@ -177,14 +180,14 @@ def train(model, criterion, data_loader, optimizer, epoch):
                # print(audio_signal.min())
                pass

-            
    avg_linear_loss /= (num_iter + 1)
    avg_mel_loss /= (num_iter + 1)
    avg_total_loss = avg_mel_loss + avg_linear_loss

    # Plot Training Epoch Stats
    tb.add_scalar('TrainEpochLoss/TotalLoss', loss.data[0], current_step)
-    tb.add_scalar('TrainEpochLoss/LinearLoss', linear_loss.data[0], current_step)
+    tb.add_scalar('TrainEpochLoss/LinearLoss',
+                  linear_loss.data[0], current_step)
    tb.add_scalar('TrainEpochLoss/MelLoss', mel_loss.data[0], current_step)
    tb.add_scalar('Time/EpochTime', epoch_time, epoch)
    epoch_time = 0
@ -227,13 +230,14 @@ def evaluate(model, criterion, data_loader, current_step):
            linear_spec_var = linear_spec_var.cuda()

        # forward pass
-        mel_output, linear_output, alignments = model.forward(text_input_var, mel_spec_var)
+        mel_output, linear_output, alignments = model.forward(
+            text_input_var, mel_spec_var)

        # loss computation
        mel_loss = criterion(mel_output, mel_spec_var, mel_lengths_var)
        linear_loss = 0.5 * criterion(linear_output, linear_spec_var, mel_lengths_var) \
            + 0.5 * criterion(linear_output[:, :, :n_priority_freq],
-                                  linear_spec_var[: ,: ,:n_priority_freq],
+                              linear_spec_var[:, :, :n_priority_freq],
                              mel_lengths_var)
        loss = mel_loss + linear_loss

@ -242,7 +246,8 @@ def evaluate(model, criterion, data_loader, current_step):

        # update
        progbar.update(num_iter+1, values=[('total_loss', loss.data[0]),
-                                           ('linear_loss', linear_loss.data[0]),
+                                           ('linear_loss',
+                                            linear_loss.data[0]),
                                           ('mel_loss', mel_loss.data[0])])

        avg_linear_loss += linear_loss.data[0]
@ -328,7 +333,7 @@ def main(args):

    val_loader = DataLoader(val_dataset, batch_size=c.eval_batch_size,
                            shuffle=False, collate_fn=val_dataset.collate_fn,
-                            drop_last=False, num_workers= 4,
+                            drop_last=False, num_workers=4,
                            pin_memory=True)

    model = Tacotron(c.embedding_size,
@ -369,12 +374,14 @@ def main(args):
        best_loss = float('inf')

    for epoch in range(0, c.epochs):
-        train_loss, current_step = train(model, criterion, train_loader, optimizer, epoch)
+        train_loss, current_step = train(
+            model, criterion, train_loader, optimizer, epoch)
        val_loss = evaluate(model, criterion, val_loader, current_step)
        best_loss = save_best_model(model, optimizer, val_loss,
                                    best_loss, OUT_PATH,
                                    current_step, epoch)

+
 if __name__ == '__main__':
    signal.signal(signal.SIGINT, signal_handler)
    main(args)
--- a/utils/audio.py
+++ b/utils/audio.py
@ -23,61 +23,49 @@ class AudioProcessor(object):
        self.power = power
        self.griffin_lim_iters = griffin_lim_iters

-
    def save_wav(self, wav, path):
        wav *= 32767 / max(0.01, np.max(np.abs(wav)))
        librosa.output.write_wav(path, wav.astype(np.int16), self.sample_rate)

-
    def _linear_to_mel(self, spectrogram):
        global _mel_basis
        if _mel_basis is None:
            _mel_basis = self._build_mel_basis()
        return np.dot(_mel_basis, spectrogram)

-
    def _build_mel_basis(self, ):
        n_fft = (self.num_freq - 1) * 2
        return librosa.filters.mel(self.sample_rate, n_fft, n_mels=self.num_mels)

-
    def _normalize(self, S):
        return np.clip((S - self.min_level_db) / -self.min_level_db, 0, 1)

-
    def _denormalize(self, S):
        return (np.clip(S, 0, 1) * -self.min_level_db) + self.min_level_db

-
    def _stft_parameters(self, ):
        n_fft = (self.num_freq - 1) * 2
        hop_length = int(self.frame_shift_ms / 1000 * self.sample_rate)
        win_length = int(self.frame_length_ms / 1000 * self.sample_rate)
        return n_fft, hop_length, win_length

-
    def _amp_to_db(self, x):
        return 20 * np.log10(np.maximum(1e-5, x))

-
    def _db_to_amp(self, x):
        return np.power(10.0, x * 0.05)

-
    def apply_preemphasis(self, x):
        return signal.lfilter([1, -self.preemphasis], [1], x)

-
    def apply_inv_preemphasis(self, x):
        return signal.lfilter([1], [1, -self.preemphasis], x)

-
    def spectrogram(self, y):
        D = self._stft(self.apply_preemphasis(y))
        S = self._amp_to_db(np.abs(D)) - self.ref_level_db
        return self._normalize(S)

-
    def inv_spectrogram(self, spectrogram):
        '''Converts spectrogram to waveform using librosa'''
        S = self._denormalize(spectrogram)
@ -85,7 +73,6 @@ class AudioProcessor(object):
        # Reconstruct phase
        return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power))

-
    def _griffin_lim(self, S):
        '''librosa implementation of Griffin-Lim
        Based on https://github.com/librosa/librosa/issues/434
@ -98,13 +85,11 @@ class AudioProcessor(object):
            y = self._istft(S_complex * angles)
        return y

-
    def melspectrogram(self, y):
        D = self._stft(self.apply_preemphasis(y))
        S = self._amp_to_db(self._linear_to_mel(np.abs(D))) - self.ref_level_db
        return self._normalize(S)

-
    def _stft(self, y):
        n_fft, hop_length, win_length = self._stft_parameters()
        return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
@ -113,7 +98,6 @@ class AudioProcessor(object):
        _, hop_length, win_length = self._stft_parameters()
        return librosa.istft(y, hop_length=hop_length, win_length=win_length)

-
    def find_endpoint(self, wav, threshold_db=-40, min_silence_sec=0.8):
        window_length = int(self.sample_rate * min_silence_sec)
        hop_length = int(window_length / 4)
--- a/utils/data.py
+++ b/utils/data.py
@ -17,9 +17,11 @@ def prepare_data(inputs):
 def _pad_tensor(x, length):
    _pad = 0
    assert x.ndim == 2
-    x = np.pad(x, [[0, 0], [0, length - x.shape[1]]], mode='constant', constant_values=_pad)
+    x = np.pad(x, [[0, 0], [0, length - x.shape[1]]],
+               mode='constant', constant_values=_pad)
    return x

+
 def prepare_tensor(inputs, out_steps):
    max_len = max((x.shape[1] for x in inputs)) + 1  # zero-frame
    remainder = max_len % out_steps
--- a/utils/generic_utils.py
+++ b/utils/generic_utils.py
@ -90,7 +90,8 @@ def save_best_model(model, optimizer, model_loss, best_loss, out_path,
        best_loss = model_loss
        bestmodel_path = 'best_model.pth.tar'
        bestmodel_path = os.path.join(out_path, bestmodel_path)
-        print("\n | > Best model saving with loss {0:.2f} : {1:}".format(model_loss, bestmodel_path))
+        print("\n | > Best model saving with loss {0:.2f} : {1:}".format(
+            model_loss, bestmodel_path))
        torch.save(state, bestmodel_path)
    return best_loss

--- a/utils/text/init.py
+++ b/utils/text/init.py
@ -1,4 +1,4 @@
-#-*- coding: utf-8 -*-
+# -*- coding: utf-8 -*-

 import re
 from TTS.utils.text import cleaners
--- a/utils/text/cleaners.py
+++ b/utils/text/cleaners.py
@ -1,4 +1,4 @@
-#-*- coding: utf-8 -*-
+# -*- coding: utf-8 -*-


 '''
--- a/utils/text/cmudict.py
+++ b/utils/text/cmudict.py
@ -1,4 +1,4 @@
-#-*- coding: utf-8 -*-
+# -*- coding: utf-8 -*-


 import re
--- a/utils/text/numbers.py
+++ b/utils/text/numbers.py
@ -1,4 +1,4 @@
-#-*- coding: utf-8 -*-
+# -*- coding: utf-8 -*-

 import inflect
 import re
--- a/utils/text/symbols.py
+++ b/utils/text/symbols.py
@ -1,4 +1,4 @@
-#-*- coding: utf-8 -*-
+# -*- coding: utf-8 -*-


 '''
--- a/utils/visual.py
+++ b/utils/visual.py
@ -5,7 +5,7 @@ import matplotlib.pyplot as plt


 def plot_alignment(alignment, info=None):
-    fig, ax = plt.subplots(figsize=(16,10))
+    fig, ax = plt.subplots(figsize=(16, 10))
    im = ax.imshow(alignment.T, aspect='auto', origin='lower',
                   interpolation='none')
    fig.colorbar(im, ax=ax)