fix Lint checks

2021-06-18 14:33:50 -03:00 · 2021-06-18 14:33:50 -03:00 · 28bec238ca
parent 83644056e3 e78e3cd81e
commit 28bec238ca
51 changed files with 814 additions and 147 deletions
--- a/.github/workflows/pypi-release.yml
+++ b/.github/workflows/pypi-release.yml
@ -0,0 +1,38 @@
 name: Publish Python 🐍 distributions 📦 to PyPI
 on:
  release:
    types: [published]
 defaults:
  run:
    shell:
      bash
 jobs:
  build-package:
    runs-on: ubuntu-20.04
    steps:
      - uses: actions/checkout@v2
      - name: Verify tag matches version
        run: |
          set -ex
          version=$(cat TTS/VERSION)
          tag="${GITHUB_REF/refs\/tags\/}"
          if [[ "v$version" != "$tag" ]]; then
            exit 1
          fi
      - uses: actions/setup-python@v2
        with:
          python-version: 3.8
      - run: |
          python -m pip install -U pip setuptools twine toml
          python -c 'import toml; c = toml.load("pyproject.toml"); print("\n".join(c["build-system"]["requires"]))' | pip install -r /dev/stdin
      - run: |
          python setup.py sdist
      - name: Setup PyPI config
        run: |
          cat << EOF > ~/.pypirc
          [pypi]
          username=__token__
          password=${{ secrets.PYPI_TOKEN }}
          EOF
      - run: |
          twine upload --repository pypi dist/*.tar.gz
--- a/.pylintrc
+++ b/.pylintrc
@ -158,7 +158,8 @@ disable=missing-docstring,
        deprecated-sys-function,
        exception-escape,
        comprehension-escape,
-        duplicate-code
+        duplicate-code,
        not-callable
 # Enable the message, report, category or checker with the given id(s). You can
 # either give multiple identifier separated by comma (,) or put this option
@ -253,7 +254,7 @@ contextmanager-decorators=contextlib.contextmanager
 # List of members which are set dynamically and missed by pylint inference
 # system, and so shouldn't trigger E1101 when accessed. Python regular
 # expressions are accepted.
-generated-members=
+generated-members=numpy.*,torch.*
 # Tells whether missing members accessed in mixin class should be ignored. A
 # mixin class is detected if its name ends with "mixin" (case insensitive).
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,6 +1,7 @@
 include README.md
 include LICENSE.txt
 include requirements.*.txt
 include TTS/VERSION
 recursive-include TTS *.json
 recursive-include TTS *.html
 recursive-include TTS *.png
--- a/TTS/.models.json
+++ b/TTS/.models.json
@ -149,6 +149,18 @@
                    "needs_phonemizer": true
                }
            }
        },
        "ja":{
            "kokoro":{
                "tacotron2-DDC":{
                    "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.15/tts_models--jp--kokoro--tacotron2-DDC.zip",
                    "default_vocoder": "vocoder_models/universal/libri-tts/wavegrad",
                    "description": "Tacotron2 with Double Decoder Consistency trained with Kokoro Speech Dataset.",
                    "author": "@kaiidams",
                    "commit": "401fbd89",
                    "needs_phonemizer": false
                }
            }
        }
    },
    "vocoder_models":{
--- a/TTS/VERSION
+++ b/TTS/VERSION
@ -0,0 +1 @@
 0.0.15
--- a/TTS/init.py
+++ b/TTS/init.py
@ -1 +1,7 @@
-from ._version import __version__
+import os
 with open(os.path.join(os.path.dirname(__file__), "VERSION")) as f:
    version = f.read().strip()
 __version__ = version
--- a/TTS/_version.py
+++ b/TTS/_version.py
@ -1 +0,0 @@
 __version__ = "0.0.14"
--- a/TTS/bin/compute_embeddings.py
+++ b/TTS/bin/compute_embeddings.py
@ -6,12 +6,12 @@ import numpy as np
 from tqdm import tqdm
 from TTS.config import load_config
 from TTS.config import BaseDatasetConfig, load_config
 from TTS.speaker_encoder.utils.generic_utils import setup_model
 from TTS.tts.datasets.preprocess import load_meta_data
 from TTS.tts.utils.speakers import SpeakerManager
 from TTS.utils.audio import AudioProcessor
 parser = argparse.ArgumentParser(
    description='Compute embedding vectors for each wav file in a dataset.'
 )
@ -74,6 +74,7 @@ for idx, wav_file in enumerate(tqdm(wav_files)):
 if speaker_mapping:
    # save speaker_mapping if target dataset is defined
    if '.json' not in args.output_path and '.npy' not in args.output_path:
        mapping_file_path = os.path.join(args.output_path, "speakers.json")
        mapping_npy_file_path = os.path.join(args.output_path, "speakers.npy")
    else:
--- a/TTS/bin/distribute.py
+++ b/TTS/bin/distribute.py
@ -51,7 +51,7 @@ def main():
        my_env["PYTHON_EGG_CACHE"] = "/tmp/tmp{}".format(i)
        command[-1] = "--rank={}".format(i)
        stdout = None if i == 0 else open(os.devnull, "w")
-        p = subprocess.Popen(["python3"] + command, stdout=stdout, env=my_env)
+        p = subprocess.Popen(["python3"] + command, stdout=stdout, env=my_env)  # pylint: disable=consider-using-with
        processes.append(p)
        print(command)
--- a/TTS/bin/extract_tts_spectrograms.py
+++ b/TTS/bin/extract_tts_spectrograms.py
@ -299,4 +299,5 @@ if __name__ == "__main__":
    args = parser.parse_args()
    c = load_config(args.config_path)
    c.audio['do_trim_silence'] = False  # IMPORTANT!!!!!!!!!!!!!!! disable to align mel
    main(args)
--- a/TTS/bin/train_encoder.py
+++ b/TTS/bin/train_encoder.py
@ -10,10 +10,8 @@ import torch
 from torch.utils.data import DataLoader
 from TTS.speaker_encoder.dataset import SpeakerEncoderDataset
 from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
 from TTS.speaker_encoder.utils.generic_utils import save_best_model, setup_model
 from TTS.speaker_encoder.utils.visual import plot_embeddings
 from TTS.tts.datasets.preprocess import load_meta_data
 from TTS.utils.arguments import init_training
@ -45,7 +43,7 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False
            storage_size=c.storage["storage_size"],
            sample_from_storage_p=c.storage["sample_from_storage_p"],
            verbose=verbose,
-            augmentation_config=c.audio_augmentation
+            augmentation_config=c.audio_augmentation,
        )
        # sampler = DistributedSampler(dataset) if num_gpus > 1 else None
@ -170,19 +168,18 @@ def main(args):  # pylint: disable=redefined-outer-name
    else:
        raise Exception("The %s  not is a loss supported" % c.loss)
    if args.restore_path:
        checkpoint = torch.load(args.restore_path)
        try:
            model.load_state_dict(checkpoint["model"])
-            if 'criterion' in checkpoint:
+            if "criterion" in checkpoint:
                criterion.load_state_dict(checkpoint["criterion"])
        except (KeyError, RuntimeError):
            print(" > Partial model initialization.")
            model_dict = model.state_dict()
-            model_dict = set_init_dict(model_dict, checkpoint['model'], c)
+            model_dict = set_init_dict(model_dict, checkpoint["model"], c)
            model.load_state_dict(model_dict)
            del model_dict
        for group in optimizer.param_groups:
--- a/TTS/server/server.py
+++ b/TTS/server/server.py
@ -99,7 +99,9 @@ if args.vocoder_path is not None:
    vocoder_config_path = args.vocoder_config_path
 # load models
-synthesizer = Synthesizer(model_path, config_path, speakers_file_path, vocoder_path, vocoder_config_path, args.use_cuda)
+synthesizer = Synthesizer(
    model_path, config_path, speakers_file_path, vocoder_path, vocoder_config_path, use_cuda=args.use_cuda
 )
 use_multi_speaker = synthesizer.speaker_manager is not None
 # TODO: set this from SpeakerManager
--- a/TTS/speaker_encoder/dataset.py
+++ b/TTS/speaker_encoder/dataset.py
@ -1,11 +1,12 @@
 import random
 import numpy as np
 import torch
 from torch.utils.data import Dataset
 from TTS.speaker_encoder.utils.generic_utils import AugmentWAV, Storage
 class SpeakerEncoderDataset(Dataset):
    def __init__(
        self,
@ -18,7 +19,7 @@ class SpeakerEncoderDataset(Dataset):
        num_utter_per_speaker=10,
        skip_speakers=False,
        verbose=False,
-            augmentation_config=None
+        augmentation_config=None,
    ):
        """
        Args:
@ -38,7 +39,9 @@ class SpeakerEncoderDataset(Dataset):
        self.verbose = verbose
        self.__parse_items()
        storage_max_size = storage_size * num_speakers_in_batch
-        self.storage = Storage(maxsize=storage_max_size, storage_batchs=storage_size, num_speakers_in_batch=num_speakers_in_batch)
+        self.storage = Storage(
            maxsize=storage_max_size, storage_batchs=storage_size, num_speakers_in_batch=num_speakers_in_batch
        )
        self.sample_from_storage_p = float(sample_from_storage_p)
        speakers_aux = list(self.speakers)
@ -49,12 +52,12 @@ class SpeakerEncoderDataset(Dataset):
        self.augmentator = None
        self.gaussian_augmentation_config = None
        if augmentation_config:
-            self.data_augmentation_p = augmentation_config['p']
+            self.data_augmentation_p = augmentation_config["p"]
-            if self.data_augmentation_p and ('additive' in augmentation_config or 'rir' in augmentation_config):
+            if self.data_augmentation_p and ("additive" in augmentation_config or "rir" in augmentation_config):
                self.augmentator = AugmentWAV(ap, augmentation_config)
-            if 'gaussian' in augmentation_config.keys():
+            if "gaussian" in augmentation_config.keys():
-                self.gaussian_augmentation_config = augmentation_config['gaussian']
+                self.gaussian_augmentation_config = augmentation_config["gaussian"]
        if self.verbose:
            print("\n > DataLoader initialization")
@ -231,9 +234,13 @@ class SpeakerEncoderDataset(Dataset):
                offset = random.randint(0, wav.shape[0] - self.seq_len)
                wav = wav[offset : offset + self.seq_len]
                # add random gaussian noise
-                if self.gaussian_augmentation_config and self.gaussian_augmentation_config['p']:
+                if self.gaussian_augmentation_config and self.gaussian_augmentation_config["p"]:
-                    if random.random() < self.gaussian_augmentation_config['p']:
+                    if random.random() < self.gaussian_augmentation_config["p"]:
-                        wav += np.random.normal(self.gaussian_augmentation_config['min_amplitude'], self.gaussian_augmentation_config['max_amplitude'], size=len(wav))
+                        wav += np.random.normal(
                            self.gaussian_augmentation_config["min_amplitude"],
                            self.gaussian_augmentation_config["max_amplitude"],
                            size=len(wav),
                        )
                mel = self.ap.melspectrogram(wav)
                feats_.append(torch.FloatTensor(mel))
--- a/TTS/speaker_encoder/losses.py
+++ b/TTS/speaker_encoder/losses.py
@ -162,6 +162,7 @@ class AngleProtoLoss(nn.Module):
        L = self.criterion(cos_sim_matrix, label)
        return L
 class SoftmaxLoss(nn.Module):
    """
    Implementation of the Softmax loss as defined in https://arxiv.org/abs/2003.11982
@ -169,13 +170,14 @@ class SoftmaxLoss(nn.Module):
            - embedding_dim (float): speaker embedding dim
            - n_speakers (float): number of speakers
    """
    def __init__(self, embedding_dim, n_speakers):
        super().__init__()
        self.criterion = torch.nn.CrossEntropyLoss()
        self.fc = nn.Linear(embedding_dim, n_speakers)
-        print('Initialised Softmax Loss')
+        print("Initialised Softmax Loss")
    def forward(self, x, label=None):
        # reshape for compatibility
@ -187,6 +189,7 @@ class SoftmaxLoss(nn.Module):
        return L
 class SoftmaxAngleProtoLoss(nn.Module):
    """
    Implementation of the Softmax AnglePrototypical loss as defined in https://arxiv.org/abs/2009.14153
@ -196,13 +199,14 @@ class SoftmaxAngleProtoLoss(nn.Module):
            - init_w (float): defines the initial value of w
            - init_b (float): definies the initial value of b
    """
    def __init__(self, embedding_dim, n_speakers, init_w=10.0, init_b=-5.0):
        super().__init__()
        self.softmax = SoftmaxLoss(embedding_dim, n_speakers)
        self.angleproto = AngleProtoLoss(init_w, init_b)
-        print('Initialised SoftmaxAnglePrototypical Loss')
+        print("Initialised SoftmaxAnglePrototypical Loss")
    def forward(self, x, label=None):
        """
--- a/TTS/speaker_encoder/models/resnet.py
+++ b/TTS/speaker_encoder/models/resnet.py
@ -1,7 +1,8 @@
 import torch
 import numpy as np
 import torch
 import torch.nn as nn
 class SELayer(nn.Module):
    def __init__(self, channel, reduction=8):
        super(SELayer, self).__init__()
@ -10,7 +11,7 @@ class SELayer(nn.Module):
            nn.Linear(channel, channel // reduction),
            nn.ReLU(inplace=True),
            nn.Linear(channel // reduction, channel),
-            nn.Sigmoid()
+            nn.Sigmoid(),
        )
    def forward(self, x):
@ -19,6 +20,7 @@ class SELayer(nn.Module):
        y = self.fc(y).view(b, c, 1, 1)
        return x * y
 class SEBasicBlock(nn.Module):
    expansion = 1
@ -51,12 +53,22 @@ class SEBasicBlock(nn.Module):
        out = self.relu(out)
        return out
 class ResNetSpeakerEncoder(nn.Module):
    """Implementation of the model H/ASP without batch normalization in speaker embedding. This model was proposed in: https://arxiv.org/abs/2009.14153
    Adapted from: https://github.com/clovaai/voxceleb_trainer
    """
    # pylint: disable=W0102
-    def __init__(self, input_dim=64, proj_dim=512, layers=[3, 4, 6, 3], num_filters=[32, 64, 128, 256], encoder_type='ASP', log_input=False):
+    def __init__(
        self,
        input_dim=64,
        proj_dim=512,
        layers=[3, 4, 6, 3],
        num_filters=[32, 64, 128, 256],
        encoder_type="ASP",
        log_input=False,
    ):
        super(ResNetSpeakerEncoder, self).__init__()
        self.encoder_type = encoder_type
@ -89,7 +101,7 @@ class ResNetSpeakerEncoder(nn.Module):
        elif self.encoder_type == "ASP":
            out_dim = num_filters[3] * outmap_size * 2
        else:
-            raise ValueError('Undefined encoder')
+            raise ValueError("Undefined encoder")
        self.fc = nn.Linear(out_dim, proj_dim)
@ -98,7 +110,7 @@ class ResNetSpeakerEncoder(nn.Module):
    def _init_layers(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
@ -107,8 +119,7 @@ class ResNetSpeakerEncoder(nn.Module):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
-                nn.Conv2d(self.inplanes, planes * block.expansion,
+                nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False),
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes * block.expansion),
            )
--- a/TTS/speaker_encoder/speaker_encoder_config.py
+++ b/TTS/speaker_encoder/speaker_encoder_config.py
@ -25,10 +25,7 @@ class SpeakerEncoderConfig(BaseTrainingConfig):
        }
    )
-    audio_augmentation : dict = field(
+    audio_augmentation: dict = field(default_factory=lambda: {})
        default_factory=lambda: {
        }
    )
    storage: dict = field(
        default_factory=lambda: {
--- a/TTS/speaker_encoder/utils/generic_utils.py
+++ b/TTS/speaker_encoder/utils/generic_utils.py
@ -1,18 +1,18 @@
-import re
+import datetime
 import glob
 import os
 import random
 import re
 from multiprocessing import Manager
 import numpy as np
 import torch
 import glob
 import random
 import datetime
 from scipy import signal
 from multiprocessing import Manager
 from TTS.speaker_encoder.models.lstm import LSTMSpeakerEncoder
 from TTS.speaker_encoder.models.resnet import ResNetSpeakerEncoder
 class Storage(object):
    def __init__(self, maxsize, storage_batchs, num_speakers_in_batch, num_threads=8):
        # use multiprocessing for threading safe
@ -53,19 +53,19 @@ class Storage(object):
        return self.storage[random.randint(0, storage_size)]
    def get_random_sample_fast(self):
-        '''Call this method only when storage is full'''
+        """Call this method only when storage is full"""
        return self.storage[random.randint(0, self.safe_storage_size)]
 class AugmentWAV(object):
 class AugmentWAV(object):
    def __init__(self, ap, augmentation_config):
        self.ap = ap
        self.use_additive_noise = False
-        if 'additive' in augmentation_config.keys():
+        if "additive" in augmentation_config.keys():
-            self.additive_noise_config = augmentation_config['additive']
+            self.additive_noise_config = augmentation_config["additive"]
-            additive_path = self.additive_noise_config['sounds_path']
+            additive_path = self.additive_noise_config["sounds_path"]
            if additive_path:
                self.use_additive_noise = True
                # get noise types
@ -74,12 +74,12 @@ class AugmentWAV(object):
                    if isinstance(self.additive_noise_config[key], dict):
                        self.additive_noise_types.append(key)
-                additive_files = glob.glob(os.path.join(additive_path, '**/*.wav'), recursive=True)
+                additive_files = glob.glob(os.path.join(additive_path, "**/*.wav"), recursive=True)
                self.noise_list = {}
                for wav_file in additive_files:
-                    noise_dir = wav_file.replace(additive_path, '').split(os.sep)[0]
+                    noise_dir = wav_file.replace(additive_path, "").split(os.sep)[0]
                    # ignore not listed directories
                    if noise_dir not in self.additive_noise_types:
                        continue
@ -87,14 +87,16 @@ class AugmentWAV(object):
                        self.noise_list[noise_dir] = []
                    self.noise_list[noise_dir].append(wav_file)
-                print(f" | > Using Additive Noise Augmentation: with {len(additive_files)} audios instances from {self.additive_noise_types}")
+                print(
                    f" | > Using Additive Noise Augmentation: with {len(additive_files)} audios instances from {self.additive_noise_types}"
                )
        self.use_rir = False
-        if 'rir' in augmentation_config.keys():
+        if "rir" in augmentation_config.keys():
-            self.rir_config = augmentation_config['rir']
+            self.rir_config = augmentation_config["rir"]
-            if self.rir_config['rir_path']:
+            if self.rir_config["rir_path"]:
-                self.rir_files = glob.glob(os.path.join(self.rir_config['rir_path'], '**/*.wav'), recursive=True)
+                self.rir_files = glob.glob(os.path.join(self.rir_config["rir_path"], "**/*.wav"), recursive=True)
                self.use_rir = True
            print(f" | > Using RIR Noise Augmentation: with {len(self.rir_files)} audios instances")
@ -113,7 +115,13 @@ class AugmentWAV(object):
        clean_db = 10 * np.log10(np.mean(audio ** 2) + 1e-4)
-        noise_list = random.sample(self.noise_list[noise_type], random.randint(self.additive_noise_config[noise_type]['min_num_noises'], self.additive_noise_config[noise_type]['max_num_noises']))
+        noise_list = random.sample(
            self.noise_list[noise_type],
            random.randint(
                self.additive_noise_config[noise_type]["min_num_noises"],
                self.additive_noise_config[noise_type]["max_num_noises"],
            ),
        )
        audio_len = audio.shape[0]
        noises_wav = None
@ -123,7 +131,10 @@ class AugmentWAV(object):
            if noiseaudio.shape[0] < audio_len:
                continue
-            noise_snr = random.uniform(self.additive_noise_config[noise_type]['min_snr_in_db'], self.additive_noise_config[noise_type]['max_num_noises'])
+            noise_snr = random.uniform(
                self.additive_noise_config[noise_type]["min_snr_in_db"],
                self.additive_noise_config[noise_type]["max_num_noises"],
            )
            noise_db = 10 * np.log10(np.mean(noiseaudio ** 2) + 1e-4)
            noise_wav = np.sqrt(10 ** ((clean_db - noise_db - noise_snr) / 10)) * noiseaudio
@ -144,7 +155,7 @@ class AugmentWAV(object):
        rir_file = random.choice(self.rir_files)
        rir = self.ap.load_wav(rir_file, sr=self.ap.sample_rate)
        rir = rir / np.sqrt(np.sum(rir ** 2))
-        return signal.convolve(audio, rir, mode=self.rir_config['conv_mode'])[:audio_len]
+        return signal.convolve(audio, rir, mode=self.rir_config["conv_mode"])[:audio_len]
    def apply_one(self, audio):
        noise_type = random.choice(self.global_noise_list)
@ -153,17 +164,25 @@ class AugmentWAV(object):
        return self.additive_noise(noise_type, audio)
 def to_camel(text):
    text = text.capitalize()
    return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text)
 def setup_model(c):
-    if c.model_params['model_name'].lower() == 'lstm':
+    if c.model_params["model_name"].lower() == "lstm":
-        model = LSTMSpeakerEncoder(c.model_params["input_dim"], c.model_params["proj_dim"], c.model_params["lstm_dim"], c.model_params["num_lstm_layers"])
+        model = LSTMSpeakerEncoder(
-    elif c.model_params['model_name'].lower() == 'resnet':
+            c.model_params["input_dim"],
            c.model_params["proj_dim"],
            c.model_params["lstm_dim"],
            c.model_params["num_lstm_layers"],
        )
    elif c.model_params["model_name"].lower() == "resnet":
        model = ResNetSpeakerEncoder(input_dim=c.model_params["input_dim"], proj_dim=c.model_params["proj_dim"])
    return model
 def save_checkpoint(model, optimizer, criterion, model_loss, out_path, current_step, epoch):
    checkpoint_path = "checkpoint_{}.pth.tar".format(current_step)
    checkpoint_path = os.path.join(out_path, checkpoint_path)
--- a/TTS/tts/datasets/preprocess.py
+++ b/TTS/tts/datasets/preprocess.py
@ -441,3 +441,17 @@ def baker(root_path: str, meta_file: str) -> List[List[str]]:
            wav_path = os.path.join(root_path, "clips_22", wav_name)
            items.append([text, wav_path, speaker_name])
    return items
 def kokoro(root_path, meta_file):
    """Japanese single-speaker dataset from https://github.com/kaiidams/Kokoro-Speech-Dataset"""
    txt_file = os.path.join(root_path, meta_file)
    items = []
    speaker_name = "kokoro"
    with open(txt_file, "r") as ttf:
        for line in ttf:
            cols = line.split("|")
            wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav")
            text = cols[2].replace(" ", "")
            items.append([text, wav_file, speaker_name])
    return items
--- a/TTS/tts/models/tacotron2.py
+++ b/TTS/tts/models/tacotron2.py
@ -255,6 +255,7 @@ class Tacotron2(TacotronAbstract):
        if self.num_speakers > 1:
            if not self.embeddings_per_sample:
                speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None]
                speaker_embeddings = torch.unsqueeze(speaker_embeddings, 0).transpose(1, 2)
            encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings)
        decoder_outputs, alignments, stop_tokens = self.decoder.inference(encoder_outputs)
@ -277,6 +278,7 @@ class Tacotron2(TacotronAbstract):
        if self.num_speakers > 1:
            if not self.embeddings_per_sample:
                speaker_embeddings = self.speaker_embedding(speaker_ids)[:, None]
                speaker_embeddings = torch.unsqueeze(speaker_embeddings, 0).transpose(1, 2)
            encoder_outputs = self._concat_speaker_embedding(encoder_outputs, speaker_embeddings)
        mel_outputs, alignments, stop_tokens = self.decoder.inference_truncated(encoder_outputs)
--- a/TTS/tts/utils/text/init.py
+++ b/TTS/tts/utils/text/init.py
@ -6,6 +6,7 @@ from packaging import version
 from TTS.tts.utils.text import cleaners
 from TTS.tts.utils.text.chinese_mandarin.phonemizer import chinese_text_to_phonemes
 from TTS.tts.utils.text.japanese.phonemizer import japanese_text_to_phonemes
 from TTS.tts.utils.text.symbols import _bos, _eos, _punctuations, make_symbols, phonemes, symbols
 # pylint: disable=unnecessary-comprehension
@ -39,6 +40,11 @@ def text2phone(text, language):
    if language == "zh-CN":
        ph = chinese_text_to_phonemes(text)
        return ph
    if language == "ja-jp":
        ph = japanese_text_to_phonemes(text)
        return ph
    raise ValueError(f" [!] Language {language} is not supported for phonemization.")
--- a/TTS/tts/utils/text/cleaners.py
+++ b/TTS/tts/utils/text/cleaners.py
@ -1,18 +1,6 @@
 """
 Cleaners are transformations that run over the input text at both training and eval time.
 Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
 hyperparameter. Some cleaners are English-specific. You'll typically want to use:
  1. "english_cleaners" for English text
  2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
     the Unidecode library (https://pypi.python.org/pypi/Unidecode)
  3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
     the symbols in symbols.py to match your data).
 """
 import re
-from unidecode import unidecode
+from anyascii import anyascii
 from TTS.tts.utils.text.chinese_mandarin.numbers import replace_numbers_to_characters_in_text
@ -47,7 +35,7 @@ def collapse_whitespace(text):
 def convert_to_ascii(text):
-    return unidecode(text)
+    return anyascii(text)
 def remove_aux_symbols(text):
--- a/TTS/tts/utils/text/japanese/init.py
+++ b/TTS/tts/utils/text/japanese/init.py
--- a/TTS/tts/utils/text/japanese/phonemizer.py
+++ b/TTS/tts/utils/text/japanese/phonemizer.py
@ -0,0 +1,380 @@
 # Convert Japanese text to phonemes which is
 # compatible with Julius https://github.com/julius-speech/segmentation-kit
 import re
 import MeCab
 _CONVRULES = [
    # Conversion of 2 letters
    "アァ/ a a",
    "イィ/ i i",
    "イェ/ i e",
    "イャ/ y a",
    "ウゥ/ u:",
    "エェ/ e e",
    "オォ/ o:",
    "カァ/ k a:",
    "キィ/ k i:",
    "クゥ/ k u:",
    "クャ/ ky a",
    "クュ/ ky u",
    "クョ/ ky o",
    "ケェ/ k e:",
    "コォ/ k o:",
    "ガァ/ g a:",
    "ギィ/ g i:",
    "グゥ/ g u:",
    "グャ/ gy a",
    "グュ/ gy u",
    "グョ/ gy o",
    "ゲェ/ g e:",
    "ゴォ/ g o:",
    "サァ/ s a:",
    "シィ/ sh i:",
    "スゥ/ s u:",
    "スャ/ sh a",
    "スュ/ sh u",
    "スョ/ sh o",
    "セェ/ s e:",
    "ソォ/ s o:",
    "ザァ/ z a:",
    "ジィ/ j i:",
    "ズゥ/ z u:",
    "ズャ/ zy a",
    "ズュ/ zy u",
    "ズョ/ zy o",
    "ゼェ/ z e:",
    "ゾォ/ z o:",
    "タァ/ t a:",
    "チィ/ ch i:",
    "ツァ/ ts a",
    "ツィ/ ts i",
    "ツゥ/ ts u:",
    "ツャ/ ch a",
    "ツュ/ ch u",
    "ツョ/ ch o",
    "ツェ/ ts e",
    "ツォ/ ts o",
    "テェ/ t e:",
    "トォ/ t o:",
    "ダァ/ d a:",
    "ヂィ/ j i:",
    "ヅゥ/ d u:",
    "ヅャ/ zy a",
    "ヅュ/ zy u",
    "ヅョ/ zy o",
    "デェ/ d e:",
    "ドォ/ d o:",
    "ナァ/ n a:",
    "ニィ/ n i:",
    "ヌゥ/ n u:",
    "ヌャ/ ny a",
    "ヌュ/ ny u",
    "ヌョ/ ny o",
    "ネェ/ n e:",
    "ノォ/ n o:",
    "ハァ/ h a:",
    "ヒィ/ h i:",
    "フゥ/ f u:",
    "フャ/ hy a",
    "フュ/ hy u",
    "フョ/ hy o",
    "ヘェ/ h e:",
    "ホォ/ h o:",
    "バァ/ b a:",
    "ビィ/ b i:",
    "ブゥ/ b u:",
    "フャ/ hy a",
    "ブュ/ by u",
    "フョ/ hy o",
    "ベェ/ b e:",
    "ボォ/ b o:",
    "パァ/ p a:",
    "ピィ/ p i:",
    "プゥ/ p u:",
    "プャ/ py a",
    "プュ/ py u",
    "プョ/ py o",
    "ペェ/ p e:",
    "ポォ/ p o:",
    "マァ/ m a:",
    "ミィ/ m i:",
    "ムゥ/ m u:",
    "ムャ/ my a",
    "ムュ/ my u",
    "ムョ/ my o",
    "メェ/ m e:",
    "モォ/ m o:",
    "ヤァ/ y a:",
    "ユゥ/ y u:",
    "ユャ/ y a:",
    "ユュ/ y u:",
    "ユョ/ y o:",
    "ヨォ/ y o:",
    "ラァ/ r a:",
    "リィ/ r i:",
    "ルゥ/ r u:",
    "ルャ/ ry a",
    "ルュ/ ry u",
    "ルョ/ ry o",
    "レェ/ r e:",
    "ロォ/ r o:",
    "ワァ/ w a:",
    "ヲォ/ o:",
    "ディ/ d i",
    "デェ/ d e:",
    "デャ/ dy a",
    "デュ/ dy u",
    "デョ/ dy o",
    "ティ/ t i",
    "テェ/ t e:",
    "テャ/ ty a",
    "テュ/ ty u",
    "テョ/ ty o",
    "スィ/ s i",
    "ズァ/ z u a",
    "ズィ/ z i",
    "ズゥ/ z u",
    "ズャ/ zy a",
    "ズュ/ zy u",
    "ズョ/ zy o",
    "ズェ/ z e",
    "ズォ/ z o",
    "キャ/ ky a",
    "キュ/ ky u",
    "キョ/ ky o",
    "シャ/ sh a",
    "シュ/ sh u",
    "シェ/ sh e",
    "ショ/ sh o",
    "チャ/ ch a",
    "チュ/ ch u",
    "チェ/ ch e",
    "チョ/ ch o",
    "トゥ/ t u",
    "トャ/ ty a",
    "トュ/ ty u",
    "トョ/ ty o",
    "ドァ/ d o a",
    "ドゥ/ d u",
    "ドャ/ dy a",
    "ドュ/ dy u",
    "ドョ/ dy o",
    "ドォ/ d o:",
    "ニャ/ ny a",
    "ニュ/ ny u",
    "ニョ/ ny o",
    "ヒャ/ hy a",
    "ヒュ/ hy u",
    "ヒョ/ hy o",
    "ミャ/ my a",
    "ミュ/ my u",
    "ミョ/ my o",
    "リャ/ ry a",
    "リュ/ ry u",
    "リョ/ ry o",
    "ギャ/ gy a",
    "ギュ/ gy u",
    "ギョ/ gy o",
    "ヂェ/ j e",
    "ヂャ/ j a",
    "ヂュ/ j u",
    "ヂョ/ j o",
    "ジェ/ j e",
    "ジャ/ j a",
    "ジュ/ j u",
    "ジョ/ j o",
    "ビャ/ by a",
    "ビュ/ by u",
    "ビョ/ by o",
    "ピャ/ py a",
    "ピュ/ py u",
    "ピョ/ py o",
    "ウァ/ u a",
    "ウィ/ w i",
    "ウェ/ w e",
    "ウォ/ w o",
    "ファ/ f a",
    "フィ/ f i",
    "フゥ/ f u",
    "フャ/ hy a",
    "フュ/ hy u",
    "フョ/ hy o",
    "フェ/ f e",
    "フォ/ f o",
    "ヴァ/ b a",
    "ヴィ/ b i",
    "ヴェ/ b e",
    "ヴォ/ b o",
    "ヴュ/ by u",
    # Conversion of 1 letter
    "ア/ a",
    "イ/ i",
    "ウ/ u",
    "エ/ e",
    "オ/ o",
    "カ/ k a",
    "キ/ k i",
    "ク/ k u",
    "ケ/ k e",
    "コ/ k o",
    "サ/ s a",
    "シ/ sh i",
    "ス/ s u",
    "セ/ s e",
    "ソ/ s o",
    "タ/ t a",
    "チ/ ch i",
    "ツ/ ts u",
    "テ/ t e",
    "ト/ t o",
    "ナ/ n a",
    "ニ/ n i",
    "ヌ/ n u",
    "ネ/ n e",
    "ノ/ n o",
    "ハ/ h a",
    "ヒ/ h i",
    "フ/ f u",
    "ヘ/ h e",
    "ホ/ h o",
    "マ/ m a",
    "ミ/ m i",
    "ム/ m u",
    "メ/ m e",
    "モ/ m o",
    "ラ/ r a",
    "リ/ r i",
    "ル/ r u",
    "レ/ r e",
    "ロ/ r o",
    "ガ/ g a",
    "ギ/ g i",
    "グ/ g u",
    "ゲ/ g e",
    "ゴ/ g o",
    "ザ/ z a",
    "ジ/ j i",
    "ズ/ z u",
    "ゼ/ z e",
    "ゾ/ z o",
    "ダ/ d a",
    "ヂ/ j i",
    "ヅ/ z u",
    "デ/ d e",
    "ド/ d o",
    "バ/ b a",
    "ビ/ b i",
    "ブ/ b u",
    "ベ/ b e",
    "ボ/ b o",
    "パ/ p a",
    "ピ/ p i",
    "プ/ p u",
    "ペ/ p e",
    "ポ/ p o",
    "ヤ/ y a",
    "ユ/ y u",
    "ヨ/ y o",
    "ワ/ w a",
    "ヰ/ i",
    "ヱ/ e",
    "ヲ/ o",
    "ン/ N",
    "ッ/ q",
    "ヴ/ b u",
    "ー/:",
    # Try converting broken text
    "ァ/ a",
    "ィ/ i",
    "ゥ/ u",
    "ェ/ e",
    "ォ/ o",
    "ヮ/ w a",
    "ォ/ o",
    # Symbols
    "、/ ,",
    "。/ .",
    "！/ !",
    "？/ ?",
    "・/ ,",
 ]
 _COLON_RX = re.compile(":+")
 _REJECT_RX = re.compile("[^ a-zA-Z:,.?]")
 def _makerulemap():
    l = [tuple(x.split("/")) for x in _CONVRULES]
    return tuple({k: v for k, v in l if len(k) == i} for i in (1, 2))
 _RULEMAP1, _RULEMAP2 = _makerulemap()
 def kata2phoneme(text: str) -> str:
    """Convert katakana text to phonemes."""
    text = text.strip()
    res = ""
    while text:
        if len(text) >= 2:
            x = _RULEMAP2.get(text[:2])
            if x is not None:
                text = text[2:]
                res += x
                continue
        x = _RULEMAP1.get(text[0])
        if x is not None:
            text = text[1:]
            res += x
            continue
        res += " " + text[0]
        text = text[1:]
    res = _COLON_RX.sub(":", res)
    return res[1:]
 _KATAKANA = "".join(chr(ch) for ch in range(ord("ァ"), ord("ン") + 1))
 _HIRAGANA = "".join(chr(ch) for ch in range(ord("ぁ"), ord("ん") + 1))
 _HIRA2KATATRANS = str.maketrans(_HIRAGANA, _KATAKANA)
 def hira2kata(text: str) -> str:
    text = text.translate(_HIRA2KATATRANS)
    return text.replace("う゛", "ヴ")
 _SYMBOL_TOKENS = set(list("・、。？！"))
 _NO_YOMI_TOKENS = set(list("「」『』―（）［］[]　…"))
 _TAGGER = MeCab.Tagger()
 def text2kata(text: str) -> str:
    parsed = _TAGGER.parse(text)
    res = []
    for line in parsed.split("\n"):
        if line == "EOS":
            break
        parts = line.split("\t")
        word, yomi = parts[0], parts[1]
        if yomi:
            res.append(yomi)
        else:
            if word in _SYMBOL_TOKENS:
                res.append(word)
            elif word in ("っ", "ッ"):
                res.append("ッ")
            elif word in _NO_YOMI_TOKENS:
                pass
            else:
                res.append(word)
    return hira2kata("".join(res))
 def japanese_text_to_phonemes(text: str) -> str:
    """Convert Japanese text to phonemes."""
    res = text2kata(text)
    res = kata2phoneme(res)
    return res.replace(" ", "")
--- a/TTS/utils/arguments.py
+++ b/TTS/utils/arguments.py
@ -152,6 +152,7 @@ def process_args(args):
        experiment_path = create_experiment_folder(config.output_path, config.run_name, args.debug)
    audio_path = os.path.join(experiment_path, "test_audios")
    # setup rank 0 process in distributed training
    tb_logger = None
    if args.rank == 0:
        os.makedirs(audio_path, exist_ok=True)
        new_fields = {}
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@ -149,7 +149,7 @@ class ModelManager(object):
    def _download_zip_file(file_url, output):
        """Download the github releases"""
        r = requests.get(file_url)
-        z = zipfile.ZipFile(io.BytesIO(r.content))
+        with zipfile.ZipFile(io.BytesIO(r.content)) as z:
            z.extractall(output)
        for file_path in z.namelist()[1:]:
            src_path = os.path.join(output, file_path)
--- a/recipes/kokoro/tacotron2-DDC/run.sh
+++ b/recipes/kokoro/tacotron2-DDC/run.sh
@ -0,0 +1,23 @@
 #!/bin/bash
 # take the scripts's parent's directory to prefix all the output paths.
 RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
 CORPUS=kokoro-speech-v1_1-small
 echo $RUN_DIR
 if [ \! -d $RUN_DIR/$CORPUS ] ; then
    echo "$RUN_DIR/$CORPUS doesn't exist."
    echo "Follow the instruction of https://github.com/kaiidams/Kokoro-Speech-Dataset to make the corpus."
    exit 1
 fi
 # create train-val splits
 shuf $RUN_DIR/$CORPUS/metadata.csv > $RUN_DIR/$CORPUS/metadata_shuf.csv
 head -n 8000 $RUN_DIR/$CORPUS/metadata_shuf.csv > $RUN_DIR/$CORPUS/metadata_train.csv
 tail -n 812 $RUN_DIR/$CORPUS/metadata_shuf.csv > $RUN_DIR/$CORPUS/metadata_val.csv
 # compute dataset mean and variance for normalization
 python TTS/bin/compute_statistics.py $RUN_DIR/tacotron2-DDC.json $RUN_DIR/scale_stats.npy --data_path $RUN_DIR/$CORPUS/wavs/
 # training ....
 # change the GPU id if needed
 CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tacotron.py --config_path $RUN_DIR/tacotron2-DDC.json \
                                                          --coqpit.output_path $RUN_DIR \
                                                          --coqpit.datasets.0.path $RUN_DIR/$CORPUS \
                                                          --coqpit.audio.stats_path $RUN_DIR/scale_stats.npy \
                                                          --coqpit.phoneme_cache_path $RUN_DIR/phoneme_cache \
--- a/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json
+++ b/recipes/kokoro/tacotron2-DDC/tacotron2-DDC.json
@ -0,0 +1,125 @@
 {
    "datasets": [
        {
            "name": "kokoro",
            "path": "DEFINE THIS",
            "meta_file_train": "metadata.csv",
            "meta_file_val": null
        }
    ],
    "audio": {
        "fft_size": 1024,
        "win_length": 1024,
        "hop_length": 256,
        "frame_length_ms": null,
        "frame_shift_ms": null,
        "sample_rate": 22050,
        "preemphasis": 0.0,
        "ref_level_db": 20,
        "do_trim_silence": true,
        "trim_db": 60,
        "power": 1.5,
        "griffin_lim_iters": 60,
        "num_mels": 80,
        "mel_fmin": 50.0,
        "mel_fmax": 7600.0,
        "spec_gain": 1,
        "signal_norm": true,
        "min_level_db": -100,
        "symmetric_norm": true,
        "max_norm": 4.0,
        "clip_norm": true,
        "stats_path": "scale_stats.npy"
    },
    "gst":{
        "gst_style_input": null,
        "gst_embedding_dim": 512,
        "gst_num_heads": 4,
        "gst_style_tokens": 10,
        "gst_use_speaker_embedding": false
 	},
    "model": "Tacotron2",
    "run_name": "kokoro-ddc",
    "run_description": "tacotron2 with DDC and differential spectral loss.",
    "batch_size": 32,
    "eval_batch_size": 16,
    "mixed_precision": true,
    "distributed": {
        "backend": "nccl",
        "url": "tcp:\/\/localhost:54321"
    },
    "reinit_layers": [],
    "loss_masking": true,
    "decoder_loss_alpha": 0.5,
    "postnet_loss_alpha": 0.25,
    "postnet_diff_spec_alpha": 0.25,
    "decoder_diff_spec_alpha": 0.25,
    "decoder_ssim_alpha": 0.5,
    "postnet_ssim_alpha": 0.25,
    "ga_alpha": 5.0,
    "stopnet_pos_weight": 15.0,
    "run_eval": true,
    "test_delay_epochs": 10,
    "test_sentences_file": null,
    "noam_schedule": false,
    "grad_clip": 1.0,
    "epochs": 1000,
    "lr": 0.0001,
    "wd": 0.000001,
    "warmup_steps": 4000,
    "seq_len_norm": false,
    "memory_size": -1,
    "prenet_type": "original",
    "prenet_dropout": true,
    "attention_type": "original",
    "windowing": false,
    "use_forward_attn": false,
    "forward_attn_mask": false,
    "transition_agent": false,
    "location_attn": true,
    "bidirectional_decoder": false,
    "double_decoder_consistency": true,
    "ddc_r": 7,
    "attention_heads": 4,
    "attention_norm": "sigmoid",
    "r": 7,
    "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]],
    "stopnet": true,
    "separate_stopnet": true,
    "print_step": 25,
    "tb_plot_step": 100,
    "print_eval": false,
    "save_step": 10000,
    "checkpoint": true,
    "keep_all_best": false,
    "keep_after": 10000,
    "tb_model_param_stats": false,
    "text_cleaner": "basic_cleaners",
    "enable_eos_bos_chars": false,
    "num_loader_workers": 4,
    "num_val_loader_workers": 4,
    "batch_group_size": 4,
    "min_seq_len": 6,
    "max_seq_len": 153,
    "compute_input_seq_cache": false,
    "use_noise_augment": true,
    "output_path": "DEFINE THIS",
    "phoneme_cache_path": "DEFINE THIS",
    "use_phonemes": true,
    "phoneme_language": "ja-jp",
    "characters": {
        "pad": "_",
        "eos": "~",
        "bos": "^",
        "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
        "punctuations": "!'(),-.:;? ",
        "phonemes": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
    },
    "use_speaker_embedding": false,
    "use_gst": false,       			
    "use_external_speaker_embedding_file": false,
    "external_speaker_embedding_file": "../../speakers-vctk-en.json"
 }
--- a/requirements.dev.txt
+++ b/requirements.dev.txt
@ -2,4 +2,4 @@ black
 coverage
 isort
 nose
-pylint==2.7.4
+pylint==2.8.3
--- a/requirements.txt
+++ b/requirements.txt
@ -17,5 +17,8 @@ torch>=1.7
 tqdm
 numba==0.52
 umap-learn==0.4.6
-unidecode==0.4.20
+anyascii
 coqpit
 # japanese g2p deps
 mecab-python3==1.0.3
 unidic-lite==1.0.8
--- a/setup.py
+++ b/setup.py
@ -4,7 +4,6 @@ import os
 import subprocess
 import sys
 from distutils.version import LooseVersion
 from TTS._version import __version__
 import numpy
 import setuptools.command.build_py
@ -12,82 +11,85 @@ import setuptools.command.develop
 from Cython.Build import cythonize
 from setuptools import Extension, find_packages, setup
 if LooseVersion(sys.version) < LooseVersion("3.6") or LooseVersion(sys.version) > LooseVersion("3.9"):
-    raise RuntimeError(
+    raise RuntimeError("TTS requires python >= 3.6 and <3.9 " "but your Python version is {}".format(sys.version))
        "TTS requires python >= 3.6 and <3.9 "
        "but your Python version is {}".format(sys.version)
    )
 version = __version__
 cwd = os.path.dirname(os.path.abspath(__file__))
 cwd = os.path.dirname(os.path.abspath(__file__))
 with open(os.path.join(cwd, "TTS", "VERSION")) as fin:
    version = fin.read().strip()
 class build_py(setuptools.command.build_py.build_py):  # pylint: disable=too-many-ancestors
    def run(self):
        self.create_version_file()
        setuptools.command.build_py.build_py.run(self)
    @staticmethod
    def create_version_file():
        print('-- Building version ' + version)
        version_path = os.path.join(cwd, 'version.py')
        with open(version_path, 'w') as f:
            f.write("__version__ = '{}'\n".format(version))
 class develop(setuptools.command.develop.develop):
    def run(self):
        build_py.create_version_file()
        setuptools.command.develop.develop.run(self)
 # The documentation for this feature is in server/README.md
-package_data = ['TTS/server/templates/*']
+package_data = ["TTS/server/templates/*"]
 def pip_install(package_name):
-    subprocess.call([sys.executable, '-m', 'pip', 'install', package_name])
+    subprocess.call([sys.executable, "-m", "pip", "install", package_name])
-requirements = open(os.path.join(cwd, 'requirements.txt'), 'r').readlines()
+requirements = open(os.path.join(cwd, "requirements.txt"), "r").readlines()
-with open(os.path.join(cwd, 'requirements.notebooks.txt'), 'r') as f:
+with open(os.path.join(cwd, "requirements.notebooks.txt"), "r") as f:
    requirements_notebooks = f.readlines()
-with open(os.path.join(cwd, 'requirements.dev.txt'), 'r') as f:
+with open(os.path.join(cwd, "requirements.dev.txt"), "r") as f:
    requirements_dev = f.readlines()
-with open(os.path.join(cwd, 'requirements.tf.txt'), 'r') as f:
+with open(os.path.join(cwd, "requirements.tf.txt"), "r") as f:
    requirements_tf = f.readlines()
 requirements_all = requirements_dev + requirements_notebooks + requirements_tf
-with open('README.md', "r", encoding="utf-8") as readme_file:
+with open("README.md", "r", encoding="utf-8") as readme_file:
    README = readme_file.read()
-exts = [Extension(name='TTS.tts.layers.glow_tts.monotonic_align.core',
+exts = [
-                  sources=["TTS/tts/layers/glow_tts/monotonic_align/core.pyx"])]
+    Extension(
        name="TTS.tts.layers.glow_tts.monotonic_align.core",
        sources=["TTS/tts/layers/glow_tts/monotonic_align/core.pyx"],
    )
 ]
 setup(
-    name='TTS',
+    name="TTS",
    version=version,
-    url='https://github.com/coqui-ai/TTS',
+    url="https://github.com/coqui-ai/TTS",
-    author='Eren Gölge',
+    author="Eren Gölge",
-    author_email='egolge@coqui.ai',
+    author_email="egolge@coqui.ai",
-    description='Deep learning for Text to Speech by Coqui.',
+    description="Deep learning for Text to Speech by Coqui.",
    long_description=README,
    long_description_content_type="text/markdown",
-    license='MPL-2.0',
+    license="MPL-2.0",
    # cython
    include_dirs=numpy.get_include(),
    ext_modules=cythonize(exts, language_level=3),
    # ext_modules=find_cython_extensions(),
    # package
    include_package_data=True,
-    packages=find_packages(include=['TTS*']),
+    packages=find_packages(include=["TTS*"]),
    package_data={
        "TTS": [
            "VERSION",
        ]
    },
    project_urls={
-        'Documentation': 'https://github.com/coqui-ai/TTS/wiki',
+        "Documentation": "https://github.com/coqui-ai/TTS/wiki",
-        'Tracker': 'https://github.com/coqui-ai/TTS/issues',
+        "Tracker": "https://github.com/coqui-ai/TTS/issues",
-        'Repository': 'https://github.com/coqui-ai/TTS',
+        "Repository": "https://github.com/coqui-ai/TTS",
-        'Discussions': 'https://github.com/coqui-ai/TTS/discussions',
+        "Discussions": "https://github.com/coqui-ai/TTS/discussions",
    },
    cmdclass={
-        'build_py': build_py,
+        "build_py": build_py,
-        'develop': develop,
+        "develop": develop,
        # 'build_ext': build_ext
    },
    install_requires=requirements,
@ -97,30 +99,25 @@ setup(
        "notebooks": requirements_notebooks,
        "tf": requirements_tf,
    },
-    python_requires='>=3.6.0, <3.9',
+    python_requires=">=3.6.0, <3.9",
-    entry_points={
+    entry_points={"console_scripts": ["tts=TTS.bin.synthesize:main", "tts-server = TTS.server.server:main"]},
        'console_scripts': [
            'tts=TTS.bin.synthesize:main',
            'tts-server = TTS.server.server:main'
        ]
    },
    classifiers=[
        "Programming Language :: Python",
        "Programming Language :: Python :: 3",
        "Programming Language :: Python :: 3.6",
        "Programming Language :: Python :: 3.7",
        "Programming Language :: Python :: 3.8",
-        'Development Status :: 3 - Alpha',
+        "Development Status :: 3 - Alpha",
        "Intended Audience :: Science/Research",
        "Intended Audience :: Developers",
        "Operating System :: POSIX :: Linux",
-        'License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)',
+        "License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)",
        "Topic :: Software Development",
        "Topic :: Software Development :: Libraries :: Python Modules",
        "Topic :: Multimedia :: Sound/Audio :: Speech",
        "Topic :: Multimedia :: Sound/Audio",
        "Topic :: Multimedia",
-        "Topic :: Scientific/Engineering :: Artificial Intelligence"
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
    ],
-    zip_safe=False
+    zip_safe=False,
 )
--- a/tests/data_tests/test_dataset_formatters.py
+++ b/tests/data_tests/test_dataset_formatters.py
--- a/tests/data_tests/test_loader.py
+++ b/tests/data_tests/test_loader.py
--- a/tests/inference_tests/test_synthesize.py
+++ b/tests/inference_tests/test_synthesize.py
--- a/tests/inference_tests/test_synthesizer.py
+++ b/tests/inference_tests/test_synthesizer.py
--- a/tests/test_audio_processor.py
+++ b/tests/test_audio_processor.py
--- a/tests/test_speaker_encoder.py
+++ b/tests/test_speaker_encoder.py
@ -6,6 +6,7 @@ from tests import get_tests_input_path
 from TTS.speaker_encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
 from TTS.speaker_encoder.models.lstm import LSTMSpeakerEncoder
 from TTS.speaker_encoder.models.resnet import ResNetSpeakerEncoder
 file_path = get_tests_input_path()
@ -39,6 +40,7 @@ class LSTMSpeakerEncoderTests(unittest.TestCase):
        assert output.shape[1] == 256
        assert len(output.shape) == 2
 class ResNetSpeakerEncoderTests(unittest.TestCase):
    # pylint: disable=R0201
    def test_in_out(self):
@ -65,6 +67,7 @@ class ResNetSpeakerEncoderTests(unittest.TestCase):
        assert output.shape[1] == 256
        assert len(output.shape) == 2
 class GE2ELossTests(unittest.TestCase):
    # pylint: disable=R0201
    def test_in_out(self):
@ -92,6 +95,7 @@ class GE2ELossTests(unittest.TestCase):
        output = loss.forward(dummy_input)
        assert output.item() < 0.005
 class AngleProtoLossTests(unittest.TestCase):
    # pylint: disable=R0201
    def test_in_out(self):
@ -121,6 +125,7 @@ class AngleProtoLossTests(unittest.TestCase):
        output = loss.forward(dummy_input)
        assert output.item() < 0.005
 class SoftmaxAngleProtoLossTests(unittest.TestCase):
    # pylint: disable=R0201
    def test_in_out(self):
--- a/tests/test_speaker_encoder_train.py
+++ b/tests/test_speaker_encoder_train.py
@ -46,7 +46,7 @@ run_cli(command_train)
 shutil.rmtree(continue_path)
 # test resnet speaker encoder
-config.model_params['model_name'] = "resnet"
+config.model_params["model_name"] = "resnet"
 config.save_json(config_path)
 # train the model for one epoch
--- a/tests/text_tests/test_japanese_phonemizer.py
+++ b/tests/text_tests/test_japanese_phonemizer.py
@ -0,0 +1,24 @@
 import unittest
 from TTS.tts.utils.text.japanese.phonemizer import japanese_text_to_phonemes
 _TEST_CASES = """
 どちらに行きますか？/dochiraniikimasuka?
 今日は温泉に、行きます。/kyo:waoNseNni,ikimasu.
 「A」から「Z」までです。/AkaraZmadedesu.
 そうですね！/so:desune!
 クジラは哺乳類です。/kujirawahonyu:ruidesu.
 ヴィディオを見ます。/bidioomimasu.
 ky o: w a o N s e N n i , i k i m a s u ./kyo:waoNseNni,ikimasu.
 """
 class TestText(unittest.TestCase):
    def test_japanese_text_to_phonemes(self):
        for line in _TEST_CASES.strip().split("\n"):
            text, phone = line.split("/")
            self.assertEqual(japanese_text_to_phonemes(text), phone)
 if __name__ == "__main__":
    unittest.main()
--- a/tests/text_tests/test_symbols.py
+++ b/tests/text_tests/test_symbols.py
--- a/tests/text_tests/test_text_cleaners.py
+++ b/tests/text_tests/test_text_cleaners.py
--- a/tests/tts_tests/test_feed_forward_layers.py
+++ b/tests/tts_tests/test_feed_forward_layers.py
--- a/tests/tts_tests/test_glow_tts.py
+++ b/tests/tts_tests/test_glow_tts.py
--- a/tests/tts_tests/test_glow_tts_train.py
+++ b/tests/tts_tests/test_glow_tts_train.py
@ -17,7 +17,7 @@ config = GlowTTSConfig(
    text_cleaner="english_cleaners",
    use_phonemes=True,
    phoneme_language="zh-CN",
-    phoneme_cache_path='tests/data/ljspeech/phoneme_cache/',
+    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
    run_eval=True,
    test_delay_epochs=-1,
    epochs=1,
--- a/tests/tts_tests/test_speedy_speech_layers.py
+++ b/tests/tts_tests/test_speedy_speech_layers.py
--- a/tests/tts_tests/test_speedy_speech_train.py
+++ b/tests/tts_tests/test_speedy_speech_train.py
@ -17,7 +17,7 @@ config = SpeedySpeechConfig(
    text_cleaner="english_cleaners",
    use_phonemes=True,
    phoneme_language="zh-CN",
-    phoneme_cache_path='tests/data/ljspeech/phoneme_cache/',
+    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
    run_eval=True,
    test_delay_epochs=-1,
    epochs=1,
--- a/tests/tts_tests/test_tacotron2_model.py
+++ b/tests/tts_tests/test_tacotron2_model.py
--- a/tests/tts_tests/test_tacotron2_tf_model.py
+++ b/tests/tts_tests/test_tacotron2_tf_model.py
--- a/tests/tts_tests/test_tacotron_layers.py
+++ b/tests/tts_tests/test_tacotron_layers.py
--- a/tests/tts_tests/test_tacotron_model.py
+++ b/tests/tts_tests/test_tacotron_model.py
--- a/tests/vocoder_tests/test_fullband_melgan_train.py
+++ b/tests/vocoder_tests/test_fullband_melgan_train.py
@ -20,6 +20,7 @@ config = FullbandMelganConfig(
    eval_split_size=1,
    print_step=1,
    print_eval=True,
    discriminator_model_params={"base_channels": 16, "max_channels": 256, "downsample_factors": [4, 4, 4]},
    data_path="tests/data/ljspeech",
    output_path=output_path,
 )
--- a/tests/vocoder_tests/test_melgan_train.py
+++ b/tests/vocoder_tests/test_melgan_train.py
@ -19,6 +19,7 @@ config = MelganConfig(
    seq_len=2048,
    eval_split_size=1,
    print_step=1,
    discriminator_model_params={"base_channels": 16, "max_channels": 256, "downsample_factors": [4, 4, 4]},
    print_eval=True,
    data_path="tests/data/ljspeech",
    output_path=output_path,